diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp index 65726f06dedb47..4690590e8f9eed 100644 --- a/llvm/lib/CodeGen/SlotIndexes.cpp +++ b/llvm/lib/CodeGen/SlotIndexes.cpp @@ -238,8 +238,38 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB, } void SlotIndexes::packIndexes() { - for (auto [Index, Entry] : enumerate(indexList)) - Entry.setIndex(Index * SlotIndex::InstrDist); + unsigned Index = 0; + // Check that the dummy entry for the start of the first block does not need + // updating. It should always be 0. + assert(idx2MBBMap[0].second->getNumber() == 0 && + "First MBB should be number 0!"); + assert(MBBRanges[0].first.getIndex() == Index && "First index should be 0!"); + Index += SlotIndex::InstrDist; + // Iterate over basic blocks in slot index order. + for (MachineBasicBlock *MBB : make_second_range(idx2MBBMap)) { + auto [MBBStartIdx, MBBEndIdx] = MBBRanges[MBB->getNumber()]; + auto Start = MBBStartIdx.listEntry()->getIterator(); + auto End = MBBEndIdx.listEntry()->getIterator(); + // Update entries for each instruction in the block. + for (auto &I : make_early_inc_range(make_range(std::next(Start), End))) { + if (I.getInstr()) { + I.setIndex(Index); + Index += SlotIndex::InstrDist; + } else { + // Remove entries for deleted instructions. + // FIXME: Eventually we want to remove them in + // removeMachineInstrFromMaps but that is not currently possible because + // some SlotIndexes API functions are called in a transiently broken + // state where some live ranges still refer to indexes of deleted + // instructions. + // TODO: Add removed entries to a free list so they can be reused? + indexList.remove(I); + } + } + // Update the dummy entry for the end of the block. + End->setIndex(Index); + Index += SlotIndex::InstrDist; + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll index fb4bef33d9b4ff..348528f02d9321 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll @@ -236,8 +236,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) { ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_unordered: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -251,8 +251,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %pt ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_unordered_const: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -266,8 +266,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) { ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -281,8 +281,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %pt ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic_const: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -296,8 +296,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) { ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -311,8 +311,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_acquire_const: -; -O1: ldaxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -326,8 +326,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) { ; -O0: stlxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: stlxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stlxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } @@ -341,8 +341,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) ; -O0: stlxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const: -; -O1: ldaxp x0, x1, [x8] -; -O1: stlxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stlxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc.ll index 373b040ebec65d..c5c03cbb076311 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc.ll @@ -236,8 +236,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) { ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_unordered: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -251,8 +251,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %pt ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_unordered_const: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -266,8 +266,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) { ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -281,8 +281,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %pt ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic_const: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -296,8 +296,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) { ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -311,8 +311,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_acquire_const: -; -O1: ldaxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -326,8 +326,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) { ; -O0: stlxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: stlxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stlxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } @@ -341,8 +341,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) ; -O0: stlxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const: -; -O1: ldaxp x0, x1, [x8] -; -O1: stlxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stlxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-v8a.ll index 045e080983d5f8..0368ec909e5363 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-v8a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-v8a.ll @@ -236,8 +236,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) { ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_unordered: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -251,8 +251,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %pt ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_unordered_const: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -266,8 +266,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) { ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -281,8 +281,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %pt ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic_const: -; -O1: ldxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -296,8 +296,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) { ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -311,8 +311,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) ; -O0: stxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_acquire_const: -; -O1: ldaxp x0, x1, [x8] -; -O1: stxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -326,8 +326,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) { ; -O0: stlxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: stlxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stlxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } @@ -341,8 +341,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) ; -O0: stlxp w8, x0, x1, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const: -; -O1: ldaxp x0, x1, [x8] -; -O1: stlxp w9, x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: stlxp w9, x8, x1, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll index 0c52a8a683e3a0..55d48f1bd6226b 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll @@ -156,8 +156,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_monotonic(ptr %ptr, i32 %value) ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_monotonic: -; -O1: ldxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value monotonic, align 4 ret i32 %r } @@ -170,8 +170,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acquire(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acquire: -; -O1: ldaxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acquire, align 4 ret i32 %r } @@ -184,8 +184,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_release(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_release: -; -O1: ldxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value release, align 4 ret i32 %r } @@ -198,8 +198,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acq_rel(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acq_rel: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acq_rel, align 4 ret i32 %r } @@ -212,8 +212,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_seq_cst(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_seq_cst: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value seq_cst, align 4 ret i32 %r } @@ -226,8 +226,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -240,8 +240,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -254,8 +254,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -268,8 +268,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -282,8 +282,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -852,9 +852,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -868,9 +868,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -884,9 +884,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -900,9 +900,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -916,9 +916,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -939,9 +939,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -962,9 +962,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -985,9 +985,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1008,9 +1008,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1031,9 +1031,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1632,9 +1632,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -1648,9 +1648,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -1664,9 +1664,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -1680,9 +1680,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -1696,9 +1696,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -1718,9 +1718,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1740,9 +1740,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1762,9 +1762,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1784,9 +1784,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1806,9 +1806,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -2402,9 +2402,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -2418,9 +2418,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -2434,9 +2434,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -2450,9 +2450,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -2466,9 +2466,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -2488,10 +2488,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -2511,10 +2511,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -2534,10 +2534,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -2557,10 +2557,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -2580,10 +2580,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3213,10 +3213,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -3231,10 +3231,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -3249,10 +3249,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -3267,10 +3267,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -3285,10 +3285,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -3310,12 +3310,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3337,12 +3337,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3364,12 +3364,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3391,12 +3391,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3418,12 +3418,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4082,9 +4082,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_monotonic(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4098,9 +4098,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4114,9 +4114,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4130,9 +4130,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4146,9 +4146,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4168,10 +4168,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4191,10 +4191,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4214,10 +4214,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -4237,10 +4237,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -4260,10 +4260,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4862,9 +4862,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4878,9 +4878,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4894,9 +4894,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4910,9 +4910,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4926,9 +4926,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4948,10 +4948,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4971,10 +4971,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4994,10 +4994,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5017,10 +5017,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5040,10 +5040,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -5725,10 +5725,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -5745,10 +5745,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -5765,10 +5765,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -5785,10 +5785,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -5805,10 +5805,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -5838,11 +5838,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -5872,11 +5872,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5906,11 +5906,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5940,11 +5940,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5974,11 +5974,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -6815,10 +6815,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -6835,10 +6835,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -6855,10 +6855,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -6875,10 +6875,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -6895,10 +6895,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -6928,11 +6928,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -6962,11 +6962,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -6996,11 +6996,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -7030,11 +7030,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -7064,11 +7064,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -7900,10 +7900,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -7920,10 +7920,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -7940,10 +7940,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -7960,10 +7960,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -7980,10 +7980,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -8013,11 +8013,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -8047,11 +8047,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -8081,11 +8081,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -8115,11 +8115,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -8149,11 +8149,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -8980,10 +8980,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -9000,10 +9000,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -9020,10 +9020,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -9040,10 +9040,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -9060,10 +9060,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -9093,11 +9093,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -9127,11 +9127,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -9161,11 +9161,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -9195,11 +9195,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -9229,11 +9229,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll index 89d22c59e630b4..7925d9e661a6cf 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll @@ -569,9 +569,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -592,9 +592,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -615,9 +615,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -638,9 +638,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -661,9 +661,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1193,9 +1193,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1215,9 +1215,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1237,9 +1237,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1259,9 +1259,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1281,9 +1281,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1848,10 +1848,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1871,10 +1871,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1894,10 +1894,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1917,10 +1917,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1940,10 +1940,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -2548,10 +2548,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -2566,10 +2566,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -2584,10 +2584,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -2602,10 +2602,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -2620,10 +2620,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -2645,12 +2645,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -2672,12 +2672,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -2699,12 +2699,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -2726,12 +2726,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -2753,12 +2753,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3318,10 +3318,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3341,10 +3341,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3364,10 +3364,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3387,10 +3387,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3410,10 +3410,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3863,10 +3863,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3886,10 +3886,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3909,10 +3909,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3932,10 +3932,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3955,10 +3955,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4590,10 +4590,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4610,10 +4610,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4630,10 +4630,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4650,10 +4650,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4670,10 +4670,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4703,11 +4703,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4737,11 +4737,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4771,11 +4771,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -4805,11 +4805,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -4839,11 +4839,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -5680,10 +5680,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -5700,10 +5700,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -5720,10 +5720,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -5740,10 +5740,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -5760,10 +5760,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -5793,11 +5793,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -5827,11 +5827,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5861,11 +5861,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5895,11 +5895,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5929,11 +5929,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -6765,10 +6765,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -6785,10 +6785,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -6805,10 +6805,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -6825,10 +6825,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -6845,10 +6845,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -6878,11 +6878,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -6912,11 +6912,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -6946,11 +6946,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -6980,11 +6980,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -7014,11 +7014,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -7845,10 +7845,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -7865,10 +7865,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -7885,10 +7885,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -7905,10 +7905,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -7925,10 +7925,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -7958,11 +7958,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -7992,11 +7992,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -8026,11 +8026,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -8060,11 +8060,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -8094,11 +8094,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll index bb6163f5bc3875..4cadb98edc716d 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll @@ -156,8 +156,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_monotonic(ptr %ptr, i32 %value) ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_monotonic: -; -O1: ldxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value monotonic, align 4 ret i32 %r } @@ -170,8 +170,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acquire(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acquire: -; -O1: ldaxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acquire, align 4 ret i32 %r } @@ -184,8 +184,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_release(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_release: -; -O1: ldxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value release, align 4 ret i32 %r } @@ -198,8 +198,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acq_rel(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acq_rel: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acq_rel, align 4 ret i32 %r } @@ -212,8 +212,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_seq_cst(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_seq_cst: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value seq_cst, align 4 ret i32 %r } @@ -226,8 +226,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -240,8 +240,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -254,8 +254,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -268,8 +268,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -282,8 +282,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -852,9 +852,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -868,9 +868,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -884,9 +884,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -900,9 +900,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -916,9 +916,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -939,9 +939,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -962,9 +962,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -985,9 +985,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1008,9 +1008,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1031,9 +1031,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1632,9 +1632,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -1648,9 +1648,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -1664,9 +1664,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -1680,9 +1680,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -1696,9 +1696,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -1718,9 +1718,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1740,9 +1740,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1762,9 +1762,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1784,9 +1784,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1806,9 +1806,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -2402,9 +2402,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -2418,9 +2418,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -2434,9 +2434,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -2450,9 +2450,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -2466,9 +2466,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -2488,10 +2488,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -2511,10 +2511,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -2534,10 +2534,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -2557,10 +2557,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -2580,10 +2580,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3213,10 +3213,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -3231,10 +3231,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -3249,10 +3249,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -3267,10 +3267,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -3285,10 +3285,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -3310,12 +3310,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3337,12 +3337,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3364,12 +3364,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3391,12 +3391,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3418,12 +3418,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4082,9 +4082,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_monotonic(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4098,9 +4098,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4114,9 +4114,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4130,9 +4130,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4146,9 +4146,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4168,10 +4168,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4191,10 +4191,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4214,10 +4214,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -4237,10 +4237,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -4260,10 +4260,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4862,9 +4862,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4878,9 +4878,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4894,9 +4894,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4910,9 +4910,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4926,9 +4926,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4948,10 +4948,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4971,10 +4971,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4994,10 +4994,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5017,10 +5017,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5040,10 +5040,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -5725,10 +5725,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -5745,10 +5745,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -5765,10 +5765,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -5785,10 +5785,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -5805,10 +5805,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -5838,11 +5838,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -5872,11 +5872,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5906,11 +5906,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5940,11 +5940,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5974,11 +5974,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -6815,10 +6815,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -6835,10 +6835,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -6855,10 +6855,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -6875,10 +6875,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -6895,10 +6895,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -6928,11 +6928,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -6962,11 +6962,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -6996,11 +6996,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -7030,11 +7030,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -7064,11 +7064,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -7900,10 +7900,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -7920,10 +7920,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -7940,10 +7940,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -7960,10 +7960,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -7980,10 +7980,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -8013,11 +8013,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -8047,11 +8047,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -8081,11 +8081,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -8115,11 +8115,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -8149,11 +8149,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -8980,10 +8980,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -9000,10 +9000,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -9020,10 +9020,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -9040,10 +9040,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -9060,10 +9060,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -9093,11 +9093,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -9127,11 +9127,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -9161,11 +9161,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -9195,11 +9195,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -9229,11 +9229,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll index 635620bb5ae11b..681333a7254898 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll @@ -156,8 +156,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_monotonic(ptr %ptr, i32 %value) ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_monotonic: -; -O1: ldxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value monotonic, align 4 ret i32 %r } @@ -170,8 +170,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acquire(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acquire: -; -O1: ldaxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acquire, align 4 ret i32 %r } @@ -184,8 +184,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_release(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_release: -; -O1: ldxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value release, align 4 ret i32 %r } @@ -198,8 +198,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acq_rel(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acq_rel: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acq_rel, align 4 ret i32 %r } @@ -212,8 +212,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_seq_cst(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_seq_cst: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value seq_cst, align 4 ret i32 %r } @@ -226,8 +226,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -240,8 +240,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -254,8 +254,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -268,8 +268,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -282,8 +282,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -852,9 +852,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -868,9 +868,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -884,9 +884,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -900,9 +900,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -916,9 +916,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -939,9 +939,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -962,9 +962,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -985,9 +985,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1008,9 +1008,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1031,9 +1031,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1632,9 +1632,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -1648,9 +1648,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -1664,9 +1664,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -1680,9 +1680,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -1696,9 +1696,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -1718,9 +1718,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1740,9 +1740,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1762,9 +1762,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1784,9 +1784,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1806,9 +1806,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -2402,9 +2402,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -2418,9 +2418,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -2434,9 +2434,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -2450,9 +2450,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -2466,9 +2466,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -2488,10 +2488,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -2511,10 +2511,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -2534,10 +2534,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -2557,10 +2557,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -2580,10 +2580,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3213,10 +3213,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -3231,10 +3231,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -3249,10 +3249,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -3267,10 +3267,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -3285,10 +3285,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -3310,12 +3310,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3337,12 +3337,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3364,12 +3364,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3391,12 +3391,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3418,12 +3418,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4082,9 +4082,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_monotonic(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4098,9 +4098,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4114,9 +4114,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4130,9 +4130,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4146,9 +4146,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4168,10 +4168,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4191,10 +4191,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4214,10 +4214,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -4237,10 +4237,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -4260,10 +4260,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4862,9 +4862,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4878,9 +4878,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4894,9 +4894,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4910,9 +4910,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4926,9 +4926,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4948,10 +4948,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4971,10 +4971,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4994,10 +4994,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5017,10 +5017,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5040,10 +5040,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -5725,10 +5725,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -5745,10 +5745,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -5765,10 +5765,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -5785,10 +5785,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -5805,10 +5805,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -5838,11 +5838,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -5872,11 +5872,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5906,11 +5906,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5940,11 +5940,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5974,11 +5974,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -6815,10 +6815,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -6835,10 +6835,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -6855,10 +6855,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -6875,10 +6875,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -6895,10 +6895,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -6928,11 +6928,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -6962,11 +6962,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -6996,11 +6996,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -7030,11 +7030,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -7064,11 +7064,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -7900,10 +7900,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -7920,10 +7920,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -7940,10 +7940,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -7960,10 +7960,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -7980,10 +7980,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -8013,11 +8013,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -8047,11 +8047,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -8081,11 +8081,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -8115,11 +8115,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -8149,11 +8149,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -8980,10 +8980,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -9000,10 +9000,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -9020,10 +9020,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -9040,10 +9040,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -9060,10 +9060,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -9093,11 +9093,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -9127,11 +9127,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -9161,11 +9161,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -9195,11 +9195,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -9229,11 +9229,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll index 7fc733a13bf07f..3bc81f82427f41 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll @@ -152,10 +152,9 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: -; -O1: ldp x4, x5, [x0] -; -O1: casp x4, x5, x2, x3, [x0] -; -O1: cmp x5, x7 -; -O1: ccmp x4, x6, #0, eq +; -O1: casp x0, x1, x2, x3, [x8] +; -O1: cmp x1, x5 +; -O1: ccmp x0, x4, #0, eq %r = atomicrmw xchg ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -169,10 +168,9 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: -; -O1: ldp x4, x5, [x0] -; -O1: caspa x4, x5, x2, x3, [x0] -; -O1: cmp x5, x7 -; -O1: ccmp x4, x6, #0, eq +; -O1: caspa x0, x1, x2, x3, [x8] +; -O1: cmp x1, x5 +; -O1: ccmp x0, x4, #0, eq %r = atomicrmw xchg ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -186,10 +184,9 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: -; -O1: ldp x4, x5, [x0] -; -O1: caspl x4, x5, x2, x3, [x0] -; -O1: cmp x5, x7 -; -O1: ccmp x4, x6, #0, eq +; -O1: caspl x0, x1, x2, x3, [x8] +; -O1: cmp x1, x5 +; -O1: ccmp x0, x4, #0, eq %r = atomicrmw xchg ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -203,10 +200,9 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: -; -O1: ldp x4, x5, [x0] -; -O1: caspal x4, x5, x2, x3, [x0] -; -O1: cmp x5, x7 -; -O1: ccmp x4, x6, #0, eq +; -O1: caspal x0, x1, x2, x3, [x8] +; -O1: cmp x1, x5 +; -O1: ccmp x0, x4, #0, eq %r = atomicrmw xchg ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -220,10 +216,9 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: -; -O1: ldp x4, x5, [x0] -; -O1: caspal x4, x5, x2, x3, [x0] -; -O1: cmp x5, x7 -; -O1: ccmp x4, x6, #0, eq +; -O1: caspal x0, x1, x2, x3, [x8] +; -O1: cmp x1, x5 +; -O1: ccmp x0, x4, #0, eq %r = atomicrmw xchg ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll index 0ea04d18788f68..d7773593365788 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll @@ -156,8 +156,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_monotonic(ptr %ptr, i32 %value) ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_monotonic: -; -O1: ldxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value monotonic, align 4 ret i32 %r } @@ -170,8 +170,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acquire(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acquire: -; -O1: ldaxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acquire, align 4 ret i32 %r } @@ -184,8 +184,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_release(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_release: -; -O1: ldxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value release, align 4 ret i32 %r } @@ -198,8 +198,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acq_rel(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acq_rel: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acq_rel, align 4 ret i32 %r } @@ -212,8 +212,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_seq_cst(ptr %ptr, i32 %value) { ; -O0: subs w8, w9, w8 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_seq_cst: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value seq_cst, align 4 ret i32 %r } @@ -226,8 +226,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -240,8 +240,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -254,8 +254,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -268,8 +268,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -282,8 +282,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -852,9 +852,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -868,9 +868,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -884,9 +884,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -900,9 +900,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -916,9 +916,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -939,9 +939,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -962,9 +962,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -985,9 +985,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1008,9 +1008,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1031,9 +1031,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: adds x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: adds x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1632,9 +1632,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -1648,9 +1648,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -1664,9 +1664,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -1680,9 +1680,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -1696,9 +1696,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -1718,9 +1718,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1740,9 +1740,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1762,9 +1762,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1784,9 +1784,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1806,9 +1806,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: subs x9, x0, x2 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: ldaxp x8, x1, [x0] +; -O1: subs x9, x8, x2 +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -2402,9 +2402,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -2418,9 +2418,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -2434,9 +2434,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -2450,9 +2450,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -2466,9 +2466,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -2488,10 +2488,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -2511,10 +2511,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -2534,10 +2534,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -2557,10 +2557,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -2580,10 +2580,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: and x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3213,10 +3213,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -3231,10 +3231,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -3249,10 +3249,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -3267,10 +3267,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -3285,10 +3285,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -3310,12 +3310,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3337,12 +3337,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3364,12 +3364,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3391,12 +3391,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3418,12 +3418,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x8, x1, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4082,9 +4082,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_monotonic(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4098,9 +4098,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4114,9 +4114,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4130,9 +4130,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4146,9 +4146,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4168,10 +4168,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4191,10 +4191,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4214,10 +4214,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -4237,10 +4237,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -4260,10 +4260,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: orr x9, x1, x3 -; -O1: orr x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: orr x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4862,9 +4862,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4878,9 +4878,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4894,9 +4894,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4910,9 +4910,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4926,9 +4926,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4948,10 +4948,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4971,10 +4971,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4994,10 +4994,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] +; -O1: ldxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5017,10 +5017,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5040,10 +5040,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] +; -O1: ldaxp x8, x1, [x0] ; -O1: eor x9, x1, x3 -; -O1: eor x10, x0, x2 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: eor x10, x8, x2 +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -5725,10 +5725,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -5745,10 +5745,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -5765,10 +5765,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -5785,10 +5785,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -5805,10 +5805,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -5838,11 +5838,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -5872,11 +5872,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5906,11 +5906,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5940,11 +5940,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5974,11 +5974,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lt -; -O1: csel x10, x0, x2, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lt +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -6815,10 +6815,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -6835,10 +6835,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -6855,10 +6855,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -6875,10 +6875,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -6895,10 +6895,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -6928,11 +6928,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -6962,11 +6962,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -6996,11 +6996,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -7030,11 +7030,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -7064,11 +7064,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, ge -; -O1: csel x10, x0, x2, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, ge +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -7900,10 +7900,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -7920,10 +7920,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -7940,10 +7940,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -7960,10 +7960,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -7980,10 +7980,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -8013,11 +8013,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -8047,11 +8047,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -8081,11 +8081,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -8115,11 +8115,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -8149,11 +8149,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, lo -; -O1: csel x10, x0, x2, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, lo +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -8980,10 +8980,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -9000,10 +9000,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -9020,10 +9020,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -9040,10 +9040,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -9060,10 +9060,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs x8, x9, x8 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -9093,11 +9093,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -9127,11 +9127,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -9161,11 +9161,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: -; -O1: ldxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -9195,11 +9195,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -9229,11 +9229,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: subs x8, x8, #0 ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O1: ldaxp x0, x1, [x8] -; -O1: cmp x2, x0 +; -O1: ldaxp x8, x1, [x0] +; -O1: cmp x2, x8 ; -O1: csel x9, x1, x3, hs -; -O1: csel x10, x0, x2, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: csel x10, x8, x2, hs +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-outline_atomics.ll index 7a2dfd84dcadc4..619316831068c8 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-outline_atomics.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-outline_atomics.ll @@ -232,8 +232,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) { ; -O0: bl __aarch64_cas16_relax ; ; -O1-LABEL: load_atomic_i128_aligned_unordered: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -243,8 +243,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %pt ; -O0: bl __aarch64_cas16_relax ; ; -O1-LABEL: load_atomic_i128_aligned_unordered_const: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -254,8 +254,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) { ; -O0: bl __aarch64_cas16_relax ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -265,8 +265,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %pt ; -O0: bl __aarch64_cas16_relax ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic_const: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -276,8 +276,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) { ; -O0: bl __aarch64_cas16_acq ; ; -O1-LABEL: load_atomic_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -287,8 +287,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) ; -O0: bl __aarch64_cas16_acq ; ; -O1-LABEL: load_atomic_i128_aligned_acquire_const: -; -O1: ldaxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -298,8 +298,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) { ; -O0: bl __aarch64_cas16_acq_rel ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: stlxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stlxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } @@ -309,8 +309,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) ; -O0: bl __aarch64_cas16_acq_rel ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const: -; -O1: ldaxp x1, x0, [x8] -; -O1: stlxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stlxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc.ll index 6703827be14672..23be1549a5469a 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc.ll @@ -236,8 +236,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) { ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_unordered: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -251,8 +251,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %pt ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_unordered_const: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -266,8 +266,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) { ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -281,8 +281,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %pt ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic_const: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -296,8 +296,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) { ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -311,8 +311,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_acquire_const: -; -O1: ldaxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -326,8 +326,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) { ; -O0: stlxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: stlxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stlxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } @@ -341,8 +341,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) ; -O0: stlxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const: -; -O1: ldaxp x1, x0, [x8] -; -O1: stlxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stlxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-v8a.ll index 06caa68f35565c..c824a642f64b1a 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-v8a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-v8a.ll @@ -236,8 +236,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) { ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_unordered: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -251,8 +251,8 @@ define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %pt ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_unordered_const: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr unordered, align 16 ret i128 %r } @@ -266,8 +266,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) { ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -281,8 +281,8 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %pt ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_monotonic_const: -; -O1: ldxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr monotonic, align 16 ret i128 %r } @@ -296,8 +296,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) { ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -311,8 +311,8 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) ; -O0: stxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_acquire_const: -; -O1: ldaxp x1, x0, [x8] -; -O1: stxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr acquire, align 16 ret i128 %r } @@ -326,8 +326,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) { ; -O0: stlxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: stlxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stlxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } @@ -341,8 +341,8 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) ; -O0: stlxp w8, x1, x0, [x9] ; ; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const: -; -O1: ldaxp x1, x0, [x8] -; -O1: stlxp w9, x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] +; -O1: stlxp w9, x1, x8, [x0] %r = load atomic i128, ptr %ptr seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll index 01317e09028c35..69285ecb00723a 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll @@ -162,8 +162,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_monotonic(ptr %ptr, i32 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_monotonic: -; -O1: ldxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value monotonic, align 4 ret i32 %r } @@ -177,8 +177,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acquire(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acquire: -; -O1: ldaxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acquire, align 4 ret i32 %r } @@ -192,8 +192,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_release(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_release: -; -O1: ldxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value release, align 4 ret i32 %r } @@ -207,8 +207,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acq_rel(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acq_rel: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acq_rel, align 4 ret i32 %r } @@ -222,8 +222,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_seq_cst(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_seq_cst: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value seq_cst, align 4 ret i32 %r } @@ -237,8 +237,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -252,8 +252,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -267,8 +267,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -282,8 +282,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -297,8 +297,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -868,9 +868,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -885,9 +885,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -902,9 +902,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -919,9 +919,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -936,9 +936,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -955,9 +955,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -974,9 +974,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -993,9 +993,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1012,9 +1012,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1031,9 +1031,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1633,9 +1633,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -1650,9 +1650,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -1667,9 +1667,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -1684,9 +1684,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -1701,9 +1701,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -1720,9 +1720,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1739,9 +1739,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1758,9 +1758,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1777,9 +1777,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1796,9 +1796,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -2398,9 +2398,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -2415,9 +2415,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -2432,9 +2432,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -2449,9 +2449,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -2466,9 +2466,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -2486,10 +2486,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -2507,10 +2507,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -2528,10 +2528,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -2549,10 +2549,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -2570,10 +2570,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3214,10 +3214,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -3233,10 +3233,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -3252,10 +3252,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -3271,10 +3271,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -3290,10 +3290,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -3313,12 +3313,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3338,12 +3338,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3363,12 +3363,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3388,12 +3388,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3413,12 +3413,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4088,9 +4088,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_monotonic(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4105,9 +4105,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4122,9 +4122,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4139,9 +4139,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4156,9 +4156,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4176,10 +4176,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4197,10 +4197,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4218,10 +4218,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -4239,10 +4239,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -4260,10 +4260,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4873,9 +4873,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4890,9 +4890,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4907,9 +4907,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4924,9 +4924,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4941,9 +4941,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4961,10 +4961,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4982,10 +4982,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5003,10 +5003,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5024,10 +5024,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5045,10 +5045,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -5709,10 +5709,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -5728,10 +5728,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -5747,10 +5747,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -5766,10 +5766,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -5785,10 +5785,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -5807,11 +5807,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -5830,11 +5830,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5853,11 +5853,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5876,11 +5876,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5899,11 +5899,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -6634,10 +6634,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -6653,10 +6653,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -6672,10 +6672,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -6691,10 +6691,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -6710,10 +6710,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -6732,11 +6732,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -6755,11 +6755,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -6778,11 +6778,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -6801,11 +6801,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -6824,11 +6824,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -7559,10 +7559,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -7578,10 +7578,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -7597,10 +7597,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -7616,10 +7616,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -7635,10 +7635,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -7657,11 +7657,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -7680,11 +7680,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -7703,11 +7703,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -7726,11 +7726,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -7749,11 +7749,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -8484,10 +8484,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -8503,10 +8503,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -8522,10 +8522,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -8541,10 +8541,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -8560,10 +8560,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -8582,11 +8582,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -8605,11 +8605,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -8628,11 +8628,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -8651,11 +8651,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -8674,11 +8674,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll index f9c1a2216dc2c1..f0be54d4edbbde 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll @@ -531,9 +531,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -546,9 +546,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -561,9 +561,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -576,9 +576,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -591,9 +591,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1106,9 +1106,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1121,9 +1121,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1136,9 +1136,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1151,9 +1151,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1166,9 +1166,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1722,10 +1722,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1739,10 +1739,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1756,10 +1756,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1773,10 +1773,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1790,10 +1790,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -2377,10 +2377,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -2394,10 +2394,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -2411,10 +2411,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -2428,10 +2428,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -2445,10 +2445,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -2464,12 +2464,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -2485,12 +2485,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -2506,12 +2506,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -2527,12 +2527,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -2548,12 +2548,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3097,10 +3097,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3114,10 +3114,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3131,10 +3131,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3148,10 +3148,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3165,10 +3165,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3612,10 +3612,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3629,10 +3629,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3646,10 +3646,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3663,10 +3663,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3680,10 +3680,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4262,10 +4262,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4279,10 +4279,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4296,10 +4296,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4313,10 +4313,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4330,10 +4330,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4348,11 +4348,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4367,11 +4367,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4386,11 +4386,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -4405,11 +4405,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -4424,11 +4424,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -5117,10 +5117,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -5134,10 +5134,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -5151,10 +5151,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -5168,10 +5168,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -5185,10 +5185,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -5203,11 +5203,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -5222,11 +5222,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5241,11 +5241,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5260,11 +5260,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5279,11 +5279,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -5972,10 +5972,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -5989,10 +5989,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -6006,10 +6006,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -6023,10 +6023,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -6040,10 +6040,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -6058,11 +6058,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -6077,11 +6077,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -6096,11 +6096,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -6115,11 +6115,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -6134,11 +6134,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -6827,10 +6827,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -6844,10 +6844,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -6861,10 +6861,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -6878,10 +6878,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -6895,10 +6895,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w8, w8, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -6913,11 +6913,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -6932,11 +6932,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -6951,11 +6951,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -6970,11 +6970,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -6989,11 +6989,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll index 1bead6d694c652..02bda4fe2991b2 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll @@ -162,8 +162,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_monotonic(ptr %ptr, i32 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_monotonic: -; -O1: ldxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value monotonic, align 4 ret i32 %r } @@ -177,8 +177,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acquire(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acquire: -; -O1: ldaxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acquire, align 4 ret i32 %r } @@ -192,8 +192,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_release(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_release: -; -O1: ldxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value release, align 4 ret i32 %r } @@ -207,8 +207,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acq_rel(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acq_rel: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acq_rel, align 4 ret i32 %r } @@ -222,8 +222,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_seq_cst(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_seq_cst: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value seq_cst, align 4 ret i32 %r } @@ -237,8 +237,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -252,8 +252,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -267,8 +267,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -282,8 +282,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -297,8 +297,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -868,9 +868,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -885,9 +885,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -902,9 +902,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -919,9 +919,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -936,9 +936,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -955,9 +955,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -974,9 +974,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -993,9 +993,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1012,9 +1012,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1031,9 +1031,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1633,9 +1633,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -1650,9 +1650,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -1667,9 +1667,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -1684,9 +1684,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -1701,9 +1701,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -1720,9 +1720,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1739,9 +1739,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1758,9 +1758,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1777,9 +1777,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1796,9 +1796,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -2398,9 +2398,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -2415,9 +2415,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -2432,9 +2432,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -2449,9 +2449,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -2466,9 +2466,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -2486,10 +2486,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -2507,10 +2507,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -2528,10 +2528,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -2549,10 +2549,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -2570,10 +2570,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3214,10 +3214,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -3233,10 +3233,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -3252,10 +3252,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -3271,10 +3271,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -3290,10 +3290,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -3313,12 +3313,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3338,12 +3338,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3363,12 +3363,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3388,12 +3388,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3413,12 +3413,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4088,9 +4088,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_monotonic(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4105,9 +4105,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4122,9 +4122,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4139,9 +4139,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4156,9 +4156,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4176,10 +4176,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4197,10 +4197,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4218,10 +4218,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -4239,10 +4239,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -4260,10 +4260,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4873,9 +4873,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4890,9 +4890,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4907,9 +4907,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4924,9 +4924,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4941,9 +4941,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4961,10 +4961,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4982,10 +4982,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5003,10 +5003,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5024,10 +5024,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5045,10 +5045,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -5709,10 +5709,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -5728,10 +5728,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -5747,10 +5747,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -5766,10 +5766,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -5785,10 +5785,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -5807,11 +5807,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -5830,11 +5830,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5853,11 +5853,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5876,11 +5876,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5899,11 +5899,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -6634,10 +6634,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -6653,10 +6653,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -6672,10 +6672,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -6691,10 +6691,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -6710,10 +6710,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -6732,11 +6732,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -6755,11 +6755,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -6778,11 +6778,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -6801,11 +6801,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -6824,11 +6824,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -7559,10 +7559,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -7578,10 +7578,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -7597,10 +7597,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -7616,10 +7616,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -7635,10 +7635,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -7657,11 +7657,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -7680,11 +7680,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -7703,11 +7703,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -7726,11 +7726,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -7749,11 +7749,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -8484,10 +8484,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -8503,10 +8503,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -8522,10 +8522,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -8541,10 +8541,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -8560,10 +8560,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -8582,11 +8582,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -8605,11 +8605,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -8628,11 +8628,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -8651,11 +8651,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -8674,11 +8674,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll index 51d9766f6a8f92..a8a2194ff197aa 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll @@ -162,8 +162,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_monotonic(ptr %ptr, i32 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_monotonic: -; -O1: ldxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value monotonic, align 4 ret i32 %r } @@ -177,8 +177,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acquire(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acquire: -; -O1: ldaxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acquire, align 4 ret i32 %r } @@ -192,8 +192,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_release(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_release: -; -O1: ldxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value release, align 4 ret i32 %r } @@ -207,8 +207,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acq_rel(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acq_rel: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acq_rel, align 4 ret i32 %r } @@ -222,8 +222,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_seq_cst(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_seq_cst: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value seq_cst, align 4 ret i32 %r } @@ -237,8 +237,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -252,8 +252,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -267,8 +267,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -282,8 +282,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -297,8 +297,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -868,9 +868,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -885,9 +885,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -902,9 +902,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -919,9 +919,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -936,9 +936,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -955,9 +955,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -974,9 +974,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -993,9 +993,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1012,9 +1012,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1031,9 +1031,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1633,9 +1633,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -1650,9 +1650,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -1667,9 +1667,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -1684,9 +1684,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -1701,9 +1701,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -1720,9 +1720,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1739,9 +1739,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1758,9 +1758,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1777,9 +1777,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1796,9 +1796,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -2398,9 +2398,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -2415,9 +2415,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -2432,9 +2432,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -2449,9 +2449,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -2466,9 +2466,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -2486,10 +2486,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -2507,10 +2507,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -2528,10 +2528,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -2549,10 +2549,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -2570,10 +2570,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3214,10 +3214,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -3233,10 +3233,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -3252,10 +3252,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -3271,10 +3271,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -3290,10 +3290,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -3313,12 +3313,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3338,12 +3338,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3363,12 +3363,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3388,12 +3388,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3413,12 +3413,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4088,9 +4088,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_monotonic(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4105,9 +4105,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4122,9 +4122,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4139,9 +4139,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4156,9 +4156,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4176,10 +4176,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4197,10 +4197,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4218,10 +4218,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -4239,10 +4239,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -4260,10 +4260,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4873,9 +4873,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4890,9 +4890,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4907,9 +4907,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4924,9 +4924,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4941,9 +4941,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4961,10 +4961,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4982,10 +4982,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5003,10 +5003,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5024,10 +5024,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5045,10 +5045,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -5709,10 +5709,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -5728,10 +5728,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -5747,10 +5747,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -5766,10 +5766,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -5785,10 +5785,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -5807,11 +5807,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -5830,11 +5830,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5853,11 +5853,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5876,11 +5876,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5899,11 +5899,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -6634,10 +6634,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -6653,10 +6653,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -6672,10 +6672,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -6691,10 +6691,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -6710,10 +6710,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -6732,11 +6732,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -6755,11 +6755,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -6778,11 +6778,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -6801,11 +6801,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -6824,11 +6824,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -7559,10 +7559,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -7578,10 +7578,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -7597,10 +7597,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -7616,10 +7616,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -7635,10 +7635,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -7657,11 +7657,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -7680,11 +7680,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -7703,11 +7703,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -7726,11 +7726,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -7749,11 +7749,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -8484,10 +8484,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -8503,10 +8503,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -8522,10 +8522,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -8541,10 +8541,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -8560,10 +8560,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -8582,11 +8582,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -8605,11 +8605,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -8628,11 +8628,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -8651,11 +8651,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -8674,11 +8674,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8_1a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8_1a.ll index 0c3ed9b0f1de0f..07965bc7df986d 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8_1a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8_1a.ll @@ -150,10 +150,9 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: -; -O1: ldp x4, x5, [x0] -; -O1: casp x4, x5, x2, x3, [x0] -; -O1: cmp x4, x6 -; -O1: ccmp x5, x7, #0, eq +; -O1: casp x0, x1, x2, x3, [x8] +; -O1: cmp x0, x4 +; -O1: ccmp x1, x5, #0, eq %r = atomicrmw xchg ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -165,10 +164,9 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: -; -O1: ldp x4, x5, [x0] -; -O1: caspa x4, x5, x2, x3, [x0] -; -O1: cmp x4, x6 -; -O1: ccmp x5, x7, #0, eq +; -O1: caspa x0, x1, x2, x3, [x8] +; -O1: cmp x0, x4 +; -O1: ccmp x1, x5, #0, eq %r = atomicrmw xchg ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -180,10 +178,9 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: -; -O1: ldp x4, x5, [x0] -; -O1: caspl x4, x5, x2, x3, [x0] -; -O1: cmp x4, x6 -; -O1: ccmp x5, x7, #0, eq +; -O1: caspl x0, x1, x2, x3, [x8] +; -O1: cmp x0, x4 +; -O1: ccmp x1, x5, #0, eq %r = atomicrmw xchg ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -195,10 +192,9 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: -; -O1: ldp x4, x5, [x0] -; -O1: caspal x4, x5, x2, x3, [x0] -; -O1: cmp x4, x6 -; -O1: ccmp x5, x7, #0, eq +; -O1: caspal x0, x1, x2, x3, [x8] +; -O1: cmp x0, x4 +; -O1: ccmp x1, x5, #0, eq %r = atomicrmw xchg ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -210,10 +206,9 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: -; -O1: ldp x4, x5, [x0] -; -O1: caspal x4, x5, x2, x3, [x0] -; -O1: cmp x4, x6 -; -O1: ccmp x5, x7, #0, eq +; -O1: caspal x0, x1, x2, x3, [x8] +; -O1: cmp x0, x4 +; -O1: ccmp x1, x5, #0, eq %r = atomicrmw xchg ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll index a58e5a987bb4c9..9749d85a248ba3 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll @@ -162,8 +162,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_monotonic(ptr %ptr, i32 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_monotonic: -; -O1: ldxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value monotonic, align 4 ret i32 %r } @@ -177,8 +177,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acquire(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acquire: -; -O1: ldaxr w0, [x8] -; -O1: stxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acquire, align 4 ret i32 %r } @@ -192,8 +192,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_release(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_release: -; -O1: ldxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value release, align 4 ret i32 %r } @@ -207,8 +207,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_acq_rel(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_acq_rel: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value acq_rel, align 4 ret i32 %r } @@ -222,8 +222,8 @@ define dso_local i32 @atomicrmw_xchg_i32_aligned_seq_cst(ptr %ptr, i32 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i32_aligned_seq_cst: -; -O1: ldaxr w0, [x8] -; -O1: stlxr w9, w1, [x8] +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w1, [x0] %r = atomicrmw xchg ptr %ptr, i32 %value seq_cst, align 4 ret i32 %r } @@ -237,8 +237,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -252,8 +252,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: stxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -267,8 +267,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -282,8 +282,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -297,8 +297,8 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xchg_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: stlxr w9, x1, [x8] +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x1, [x0] %r = atomicrmw xchg ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -868,9 +868,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -885,9 +885,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -902,9 +902,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -919,9 +919,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -936,9 +936,9 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_add_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: add x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: add x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -955,9 +955,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -974,9 +974,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -993,9 +993,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1012,9 +1012,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1031,9 +1031,9 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: adds x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -1633,9 +1633,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -1650,9 +1650,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -1667,9 +1667,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -1684,9 +1684,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -1701,9 +1701,9 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_sub_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: sub x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: sub x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -1720,9 +1720,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -1739,9 +1739,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -1758,9 +1758,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -1777,9 +1777,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -1796,9 +1796,9 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: subs x9, x1, x3 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -2398,9 +2398,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -2415,9 +2415,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -2432,9 +2432,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -2449,9 +2449,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -2466,9 +2466,9 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_and_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -2486,10 +2486,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -2507,10 +2507,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -2528,10 +2528,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -2549,10 +2549,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -2570,10 +2570,10 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: and x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: and x9, x8, x2 ; -O1: and x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -3214,10 +3214,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -3233,10 +3233,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stxr w10, x9, [x8] +; -O1: stxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -3252,10 +3252,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -3271,10 +3271,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -3290,10 +3290,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: and x9, x0, x1 +; -O1: ldaxr x8, [x0] +; -O1: and x9, x8, x1 ; -O1: mvn x9, x9 -; -O1: stlxr w10, x9, [x8] +; -O1: stlxr w10, x9, [x0] %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -3313,12 +3313,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -3338,12 +3338,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stxp w11, x9, x10, [x8] +; -O1: stxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -3363,12 +3363,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -3388,12 +3388,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -3413,12 +3413,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: and x9, x1, x3 -; -O1: and x10, x0, x2 +; -O1: and x10, x8, x2 ; -O1: mvn x10, x10 ; -O1: mvn x9, x9 -; -O1: stlxp w11, x9, x10, [x8] +; -O1: stlxp w11, x9, x10, [x0] %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4088,9 +4088,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_monotonic(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4105,9 +4105,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4122,9 +4122,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4139,9 +4139,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4156,9 +4156,9 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_or_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: orr x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: orr x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4176,10 +4176,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4197,10 +4197,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -4218,10 +4218,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -4239,10 +4239,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -4260,10 +4260,10 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: orr x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: orr x9, x8, x2 ; -O1: orr x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -4873,9 +4873,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -4890,9 +4890,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -4907,9 +4907,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -4924,9 +4924,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -4941,9 +4941,9 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_xor_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: eor x9, x0, x1 -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: eor x9, x8, x1 +; -O1: stlxr w10, x9, [x0] %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -4961,10 +4961,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -4982,10 +4982,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5003,10 +5003,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5024,10 +5024,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5045,10 +5045,10 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] -; -O1: eor x9, x0, x2 +; -O1: ldaxp x1, x8, [x0] +; -O1: eor x9, x8, x2 ; -O1: eor x10, x1, x3 -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -5709,10 +5709,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -5728,10 +5728,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -5747,10 +5747,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -5766,10 +5766,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -5785,10 +5785,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, gt -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, gt +; -O1: stlxr w10, x9, [x0] %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -5807,11 +5807,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -5830,11 +5830,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -5853,11 +5853,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -5876,11 +5876,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -5899,11 +5899,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lt +; -O1: csel x9, x8, x2, lt ; -O1: csel x10, x1, x3, lt -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -6634,10 +6634,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -6653,10 +6653,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -6672,10 +6672,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -6691,10 +6691,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -6710,10 +6710,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, le -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, le +; -O1: stlxr w10, x9, [x0] %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -6732,11 +6732,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -6755,11 +6755,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -6778,11 +6778,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -6801,11 +6801,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -6824,11 +6824,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, ge +; -O1: csel x9, x8, x2, ge ; -O1: csel x10, x1, x3, ge -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -7559,10 +7559,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -7578,10 +7578,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -7597,10 +7597,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -7616,10 +7616,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -7635,10 +7635,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, hi -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, hi +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -7657,11 +7657,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -7680,11 +7680,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -7703,11 +7703,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -7726,11 +7726,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -7749,11 +7749,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, lo +; -O1: csel x9, x8, x2, lo ; -O1: csel x10, x1, x3, lo -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } @@ -8484,10 +8484,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 8 ret i64 %r } @@ -8503,10 +8503,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 8 ret i64 %r } @@ -8522,10 +8522,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_release: -; -O1: ldxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value release, align 8 ret i64 %r } @@ -8541,10 +8541,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 8 ret i64 %r } @@ -8560,10 +8560,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) { ; -O0: subs w9, w9, #1 ; ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst: -; -O1: ldaxr x0, [x8] -; -O1: cmp x0, x1 -; -O1: csel x9, x0, x1, ls -; -O1: stlxr w10, x9, [x8] +; -O1: ldaxr x8, [x0] +; -O1: cmp x8, x1 +; -O1: csel x9, x8, x1, ls +; -O1: stlxr w10, x9, [x0] %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 8 ret i64 %r } @@ -8582,11 +8582,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 16 ret i128 %r } @@ -8605,11 +8605,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stxp w11, x10, x9, [x8] +; -O1: stxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 16 ret i128 %r } @@ -8628,11 +8628,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: -; -O1: ldxp x1, x0, [x8] +; -O1: ldxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value release, align 16 ret i128 %r } @@ -8651,11 +8651,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 16 ret i128 %r } @@ -8674,11 +8674,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value ; -O0: ccmp x10, x11, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O1: ldaxp x1, x0, [x8] +; -O1: ldaxp x1, x8, [x0] ; -O1: cmp x3, x1 -; -O1: csel x9, x0, x2, hs +; -O1: csel x9, x8, x2, hs ; -O1: csel x10, x1, x3, hs -; -O1: stlxp w11, x10, x9, [x8] +; -O1: stlxp w11, x10, x9, [x0] %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 16 ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll b/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll index 6d27e4f4d603bd..83ae2c77b3476b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll @@ -372,7 +372,7 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, ptr ; CHECK-LABEL: params_and_return_in_reg ; Store callee saved registers. -; CHECK: stp x28, x0, [sp, #16 +; CHECK: str x28, [sp, #16 ; CHECK: stp x27, x26, [sp ; CHECK: stp x25, x24, [sp ; CHECK: stp x23, x22, [sp @@ -398,8 +398,8 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, ptr ; CHECK: mov x21, xzr ; CHECK: bl _params_in_reg2 ; Store swifterror %error_ptr_ref. -; CHECK: ldr x0, [sp, #24] -; CHECK: stp {{x[0-9]+}}, x21, [sp] +; CHECK: ldr x8, [sp, #8] +; CHECK: str x21, [sp, #24] ; Setup call arguments from original arguments. ; CHECK: mov x1, x20 ; CHECK: mov x2, x22 @@ -410,17 +410,17 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, ptr ; CHECK: mov x7, x27 ; CHECK: mov x21, x28 ; CHECK: bl _params_and_return_in_reg2 -; CHECK: mov x19, x21 -; CHECK: ldr x21, [sp, #8 +; CHECK: mov x28, x21 +; CHECK: ldr x21, [sp, #24 ; Store return values. -; CHECK: mov x20, x0 -; CHECK: mov x22, x1 -; CHECK: mov x23, x2 -; CHECK: mov x24, x3 -; CHECK: mov x25, x4 -; CHECK: mov x26, x5 -; CHECK: mov x27, x6 -; CHECK: mov x28, x7 +; CHECK: mov x19, x0 +; CHECK: mov x20, x1 +; CHECK: mov x22, x2 +; CHECK: mov x23, x3 +; CHECK: mov x24, x4 +; CHECK: mov x25, x5 +; CHECK: mov x26, x6 +; CHECK: mov x27, x7 ; Setup call. ; CHECK: mov w0, #1 ; CHECK: mov w1, #2 @@ -433,15 +433,15 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, ptr ; CHECK: str xzr, [sp] ; CHECK: bl _params_in_reg2 ; Restore return values for return from this function. -; CHECK: mov x0, x20 -; CHECK: mov x1, x22 -; CHECK: mov x2, x23 -; CHECK: mov x3, x24 -; CHECK: mov x4, x25 -; CHECK: mov x5, x26 -; CHECK: mov x6, x27 -; CHECK: mov x21, x19 -; CHECK: mov x7, x28 +; CHECK: mov x0, x19 +; CHECK: mov x1, x20 +; CHECK: mov x2, x22 +; CHECK: mov x3, x23 +; CHECK: mov x4, x24 +; CHECK: mov x5, x25 +; CHECK: mov x6, x26 +; CHECK: mov x7, x27 +; CHECK: mov x21, x28 ; CHECK: ldp x29, x30, [sp, #96] ; 16-byte Folded Reload ; CHECK: ldr x28, [sp, #16] ; 8-byte Folded Reload ; CHECK: ldp x20, x19, [sp, #80] ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll index a65c5d66677946..d80003f446b38d 100644 --- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll +++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll @@ -218,72 +218,72 @@ define @lane_mask_nxv32i1_i64(i64 %index, i64 %TC) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z1.d, #0, #1 -; CHECK-NEXT: mov z0.d, x0 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: mov z2.d, x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z3.d, x1 -; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z7.d, x1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: uqadd z5.d, z0.d, z2.d +; CHECK-NEXT: incd z0.d, all, mul #8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: incd z6.d, all, mul #4 +; CHECK-NEXT: cmphi p1.d, p0/z, z7.d, z5.d +; CHECK-NEXT: uqadd z0.d, z0.d, z2.d ; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: uqadd z25.d, z1.d, z0.d +; CHECK-NEXT: uqadd z24.d, z1.d, z2.d +; CHECK-NEXT: mov z25.d, z1.d +; CHECK-NEXT: mov z27.d, z3.d +; CHECK-NEXT: uqadd z26.d, z3.d, z2.d +; CHECK-NEXT: uqadd z28.d, z6.d, z2.d ; CHECK-NEXT: incd z1.d, all, mul #8 -; CHECK-NEXT: incd z2.d +; CHECK-NEXT: incd z3.d, all, mul #8 +; CHECK-NEXT: incd z6.d, all, mul #8 ; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: incd z6.d, all, mul #4 -; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z25.d -; CHECK-NEXT: uqadd z1.d, z1.d, z0.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: uqadd z26.d, z2.d, z0.d -; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: incd z25.d, all, mul #4 +; CHECK-NEXT: cmphi p2.d, p0/z, z7.d, z24.d +; CHECK-NEXT: incd z27.d, all, mul #4 +; CHECK-NEXT: cmphi p3.d, p0/z, z7.d, z26.d +; CHECK-NEXT: cmphi p5.d, p0/z, z7.d, z28.d +; CHECK-NEXT: uqadd z1.d, z1.d, z2.d +; CHECK-NEXT: uqadd z3.d, z3.d, z2.d ; CHECK-NEXT: mov z24.d, z4.d -; CHECK-NEXT: uqadd z27.d, z4.d, z0.d -; CHECK-NEXT: uqadd z28.d, z6.d, z0.d -; CHECK-NEXT: incd z2.d, all, mul #8 +; CHECK-NEXT: uqadd z5.d, z4.d, z2.d +; CHECK-NEXT: uqadd z26.d, z25.d, z2.d ; CHECK-NEXT: incd z4.d, all, mul #8 -; CHECK-NEXT: incd z6.d, all, mul #8 -; CHECK-NEXT: incd z5.d, all, mul #2 -; CHECK-NEXT: incd z7.d, all, mul #4 -; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z26.d -; CHECK-NEXT: incd z24.d, all, mul #4 -; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z27.d -; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z28.d -; CHECK-NEXT: uqadd z2.d, z2.d, z0.d -; CHECK-NEXT: uqadd z4.d, z4.d, z0.d -; CHECK-NEXT: uqadd z6.d, z6.d, z0.d -; CHECK-NEXT: mov z26.d, z5.d -; CHECK-NEXT: uqadd z25.d, z5.d, z0.d -; CHECK-NEXT: uqadd z27.d, z7.d, z0.d -; CHECK-NEXT: incd z5.d, all, mul #8 -; CHECK-NEXT: incd z7.d, all, mul #8 +; CHECK-NEXT: incd z25.d, all, mul #8 ; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s -; CHECK-NEXT: incd z26.d, all, mul #4 -; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z2.d -; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z25.d -; CHECK-NEXT: uqadd z25.d, z24.d, z0.d +; CHECK-NEXT: incd z24.d, all, mul #4 +; CHECK-NEXT: cmphi p8.d, p0/z, z7.d, z1.d +; CHECK-NEXT: cmphi p4.d, p0/z, z7.d, z5.d +; CHECK-NEXT: uqadd z5.d, z27.d, z2.d +; CHECK-NEXT: incd z27.d, all, mul #8 +; CHECK-NEXT: uqadd z4.d, z4.d, z2.d +; CHECK-NEXT: cmphi p6.d, p0/z, z7.d, z26.d +; CHECK-NEXT: uqadd z28.d, z24.d, z2.d ; CHECK-NEXT: incd z24.d, all, mul #8 -; CHECK-NEXT: uqadd z5.d, z5.d, z0.d -; CHECK-NEXT: uqadd z7.d, z7.d, z0.d -; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z27.d -; CHECK-NEXT: uqadd z28.d, z26.d, z0.d -; CHECK-NEXT: incd z26.d, all, mul #8 ; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s -; CHECK-NEXT: uqadd z24.d, z24.d, z0.d -; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z25.d -; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z1.d +; CHECK-NEXT: cmphi p7.d, p0/z, z7.d, z5.d +; CHECK-NEXT: uqadd z5.d, z6.d, z2.d +; CHECK-NEXT: uqadd z6.d, z25.d, z2.d +; CHECK-NEXT: uqadd z25.d, z27.d, z2.d +; CHECK-NEXT: cmphi p4.d, p0/z, z7.d, z0.d ; CHECK-NEXT: uzp1 p5.s, p5.s, p6.s -; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z4.d -; CHECK-NEXT: cmphi p9.d, p0/z, z3.d, z5.d -; CHECK-NEXT: cmphi p10.d, p0/z, z3.d, z7.d -; CHECK-NEXT: uqadd z0.d, z26.d, z0.d -; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z28.d +; CHECK-NEXT: cmphi p6.d, p0/z, z7.d, z3.d +; CHECK-NEXT: cmphi p9.d, p0/z, z7.d, z4.d +; CHECK-NEXT: uqadd z2.d, z24.d, z2.d +; CHECK-NEXT: cmphi p2.d, p0/z, z7.d, z28.d +; CHECK-NEXT: cmphi p10.d, p0/z, z7.d, z6.d ; CHECK-NEXT: uzp1 p4.s, p4.s, p8.s -; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z24.d +; CHECK-NEXT: cmphi p8.d, p0/z, z7.d, z25.d ; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s ; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h ; CHECK-NEXT: uzp1 p2.s, p7.s, p2.s -; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z6.d -; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z0.d +; CHECK-NEXT: cmphi p7.d, p0/z, z7.d, z5.d +; CHECK-NEXT: cmphi p0.d, p0/z, z7.d, z2.d +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h ; CHECK-NEXT: uzp1 p7.s, p7.s, p10.s ; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: uzp1 p0.s, p8.s, p0.s diff --git a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll index 37c61d0a4a0fb6..a4c223fb13691e 100644 --- a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll +++ b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll @@ -692,30 +692,30 @@ define void @fetch_and_umax(ptr %p, i128 %bits) { define i128 @atomic_load_seq_cst(ptr %p) { ; NOOUTLINE-LABEL: atomic_load_seq_cst: ; NOOUTLINE: // %bb.0: -; NOOUTLINE-NEXT: mov x8, x0 ; NOOUTLINE-NEXT: .LBB12_1: // %atomicrmw.start ; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 -; NOOUTLINE-NEXT: ldaxp x0, x1, [x8] -; NOOUTLINE-NEXT: stlxp w9, x0, x1, [x8] +; NOOUTLINE-NEXT: ldaxp x8, x1, [x0] +; NOOUTLINE-NEXT: stlxp w9, x8, x1, [x0] ; NOOUTLINE-NEXT: cbnz w9, .LBB12_1 ; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: mov x0, x8 ; NOOUTLINE-NEXT: ret ; ; OUTLINE-LABEL: atomic_load_seq_cst: ; OUTLINE: // %bb.0: -; OUTLINE-NEXT: mov x8, x0 ; OUTLINE-NEXT: .LBB12_1: // %atomicrmw.start ; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 -; OUTLINE-NEXT: ldaxp x0, x1, [x8] -; OUTLINE-NEXT: stlxp w9, x0, x1, [x8] +; OUTLINE-NEXT: ldaxp x8, x1, [x0] +; OUTLINE-NEXT: stlxp w9, x8, x1, [x0] ; OUTLINE-NEXT: cbnz w9, .LBB12_1 ; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: mov x0, x8 ; OUTLINE-NEXT: ret ; ; LSE-LABEL: atomic_load_seq_cst: ; LSE: // %bb.0: -; LSE-NEXT: mov x2, #0 -; LSE-NEXT: mov x3, #0 +; LSE-NEXT: mov x2, #0 // =0x0 +; LSE-NEXT: mov x3, #0 // =0x0 ; LSE-NEXT: caspal x2, x3, x2, x3, [x0] ; LSE-NEXT: mov x0, x2 ; LSE-NEXT: mov x1, x3 @@ -747,8 +747,8 @@ define i128 @atomic_load_relaxed(i64, i64, ptr %p) { ; ; LSE-LABEL: atomic_load_relaxed: ; LSE: // %bb.0: -; LSE-NEXT: mov x0, #0 -; LSE-NEXT: mov x1, #0 +; LSE-NEXT: mov x0, #0 // =0x0 +; LSE-NEXT: mov x1, #0 // =0x0 ; LSE-NEXT: casp x0, x1, x0, x1, [x2] ; LSE-NEXT: ret %r = load atomic i128, ptr %p monotonic, align 16 diff --git a/llvm/test/CodeGen/AArch64/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/arm64-atomic.ll index 739fc8bbcaf072..e8d9c7e18df69e 100644 --- a/llvm/test/CodeGen/AArch64/arm64-atomic.ll +++ b/llvm/test/CodeGen/AArch64/arm64-atomic.ll @@ -23,18 +23,17 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) #0 { define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) #0 { ; OUTLINE-ATOMICS: bl __aarch64_cas4_acq ; CHECK-LABEL: val_compare_and_swap_from_load: +; CHECK-NEXT: mov x[[RESULT:[0-9]+]], x0 ; CHECK-NEXT: ldr [[NEW:w[0-9]+]], [x2] ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: -; CHECK-NEXT: ldaxr w[[RESULT:[0-9]+]], [x0] -; CHECK-NEXT: cmp w[[RESULT]], w1 +; CHECK-NEXT: ldaxr w0, [x[[RESULT]]] +; CHECK-NEXT: cmp w0, w1 ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] -; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0] +; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x[[RESULT]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: mov x0, x[[RESULT]] ; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: mov x0, x[[RESULT]] ; CHECK-NEXT: ret %new = load i32, ptr %pnew %pair = cmpxchg ptr %p, i32 %cmp, i32 %new acquire acquire @@ -96,14 +95,13 @@ define i32 @fetch_and_nand(ptr %p) #0 { define i64 @fetch_and_nand_64(ptr %p) #0 { ; CHECK-LABEL: fetch_and_nand_64: -; CHECK: mov x[[ADDR:[0-9]+]], x0 ; CHECK: [[TRYBB:.?LBB[0-9_]+]]: -; CHECK: ldaxr x[[DEST_REG:[0-9]+]], [x[[ADDR]]] +; CHECK: ldaxr x[[DEST_REG:[0-9]+]], [x0] ; CHECK: mvn w[[TMP_REG:[0-9]+]], w[[DEST_REG]] -; CHECK: orr [[SCRATCH2_REG:x[0-9]+]], x[[TMP_REG]], #0xfffffffffffffff8 -; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]] +; CHECK: orr x[[TMP_REG]], x[[TMP_REG]], #0xfffffffffffffff8 +; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], x[[TMP_REG]], [x0] ; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]] - +; CHECK: mov x0, x[[DEST_REG]] %val = atomicrmw nand ptr %p, i64 7 acq_rel ret i64 %val } @@ -126,12 +124,12 @@ define i32 @fetch_and_or(ptr %p) #0 { define i64 @fetch_and_or_64(ptr %p) #0 { ; OUTLINE-ATOMICS: bl __aarch64_ldset8_relax ; CHECK: fetch_and_or_64: -; CHECK: mov x[[ADDR:[0-9]+]], x0 ; CHECK: [[TRYBB:.?LBB[0-9_]+]]: -; CHECK: ldxr [[DEST_REG:x[0-9]+]], [x[[ADDR]]] +; CHECK: ldxr [[DEST_REG:x[0-9]+]], [x0] ; CHECK: orr [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7 -; CHECK: stxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]] +; CHECK: stxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0] ; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]] +; CHECK: mov x0, [[DEST_REG]] %val = atomicrmw or ptr %p, i64 7 monotonic ret i64 %val } diff --git a/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll b/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll index 2816f91df44ddc..3b0f9572c04acb 100644 --- a/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll +++ b/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll @@ -12,7 +12,6 @@ ; YAML: - INST_add: '2' ; YAML: - INST_b.: '1' ; YAML: - INST_ldr: '1' -; YAML: - INST_orr: '1' ; YAML: - INST_sub: '1' ; YAML: - INST_subs: '1' @@ -27,20 +26,19 @@ define i32 @foo(ptr %ptr, i32 %x, i64 %y) !dbg !3 { ; CHECK-LABEL: foo: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: add w0, w9, w1 -; CHECK-NEXT: add x9, x0, x2 +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: add w8, w8, w1 +; CHECK-NEXT: add x9, x8, x2 ; CHECK-NEXT: sub x9, x9, #244, lsl #12 ; =999424 ; CHECK-NEXT: cmp x9, #575 ; CHECK-NEXT: b.eq LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %else -; CHECK-NEXT: mul w9, w0, w1 -; CHECK-NEXT: mul w0, w9, w1 +; CHECK-NEXT: mul w8, w8, w1 ; CHECK-NEXT: mov w9, #10 ; =0xa -; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: str w9, [x0] +; CHECK-NEXT: mul w8, w8, w1 ; CHECK-NEXT: LBB0_2: ; %common.ret -; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret entry: %l = load i32, ptr %ptr, !dbg !4 diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll index 7039cccdf9393c..68b98cad582c83 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll @@ -1482,16 +1482,16 @@ define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) { ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-NEXT: stp q0, q1, [sp, #16] // 32-byte Folded Spill ; CHECK-NEXT: mov s0, v0.s[1] ; CHECK-NEXT: mov s1, v1.s[1] ; CHECK-NEXT: bl fmodf -; CHECK-NEXT: str d0, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-NEXT: bl fmodf -; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.s[1], v1.s[0] @@ -1565,16 +1565,16 @@ define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) { ; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-NEXT: stp q0, q1, [sp, #16] // 32-byte Folded Spill ; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: bl fmod -; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-NEXT: bl fmod -; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], v1.d[0] diff --git a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll index 5806bcf0dacf16..94d6d7ecab842f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -1058,16 +1058,16 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; ENABLE-NEXT: .cfi_offset w26, -80 ; ENABLE-NEXT: .cfi_offset w27, -88 ; ENABLE-NEXT: .cfi_offset w28, -96 -; ENABLE-NEXT: lsl w8, w1, w0 -; ENABLE-NEXT: lsr w9, w0, w1 -; ENABLE-NEXT: lsl w14, w0, w1 -; ENABLE-NEXT: lsr w11, w1, w0 +; ENABLE-NEXT: lsl w11, w1, w0 +; ENABLE-NEXT: lsr w12, w0, w1 +; ENABLE-NEXT: lsl w17, w0, w1 +; ENABLE-NEXT: lsr w14, w1, w0 ; ENABLE-NEXT: add w15, w1, w0 -; ENABLE-NEXT: sub w10, w8, w9 -; ENABLE-NEXT: subs w17, w1, w0 -; ENABLE-NEXT: add w16, w14, w8 -; ENABLE-NEXT: add w12, w9, w11 -; ENABLE-NEXT: add w13, w11, w15 +; ENABLE-NEXT: sub w8, w11, w12 +; ENABLE-NEXT: subs w16, w1, w0 +; ENABLE-NEXT: add w13, w17, w11 +; ENABLE-NEXT: add w9, w12, w14 +; ENABLE-NEXT: add w10, w14, w15 ; ENABLE-NEXT: b.le LBB14_2 ; ENABLE-NEXT: ; %bb.1: ; %true ; ENABLE-NEXT: str w0, [sp] @@ -1075,15 +1075,15 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; ENABLE-NEXT: nop ; ENABLE-NEXT: ; InlineAsm End ; ENABLE-NEXT: LBB14_2: ; %false -; ENABLE-NEXT: str w14, [x2] -; ENABLE-NEXT: str w8, [x3] -; ENABLE-NEXT: str w9, [x4] -; ENABLE-NEXT: str w11, [x5] +; ENABLE-NEXT: str w17, [x2] +; ENABLE-NEXT: str w11, [x3] +; ENABLE-NEXT: str w12, [x4] +; ENABLE-NEXT: str w14, [x5] ; ENABLE-NEXT: str w15, [x6] -; ENABLE-NEXT: str w17, [x7] +; ENABLE-NEXT: str w16, [x7] ; ENABLE-NEXT: stp w0, w1, [x2, #4] -; ENABLE-NEXT: stp w16, w10, [x2, #12] -; ENABLE-NEXT: stp w12, w13, [x2, #20] +; ENABLE-NEXT: stp w13, w8, [x2, #12] +; ENABLE-NEXT: stp w9, w10, [x2, #20] ; ENABLE-NEXT: sub sp, x29, #80 ; ENABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload ; ENABLE-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload @@ -1117,16 +1117,16 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; DISABLE-NEXT: .cfi_offset w26, -80 ; DISABLE-NEXT: .cfi_offset w27, -88 ; DISABLE-NEXT: .cfi_offset w28, -96 -; DISABLE-NEXT: lsl w8, w1, w0 -; DISABLE-NEXT: lsr w9, w0, w1 -; DISABLE-NEXT: lsl w14, w0, w1 -; DISABLE-NEXT: lsr w11, w1, w0 +; DISABLE-NEXT: lsl w11, w1, w0 +; DISABLE-NEXT: lsr w12, w0, w1 +; DISABLE-NEXT: lsl w17, w0, w1 +; DISABLE-NEXT: lsr w14, w1, w0 ; DISABLE-NEXT: add w15, w1, w0 -; DISABLE-NEXT: sub w10, w8, w9 -; DISABLE-NEXT: subs w17, w1, w0 -; DISABLE-NEXT: add w16, w14, w8 -; DISABLE-NEXT: add w12, w9, w11 -; DISABLE-NEXT: add w13, w11, w15 +; DISABLE-NEXT: sub w8, w11, w12 +; DISABLE-NEXT: subs w16, w1, w0 +; DISABLE-NEXT: add w13, w17, w11 +; DISABLE-NEXT: add w9, w12, w14 +; DISABLE-NEXT: add w10, w14, w15 ; DISABLE-NEXT: b.le LBB14_2 ; DISABLE-NEXT: ; %bb.1: ; %true ; DISABLE-NEXT: str w0, [sp] @@ -1134,15 +1134,15 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; DISABLE-NEXT: nop ; DISABLE-NEXT: ; InlineAsm End ; DISABLE-NEXT: LBB14_2: ; %false -; DISABLE-NEXT: str w14, [x2] -; DISABLE-NEXT: str w8, [x3] -; DISABLE-NEXT: str w9, [x4] -; DISABLE-NEXT: str w11, [x5] +; DISABLE-NEXT: str w17, [x2] +; DISABLE-NEXT: str w11, [x3] +; DISABLE-NEXT: str w12, [x4] +; DISABLE-NEXT: str w14, [x5] ; DISABLE-NEXT: str w15, [x6] -; DISABLE-NEXT: str w17, [x7] +; DISABLE-NEXT: str w16, [x7] ; DISABLE-NEXT: stp w0, w1, [x2, #4] -; DISABLE-NEXT: stp w16, w10, [x2, #12] -; DISABLE-NEXT: stp w12, w13, [x2, #20] +; DISABLE-NEXT: stp w13, w8, [x2, #12] +; DISABLE-NEXT: stp w9, w10, [x2, #20] ; DISABLE-NEXT: sub sp, x29, #80 ; DISABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload ; DISABLE-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll index fdb14606d463b7..89dfd7f2992c04 100644 --- a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll +++ b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll @@ -136,16 +136,16 @@ define dso_local i32 @test_atomic_load_sub_i32(i32 %offset) nounwind { define dso_local i64 @test_atomic_load_sub_i64(i64 %offset) nounwind { ; CHECK-LABEL: test_atomic_load_sub_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: adrp x9, var64 ; CHECK-NEXT: add x9, x9, :lo12:var64 ; CHECK-NEXT: .LBB7_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxr x0, [x9] -; CHECK-NEXT: sub x10, x0, x8 +; CHECK-NEXT: ldaxr x8, [x9] +; CHECK-NEXT: sub x10, x8, x0 ; CHECK-NEXT: stlxr w11, x10, [x9] ; CHECK-NEXT: cbnz w11, .LBB7_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw sub ptr @var64, i64 %offset seq_cst @@ -411,16 +411,15 @@ define dso_local i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind { define dso_local i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind { ; CHECK-LABEL: test_atomic_load_xchg_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: adrp x9, var32 ; CHECK-NEXT: add x9, x9, :lo12:var32 ; CHECK-NEXT: .LBB22_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldxr w0, [x9] -; CHECK-NEXT: stlxr w10, w8, [x9] +; CHECK-NEXT: ldxr w8, [x9] +; CHECK-NEXT: stlxr w10, w0, [x9] ; CHECK-NEXT: cbnz w10, .LBB22_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %old = atomicrmw xchg ptr @var32, i32 %offset release ret i32 %old @@ -506,17 +505,17 @@ define dso_local i32 @test_atomic_load_min_i32(i32 %offset) nounwind { define dso_local i64 @test_atomic_load_min_i64(i64 %offset) nounwind { ; CHECK-LABEL: test_atomic_load_min_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: adrp x9, var64 ; CHECK-NEXT: add x9, x9, :lo12:var64 ; CHECK-NEXT: .LBB27_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxr x0, [x9] -; CHECK-NEXT: cmp x0, x8 -; CHECK-NEXT: csel x10, x0, x8, le +; CHECK-NEXT: ldaxr x8, [x9] +; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: csel x10, x8, x0, le ; CHECK-NEXT: stlxr w11, x10, [x9] ; CHECK-NEXT: cbnz w11, .LBB27_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw min ptr @var64, i64 %offset seq_cst diff --git a/llvm/test/CodeGen/AArch64/atomic-ops.ll b/llvm/test/CodeGen/AArch64/atomic-ops.ll index 679065529090f0..4d98c14202b1c2 100644 --- a/llvm/test/CodeGen/AArch64/atomic-ops.ll +++ b/llvm/test/CodeGen/AArch64/atomic-ops.ll @@ -612,16 +612,15 @@ define dso_local i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind { define dso_local i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind { ; INLINE_ATOMICS-LABEL: test_atomic_load_xchg_i32: ; INLINE_ATOMICS: // %bb.0: -; INLINE_ATOMICS-NEXT: mov w8, w0 ; INLINE_ATOMICS-NEXT: adrp x9, var32 ; INLINE_ATOMICS-NEXT: add x9, x9, :lo12:var32 ; INLINE_ATOMICS-NEXT: .LBB22_1: // %atomicrmw.start ; INLINE_ATOMICS-NEXT: // =>This Inner Loop Header: Depth=1 -; INLINE_ATOMICS-NEXT: ldxr w0, [x9] -; INLINE_ATOMICS-NEXT: stlxr w10, w8, [x9] +; INLINE_ATOMICS-NEXT: ldxr w8, [x9] +; INLINE_ATOMICS-NEXT: stlxr w10, w0, [x9] ; INLINE_ATOMICS-NEXT: cbnz w10, .LBB22_1 ; INLINE_ATOMICS-NEXT: // %bb.2: // %atomicrmw.end -; INLINE_ATOMICS-NEXT: // kill: def $w0 killed $w0 killed $x0 +; INLINE_ATOMICS-NEXT: mov w0, w8 ; INLINE_ATOMICS-NEXT: ret ; ; OUTLINE_ATOMICS-LABEL: test_atomic_load_xchg_i32: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll index 5f293e5c7ea34f..872ea11419cc7a 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll @@ -55,15 +55,15 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) { define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-LABEL: atomicrmw_uinc_wrap_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: .LBB3_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxr x0, [x8] -; CHECK-NEXT: cmp x0, x1 -; CHECK-NEXT: csinc x9, xzr, x0, hs -; CHECK-NEXT: stlxr w10, x9, [x8] +; CHECK-NEXT: ldaxr x8, [x0] +; CHECK-NEXT: cmp x8, x1 +; CHECK-NEXT: csinc x9, xzr, x8, hs +; CHECK-NEXT: stlxr w10, x9, [x0] ; CHECK-NEXT: cbnz w10, .LBB3_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret %result = atomicrmw uinc_wrap ptr %ptr, i64 %val seq_cst ret i64 %result @@ -129,17 +129,17 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) { define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-LABEL: atomicrmw_udec_wrap_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: .LBB7_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxr x0, [x8] -; CHECK-NEXT: cmp x0, x1 -; CHECK-NEXT: sub x9, x0, #1 -; CHECK-NEXT: ccmp x0, #0, #4, ls +; CHECK-NEXT: ldaxr x8, [x0] +; CHECK-NEXT: cmp x8, x1 +; CHECK-NEXT: sub x9, x8, #1 +; CHECK-NEXT: ccmp x8, #0, #4, ls ; CHECK-NEXT: csel x9, x1, x9, eq -; CHECK-NEXT: stlxr w10, x9, [x8] +; CHECK-NEXT: stlxr w10, x9, [x0] ; CHECK-NEXT: cbnz w10, .LBB7_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret %result = atomicrmw udec_wrap ptr %ptr, i64 %val seq_cst ret i64 %result diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll b/llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll index 98033a8e449ffb..bd8757a5167a1f 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=aarch64-- -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck %s -check-prefix=NOLSE ; RUN: llc -verify-machineinstrs -mtriple=aarch64-- -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck %s -check-prefix=LSE @@ -32,14 +32,14 @@ define half @test_rmw_xchg_f16(ptr %dst, half %new) { define float @test_rmw_xchg_f32(ptr %dst, float %new) { ; NOLSE-LABEL: test_rmw_xchg_f32: ; NOLSE: // %bb.0: -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w8, s0 ; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 -; NOLSE-NEXT: ldaxr w8, [x0] -; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: ldaxr w9, [x0] +; NOLSE-NEXT: stlxr w10, w8, [x0] ; NOLSE-NEXT: cbnz w10, .LBB1_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: fmov s0, w8 +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_rmw_xchg_f32: @@ -97,20 +97,20 @@ define fp128 @test_rmw_xchg_f128(ptr %dst, fp128 %new) { ; LSE-NEXT: sub sp, sp, #32 ; LSE-NEXT: .cfi_def_cfa_offset 32 ; LSE-NEXT: str q0, [sp, #16] -; LSE-NEXT: ldp x2, x3, [sp, #16] -; LSE-NEXT: ldp x4, x5, [x0] +; LSE-NEXT: ldp x4, x5, [sp, #16] +; LSE-NEXT: ldp x2, x3, [x0] ; LSE-NEXT: .LBB3_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: mov x7, x5 -; LSE-NEXT: mov x6, x4 -; LSE-NEXT: mov x5, x7 -; LSE-NEXT: mov x4, x6 -; LSE-NEXT: caspal x4, x5, x2, x3, [x0] -; LSE-NEXT: cmp x5, x7 -; LSE-NEXT: ccmp x4, x6, #0, eq +; LSE-NEXT: mov x7, x3 +; LSE-NEXT: mov x6, x2 +; LSE-NEXT: mov x3, x7 +; LSE-NEXT: mov x2, x6 +; LSE-NEXT: caspal x2, x3, x4, x5, [x0] +; LSE-NEXT: cmp x3, x7 +; LSE-NEXT: ccmp x2, x6, #0, eq ; LSE-NEXT: b.ne .LBB3_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: stp x4, x5, [sp] +; LSE-NEXT: stp x2, x3, [sp] ; LSE-NEXT: ldr q0, [sp], #32 ; LSE-NEXT: ret %res = atomicrmw xchg ptr %dst, fp128 %new seq_cst diff --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll index b66b149bd643fa..6d469c28598808 100644 --- a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll +++ b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll @@ -13,26 +13,26 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define i64 @bfis_in_loop_zero() { ; CHECK-LABEL: bfis_in_loop_zero: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x8, :got:global -; CHECK-NEXT: mov x0, xzr -; CHECK-NEXT: mov w9, wzr -; CHECK-NEXT: ldr x8, [x8, :got_lo12:global] -; CHECK-NEXT: ldr x8, [x8] -; CHECK-NEXT: .LBB0_1: // %midblock -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrh w10, [x8, #72] -; CHECK-NEXT: ldr x13, [x8, #8] -; CHECK-NEXT: ubfx x11, x10, #8, #24 -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: cset w12, ne -; CHECK-NEXT: ldr x8, [x13, #16] -; CHECK-NEXT: csel w9, w9, w11, eq -; CHECK-NEXT: and x11, x0, #0xffffffff00000000 -; CHECK-NEXT: orr x10, x10, x9, lsl #8 -; CHECK-NEXT: orr x11, x11, x12, lsl #16 -; CHECK-NEXT: orr x0, x11, x10 -; CHECK-NEXT: cbnz x13, .LBB0_1 +; CHECK-NEXT: adrp x8, :got:global +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: ldr x8, [x8, :got_lo12:global] +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: .LBB0_1: // %midblock +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldrh w10, [x9, #72] +; CHECK-NEXT: ldr x13, [x9, #8] +; CHECK-NEXT: ubfx x11, x10, #8, #24 +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: cset w12, ne +; CHECK-NEXT: ldr x9, [x13, #16] +; CHECK-NEXT: csel w8, w8, w11, eq +; CHECK-NEXT: and x11, x0, #0xffffffff00000000 +; CHECK-NEXT: orr x10, x10, x8, lsl #8 +; CHECK-NEXT: orr x11, x11, x12, lsl #16 +; CHECK-NEXT: orr x0, x11, x10 +; CHECK-NEXT: cbnz x13, .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: @@ -81,26 +81,26 @@ exit: define i64 @bfis_in_loop_undef() { ; CHECK-LABEL: bfis_in_loop_undef: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x9, :got:global -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: // implicit-def: $x0 -; CHECK-NEXT: ldr x9, [x9, :got_lo12:global] -; CHECK-NEXT: ldr x9, [x9] -; CHECK-NEXT: .LBB1_1: // %midblock -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrh w10, [x9, #72] -; CHECK-NEXT: ldr x13, [x9, #8] -; CHECK-NEXT: ubfx x11, x10, #8, #24 -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: cset w12, ne -; CHECK-NEXT: ldr x9, [x13, #16] -; CHECK-NEXT: csel w8, w8, w11, eq -; CHECK-NEXT: and x11, x0, #0xffffffff00000000 -; CHECK-NEXT: orr x10, x10, x8, lsl #8 -; CHECK-NEXT: orr x11, x11, x12, lsl #16 -; CHECK-NEXT: orr x0, x11, x10 -; CHECK-NEXT: cbnz x13, .LBB1_1 +; CHECK-NEXT: adrp x9, :got:global +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: // implicit-def: $x0 +; CHECK-NEXT: ldr x9, [x9, :got_lo12:global] +; CHECK-NEXT: ldr x9, [x9] +; CHECK-NEXT: .LBB1_1: // %midblock +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldrh w10, [x9, #72] +; CHECK-NEXT: ldr x13, [x9, #8] +; CHECK-NEXT: ubfx x11, x10, #8, #24 +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: cset w12, ne +; CHECK-NEXT: ldr x9, [x13, #16] +; CHECK-NEXT: csel w8, w8, w11, eq +; CHECK-NEXT: and x11, x0, #0xffffffff00000000 +; CHECK-NEXT: orr x10, x10, x8, lsl #8 +; CHECK-NEXT: orr x11, x11, x12, lsl #16 +; CHECK-NEXT: orr x0, x11, x10 +; CHECK-NEXT: cbnz x13, .LBB1_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll index 01fd2b1113b000..f00265a80e0328 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll @@ -26,25 +26,25 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: zip2 p3.d, p1.d, p1.d +; CHECK-NEXT: zip2 p2.d, p1.d, p1.d ; CHECK-NEXT: add x13, x0, x8 ; CHECK-NEXT: add x14, x1, x8 -; CHECK-NEXT: zip1 p2.d, p1.d, p1.d +; CHECK-NEXT: zip1 p3.d, p1.d, p1.d ; CHECK-NEXT: mov z6.d, z1.d ; CHECK-NEXT: mov z7.d, z0.d ; CHECK-NEXT: whilelo p1.d, x12, x9 ; CHECK-NEXT: add x8, x8, x11 ; CHECK-NEXT: add x12, x12, x10 -; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] -; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] +; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p3/z, [x13] +; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl] +; CHECK-NEXT: ld1d { z5.d }, p3/z, [x14] ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 -; CHECK-NEXT: mov z0.d, p3/m, z7.d -; CHECK-NEXT: mov z1.d, p2/m, z6.d +; CHECK-NEXT: mov z0.d, p2/m, z7.d +; CHECK-NEXT: mov z1.d, p3/m, z6.d ; CHECK-NEXT: b.mi .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d @@ -237,19 +237,19 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK-NEXT: add x9, x9, x11 ; CHECK-NEXT: add x8, x8, x12 ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 -; CHECK-NEXT: zip2 p3.d, p1.d, p1.d -; CHECK-NEXT: zip1 p2.d, p1.d, p1.d +; CHECK-NEXT: zip2 p2.d, p1.d, p1.d +; CHECK-NEXT: zip1 p3.d, p1.d, p1.d ; CHECK-NEXT: whilelo p1.d, x9, x10 -; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] -; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] +; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p3/z, [x13] +; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl] +; CHECK-NEXT: ld1d { z5.d }, p3/z, [x14] ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 -; CHECK-NEXT: mov z0.d, p3/m, z7.d -; CHECK-NEXT: mov z1.d, p2/m, z6.d +; CHECK-NEXT: mov z0.d, p2/m, z7.d +; CHECK-NEXT: mov z1.d, p3/m, z6.d ; CHECK-NEXT: b.mi .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll index e8d9ec7dc85de7..f07922379af537 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -201,55 +201,55 @@ entry: define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) { ; CHECK-LABEL: abp90c12: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-NEXT: ldr s16, [sp, #40] +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: ldr s16, [sp, #32] ; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: add x9, sp, #48 ; CHECK-NEXT: mov v1.s[1], v3.s[0] -; CHECK-NEXT: ldr s3, [sp, #32] +; CHECK-NEXT: ldr s3, [sp, #40] ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-NEXT: ldr s18, [sp, #8] -; CHECK-NEXT: ld1 { v16.s }[1], [x10] +; CHECK-NEXT: ld1 { v16.s }[1], [x9] ; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: add x10, sp, #72 -; CHECK-NEXT: ld1 { v3.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ldr s17, [sp, #104] +; CHECK-NEXT: ld1 { v3.s }[1], [x10] +; CHECK-NEXT: add x10, sp, #72 +; CHECK-NEXT: ldr s17, [sp, #96] ; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 ; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 ; CHECK-NEXT: ldr s2, [sp, #136] ; CHECK-NEXT: ldr s20, [sp, #192] ; CHECK-NEXT: mov v1.s[2], v5.s[0] -; CHECK-NEXT: ld1 { v16.s }[2], [x10] -; CHECK-NEXT: ldr s5, [sp, #96] -; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: ld1 { v16.s }[2], [x9] ; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: ld1 { v3.s }[2], [x10] +; CHECK-NEXT: ldr s5, [sp, #104] ; CHECK-NEXT: add x10, sp, #112 ; CHECK-NEXT: ld1 { v18.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #88 ; CHECK-NEXT: mov v0.s[2], v4.s[0] -; CHECK-NEXT: ld1 { v5.s }[1], [x10] +; CHECK-NEXT: ld1 { v17.s }[1], [x10] ; CHECK-NEXT: add x10, sp, #80 -; CHECK-NEXT: ld1 { v16.s }[3], [x9] +; CHECK-NEXT: ldr s4, [sp, #128] ; CHECK-NEXT: mov v1.s[3], v7.s[0] +; CHECK-NEXT: ld1 { v3.s }[3], [x9] ; CHECK-NEXT: add x9, sp, #120 -; CHECK-NEXT: ldr s4, [sp, #128] -; CHECK-NEXT: ld1 { v3.s }[3], [x10] -; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: ld1 { v16.s }[3], [x10] +; CHECK-NEXT: ld1 { v5.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #144 ; CHECK-NEXT: ldr s7, [sp] ; CHECK-NEXT: ld1 { v4.s }[1], [x9] ; CHECK-NEXT: mov v0.s[3], v6.s[0] ; CHECK-NEXT: add x10, sp, #16 ; CHECK-NEXT: add x9, sp, #160 -; CHECK-NEXT: fmul v6.4s, v16.4s, v1.4s -; CHECK-NEXT: fmul v19.4s, v17.4s, v18.4s -; CHECK-NEXT: fmul v18.4s, v5.4s, v18.4s -; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s +; CHECK-NEXT: fmul v6.4s, v3.4s, v1.4s +; CHECK-NEXT: fmul v19.4s, v5.4s, v18.4s +; CHECK-NEXT: fmul v18.4s, v17.4s, v18.4s +; CHECK-NEXT: fmul v1.4s, v16.4s, v1.4s ; CHECK-NEXT: ld1 { v7.s }[1], [x10] ; CHECK-NEXT: ld1 { v4.s }[2], [x9] ; CHECK-NEXT: add x9, sp, #152 @@ -259,21 +259,21 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK-NEXT: ld1 { v20.s }[1], [x10] ; CHECK-NEXT: fneg v6.4s, v6.4s ; CHECK-NEXT: fneg v19.4s, v19.4s -; CHECK-NEXT: fmla v18.4s, v7.4s, v17.4s -; CHECK-NEXT: fmla v1.4s, v0.4s, v16.4s +; CHECK-NEXT: fmla v18.4s, v7.4s, v5.4s +; CHECK-NEXT: fmla v1.4s, v0.4s, v3.4s ; CHECK-NEXT: ld1 { v4.s }[3], [x9] ; CHECK-NEXT: add x9, sp, #168 ; CHECK-NEXT: ld1 { v2.s }[2], [x9] -; CHECK-NEXT: ldr s16, [sp, #200] +; CHECK-NEXT: ldr s3, [sp, #200] ; CHECK-NEXT: add x9, sp, #216 ; CHECK-NEXT: add x10, sp, #184 -; CHECK-NEXT: fmla v6.4s, v0.4s, v3.4s -; CHECK-NEXT: fmla v19.4s, v7.4s, v5.4s -; CHECK-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-NEXT: fmla v6.4s, v0.4s, v16.4s +; CHECK-NEXT: fmla v19.4s, v7.4s, v17.4s +; CHECK-NEXT: ld1 { v3.s }[1], [x9] ; CHECK-NEXT: fsub v0.4s, v4.4s, v1.4s ; CHECK-NEXT: fsub v1.4s, v20.4s, v18.4s ; CHECK-NEXT: ld1 { v2.s }[3], [x10] -; CHECK-NEXT: fadd v3.4s, v16.4s, v19.4s +; CHECK-NEXT: fadd v3.4s, v3.4s, v19.4s ; CHECK-NEXT: fadd v2.4s, v2.4s, v6.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12 ; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12 diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index 99f573795489a0..1f1651ecf67abe 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -645,27 +645,27 @@ define <16 x i32> @double2_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: add x11, x1, #12 -; CHECK-NEXT: str s1, [x4] +; CHECK-NEXT: str s0, [x4] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldp s1, s5, [x2] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ldp s0, s5, [x2] -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: umov w9, v2.h[0] -; CHECK-NEXT: umov w10, v2.h[1] -; CHECK-NEXT: mov v0.b[8], w9 -; CHECK-NEXT: umov w9, v2.h[2] -; CHECK-NEXT: mov v0.b[9], w10 -; CHECK-NEXT: umov w10, v2.h[3] -; CHECK-NEXT: ldr s2, [x1] -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: mov v0.b[10], w9 +; CHECK-NEXT: umov w9, v1.h[0] +; CHECK-NEXT: umov w10, v1.h[1] +; CHECK-NEXT: mov v2.b[8], w9 +; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov v2.b[9], w10 +; CHECK-NEXT: umov w10, v1.h[3] +; CHECK-NEXT: ldr s1, [x1] +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: mov v2.b[10], w9 ; CHECK-NEXT: add x9, x1, #4 -; CHECK-NEXT: uzp1 v1.8b, v1.8b, v2.8b -; CHECK-NEXT: mov v0.b[11], w10 +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v2.b[11], w10 ; CHECK-NEXT: add x10, x3, #12 -; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4 +; CHECK-NEXT: ld1 { v2.s }[3], [x3], #4 ; CHECK-NEXT: ldr s4, [x0, #12] ; CHECK-NEXT: ldp s3, s16, [x0, #4] ; CHECK-NEXT: ldp s6, s7, [x2, #8] @@ -676,19 +676,19 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ld1 { v7.s }[1], [x10] ; CHECK-NEXT: add x8, x1, #8 ; CHECK-NEXT: ld1 { v16.s }[1], [x8] -; CHECK-NEXT: uaddl v2.8h, v3.8b, v4.8b +; CHECK-NEXT: uaddl v1.8h, v3.8b, v4.8b ; CHECK-NEXT: ushll v3.8h, v6.8b, #0 ; CHECK-NEXT: uaddl v4.8h, v5.8b, v7.8b -; CHECK-NEXT: uaddl v1.8h, v1.8b, v16.8b -; CHECK-NEXT: uaddw2 v5.8h, v3.8h, v0.16b -; CHECK-NEXT: ushll v0.4s, v2.4h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 +; CHECK-NEXT: uaddl v5.8h, v0.8b, v16.8b +; CHECK-NEXT: uaddw2 v2.8h, v3.8h, v2.16b +; CHECK-NEXT: ushll v0.4s, v1.4h, #3 +; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3 ; CHECK-NEXT: ushll v6.4s, v4.4h, #3 ; CHECK-NEXT: ushll2 v3.4s, v4.8h, #3 -; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h -; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v5.8h -; CHECK-NEXT: uaddw v2.4s, v6.4s, v5.4h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v5.4h +; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v5.8h +; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v2.8h +; CHECK-NEXT: uaddw v2.4s, v6.4s, v2.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p store <4 x i8> %lp1, ptr %z @@ -757,39 +757,39 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_shuffle: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s2, s7, [x0, #8] +; CHECK-NEXT: ldp s2, s3, [x0, #8] ; CHECK-NEXT: add x8, x3, #8 -; CHECK-NEXT: ldr s18, [x1, #12] +; CHECK-NEXT: ldr s6, [x1, #12] ; CHECK-NEXT: ldp s0, s1, [x2] -; CHECK-NEXT: ldp s3, s16, [x0] +; CHECK-NEXT: ldp s4, s5, [x0] ; CHECK-NEXT: add x9, x1, #8 -; CHECK-NEXT: mov v4.16b, v7.16b -; CHECK-NEXT: ldp s6, s17, [x2, #8] -; CHECK-NEXT: ldr s5, [x3, #12] -; CHECK-NEXT: mov v7.s[1], v18.s[0] +; CHECK-NEXT: mov v18.16b, v3.16b +; CHECK-NEXT: ldp s7, s16, [x2, #8] +; CHECK-NEXT: ldr s17, [x3, #12] +; CHECK-NEXT: mov v3.s[1], v6.s[0] ; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: mov v4.s[1], v18.s[0] -; CHECK-NEXT: ld1 { v3.s }[1], [x1], #4 +; CHECK-NEXT: mov v18.s[1], v6.s[0] +; CHECK-NEXT: ld1 { v4.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v2.s }[1], [x9] -; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: ld1 { v7.s }[1], [x8] ; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: ld1 { v16.s }[1], [x1] -; CHECK-NEXT: mov v4.s[2], v17.s[0] -; CHECK-NEXT: mov v17.s[1], v5.s[0] -; CHECK-NEXT: uaddl v2.8h, v3.8b, v2.8b -; CHECK-NEXT: uaddl v6.8h, v0.8b, v6.8b -; CHECK-NEXT: uaddl v7.8h, v16.8b, v7.8b -; CHECK-NEXT: uaddl v1.8h, v1.8b, v17.8b -; CHECK-NEXT: mov v4.s[3], v5.s[0] -; CHECK-NEXT: ushll v0.4s, v7.4h, #3 -; CHECK-NEXT: ushll v16.4s, v1.4h, #3 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 -; CHECK-NEXT: ushll2 v1.4s, v7.8h, #3 +; CHECK-NEXT: ld1 { v5.s }[1], [x1] +; CHECK-NEXT: mov v18.s[2], v16.s[0] +; CHECK-NEXT: mov v16.s[1], v17.s[0] +; CHECK-NEXT: uaddl v2.8h, v4.8b, v2.8b +; CHECK-NEXT: uaddl v4.8h, v0.8b, v7.8b +; CHECK-NEXT: uaddl v3.8h, v5.8b, v3.8b +; CHECK-NEXT: uaddl v1.8h, v1.8b, v16.8b +; CHECK-NEXT: mov v18.s[3], v17.s[0] +; CHECK-NEXT: ushll v0.4s, v3.4h, #3 +; CHECK-NEXT: ushll v5.4s, v1.4h, #3 +; CHECK-NEXT: ushll2 v6.4s, v1.8h, #3 +; CHECK-NEXT: ushll2 v1.4s, v3.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h -; CHECK-NEXT: str q4, [x4] +; CHECK-NEXT: str q18, [x4] ; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h -; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v6.8h -; CHECK-NEXT: uaddw v2.4s, v16.4s, v6.4h +; CHECK-NEXT: uaddw2 v3.4s, v6.4s, v4.8h +; CHECK-NEXT: uaddw v2.4s, v5.4s, v4.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -858,37 +858,37 @@ define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_ext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s1, s2, [x2] +; CHECK-NEXT: ldp s0, s1, [x2] ; CHECK-NEXT: add x8, x3, #8 -; CHECK-NEXT: ldp s3, s5, [x0] +; CHECK-NEXT: ldp s2, s3, [x0] ; CHECK-NEXT: add x9, x1, #8 ; CHECK-NEXT: add x10, x3, #12 -; CHECK-NEXT: ldp s6, s0, [x2, #8] -; CHECK-NEXT: ldp s7, s4, [x0, #8] +; CHECK-NEXT: ldp s4, s5, [x2, #8] +; CHECK-NEXT: ldp s6, s7, [x0, #8] ; CHECK-NEXT: add x11, x1, #12 -; CHECK-NEXT: ld1 { v1.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v3.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v0.s }[1], [x10] -; CHECK-NEXT: ld1 { v4.s }[1], [x11] -; CHECK-NEXT: ld1 { v7.s }[1], [x9] -; CHECK-NEXT: ld1 { v6.s }[1], [x8] -; CHECK-NEXT: ld1 { v2.s }[1], [x3] -; CHECK-NEXT: ld1 { v5.s }[1], [x1] -; CHECK-NEXT: ushll v16.8h, v0.8b, #0 +; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v5.s }[1], [x10] +; CHECK-NEXT: ld1 { v7.s }[1], [x11] +; CHECK-NEXT: ld1 { v6.s }[1], [x9] +; CHECK-NEXT: ld1 { v4.s }[1], [x8] +; CHECK-NEXT: ld1 { v1.s }[1], [x3] +; CHECK-NEXT: ld1 { v3.s }[1], [x1] +; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b +; CHECK-NEXT: uaddl v4.8h, v0.8b, v4.8b +; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b +; CHECK-NEXT: ushll v5.8h, v5.8b, #0 ; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b -; CHECK-NEXT: uaddl v6.8h, v1.8b, v6.8b -; CHECK-NEXT: uaddl v2.8h, v2.8b, v0.8b -; CHECK-NEXT: uaddl v5.8h, v5.8b, v4.8b -; CHECK-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-NEXT: ushll v7.4s, v2.4h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: stp q4, q16, [x4] -; CHECK-NEXT: ushll v1.4s, v5.4h, #3 -; CHECK-NEXT: ushll2 v5.4s, v5.8h, #3 -; CHECK-NEXT: uaddw v0.4s, v1.4s, v3.4h -; CHECK-NEXT: uaddw2 v1.4s, v5.4s, v3.8h -; CHECK-NEXT: uaddw2 v3.4s, v2.4s, v6.8h -; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h +; CHECK-NEXT: ushll v6.4s, v1.4h, #3 +; CHECK-NEXT: ushll2 v16.4s, v1.8h, #3 +; CHECK-NEXT: ushll v0.4s, v3.4h, #3 +; CHECK-NEXT: ushll2 v1.4s, v3.8h, #3 +; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v4.8h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h +; CHECK-NEXT: uaddw v2.4s, v6.4s, v4.4h +; CHECK-NEXT: ushll v4.8h, v7.8b, #0 +; CHECK-NEXT: stp q4, q5, [x4] ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 diff --git a/llvm/test/CodeGen/AArch64/faddp-half.ll b/llvm/test/CodeGen/AArch64/faddp-half.ll index 6068a4742eea99..6a450881dc978b 100644 --- a/llvm/test/CodeGen/AArch64/faddp-half.ll +++ b/llvm/test/CodeGen/AArch64/faddp-half.ll @@ -223,15 +223,15 @@ define <16 x half> @addp_v16f16(<16 x half> %a) { ; CHECKNOFP16: // %bb.0: // %entry ; CHECKNOFP16-NEXT: rev32 v5.8h, v0.8h ; CHECKNOFP16-NEXT: rev32 v4.8h, v1.8h -; CHECKNOFP16-NEXT: mov h3, v0.h[1] +; CHECKNOFP16-NEXT: mov h2, v0.h[1] ; CHECKNOFP16-NEXT: mov h6, v1.h[1] ; CHECKNOFP16-NEXT: fcvt s16, h0 ; CHECKNOFP16-NEXT: mov h17, v0.h[2] ; CHECKNOFP16-NEXT: fcvt s20, h1 ; CHECKNOFP16-NEXT: mov h21, v1.h[2] -; CHECKNOFP16-NEXT: mov h2, v5.h[1] +; CHECKNOFP16-NEXT: mov h3, v5.h[1] ; CHECKNOFP16-NEXT: mov h7, v4.h[1] -; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fcvt s2, h2 ; CHECKNOFP16-NEXT: fcvt s18, h5 ; CHECKNOFP16-NEXT: mov h19, v5.h[2] ; CHECKNOFP16-NEXT: fcvt s6, h6 @@ -241,7 +241,7 @@ define <16 x half> @addp_v16f16(<16 x half> %a) { ; CHECKNOFP16-NEXT: mov h24, v5.h[3] ; CHECKNOFP16-NEXT: fcvt s21, h21 ; CHECKNOFP16-NEXT: mov h25, v4.h[6] -; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s3, h3 ; CHECKNOFP16-NEXT: fcvt s7, h7 ; CHECKNOFP16-NEXT: fadd s16, s18, s16 ; CHECKNOFP16-NEXT: fcvt s18, h19 @@ -249,7 +249,7 @@ define <16 x half> @addp_v16f16(<16 x half> %a) { ; CHECKNOFP16-NEXT: fadd s20, s22, s20 ; CHECKNOFP16-NEXT: fcvt s22, h23 ; CHECKNOFP16-NEXT: mov h23, v4.h[3] -; CHECKNOFP16-NEXT: fadd s3, s2, s3 +; CHECKNOFP16-NEXT: fadd s3, s3, s2 ; CHECKNOFP16-NEXT: fadd s6, s7, s6 ; CHECKNOFP16-NEXT: mov h7, v1.h[3] ; CHECKNOFP16-NEXT: fcvt h2, s16 diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll index c02e02d9257e9e..cedb3f9fc176d6 100644 --- a/llvm/test/CodeGen/AArch64/faddsub.ll +++ b/llvm/test/CodeGen/AArch64/faddsub.ll @@ -394,113 +394,113 @@ entry: define <16 x half> @fadd_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-SD-NOFP16-LABEL: fadd_v16f16: ; CHECK-SD-NOFP16: // %bb.0: // %entry -; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[1] -; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[1] -; CHECK-SD-NOFP16-NEXT: fcvt s4, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h0 +; CHECK-SD-NOFP16-NEXT: mov v4.16b, v0.16b +; CHECK-SD-NOFP16-NEXT: mov h0, v2.h[1] +; CHECK-SD-NOFP16-NEXT: fcvt s6, h2 ; CHECK-SD-NOFP16-NEXT: mov h16, v3.h[1] ; CHECK-SD-NOFP16-NEXT: mov h17, v1.h[1] ; CHECK-SD-NOFP16-NEXT: mov h18, v2.h[2] -; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2] ; CHECK-SD-NOFP16-NEXT: fcvt s20, h3 ; CHECK-SD-NOFP16-NEXT: fcvt s21, h1 ; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[2] +; CHECK-SD-NOFP16-NEXT: mov h5, v4.h[1] +; CHECK-SD-NOFP16-NEXT: fcvt s7, h4 +; CHECK-SD-NOFP16-NEXT: mov h19, v4.h[2] +; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 ; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[2] -; CHECK-SD-NOFP16-NEXT: fcvt s6, h6 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h7 -; CHECK-SD-NOFP16-NEXT: mov h24, v0.h[6] -; CHECK-SD-NOFP16-NEXT: fadd s4, s5, s4 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h16 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h17 -; CHECK-SD-NOFP16-NEXT: fcvt s17, h18 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h19 -; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[3] +; CHECK-SD-NOFP16-NEXT: mov h24, v4.h[3] +; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 +; CHECK-SD-NOFP16-NEXT: mov h25, v1.h[6] ; CHECK-SD-NOFP16-NEXT: fadd s20, s21, s20 ; CHECK-SD-NOFP16-NEXT: fcvt s21, h22 ; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[3] +; CHECK-SD-NOFP16-NEXT: fcvt s5, h5 ; CHECK-SD-NOFP16-NEXT: fadd s6, s7, s6 -; CHECK-SD-NOFP16-NEXT: mov h7, v2.h[3] -; CHECK-SD-NOFP16-NEXT: mov h25, v1.h[6] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s4 -; CHECK-SD-NOFP16-NEXT: fadd s5, s16, s5 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h23 +; CHECK-SD-NOFP16-NEXT: fcvt s7, h16 +; CHECK-SD-NOFP16-NEXT: fcvt s16, h17 +; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[3] +; CHECK-SD-NOFP16-NEXT: fcvt s19, h19 +; CHECK-SD-NOFP16-NEXT: fadd s5, s5, s0 +; CHECK-SD-NOFP16-NEXT: fcvt h0, s6 +; CHECK-SD-NOFP16-NEXT: fadd s6, s16, s7 +; CHECK-SD-NOFP16-NEXT: fcvt s7, h23 ; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[3] -; CHECK-SD-NOFP16-NEXT: fadd s17, s18, s17 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h19 -; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h7 -; CHECK-SD-NOFP16-NEXT: fcvt h19, s5 +; CHECK-SD-NOFP16-NEXT: fadd s18, s19, s18 +; CHECK-SD-NOFP16-NEXT: fcvt s17, h17 +; CHECK-SD-NOFP16-NEXT: fcvt s19, h24 +; CHECK-SD-NOFP16-NEXT: mov h24, v4.h[6] +; CHECK-SD-NOFP16-NEXT: fcvt h16, s5 ; CHECK-SD-NOFP16-NEXT: fcvt h5, s20 -; CHECK-SD-NOFP16-NEXT: fadd s16, s16, s21 +; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 +; CHECK-SD-NOFP16-NEXT: fadd s7, s7, s21 ; CHECK-SD-NOFP16-NEXT: fcvt s20, h23 -; CHECK-SD-NOFP16-NEXT: fcvt h17, s17 +; CHECK-SD-NOFP16-NEXT: fcvt h18, s18 ; CHECK-SD-NOFP16-NEXT: mov h21, v2.h[4] +; CHECK-SD-NOFP16-NEXT: fadd s17, s19, s17 +; CHECK-SD-NOFP16-NEXT: mov h19, v3.h[4] ; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[4] -; CHECK-SD-NOFP16-NEXT: mov v4.h[1], v6.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt s6, h22 -; CHECK-SD-NOFP16-NEXT: mov h22, v0.h[4] -; CHECK-SD-NOFP16-NEXT: fadd s7, s18, s7 -; CHECK-SD-NOFP16-NEXT: mov h18, v3.h[4] -; CHECK-SD-NOFP16-NEXT: mov v5.h[1], v19.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h16, s16 -; CHECK-SD-NOFP16-NEXT: fadd s6, s20, s6 -; CHECK-SD-NOFP16-NEXT: mov v4.h[2], v17.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt s17, h21 -; CHECK-SD-NOFP16-NEXT: fcvt s19, h22 -; CHECK-SD-NOFP16-NEXT: fcvt h7, s7 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 +; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v16.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt s16, h22 +; CHECK-SD-NOFP16-NEXT: mov h22, v4.h[4] +; CHECK-SD-NOFP16-NEXT: mov v5.h[1], v6.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt h6, s7 +; CHECK-SD-NOFP16-NEXT: fcvt h17, s17 +; CHECK-SD-NOFP16-NEXT: fcvt s19, h19 +; CHECK-SD-NOFP16-NEXT: fadd s7, s20, s16 +; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v18.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt s16, h21 +; CHECK-SD-NOFP16-NEXT: fcvt s18, h22 ; CHECK-SD-NOFP16-NEXT: fcvt s20, h23 -; CHECK-SD-NOFP16-NEXT: mov h21, v2.h[5] -; CHECK-SD-NOFP16-NEXT: mov h22, v0.h[5] -; CHECK-SD-NOFP16-NEXT: mov v5.h[2], v16.h[0] -; CHECK-SD-NOFP16-NEXT: mov h16, v3.h[5] +; CHECK-SD-NOFP16-NEXT: mov v5.h[2], v6.h[0] ; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[5] -; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 -; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7] +; CHECK-SD-NOFP16-NEXT: mov h21, v2.h[5] +; CHECK-SD-NOFP16-NEXT: mov h22, v4.h[5] ; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7] -; CHECK-SD-NOFP16-NEXT: fadd s17, s19, s17 -; CHECK-SD-NOFP16-NEXT: mov h19, v2.h[6] -; CHECK-SD-NOFP16-NEXT: mov v4.h[3], v7.h[0] -; CHECK-SD-NOFP16-NEXT: fadd s18, s20, s18 +; CHECK-SD-NOFP16-NEXT: mov h4, v4.h[7] +; CHECK-SD-NOFP16-NEXT: fcvt h6, s7 +; CHECK-SD-NOFP16-NEXT: mov h7, v3.h[5] +; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v17.h[0] +; CHECK-SD-NOFP16-NEXT: fadd s16, s18, s16 +; CHECK-SD-NOFP16-NEXT: fadd s19, s20, s19 ; CHECK-SD-NOFP16-NEXT: mov h20, v3.h[6] -; CHECK-SD-NOFP16-NEXT: fcvt s7, h21 +; CHECK-SD-NOFP16-NEXT: mov h18, v2.h[6] +; CHECK-SD-NOFP16-NEXT: fcvt s17, h21 ; CHECK-SD-NOFP16-NEXT: fcvt s21, h22 +; CHECK-SD-NOFP16-NEXT: mov h3, v3.h[7] ; CHECK-SD-NOFP16-NEXT: fcvt s22, h24 ; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7] ; CHECK-SD-NOFP16-NEXT: mov v5.h[3], v6.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt s6, h16 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h23 -; CHECK-SD-NOFP16-NEXT: fcvt h17, s17 -; CHECK-SD-NOFP16-NEXT: fcvt s19, h19 -; CHECK-SD-NOFP16-NEXT: fcvt s23, h25 -; CHECK-SD-NOFP16-NEXT: fcvt h18, s18 +; CHECK-SD-NOFP16-NEXT: fcvt s6, h7 +; CHECK-SD-NOFP16-NEXT: fcvt s7, h23 +; CHECK-SD-NOFP16-NEXT: fcvt h16, s16 +; CHECK-SD-NOFP16-NEXT: fcvt h19, s19 ; CHECK-SD-NOFP16-NEXT: fcvt s20, h20 -; CHECK-SD-NOFP16-NEXT: mov h3, v3.h[7] -; CHECK-SD-NOFP16-NEXT: fadd s7, s21, s7 -; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 -; CHECK-SD-NOFP16-NEXT: fadd s6, s16, s6 -; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 -; CHECK-SD-NOFP16-NEXT: mov v4.h[4], v17.h[0] -; CHECK-SD-NOFP16-NEXT: fadd s16, s22, s19 -; CHECK-SD-NOFP16-NEXT: mov v5.h[4], v18.h[0] -; CHECK-SD-NOFP16-NEXT: fadd s17, s23, s20 +; CHECK-SD-NOFP16-NEXT: fcvt s23, h25 +; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 +; CHECK-SD-NOFP16-NEXT: fadd s17, s21, s17 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 -; CHECK-SD-NOFP16-NEXT: fcvt h7, s7 -; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2 -; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 -; CHECK-SD-NOFP16-NEXT: fcvt h2, s16 +; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 +; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 +; CHECK-SD-NOFP16-NEXT: fadd s6, s7, s6 +; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 +; CHECK-SD-NOFP16-NEXT: mov v0.h[4], v16.h[0] +; CHECK-SD-NOFP16-NEXT: mov v5.h[4], v19.h[0] +; CHECK-SD-NOFP16-NEXT: fadd s16, s23, s20 +; CHECK-SD-NOFP16-NEXT: fadd s7, s22, s18 +; CHECK-SD-NOFP16-NEXT: fcvt h17, s17 ; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s3 -; CHECK-SD-NOFP16-NEXT: mov v4.h[5], v7.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 -; CHECK-SD-NOFP16-NEXT: mov v5.h[5], v6.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h6, s17 +; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 +; CHECK-SD-NOFP16-NEXT: fadd s2, s4, s2 +; CHECK-SD-NOFP16-NEXT: mov v0.h[5], v17.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt h4, s7 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 -; CHECK-SD-NOFP16-NEXT: mov v4.h[6], v2.h[0] +; CHECK-SD-NOFP16-NEXT: mov v5.h[5], v6.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt h6, s16 +; CHECK-SD-NOFP16-NEXT: fcvt h2, s2 +; CHECK-SD-NOFP16-NEXT: mov v0.h[6], v4.h[0] ; CHECK-SD-NOFP16-NEXT: mov v5.h[6], v6.h[0] -; CHECK-SD-NOFP16-NEXT: mov v4.h[7], v0.h[0] +; CHECK-SD-NOFP16-NEXT: mov v0.h[7], v2.h[0] ; CHECK-SD-NOFP16-NEXT: mov v5.h[7], v1.h[0] -; CHECK-SD-NOFP16-NEXT: mov v0.16b, v4.16b ; CHECK-SD-NOFP16-NEXT: mov v1.16b, v5.16b ; CHECK-SD-NOFP16-NEXT: ret ; @@ -930,113 +930,113 @@ entry: define <16 x half> @fsub_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-SD-NOFP16-LABEL: fsub_v16f16: ; CHECK-SD-NOFP16: // %bb.0: // %entry -; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[1] -; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[1] -; CHECK-SD-NOFP16-NEXT: fcvt s4, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h0 +; CHECK-SD-NOFP16-NEXT: mov v4.16b, v0.16b +; CHECK-SD-NOFP16-NEXT: mov h0, v2.h[1] +; CHECK-SD-NOFP16-NEXT: fcvt s6, h2 ; CHECK-SD-NOFP16-NEXT: mov h16, v3.h[1] ; CHECK-SD-NOFP16-NEXT: mov h17, v1.h[1] ; CHECK-SD-NOFP16-NEXT: mov h18, v2.h[2] -; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2] ; CHECK-SD-NOFP16-NEXT: fcvt s20, h3 ; CHECK-SD-NOFP16-NEXT: fcvt s21, h1 ; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[2] +; CHECK-SD-NOFP16-NEXT: mov h5, v4.h[1] +; CHECK-SD-NOFP16-NEXT: fcvt s7, h4 +; CHECK-SD-NOFP16-NEXT: mov h19, v4.h[2] +; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 ; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[2] -; CHECK-SD-NOFP16-NEXT: fcvt s6, h6 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h7 -; CHECK-SD-NOFP16-NEXT: mov h24, v0.h[6] -; CHECK-SD-NOFP16-NEXT: fsub s4, s5, s4 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h16 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h17 -; CHECK-SD-NOFP16-NEXT: fcvt s17, h18 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h19 -; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[3] +; CHECK-SD-NOFP16-NEXT: mov h24, v4.h[3] +; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 +; CHECK-SD-NOFP16-NEXT: mov h25, v1.h[6] ; CHECK-SD-NOFP16-NEXT: fsub s20, s21, s20 ; CHECK-SD-NOFP16-NEXT: fcvt s21, h22 ; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[3] +; CHECK-SD-NOFP16-NEXT: fcvt s5, h5 ; CHECK-SD-NOFP16-NEXT: fsub s6, s7, s6 -; CHECK-SD-NOFP16-NEXT: mov h7, v2.h[3] -; CHECK-SD-NOFP16-NEXT: mov h25, v1.h[6] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s4 -; CHECK-SD-NOFP16-NEXT: fsub s5, s16, s5 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h23 +; CHECK-SD-NOFP16-NEXT: fcvt s7, h16 +; CHECK-SD-NOFP16-NEXT: fcvt s16, h17 +; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[3] +; CHECK-SD-NOFP16-NEXT: fcvt s19, h19 +; CHECK-SD-NOFP16-NEXT: fsub s5, s5, s0 +; CHECK-SD-NOFP16-NEXT: fcvt h0, s6 +; CHECK-SD-NOFP16-NEXT: fsub s6, s16, s7 +; CHECK-SD-NOFP16-NEXT: fcvt s7, h23 ; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[3] -; CHECK-SD-NOFP16-NEXT: fsub s17, s18, s17 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h19 -; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h7 -; CHECK-SD-NOFP16-NEXT: fcvt h19, s5 +; CHECK-SD-NOFP16-NEXT: fsub s18, s19, s18 +; CHECK-SD-NOFP16-NEXT: fcvt s17, h17 +; CHECK-SD-NOFP16-NEXT: fcvt s19, h24 +; CHECK-SD-NOFP16-NEXT: mov h24, v4.h[6] +; CHECK-SD-NOFP16-NEXT: fcvt h16, s5 ; CHECK-SD-NOFP16-NEXT: fcvt h5, s20 -; CHECK-SD-NOFP16-NEXT: fsub s16, s16, s21 +; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 +; CHECK-SD-NOFP16-NEXT: fsub s7, s7, s21 ; CHECK-SD-NOFP16-NEXT: fcvt s20, h23 -; CHECK-SD-NOFP16-NEXT: fcvt h17, s17 +; CHECK-SD-NOFP16-NEXT: fcvt h18, s18 ; CHECK-SD-NOFP16-NEXT: mov h21, v2.h[4] +; CHECK-SD-NOFP16-NEXT: fsub s17, s19, s17 +; CHECK-SD-NOFP16-NEXT: mov h19, v3.h[4] ; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[4] -; CHECK-SD-NOFP16-NEXT: mov v4.h[1], v6.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt s6, h22 -; CHECK-SD-NOFP16-NEXT: mov h22, v0.h[4] -; CHECK-SD-NOFP16-NEXT: fsub s7, s18, s7 -; CHECK-SD-NOFP16-NEXT: mov h18, v3.h[4] -; CHECK-SD-NOFP16-NEXT: mov v5.h[1], v19.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h16, s16 -; CHECK-SD-NOFP16-NEXT: fsub s6, s20, s6 -; CHECK-SD-NOFP16-NEXT: mov v4.h[2], v17.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt s17, h21 -; CHECK-SD-NOFP16-NEXT: fcvt s19, h22 -; CHECK-SD-NOFP16-NEXT: fcvt h7, s7 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 +; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v16.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt s16, h22 +; CHECK-SD-NOFP16-NEXT: mov h22, v4.h[4] +; CHECK-SD-NOFP16-NEXT: mov v5.h[1], v6.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt h6, s7 +; CHECK-SD-NOFP16-NEXT: fcvt h17, s17 +; CHECK-SD-NOFP16-NEXT: fcvt s19, h19 +; CHECK-SD-NOFP16-NEXT: fsub s7, s20, s16 +; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v18.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt s16, h21 +; CHECK-SD-NOFP16-NEXT: fcvt s18, h22 ; CHECK-SD-NOFP16-NEXT: fcvt s20, h23 -; CHECK-SD-NOFP16-NEXT: mov h21, v2.h[5] -; CHECK-SD-NOFP16-NEXT: mov h22, v0.h[5] -; CHECK-SD-NOFP16-NEXT: mov v5.h[2], v16.h[0] -; CHECK-SD-NOFP16-NEXT: mov h16, v3.h[5] +; CHECK-SD-NOFP16-NEXT: mov v5.h[2], v6.h[0] ; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[5] -; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 -; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7] +; CHECK-SD-NOFP16-NEXT: mov h21, v2.h[5] +; CHECK-SD-NOFP16-NEXT: mov h22, v4.h[5] ; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7] -; CHECK-SD-NOFP16-NEXT: fsub s17, s19, s17 -; CHECK-SD-NOFP16-NEXT: mov h19, v2.h[6] -; CHECK-SD-NOFP16-NEXT: mov v4.h[3], v7.h[0] -; CHECK-SD-NOFP16-NEXT: fsub s18, s20, s18 +; CHECK-SD-NOFP16-NEXT: mov h4, v4.h[7] +; CHECK-SD-NOFP16-NEXT: fcvt h6, s7 +; CHECK-SD-NOFP16-NEXT: mov h7, v3.h[5] +; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v17.h[0] +; CHECK-SD-NOFP16-NEXT: fsub s16, s18, s16 +; CHECK-SD-NOFP16-NEXT: fsub s19, s20, s19 ; CHECK-SD-NOFP16-NEXT: mov h20, v3.h[6] -; CHECK-SD-NOFP16-NEXT: fcvt s7, h21 +; CHECK-SD-NOFP16-NEXT: mov h18, v2.h[6] +; CHECK-SD-NOFP16-NEXT: fcvt s17, h21 ; CHECK-SD-NOFP16-NEXT: fcvt s21, h22 +; CHECK-SD-NOFP16-NEXT: mov h3, v3.h[7] ; CHECK-SD-NOFP16-NEXT: fcvt s22, h24 ; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7] ; CHECK-SD-NOFP16-NEXT: mov v5.h[3], v6.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt s6, h16 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h23 -; CHECK-SD-NOFP16-NEXT: fcvt h17, s17 -; CHECK-SD-NOFP16-NEXT: fcvt s19, h19 -; CHECK-SD-NOFP16-NEXT: fcvt s23, h25 -; CHECK-SD-NOFP16-NEXT: fcvt h18, s18 +; CHECK-SD-NOFP16-NEXT: fcvt s6, h7 +; CHECK-SD-NOFP16-NEXT: fcvt s7, h23 +; CHECK-SD-NOFP16-NEXT: fcvt h16, s16 +; CHECK-SD-NOFP16-NEXT: fcvt h19, s19 ; CHECK-SD-NOFP16-NEXT: fcvt s20, h20 -; CHECK-SD-NOFP16-NEXT: mov h3, v3.h[7] -; CHECK-SD-NOFP16-NEXT: fsub s7, s21, s7 -; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 -; CHECK-SD-NOFP16-NEXT: fsub s6, s16, s6 -; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 -; CHECK-SD-NOFP16-NEXT: mov v4.h[4], v17.h[0] -; CHECK-SD-NOFP16-NEXT: fsub s16, s22, s19 -; CHECK-SD-NOFP16-NEXT: mov v5.h[4], v18.h[0] -; CHECK-SD-NOFP16-NEXT: fsub s17, s23, s20 +; CHECK-SD-NOFP16-NEXT: fcvt s23, h25 +; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 +; CHECK-SD-NOFP16-NEXT: fsub s17, s21, s17 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 -; CHECK-SD-NOFP16-NEXT: fcvt h7, s7 -; CHECK-SD-NOFP16-NEXT: fsub s0, s0, s2 -; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 -; CHECK-SD-NOFP16-NEXT: fcvt h2, s16 +; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 +; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 +; CHECK-SD-NOFP16-NEXT: fsub s6, s7, s6 +; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 +; CHECK-SD-NOFP16-NEXT: mov v0.h[4], v16.h[0] +; CHECK-SD-NOFP16-NEXT: mov v5.h[4], v19.h[0] +; CHECK-SD-NOFP16-NEXT: fsub s16, s23, s20 +; CHECK-SD-NOFP16-NEXT: fsub s7, s22, s18 +; CHECK-SD-NOFP16-NEXT: fcvt h17, s17 ; CHECK-SD-NOFP16-NEXT: fsub s1, s1, s3 -; CHECK-SD-NOFP16-NEXT: mov v4.h[5], v7.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 -; CHECK-SD-NOFP16-NEXT: mov v5.h[5], v6.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h6, s17 +; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 +; CHECK-SD-NOFP16-NEXT: fsub s2, s4, s2 +; CHECK-SD-NOFP16-NEXT: mov v0.h[5], v17.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt h4, s7 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 -; CHECK-SD-NOFP16-NEXT: mov v4.h[6], v2.h[0] +; CHECK-SD-NOFP16-NEXT: mov v5.h[5], v6.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt h6, s16 +; CHECK-SD-NOFP16-NEXT: fcvt h2, s2 +; CHECK-SD-NOFP16-NEXT: mov v0.h[6], v4.h[0] ; CHECK-SD-NOFP16-NEXT: mov v5.h[6], v6.h[0] -; CHECK-SD-NOFP16-NEXT: mov v4.h[7], v0.h[0] +; CHECK-SD-NOFP16-NEXT: mov v0.h[7], v2.h[0] ; CHECK-SD-NOFP16-NEXT: mov v5.h[7], v1.h[0] -; CHECK-SD-NOFP16-NEXT: mov v0.16b, v4.16b ; CHECK-SD-NOFP16-NEXT: mov v1.16b, v5.16b ; CHECK-SD-NOFP16-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/fcvt_combine.ll b/llvm/test/CodeGen/AArch64/fcvt_combine.ll index b5b9055fbc02f8..bfe8d173435c41 100644 --- a/llvm/test/CodeGen/AArch64/fcvt_combine.ll +++ b/llvm/test/CodeGen/AArch64/fcvt_combine.ll @@ -498,7 +498,7 @@ define <8 x i16> @test_v8f16_sat(<8 x half> %in) { ; CHECK-NO16: // %bb.0: ; CHECK-NO16-NEXT: mov h2, v0.h[4] ; CHECK-NO16-NEXT: mov h3, v0.h[5] -; CHECK-NO16-NEXT: mov w8, #32767 // =0x7fff +; CHECK-NO16-NEXT: mov w9, #32767 // =0x7fff ; CHECK-NO16-NEXT: mov h4, v0.h[6] ; CHECK-NO16-NEXT: fmov s1, #4.00000000 ; CHECK-NO16-NEXT: mov w11, #-32768 // =0xffff8000 @@ -512,82 +512,82 @@ define <8 x i16> @test_v8f16_sat(<8 x half> %in) { ; CHECK-NO16-NEXT: fcvt s4, h4 ; CHECK-NO16-NEXT: fcvt s5, h5 ; CHECK-NO16-NEXT: fcvt s6, h6 +; CHECK-NO16-NEXT: fcvt s0, h0 ; CHECK-NO16-NEXT: fmul s2, s2, s1 ; CHECK-NO16-NEXT: fmul s3, s3, s1 ; CHECK-NO16-NEXT: fmul s4, s4, s1 ; CHECK-NO16-NEXT: fmul s5, s5, s1 ; CHECK-NO16-NEXT: fmul s6, s6, s1 +; CHECK-NO16-NEXT: fmul s0, s0, s1 ; CHECK-NO16-NEXT: fcvt h2, s2 ; CHECK-NO16-NEXT: fcvt h3, s3 ; CHECK-NO16-NEXT: fcvt h4, s4 ; CHECK-NO16-NEXT: fcvt h5, s5 ; CHECK-NO16-NEXT: fcvt h6, s6 +; CHECK-NO16-NEXT: fcvt h0, s0 ; CHECK-NO16-NEXT: mov v2.h[1], v3.h[0] ; CHECK-NO16-NEXT: fcvt s3, h7 ; CHECK-NO16-NEXT: fmul s7, s16, s1 ; CHECK-NO16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-NO16-NEXT: fcvt s4, h0 ; CHECK-NO16-NEXT: fmul s3, s3, s1 -; CHECK-NO16-NEXT: fcvt h0, s7 +; CHECK-NO16-NEXT: fcvt h4, s7 ; CHECK-NO16-NEXT: mov v2.h[3], v5.h[0] -; CHECK-NO16-NEXT: fmul s1, s4, s1 -; CHECK-NO16-NEXT: fcvt h3, s3 -; CHECK-NO16-NEXT: mov v0.h[1], v6.h[0] +; CHECK-NO16-NEXT: fcvt h1, s3 +; CHECK-NO16-NEXT: mov v4.h[1], v6.h[0] ; CHECK-NO16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-NO16-NEXT: fcvt h1, s1 -; CHECK-NO16-NEXT: mov v0.h[2], v3.h[0] -; CHECK-NO16-NEXT: mov s4, v2.s[1] +; CHECK-NO16-NEXT: mov v4.h[2], v1.h[0] +; CHECK-NO16-NEXT: mov s3, v2.s[1] +; CHECK-NO16-NEXT: mov v4.h[3], v0.h[0] +; CHECK-NO16-NEXT: mov s0, v2.s[2] ; CHECK-NO16-NEXT: fcvtzs w10, s2 -; CHECK-NO16-NEXT: mov v0.h[3], v1.h[0] -; CHECK-NO16-NEXT: mov s1, v2.s[2] ; CHECK-NO16-NEXT: mov s2, v2.s[3] -; CHECK-NO16-NEXT: fcvtzs w9, s4 -; CHECK-NO16-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NO16-NEXT: fcvtzs w12, s1 +; CHECK-NO16-NEXT: fcvtzs w8, s3 +; CHECK-NO16-NEXT: fcvtl v1.4s, v4.4h +; CHECK-NO16-NEXT: fcvtzs w12, s0 ; CHECK-NO16-NEXT: fcvtzs w13, s2 -; CHECK-NO16-NEXT: cmp w9, w8 -; CHECK-NO16-NEXT: csel w9, w9, w8, lt -; CHECK-NO16-NEXT: mov s1, v0.s[1] -; CHECK-NO16-NEXT: fcvtzs w15, s0 -; CHECK-NO16-NEXT: cmn w9, #8, lsl #12 // =32768 -; CHECK-NO16-NEXT: csel w9, w9, w11, gt -; CHECK-NO16-NEXT: cmp w10, w8 -; CHECK-NO16-NEXT: csel w10, w10, w8, lt +; CHECK-NO16-NEXT: cmp w8, w9 +; CHECK-NO16-NEXT: mov s0, v1.s[1] +; CHECK-NO16-NEXT: fcvtzs w15, s1 +; CHECK-NO16-NEXT: csel w8, w8, w9, lt +; CHECK-NO16-NEXT: cmn w8, #8, lsl #12 // =32768 +; CHECK-NO16-NEXT: csel w8, w8, w11, gt +; CHECK-NO16-NEXT: cmp w10, w9 +; CHECK-NO16-NEXT: csel w10, w10, w9, lt +; CHECK-NO16-NEXT: fcvtzs w14, s0 +; CHECK-NO16-NEXT: mov s0, v1.s[2] ; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NO16-NEXT: fcvtzs w14, s1 -; CHECK-NO16-NEXT: mov s1, v0.s[2] ; CHECK-NO16-NEXT: csel w10, w10, w11, gt -; CHECK-NO16-NEXT: cmp w12, w8 -; CHECK-NO16-NEXT: mov s0, v0.s[3] -; CHECK-NO16-NEXT: csel w12, w12, w8, lt +; CHECK-NO16-NEXT: cmp w12, w9 +; CHECK-NO16-NEXT: csel w12, w12, w9, lt ; CHECK-NO16-NEXT: cmn w12, #8, lsl #12 // =32768 +; CHECK-NO16-NEXT: fcvtzs w16, s0 +; CHECK-NO16-NEXT: mov s0, v1.s[3] ; CHECK-NO16-NEXT: csel w12, w12, w11, gt -; CHECK-NO16-NEXT: cmp w13, w8 -; CHECK-NO16-NEXT: fcvtzs w16, s1 -; CHECK-NO16-NEXT: csel w13, w13, w8, lt +; CHECK-NO16-NEXT: cmp w13, w9 ; CHECK-NO16-NEXT: fmov s1, w10 +; CHECK-NO16-NEXT: csel w13, w13, w9, lt ; CHECK-NO16-NEXT: cmn w13, #8, lsl #12 // =32768 ; CHECK-NO16-NEXT: csel w13, w13, w11, gt -; CHECK-NO16-NEXT: cmp w14, w8 -; CHECK-NO16-NEXT: csel w14, w14, w8, lt -; CHECK-NO16-NEXT: mov v1.s[1], w9 -; CHECK-NO16-NEXT: fcvtzs w9, s0 +; CHECK-NO16-NEXT: cmp w14, w9 +; CHECK-NO16-NEXT: mov v1.s[1], w8 +; CHECK-NO16-NEXT: csel w14, w14, w9, lt +; CHECK-NO16-NEXT: fcvtzs w8, s0 ; CHECK-NO16-NEXT: cmn w14, #8, lsl #12 // =32768 ; CHECK-NO16-NEXT: csel w14, w14, w11, gt -; CHECK-NO16-NEXT: cmp w15, w8 -; CHECK-NO16-NEXT: csel w15, w15, w8, lt -; CHECK-NO16-NEXT: cmn w15, #8, lsl #12 // =32768 +; CHECK-NO16-NEXT: cmp w15, w9 +; CHECK-NO16-NEXT: csel w15, w15, w9, lt ; CHECK-NO16-NEXT: mov v1.s[2], w12 +; CHECK-NO16-NEXT: cmn w15, #8, lsl #12 // =32768 ; CHECK-NO16-NEXT: csel w10, w15, w11, gt -; CHECK-NO16-NEXT: cmp w16, w8 +; CHECK-NO16-NEXT: cmp w16, w9 ; CHECK-NO16-NEXT: fmov s2, w10 -; CHECK-NO16-NEXT: csel w10, w16, w8, lt +; CHECK-NO16-NEXT: csel w10, w16, w9, lt ; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NO16-NEXT: csel w10, w10, w11, gt -; CHECK-NO16-NEXT: cmp w9, w8 ; CHECK-NO16-NEXT: mov v1.s[3], w13 +; CHECK-NO16-NEXT: csel w10, w10, w11, gt +; CHECK-NO16-NEXT: cmp w8, w9 ; CHECK-NO16-NEXT: mov v2.s[1], w14 -; CHECK-NO16-NEXT: csel w8, w9, w8, lt +; CHECK-NO16-NEXT: csel w8, w8, w9, lt ; CHECK-NO16-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-NO16-NEXT: csel w8, w8, w11, gt ; CHECK-NO16-NEXT: mov v2.s[2], w10 diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll index b7a645bfb546fd..7d1a921358a8f4 100644 --- a/llvm/test/CodeGen/AArch64/fdiv.ll +++ b/llvm/test/CodeGen/AArch64/fdiv.ll @@ -397,52 +397,52 @@ define <16 x half> @fdiv_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[1] ; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1] ; CHECK-SD-NOFP16-NEXT: fcvt s6, h0 -; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[2] -; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[3] -; CHECK-SD-NOFP16-NEXT: mov h17, v0.h[4] -; CHECK-SD-NOFP16-NEXT: mov h18, v0.h[5] +; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[5] ; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[6] -; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7] ; CHECK-SD-NOFP16-NEXT: fcvt s20, h1 ; CHECK-SD-NOFP16-NEXT: mov h21, v1.h[2] ; CHECK-SD-NOFP16-NEXT: mov h22, v1.h[3] +; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[4] +; CHECK-SD-NOFP16-NEXT: mov h24, v1.h[5] +; CHECK-SD-NOFP16-NEXT: mov h25, v1.h[6] ; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 ; CHECK-SD-NOFP16-NEXT: fcvt s5, h5 -; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[4] -; CHECK-SD-NOFP16-NEXT: fcvt s7, h7 ; CHECK-SD-NOFP16-NEXT: fcvt s16, h16 -; CHECK-SD-NOFP16-NEXT: fcvt s17, h17 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 ; CHECK-SD-NOFP16-NEXT: fcvt s19, h19 -; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 ; CHECK-SD-NOFP16-NEXT: fcvt s21, h21 ; CHECK-SD-NOFP16-NEXT: fcvt s22, h22 -; CHECK-SD-NOFP16-NEXT: mov h24, v1.h[5] -; CHECK-SD-NOFP16-NEXT: fdiv s4, s5, s4 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h2 ; CHECK-SD-NOFP16-NEXT: fcvt s23, h23 -; CHECK-SD-NOFP16-NEXT: mov h25, v1.h[6] ; CHECK-SD-NOFP16-NEXT: fcvt s24, h24 ; CHECK-SD-NOFP16-NEXT: fcvt s25, h25 -; CHECK-SD-NOFP16-NEXT: fdiv s5, s6, s5 -; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[2] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s4 +; CHECK-SD-NOFP16-NEXT: fdiv s5, s5, s4 +; CHECK-SD-NOFP16-NEXT: fcvt s4, h2 +; CHECK-SD-NOFP16-NEXT: fdiv s17, s6, s4 +; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[2] +; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[2] +; CHECK-SD-NOFP16-NEXT: fcvt h5, s5 +; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 ; CHECK-SD-NOFP16-NEXT: fcvt s6, h6 -; CHECK-SD-NOFP16-NEXT: fdiv s7, s7, s6 -; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[3] +; CHECK-SD-NOFP16-NEXT: fdiv s18, s6, s4 +; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[3] +; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[3] +; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 +; CHECK-SD-NOFP16-NEXT: fcvt s6, h6 +; CHECK-SD-NOFP16-NEXT: fdiv s7, s6, s4 +; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[4] +; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[4] +; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7] +; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 +; CHECK-SD-NOFP16-NEXT: fcvt s6, h6 +; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 +; CHECK-SD-NOFP16-NEXT: fdiv s4, s6, s4 +; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[5] ; CHECK-SD-NOFP16-NEXT: fcvt s6, h6 ; CHECK-SD-NOFP16-NEXT: fdiv s6, s16, s6 -; CHECK-SD-NOFP16-NEXT: mov h16, v2.h[4] -; CHECK-SD-NOFP16-NEXT: fcvt s16, h16 -; CHECK-SD-NOFP16-NEXT: fdiv s16, s17, s16 -; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[5] -; CHECK-SD-NOFP16-NEXT: fcvt s17, h17 -; CHECK-SD-NOFP16-NEXT: fdiv s17, s18, s17 -; CHECK-SD-NOFP16-NEXT: mov h18, v2.h[6] +; CHECK-SD-NOFP16-NEXT: mov h16, v2.h[6] ; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7] -; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 +; CHECK-SD-NOFP16-NEXT: fcvt s16, h16 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 -; CHECK-SD-NOFP16-NEXT: fdiv s18, s19, s18 +; CHECK-SD-NOFP16-NEXT: fdiv s16, s19, s16 ; CHECK-SD-NOFP16-NEXT: fdiv s19, s0, s2 ; CHECK-SD-NOFP16-NEXT: mov h0, v3.h[1] ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1] @@ -468,38 +468,38 @@ define <16 x half> @fdiv_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-SD-NOFP16-NEXT: mov h0, v3.h[6] ; CHECK-SD-NOFP16-NEXT: mov h3, v3.h[7] ; CHECK-SD-NOFP16-NEXT: fcvt s26, h0 -; CHECK-SD-NOFP16-NEXT: fcvt h0, s5 -; CHECK-SD-NOFP16-NEXT: fcvt h5, s2 +; CHECK-SD-NOFP16-NEXT: fcvt h0, s17 +; CHECK-SD-NOFP16-NEXT: fcvt h17, s2 ; CHECK-SD-NOFP16-NEXT: fcvt h2, s20 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 -; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v4.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s7 -; CHECK-SD-NOFP16-NEXT: mov v2.h[1], v5.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h5, s21 +; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v5.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt h5, s18 +; CHECK-SD-NOFP16-NEXT: mov v2.h[1], v17.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt h17, s21 ; CHECK-SD-NOFP16-NEXT: fdiv s20, s25, s26 -; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v4.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s6 -; CHECK-SD-NOFP16-NEXT: mov v2.h[2], v5.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h5, s22 -; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v4.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s23 -; CHECK-SD-NOFP16-NEXT: mov v2.h[3], v5.h[0] +; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v5.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt h5, s7 +; CHECK-SD-NOFP16-NEXT: fcvt h7, s22 +; CHECK-SD-NOFP16-NEXT: mov v2.h[2], v17.h[0] +; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v5.h[0] +; CHECK-SD-NOFP16-NEXT: mov v2.h[3], v7.h[0] ; CHECK-SD-NOFP16-NEXT: fdiv s1, s1, s3 -; CHECK-SD-NOFP16-NEXT: fcvt h3, s16 +; CHECK-SD-NOFP16-NEXT: fcvt h3, s4 +; CHECK-SD-NOFP16-NEXT: fcvt h4, s23 ; CHECK-SD-NOFP16-NEXT: mov v2.h[4], v4.h[0] ; CHECK-SD-NOFP16-NEXT: fcvt h4, s24 ; CHECK-SD-NOFP16-NEXT: mov v0.h[4], v3.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h3, s17 +; CHECK-SD-NOFP16-NEXT: fcvt h3, s6 ; CHECK-SD-NOFP16-NEXT: mov v2.h[5], v4.h[0] ; CHECK-SD-NOFP16-NEXT: fcvt h4, s20 ; CHECK-SD-NOFP16-NEXT: mov v0.h[5], v3.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h3, s18 +; CHECK-SD-NOFP16-NEXT: fcvt h3, s16 ; CHECK-SD-NOFP16-NEXT: mov v2.h[6], v4.h[0] +; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 ; CHECK-SD-NOFP16-NEXT: mov v0.h[6], v3.h[0] ; CHECK-SD-NOFP16-NEXT: fcvt h3, s19 -; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 -; CHECK-SD-NOFP16-NEXT: mov v0.h[7], v3.h[0] ; CHECK-SD-NOFP16-NEXT: mov v2.h[7], v1.h[0] +; CHECK-SD-NOFP16-NEXT: mov v0.h[7], v3.h[0] ; CHECK-SD-NOFP16-NEXT: mov v1.16b, v2.16b ; CHECK-SD-NOFP16-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll index c2d6a1f5de1918..873c25a074130e 100644 --- a/llvm/test/CodeGen/AArch64/fexplog.ll +++ b/llvm/test/CodeGen/AArch64/fexplog.ll @@ -203,7 +203,7 @@ define <4 x double> @exp_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: .cfi_offset w30, -16 ; CHECK-GI-NEXT: .cfi_offset b8, -24 ; CHECK-GI-NEXT: .cfi_offset b9, -32 -; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov d8, v0.d[1] ; CHECK-GI-NEXT: mov d9, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -211,21 +211,21 @@ define <4 x double> @exp_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl exp -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: bl exp -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl exp -; CHECK-GI-NEXT: ldp q1, q2, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v2.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] -; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -241,14 +241,14 @@ define <2 x float> @exp_v2f32(<2 x float> %a) { ; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 ; CHECK-SD-NEXT: .cfi_offset w30, -16 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-SD-NEXT: mov s0, v0.s[1] ; CHECK-SD-NEXT: bl expf -; CHECK-SD-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-SD-NEXT: bl expf -; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] @@ -510,16 +510,16 @@ define <8 x float> @exp_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: mov s12, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s8 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s9 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: bl expf @@ -532,7 +532,7 @@ define <8 x float> @exp_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: str d0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s13 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #48] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload @@ -541,11 +541,11 @@ define <8 x float> @exp_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -790,8 +790,7 @@ define <4 x half> @exp_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d0, d1 ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -1111,7 +1110,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov v2.16b, v1.16b -; CHECK-GI-NEXT: str q1, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h1, v1.h[2] ; CHECK-GI-NEXT: mov h15, v0.h[1] @@ -1122,57 +1121,57 @@ define <16 x half> @exp_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h12, v0.h[6] ; CHECK-GI-NEXT: mov h13, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h1, [sp, #16] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[3] -; CHECK-GI-NEXT: str h1, [sp, #32] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[4] -; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #128] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[5] -; CHECK-GI-NEXT: str h1, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] -; CHECK-GI-NEXT: str h1, [sp, #96] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #192] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[7] -; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #224] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: fcvt s1, h14 @@ -1180,77 +1179,79 @@ define <16 x half> @exp_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #128] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #192] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #224] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #288] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #272] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #256] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v3.16b +; CHECK-GI-NEXT: ldr q0, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v1.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #320 ; CHECK-GI-NEXT: ret entry: @@ -1459,7 +1460,7 @@ define <4 x double> @exp2_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: .cfi_offset w30, -16 ; CHECK-GI-NEXT: .cfi_offset b8, -24 ; CHECK-GI-NEXT: .cfi_offset b9, -32 -; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov d8, v0.d[1] ; CHECK-GI-NEXT: mov d9, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -1467,21 +1468,21 @@ define <4 x double> @exp2_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl exp2 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: bl exp2 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl exp2 -; CHECK-GI-NEXT: ldp q1, q2, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v2.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] -; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -1497,14 +1498,14 @@ define <2 x float> @exp2_v2f32(<2 x float> %a) { ; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 ; CHECK-SD-NEXT: .cfi_offset w30, -16 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-SD-NEXT: mov s0, v0.s[1] ; CHECK-SD-NEXT: bl exp2f -; CHECK-SD-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-SD-NEXT: bl exp2f -; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] @@ -1766,16 +1767,16 @@ define <8 x float> @exp2_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: mov s12, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s8 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s9 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: bl exp2f @@ -1788,7 +1789,7 @@ define <8 x float> @exp2_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: str d0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s13 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #48] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload @@ -1797,11 +1798,11 @@ define <8 x float> @exp2_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -2046,8 +2047,7 @@ define <4 x half> @exp2_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d0, d1 ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -2367,7 +2367,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov v2.16b, v1.16b -; CHECK-GI-NEXT: str q1, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h1, v1.h[2] ; CHECK-GI-NEXT: mov h15, v0.h[1] @@ -2378,57 +2378,57 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h12, v0.h[6] ; CHECK-GI-NEXT: mov h13, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h1, [sp, #16] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[3] -; CHECK-GI-NEXT: str h1, [sp, #32] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[4] -; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #128] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[5] -; CHECK-GI-NEXT: str h1, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] -; CHECK-GI-NEXT: str h1, [sp, #96] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #192] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[7] -; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #224] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: fcvt s1, h14 @@ -2436,77 +2436,79 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #128] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #192] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #224] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #288] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #272] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #256] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v3.16b +; CHECK-GI-NEXT: ldr q0, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v1.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #320 ; CHECK-GI-NEXT: ret entry: @@ -2715,7 +2717,7 @@ define <4 x double> @log_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: .cfi_offset w30, -16 ; CHECK-GI-NEXT: .cfi_offset b8, -24 ; CHECK-GI-NEXT: .cfi_offset b9, -32 -; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov d8, v0.d[1] ; CHECK-GI-NEXT: mov d9, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -2723,21 +2725,21 @@ define <4 x double> @log_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl log -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: bl log -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl log -; CHECK-GI-NEXT: ldp q1, q2, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v2.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] -; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -2753,14 +2755,14 @@ define <2 x float> @log_v2f32(<2 x float> %a) { ; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 ; CHECK-SD-NEXT: .cfi_offset w30, -16 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-SD-NEXT: mov s0, v0.s[1] ; CHECK-SD-NEXT: bl logf -; CHECK-SD-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-SD-NEXT: bl logf -; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] @@ -3022,16 +3024,16 @@ define <8 x float> @log_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: mov s12, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s8 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s9 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: bl logf @@ -3044,7 +3046,7 @@ define <8 x float> @log_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: str d0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s13 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #48] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload @@ -3053,11 +3055,11 @@ define <8 x float> @log_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -3302,8 +3304,7 @@ define <4 x half> @log_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d0, d1 ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -3623,7 +3624,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov v2.16b, v1.16b -; CHECK-GI-NEXT: str q1, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h1, v1.h[2] ; CHECK-GI-NEXT: mov h15, v0.h[1] @@ -3634,57 +3635,57 @@ define <16 x half> @log_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h12, v0.h[6] ; CHECK-GI-NEXT: mov h13, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h1, [sp, #16] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[3] -; CHECK-GI-NEXT: str h1, [sp, #32] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[4] -; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #128] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[5] -; CHECK-GI-NEXT: str h1, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] -; CHECK-GI-NEXT: str h1, [sp, #96] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #192] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[7] -; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #224] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: fcvt s1, h14 @@ -3692,77 +3693,79 @@ define <16 x half> @log_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #128] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #192] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #224] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #288] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #272] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #256] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v3.16b +; CHECK-GI-NEXT: ldr q0, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v1.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #320 ; CHECK-GI-NEXT: ret entry: @@ -3971,7 +3974,7 @@ define <4 x double> @log2_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: .cfi_offset w30, -16 ; CHECK-GI-NEXT: .cfi_offset b8, -24 ; CHECK-GI-NEXT: .cfi_offset b9, -32 -; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov d8, v0.d[1] ; CHECK-GI-NEXT: mov d9, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -3979,21 +3982,21 @@ define <4 x double> @log2_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl log2 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: bl log2 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl log2 -; CHECK-GI-NEXT: ldp q1, q2, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v2.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] -; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -4009,14 +4012,14 @@ define <2 x float> @log2_v2f32(<2 x float> %a) { ; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 ; CHECK-SD-NEXT: .cfi_offset w30, -16 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-SD-NEXT: mov s0, v0.s[1] ; CHECK-SD-NEXT: bl log2f -; CHECK-SD-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-SD-NEXT: bl log2f -; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] @@ -4278,16 +4281,16 @@ define <8 x float> @log2_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: mov s12, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s8 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s9 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: bl log2f @@ -4300,7 +4303,7 @@ define <8 x float> @log2_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: str d0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s13 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #48] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload @@ -4309,11 +4312,11 @@ define <8 x float> @log2_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -4558,8 +4561,7 @@ define <4 x half> @log2_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d0, d1 ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -4879,7 +4881,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov v2.16b, v1.16b -; CHECK-GI-NEXT: str q1, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h1, v1.h[2] ; CHECK-GI-NEXT: mov h15, v0.h[1] @@ -4890,57 +4892,57 @@ define <16 x half> @log2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h12, v0.h[6] ; CHECK-GI-NEXT: mov h13, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h1, [sp, #16] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[3] -; CHECK-GI-NEXT: str h1, [sp, #32] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[4] -; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #128] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[5] -; CHECK-GI-NEXT: str h1, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] -; CHECK-GI-NEXT: str h1, [sp, #96] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #192] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[7] -; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #224] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: fcvt s1, h14 @@ -4948,77 +4950,79 @@ define <16 x half> @log2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #128] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #192] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #224] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #288] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #272] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #256] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v3.16b +; CHECK-GI-NEXT: ldr q0, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v1.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #320 ; CHECK-GI-NEXT: ret entry: @@ -5227,7 +5231,7 @@ define <4 x double> @log10_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: .cfi_offset w30, -16 ; CHECK-GI-NEXT: .cfi_offset b8, -24 ; CHECK-GI-NEXT: .cfi_offset b9, -32 -; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov d8, v0.d[1] ; CHECK-GI-NEXT: mov d9, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -5235,21 +5239,21 @@ define <4 x double> @log10_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl log10 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: bl log10 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl log10 -; CHECK-GI-NEXT: ldp q1, q2, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v2.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] -; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -5265,14 +5269,14 @@ define <2 x float> @log10_v2f32(<2 x float> %a) { ; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 ; CHECK-SD-NEXT: .cfi_offset w30, -16 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-SD-NEXT: mov s0, v0.s[1] ; CHECK-SD-NEXT: bl log10f -; CHECK-SD-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-SD-NEXT: bl log10f -; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] @@ -5534,16 +5538,16 @@ define <8 x float> @log10_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: mov s12, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s8 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s9 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: bl log10f @@ -5556,7 +5560,7 @@ define <8 x float> @log10_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: str d0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s13 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldp q2, q1, [sp, #48] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload @@ -5565,11 +5569,11 @@ define <8 x float> @log10_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -5814,8 +5818,7 @@ define <4 x half> @log10_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d0, d1 ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -6135,7 +6138,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov v2.16b, v1.16b -; CHECK-GI-NEXT: str q1, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h1, v1.h[2] ; CHECK-GI-NEXT: mov h15, v0.h[1] @@ -6146,57 +6149,57 @@ define <16 x half> @log10_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h12, v0.h[6] ; CHECK-GI-NEXT: mov h13, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h1, [sp, #16] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[3] -; CHECK-GI-NEXT: str h1, [sp, #32] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[4] -; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #128] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[5] -; CHECK-GI-NEXT: str h1, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] -; CHECK-GI-NEXT: str h1, [sp, #96] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #192] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[7] -; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #224] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: fcvt s1, h14 @@ -6204,77 +6207,79 @@ define <16 x half> @log10_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #128] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #192] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #224] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #288] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #272] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #256] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v3.16b +; CHECK-GI-NEXT: ldr q0, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v1.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #320 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll index 217e4e40a77948..8f29d5a1b93421 100644 --- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll +++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll @@ -991,113 +991,113 @@ entry: define <16 x half> @min_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-NOFP16-SD-LABEL: min_v16f16: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: mov h6, v2.h[1] -; CHECK-NOFP16-SD-NEXT: mov h7, v0.h[1] -; CHECK-NOFP16-SD-NEXT: fcvt s4, h2 -; CHECK-NOFP16-SD-NEXT: fcvt s5, h0 +; CHECK-NOFP16-SD-NEXT: mov v4.16b, v0.16b +; CHECK-NOFP16-SD-NEXT: mov h0, v2.h[1] +; CHECK-NOFP16-SD-NEXT: fcvt s6, h2 ; CHECK-NOFP16-SD-NEXT: mov h16, v3.h[1] ; CHECK-NOFP16-SD-NEXT: mov h17, v1.h[1] ; CHECK-NOFP16-SD-NEXT: mov h18, v2.h[2] -; CHECK-NOFP16-SD-NEXT: mov h19, v0.h[2] ; CHECK-NOFP16-SD-NEXT: fcvt s20, h3 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h1 ; CHECK-NOFP16-SD-NEXT: mov h22, v3.h[2] +; CHECK-NOFP16-SD-NEXT: mov h5, v4.h[1] +; CHECK-NOFP16-SD-NEXT: fcvt s7, h4 +; CHECK-NOFP16-SD-NEXT: mov h19, v4.h[2] +; CHECK-NOFP16-SD-NEXT: fcvt s0, h0 ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[2] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h6 -; CHECK-NOFP16-SD-NEXT: fcvt s7, h7 -; CHECK-NOFP16-SD-NEXT: mov h24, v0.h[6] -; CHECK-NOFP16-SD-NEXT: fmin s4, s5, s4 -; CHECK-NOFP16-SD-NEXT: fcvt s5, h16 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h17 -; CHECK-NOFP16-SD-NEXT: fcvt s17, h18 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h19 -; CHECK-NOFP16-SD-NEXT: mov h19, v0.h[3] +; CHECK-NOFP16-SD-NEXT: mov h24, v4.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: mov h25, v1.h[6] ; CHECK-NOFP16-SD-NEXT: fmin s20, s21, s20 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h22 ; CHECK-NOFP16-SD-NEXT: mov h22, v3.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s5, h5 ; CHECK-NOFP16-SD-NEXT: fmin s6, s7, s6 -; CHECK-NOFP16-SD-NEXT: mov h7, v2.h[3] -; CHECK-NOFP16-SD-NEXT: mov h25, v1.h[6] -; CHECK-NOFP16-SD-NEXT: fcvt h4, s4 -; CHECK-NOFP16-SD-NEXT: fmin s5, s16, s5 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h23 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h16 +; CHECK-NOFP16-SD-NEXT: fcvt s16, h17 +; CHECK-NOFP16-SD-NEXT: mov h17, v2.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 +; CHECK-NOFP16-SD-NEXT: fmin s5, s5, s0 +; CHECK-NOFP16-SD-NEXT: fcvt h0, s6 +; CHECK-NOFP16-SD-NEXT: fmin s6, s16, s7 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h23 ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[3] -; CHECK-NOFP16-SD-NEXT: fmin s17, s18, s17 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h19 -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: fcvt s7, h7 -; CHECK-NOFP16-SD-NEXT: fcvt h19, s5 +; CHECK-NOFP16-SD-NEXT: fmin s18, s19, s18 +; CHECK-NOFP16-SD-NEXT: fcvt s17, h17 +; CHECK-NOFP16-SD-NEXT: fcvt s19, h24 +; CHECK-NOFP16-SD-NEXT: mov h24, v4.h[6] +; CHECK-NOFP16-SD-NEXT: fcvt h16, s5 ; CHECK-NOFP16-SD-NEXT: fcvt h5, s20 -; CHECK-NOFP16-SD-NEXT: fmin s16, s16, s21 +; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 +; CHECK-NOFP16-SD-NEXT: fmin s7, s7, s21 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h23 -; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 +; CHECK-NOFP16-SD-NEXT: fcvt h18, s18 ; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[4] +; CHECK-NOFP16-SD-NEXT: fmin s17, s19, s17 +; CHECK-NOFP16-SD-NEXT: mov h19, v3.h[4] ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[4] -; CHECK-NOFP16-SD-NEXT: mov v4.h[1], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h22 -; CHECK-NOFP16-SD-NEXT: mov h22, v0.h[4] -; CHECK-NOFP16-SD-NEXT: fmin s7, s18, s7 -; CHECK-NOFP16-SD-NEXT: mov h18, v3.h[4] -; CHECK-NOFP16-SD-NEXT: mov v5.h[1], v19.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h16, s16 -; CHECK-NOFP16-SD-NEXT: fmin s6, s20, s6 -; CHECK-NOFP16-SD-NEXT: mov v4.h[2], v17.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s17, h21 -; CHECK-NOFP16-SD-NEXT: fcvt s19, h22 -; CHECK-NOFP16-SD-NEXT: fcvt h7, s7 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: mov v0.h[1], v16.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt s16, h22 +; CHECK-NOFP16-SD-NEXT: mov h22, v4.h[4] +; CHECK-NOFP16-SD-NEXT: mov v5.h[1], v6.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s7 +; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 +; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 +; CHECK-NOFP16-SD-NEXT: fmin s7, s20, s16 +; CHECK-NOFP16-SD-NEXT: mov v0.h[2], v18.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt s16, h21 +; CHECK-NOFP16-SD-NEXT: fcvt s18, h22 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h23 -; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[5] -; CHECK-NOFP16-SD-NEXT: mov h22, v0.h[5] -; CHECK-NOFP16-SD-NEXT: mov v5.h[2], v16.h[0] -; CHECK-NOFP16-SD-NEXT: mov h16, v3.h[5] +; CHECK-NOFP16-SD-NEXT: mov v5.h[2], v6.h[0] ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[5] -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: mov h0, v0.h[7] +; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[5] +; CHECK-NOFP16-SD-NEXT: mov h22, v4.h[5] ; CHECK-NOFP16-SD-NEXT: mov h1, v1.h[7] -; CHECK-NOFP16-SD-NEXT: fmin s17, s19, s17 -; CHECK-NOFP16-SD-NEXT: mov h19, v2.h[6] -; CHECK-NOFP16-SD-NEXT: mov v4.h[3], v7.h[0] -; CHECK-NOFP16-SD-NEXT: fmin s18, s20, s18 +; CHECK-NOFP16-SD-NEXT: mov h4, v4.h[7] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s7 +; CHECK-NOFP16-SD-NEXT: mov h7, v3.h[5] +; CHECK-NOFP16-SD-NEXT: mov v0.h[3], v17.h[0] +; CHECK-NOFP16-SD-NEXT: fmin s16, s18, s16 +; CHECK-NOFP16-SD-NEXT: fmin s19, s20, s19 ; CHECK-NOFP16-SD-NEXT: mov h20, v3.h[6] -; CHECK-NOFP16-SD-NEXT: fcvt s7, h21 +; CHECK-NOFP16-SD-NEXT: mov h18, v2.h[6] +; CHECK-NOFP16-SD-NEXT: fcvt s17, h21 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h22 +; CHECK-NOFP16-SD-NEXT: mov h3, v3.h[7] ; CHECK-NOFP16-SD-NEXT: fcvt s22, h24 ; CHECK-NOFP16-SD-NEXT: mov h2, v2.h[7] ; CHECK-NOFP16-SD-NEXT: mov v5.h[3], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h16 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h23 -; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 -; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 -; CHECK-NOFP16-SD-NEXT: fcvt s23, h25 -; CHECK-NOFP16-SD-NEXT: fcvt h18, s18 +; CHECK-NOFP16-SD-NEXT: fcvt s6, h7 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h23 +; CHECK-NOFP16-SD-NEXT: fcvt h16, s16 +; CHECK-NOFP16-SD-NEXT: fcvt h19, s19 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h20 -; CHECK-NOFP16-SD-NEXT: mov h3, v3.h[7] -; CHECK-NOFP16-SD-NEXT: fmin s7, s21, s7 -; CHECK-NOFP16-SD-NEXT: fcvt s2, h2 -; CHECK-NOFP16-SD-NEXT: fcvt s0, h0 -; CHECK-NOFP16-SD-NEXT: fmin s6, s16, s6 -; CHECK-NOFP16-SD-NEXT: fcvt s1, h1 -; CHECK-NOFP16-SD-NEXT: mov v4.h[4], v17.h[0] -; CHECK-NOFP16-SD-NEXT: fmin s16, s22, s19 -; CHECK-NOFP16-SD-NEXT: mov v5.h[4], v18.h[0] -; CHECK-NOFP16-SD-NEXT: fmin s17, s23, s20 +; CHECK-NOFP16-SD-NEXT: fcvt s23, h25 +; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: fmin s17, s21, s17 ; CHECK-NOFP16-SD-NEXT: fcvt s3, h3 -; CHECK-NOFP16-SD-NEXT: fcvt h7, s7 -; CHECK-NOFP16-SD-NEXT: fmin s0, s0, s2 -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: fcvt h2, s16 +; CHECK-NOFP16-SD-NEXT: fcvt s1, h1 +; CHECK-NOFP16-SD-NEXT: fcvt s2, h2 +; CHECK-NOFP16-SD-NEXT: fmin s6, s7, s6 +; CHECK-NOFP16-SD-NEXT: fcvt s4, h4 +; CHECK-NOFP16-SD-NEXT: mov v0.h[4], v16.h[0] +; CHECK-NOFP16-SD-NEXT: mov v5.h[4], v19.h[0] +; CHECK-NOFP16-SD-NEXT: fmin s16, s23, s20 +; CHECK-NOFP16-SD-NEXT: fmin s7, s22, s18 +; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 ; CHECK-NOFP16-SD-NEXT: fmin s1, s1, s3 -; CHECK-NOFP16-SD-NEXT: mov v4.h[5], v7.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h0, s0 -; CHECK-NOFP16-SD-NEXT: mov v5.h[5], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h6, s17 +; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 +; CHECK-NOFP16-SD-NEXT: fmin s2, s4, s2 +; CHECK-NOFP16-SD-NEXT: mov v0.h[5], v17.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h4, s7 ; CHECK-NOFP16-SD-NEXT: fcvt h1, s1 -; CHECK-NOFP16-SD-NEXT: mov v4.h[6], v2.h[0] +; CHECK-NOFP16-SD-NEXT: mov v5.h[5], v6.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s16 +; CHECK-NOFP16-SD-NEXT: fcvt h2, s2 +; CHECK-NOFP16-SD-NEXT: mov v0.h[6], v4.h[0] ; CHECK-NOFP16-SD-NEXT: mov v5.h[6], v6.h[0] -; CHECK-NOFP16-SD-NEXT: mov v4.h[7], v0.h[0] +; CHECK-NOFP16-SD-NEXT: mov v0.h[7], v2.h[0] ; CHECK-NOFP16-SD-NEXT: mov v5.h[7], v1.h[0] -; CHECK-NOFP16-SD-NEXT: mov v0.16b, v4.16b ; CHECK-NOFP16-SD-NEXT: mov v1.16b, v5.16b ; CHECK-NOFP16-SD-NEXT: ret ; @@ -1140,113 +1140,113 @@ entry: define <16 x half> @max_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-NOFP16-SD-LABEL: max_v16f16: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: mov h6, v2.h[1] -; CHECK-NOFP16-SD-NEXT: mov h7, v0.h[1] -; CHECK-NOFP16-SD-NEXT: fcvt s4, h2 -; CHECK-NOFP16-SD-NEXT: fcvt s5, h0 +; CHECK-NOFP16-SD-NEXT: mov v4.16b, v0.16b +; CHECK-NOFP16-SD-NEXT: mov h0, v2.h[1] +; CHECK-NOFP16-SD-NEXT: fcvt s6, h2 ; CHECK-NOFP16-SD-NEXT: mov h16, v3.h[1] ; CHECK-NOFP16-SD-NEXT: mov h17, v1.h[1] ; CHECK-NOFP16-SD-NEXT: mov h18, v2.h[2] -; CHECK-NOFP16-SD-NEXT: mov h19, v0.h[2] ; CHECK-NOFP16-SD-NEXT: fcvt s20, h3 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h1 ; CHECK-NOFP16-SD-NEXT: mov h22, v3.h[2] +; CHECK-NOFP16-SD-NEXT: mov h5, v4.h[1] +; CHECK-NOFP16-SD-NEXT: fcvt s7, h4 +; CHECK-NOFP16-SD-NEXT: mov h19, v4.h[2] +; CHECK-NOFP16-SD-NEXT: fcvt s0, h0 ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[2] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h6 -; CHECK-NOFP16-SD-NEXT: fcvt s7, h7 -; CHECK-NOFP16-SD-NEXT: mov h24, v0.h[6] -; CHECK-NOFP16-SD-NEXT: fmax s4, s5, s4 -; CHECK-NOFP16-SD-NEXT: fcvt s5, h16 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h17 -; CHECK-NOFP16-SD-NEXT: fcvt s17, h18 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h19 -; CHECK-NOFP16-SD-NEXT: mov h19, v0.h[3] +; CHECK-NOFP16-SD-NEXT: mov h24, v4.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: mov h25, v1.h[6] ; CHECK-NOFP16-SD-NEXT: fmax s20, s21, s20 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h22 ; CHECK-NOFP16-SD-NEXT: mov h22, v3.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s5, h5 ; CHECK-NOFP16-SD-NEXT: fmax s6, s7, s6 -; CHECK-NOFP16-SD-NEXT: mov h7, v2.h[3] -; CHECK-NOFP16-SD-NEXT: mov h25, v1.h[6] -; CHECK-NOFP16-SD-NEXT: fcvt h4, s4 -; CHECK-NOFP16-SD-NEXT: fmax s5, s16, s5 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h23 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h16 +; CHECK-NOFP16-SD-NEXT: fcvt s16, h17 +; CHECK-NOFP16-SD-NEXT: mov h17, v2.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 +; CHECK-NOFP16-SD-NEXT: fmax s5, s5, s0 +; CHECK-NOFP16-SD-NEXT: fcvt h0, s6 +; CHECK-NOFP16-SD-NEXT: fmax s6, s16, s7 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h23 ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[3] -; CHECK-NOFP16-SD-NEXT: fmax s17, s18, s17 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h19 -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: fcvt s7, h7 -; CHECK-NOFP16-SD-NEXT: fcvt h19, s5 +; CHECK-NOFP16-SD-NEXT: fmax s18, s19, s18 +; CHECK-NOFP16-SD-NEXT: fcvt s17, h17 +; CHECK-NOFP16-SD-NEXT: fcvt s19, h24 +; CHECK-NOFP16-SD-NEXT: mov h24, v4.h[6] +; CHECK-NOFP16-SD-NEXT: fcvt h16, s5 ; CHECK-NOFP16-SD-NEXT: fcvt h5, s20 -; CHECK-NOFP16-SD-NEXT: fmax s16, s16, s21 +; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 +; CHECK-NOFP16-SD-NEXT: fmax s7, s7, s21 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h23 -; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 +; CHECK-NOFP16-SD-NEXT: fcvt h18, s18 ; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[4] +; CHECK-NOFP16-SD-NEXT: fmax s17, s19, s17 +; CHECK-NOFP16-SD-NEXT: mov h19, v3.h[4] ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[4] -; CHECK-NOFP16-SD-NEXT: mov v4.h[1], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h22 -; CHECK-NOFP16-SD-NEXT: mov h22, v0.h[4] -; CHECK-NOFP16-SD-NEXT: fmax s7, s18, s7 -; CHECK-NOFP16-SD-NEXT: mov h18, v3.h[4] -; CHECK-NOFP16-SD-NEXT: mov v5.h[1], v19.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h16, s16 -; CHECK-NOFP16-SD-NEXT: fmax s6, s20, s6 -; CHECK-NOFP16-SD-NEXT: mov v4.h[2], v17.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s17, h21 -; CHECK-NOFP16-SD-NEXT: fcvt s19, h22 -; CHECK-NOFP16-SD-NEXT: fcvt h7, s7 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: mov v0.h[1], v16.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt s16, h22 +; CHECK-NOFP16-SD-NEXT: mov h22, v4.h[4] +; CHECK-NOFP16-SD-NEXT: mov v5.h[1], v6.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s7 +; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 +; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 +; CHECK-NOFP16-SD-NEXT: fmax s7, s20, s16 +; CHECK-NOFP16-SD-NEXT: mov v0.h[2], v18.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt s16, h21 +; CHECK-NOFP16-SD-NEXT: fcvt s18, h22 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h23 -; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[5] -; CHECK-NOFP16-SD-NEXT: mov h22, v0.h[5] -; CHECK-NOFP16-SD-NEXT: mov v5.h[2], v16.h[0] -; CHECK-NOFP16-SD-NEXT: mov h16, v3.h[5] +; CHECK-NOFP16-SD-NEXT: mov v5.h[2], v6.h[0] ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[5] -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: mov h0, v0.h[7] +; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[5] +; CHECK-NOFP16-SD-NEXT: mov h22, v4.h[5] ; CHECK-NOFP16-SD-NEXT: mov h1, v1.h[7] -; CHECK-NOFP16-SD-NEXT: fmax s17, s19, s17 -; CHECK-NOFP16-SD-NEXT: mov h19, v2.h[6] -; CHECK-NOFP16-SD-NEXT: mov v4.h[3], v7.h[0] -; CHECK-NOFP16-SD-NEXT: fmax s18, s20, s18 +; CHECK-NOFP16-SD-NEXT: mov h4, v4.h[7] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s7 +; CHECK-NOFP16-SD-NEXT: mov h7, v3.h[5] +; CHECK-NOFP16-SD-NEXT: mov v0.h[3], v17.h[0] +; CHECK-NOFP16-SD-NEXT: fmax s16, s18, s16 +; CHECK-NOFP16-SD-NEXT: fmax s19, s20, s19 ; CHECK-NOFP16-SD-NEXT: mov h20, v3.h[6] -; CHECK-NOFP16-SD-NEXT: fcvt s7, h21 +; CHECK-NOFP16-SD-NEXT: mov h18, v2.h[6] +; CHECK-NOFP16-SD-NEXT: fcvt s17, h21 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h22 +; CHECK-NOFP16-SD-NEXT: mov h3, v3.h[7] ; CHECK-NOFP16-SD-NEXT: fcvt s22, h24 ; CHECK-NOFP16-SD-NEXT: mov h2, v2.h[7] ; CHECK-NOFP16-SD-NEXT: mov v5.h[3], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h16 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h23 -; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 -; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 -; CHECK-NOFP16-SD-NEXT: fcvt s23, h25 -; CHECK-NOFP16-SD-NEXT: fcvt h18, s18 +; CHECK-NOFP16-SD-NEXT: fcvt s6, h7 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h23 +; CHECK-NOFP16-SD-NEXT: fcvt h16, s16 +; CHECK-NOFP16-SD-NEXT: fcvt h19, s19 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h20 -; CHECK-NOFP16-SD-NEXT: mov h3, v3.h[7] -; CHECK-NOFP16-SD-NEXT: fmax s7, s21, s7 -; CHECK-NOFP16-SD-NEXT: fcvt s2, h2 -; CHECK-NOFP16-SD-NEXT: fcvt s0, h0 -; CHECK-NOFP16-SD-NEXT: fmax s6, s16, s6 -; CHECK-NOFP16-SD-NEXT: fcvt s1, h1 -; CHECK-NOFP16-SD-NEXT: mov v4.h[4], v17.h[0] -; CHECK-NOFP16-SD-NEXT: fmax s16, s22, s19 -; CHECK-NOFP16-SD-NEXT: mov v5.h[4], v18.h[0] -; CHECK-NOFP16-SD-NEXT: fmax s17, s23, s20 +; CHECK-NOFP16-SD-NEXT: fcvt s23, h25 +; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: fmax s17, s21, s17 ; CHECK-NOFP16-SD-NEXT: fcvt s3, h3 -; CHECK-NOFP16-SD-NEXT: fcvt h7, s7 -; CHECK-NOFP16-SD-NEXT: fmax s0, s0, s2 -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: fcvt h2, s16 +; CHECK-NOFP16-SD-NEXT: fcvt s1, h1 +; CHECK-NOFP16-SD-NEXT: fcvt s2, h2 +; CHECK-NOFP16-SD-NEXT: fmax s6, s7, s6 +; CHECK-NOFP16-SD-NEXT: fcvt s4, h4 +; CHECK-NOFP16-SD-NEXT: mov v0.h[4], v16.h[0] +; CHECK-NOFP16-SD-NEXT: mov v5.h[4], v19.h[0] +; CHECK-NOFP16-SD-NEXT: fmax s16, s23, s20 +; CHECK-NOFP16-SD-NEXT: fmax s7, s22, s18 +; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 ; CHECK-NOFP16-SD-NEXT: fmax s1, s1, s3 -; CHECK-NOFP16-SD-NEXT: mov v4.h[5], v7.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h0, s0 -; CHECK-NOFP16-SD-NEXT: mov v5.h[5], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h6, s17 +; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 +; CHECK-NOFP16-SD-NEXT: fmax s2, s4, s2 +; CHECK-NOFP16-SD-NEXT: mov v0.h[5], v17.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h4, s7 ; CHECK-NOFP16-SD-NEXT: fcvt h1, s1 -; CHECK-NOFP16-SD-NEXT: mov v4.h[6], v2.h[0] +; CHECK-NOFP16-SD-NEXT: mov v5.h[5], v6.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s16 +; CHECK-NOFP16-SD-NEXT: fcvt h2, s2 +; CHECK-NOFP16-SD-NEXT: mov v0.h[6], v4.h[0] ; CHECK-NOFP16-SD-NEXT: mov v5.h[6], v6.h[0] -; CHECK-NOFP16-SD-NEXT: mov v4.h[7], v0.h[0] +; CHECK-NOFP16-SD-NEXT: mov v0.h[7], v2.h[0] ; CHECK-NOFP16-SD-NEXT: mov v5.h[7], v1.h[0] -; CHECK-NOFP16-SD-NEXT: mov v0.16b, v4.16b ; CHECK-NOFP16-SD-NEXT: mov v1.16b, v5.16b ; CHECK-NOFP16-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll index 1b92c462af144e..a31a8effdfecf1 100644 --- a/llvm/test/CodeGen/AArch64/fminmax.ll +++ b/llvm/test/CodeGen/AArch64/fminmax.ll @@ -991,113 +991,113 @@ entry: define <16 x half> @min_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-NOFP16-SD-LABEL: min_v16f16: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: mov h6, v2.h[1] -; CHECK-NOFP16-SD-NEXT: mov h7, v0.h[1] -; CHECK-NOFP16-SD-NEXT: fcvt s4, h2 -; CHECK-NOFP16-SD-NEXT: fcvt s5, h0 +; CHECK-NOFP16-SD-NEXT: mov v4.16b, v0.16b +; CHECK-NOFP16-SD-NEXT: mov h0, v2.h[1] +; CHECK-NOFP16-SD-NEXT: fcvt s6, h2 ; CHECK-NOFP16-SD-NEXT: mov h16, v3.h[1] ; CHECK-NOFP16-SD-NEXT: mov h17, v1.h[1] ; CHECK-NOFP16-SD-NEXT: mov h18, v2.h[2] -; CHECK-NOFP16-SD-NEXT: mov h19, v0.h[2] ; CHECK-NOFP16-SD-NEXT: fcvt s20, h3 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h1 ; CHECK-NOFP16-SD-NEXT: mov h22, v3.h[2] +; CHECK-NOFP16-SD-NEXT: mov h5, v4.h[1] +; CHECK-NOFP16-SD-NEXT: fcvt s7, h4 +; CHECK-NOFP16-SD-NEXT: mov h19, v4.h[2] +; CHECK-NOFP16-SD-NEXT: fcvt s0, h0 ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[2] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h6 -; CHECK-NOFP16-SD-NEXT: fcvt s7, h7 -; CHECK-NOFP16-SD-NEXT: mov h24, v0.h[6] -; CHECK-NOFP16-SD-NEXT: fminnm s4, s5, s4 -; CHECK-NOFP16-SD-NEXT: fcvt s5, h16 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h17 -; CHECK-NOFP16-SD-NEXT: fcvt s17, h18 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h19 -; CHECK-NOFP16-SD-NEXT: mov h19, v0.h[3] +; CHECK-NOFP16-SD-NEXT: mov h24, v4.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: mov h25, v1.h[6] ; CHECK-NOFP16-SD-NEXT: fminnm s20, s21, s20 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h22 ; CHECK-NOFP16-SD-NEXT: mov h22, v3.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s5, h5 ; CHECK-NOFP16-SD-NEXT: fminnm s6, s7, s6 -; CHECK-NOFP16-SD-NEXT: mov h7, v2.h[3] -; CHECK-NOFP16-SD-NEXT: mov h25, v1.h[6] -; CHECK-NOFP16-SD-NEXT: fcvt h4, s4 -; CHECK-NOFP16-SD-NEXT: fminnm s5, s16, s5 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h23 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h16 +; CHECK-NOFP16-SD-NEXT: fcvt s16, h17 +; CHECK-NOFP16-SD-NEXT: mov h17, v2.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 +; CHECK-NOFP16-SD-NEXT: fminnm s5, s5, s0 +; CHECK-NOFP16-SD-NEXT: fcvt h0, s6 +; CHECK-NOFP16-SD-NEXT: fminnm s6, s16, s7 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h23 ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[3] -; CHECK-NOFP16-SD-NEXT: fminnm s17, s18, s17 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h19 -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: fcvt s7, h7 -; CHECK-NOFP16-SD-NEXT: fcvt h19, s5 +; CHECK-NOFP16-SD-NEXT: fminnm s18, s19, s18 +; CHECK-NOFP16-SD-NEXT: fcvt s17, h17 +; CHECK-NOFP16-SD-NEXT: fcvt s19, h24 +; CHECK-NOFP16-SD-NEXT: mov h24, v4.h[6] +; CHECK-NOFP16-SD-NEXT: fcvt h16, s5 ; CHECK-NOFP16-SD-NEXT: fcvt h5, s20 -; CHECK-NOFP16-SD-NEXT: fminnm s16, s16, s21 +; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 +; CHECK-NOFP16-SD-NEXT: fminnm s7, s7, s21 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h23 -; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 +; CHECK-NOFP16-SD-NEXT: fcvt h18, s18 ; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[4] +; CHECK-NOFP16-SD-NEXT: fminnm s17, s19, s17 +; CHECK-NOFP16-SD-NEXT: mov h19, v3.h[4] ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[4] -; CHECK-NOFP16-SD-NEXT: mov v4.h[1], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h22 -; CHECK-NOFP16-SD-NEXT: mov h22, v0.h[4] -; CHECK-NOFP16-SD-NEXT: fminnm s7, s18, s7 -; CHECK-NOFP16-SD-NEXT: mov h18, v3.h[4] -; CHECK-NOFP16-SD-NEXT: mov v5.h[1], v19.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h16, s16 -; CHECK-NOFP16-SD-NEXT: fminnm s6, s20, s6 -; CHECK-NOFP16-SD-NEXT: mov v4.h[2], v17.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s17, h21 -; CHECK-NOFP16-SD-NEXT: fcvt s19, h22 -; CHECK-NOFP16-SD-NEXT: fcvt h7, s7 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: mov v0.h[1], v16.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt s16, h22 +; CHECK-NOFP16-SD-NEXT: mov h22, v4.h[4] +; CHECK-NOFP16-SD-NEXT: mov v5.h[1], v6.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s7 +; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 +; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 +; CHECK-NOFP16-SD-NEXT: fminnm s7, s20, s16 +; CHECK-NOFP16-SD-NEXT: mov v0.h[2], v18.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt s16, h21 +; CHECK-NOFP16-SD-NEXT: fcvt s18, h22 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h23 -; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[5] -; CHECK-NOFP16-SD-NEXT: mov h22, v0.h[5] -; CHECK-NOFP16-SD-NEXT: mov v5.h[2], v16.h[0] -; CHECK-NOFP16-SD-NEXT: mov h16, v3.h[5] +; CHECK-NOFP16-SD-NEXT: mov v5.h[2], v6.h[0] ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[5] -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: mov h0, v0.h[7] +; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[5] +; CHECK-NOFP16-SD-NEXT: mov h22, v4.h[5] ; CHECK-NOFP16-SD-NEXT: mov h1, v1.h[7] -; CHECK-NOFP16-SD-NEXT: fminnm s17, s19, s17 -; CHECK-NOFP16-SD-NEXT: mov h19, v2.h[6] -; CHECK-NOFP16-SD-NEXT: mov v4.h[3], v7.h[0] -; CHECK-NOFP16-SD-NEXT: fminnm s18, s20, s18 +; CHECK-NOFP16-SD-NEXT: mov h4, v4.h[7] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s7 +; CHECK-NOFP16-SD-NEXT: mov h7, v3.h[5] +; CHECK-NOFP16-SD-NEXT: mov v0.h[3], v17.h[0] +; CHECK-NOFP16-SD-NEXT: fminnm s16, s18, s16 +; CHECK-NOFP16-SD-NEXT: fminnm s19, s20, s19 ; CHECK-NOFP16-SD-NEXT: mov h20, v3.h[6] -; CHECK-NOFP16-SD-NEXT: fcvt s7, h21 +; CHECK-NOFP16-SD-NEXT: mov h18, v2.h[6] +; CHECK-NOFP16-SD-NEXT: fcvt s17, h21 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h22 +; CHECK-NOFP16-SD-NEXT: mov h3, v3.h[7] ; CHECK-NOFP16-SD-NEXT: fcvt s22, h24 ; CHECK-NOFP16-SD-NEXT: mov h2, v2.h[7] ; CHECK-NOFP16-SD-NEXT: mov v5.h[3], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h16 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h23 -; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 -; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 -; CHECK-NOFP16-SD-NEXT: fcvt s23, h25 -; CHECK-NOFP16-SD-NEXT: fcvt h18, s18 +; CHECK-NOFP16-SD-NEXT: fcvt s6, h7 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h23 +; CHECK-NOFP16-SD-NEXT: fcvt h16, s16 +; CHECK-NOFP16-SD-NEXT: fcvt h19, s19 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h20 -; CHECK-NOFP16-SD-NEXT: mov h3, v3.h[7] -; CHECK-NOFP16-SD-NEXT: fminnm s7, s21, s7 -; CHECK-NOFP16-SD-NEXT: fcvt s2, h2 -; CHECK-NOFP16-SD-NEXT: fcvt s0, h0 -; CHECK-NOFP16-SD-NEXT: fminnm s6, s16, s6 -; CHECK-NOFP16-SD-NEXT: fcvt s1, h1 -; CHECK-NOFP16-SD-NEXT: mov v4.h[4], v17.h[0] -; CHECK-NOFP16-SD-NEXT: fminnm s16, s22, s19 -; CHECK-NOFP16-SD-NEXT: mov v5.h[4], v18.h[0] -; CHECK-NOFP16-SD-NEXT: fminnm s17, s23, s20 +; CHECK-NOFP16-SD-NEXT: fcvt s23, h25 +; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: fminnm s17, s21, s17 ; CHECK-NOFP16-SD-NEXT: fcvt s3, h3 -; CHECK-NOFP16-SD-NEXT: fcvt h7, s7 -; CHECK-NOFP16-SD-NEXT: fminnm s0, s0, s2 -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: fcvt h2, s16 +; CHECK-NOFP16-SD-NEXT: fcvt s1, h1 +; CHECK-NOFP16-SD-NEXT: fcvt s2, h2 +; CHECK-NOFP16-SD-NEXT: fminnm s6, s7, s6 +; CHECK-NOFP16-SD-NEXT: fcvt s4, h4 +; CHECK-NOFP16-SD-NEXT: mov v0.h[4], v16.h[0] +; CHECK-NOFP16-SD-NEXT: mov v5.h[4], v19.h[0] +; CHECK-NOFP16-SD-NEXT: fminnm s16, s23, s20 +; CHECK-NOFP16-SD-NEXT: fminnm s7, s22, s18 +; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 ; CHECK-NOFP16-SD-NEXT: fminnm s1, s1, s3 -; CHECK-NOFP16-SD-NEXT: mov v4.h[5], v7.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h0, s0 -; CHECK-NOFP16-SD-NEXT: mov v5.h[5], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h6, s17 +; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 +; CHECK-NOFP16-SD-NEXT: fminnm s2, s4, s2 +; CHECK-NOFP16-SD-NEXT: mov v0.h[5], v17.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h4, s7 ; CHECK-NOFP16-SD-NEXT: fcvt h1, s1 -; CHECK-NOFP16-SD-NEXT: mov v4.h[6], v2.h[0] +; CHECK-NOFP16-SD-NEXT: mov v5.h[5], v6.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s16 +; CHECK-NOFP16-SD-NEXT: fcvt h2, s2 +; CHECK-NOFP16-SD-NEXT: mov v0.h[6], v4.h[0] ; CHECK-NOFP16-SD-NEXT: mov v5.h[6], v6.h[0] -; CHECK-NOFP16-SD-NEXT: mov v4.h[7], v0.h[0] +; CHECK-NOFP16-SD-NEXT: mov v0.h[7], v2.h[0] ; CHECK-NOFP16-SD-NEXT: mov v5.h[7], v1.h[0] -; CHECK-NOFP16-SD-NEXT: mov v0.16b, v4.16b ; CHECK-NOFP16-SD-NEXT: mov v1.16b, v5.16b ; CHECK-NOFP16-SD-NEXT: ret ; @@ -1140,113 +1140,113 @@ entry: define <16 x half> @max_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-NOFP16-SD-LABEL: max_v16f16: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: mov h6, v2.h[1] -; CHECK-NOFP16-SD-NEXT: mov h7, v0.h[1] -; CHECK-NOFP16-SD-NEXT: fcvt s4, h2 -; CHECK-NOFP16-SD-NEXT: fcvt s5, h0 +; CHECK-NOFP16-SD-NEXT: mov v4.16b, v0.16b +; CHECK-NOFP16-SD-NEXT: mov h0, v2.h[1] +; CHECK-NOFP16-SD-NEXT: fcvt s6, h2 ; CHECK-NOFP16-SD-NEXT: mov h16, v3.h[1] ; CHECK-NOFP16-SD-NEXT: mov h17, v1.h[1] ; CHECK-NOFP16-SD-NEXT: mov h18, v2.h[2] -; CHECK-NOFP16-SD-NEXT: mov h19, v0.h[2] ; CHECK-NOFP16-SD-NEXT: fcvt s20, h3 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h1 ; CHECK-NOFP16-SD-NEXT: mov h22, v3.h[2] +; CHECK-NOFP16-SD-NEXT: mov h5, v4.h[1] +; CHECK-NOFP16-SD-NEXT: fcvt s7, h4 +; CHECK-NOFP16-SD-NEXT: mov h19, v4.h[2] +; CHECK-NOFP16-SD-NEXT: fcvt s0, h0 ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[2] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h6 -; CHECK-NOFP16-SD-NEXT: fcvt s7, h7 -; CHECK-NOFP16-SD-NEXT: mov h24, v0.h[6] -; CHECK-NOFP16-SD-NEXT: fmaxnm s4, s5, s4 -; CHECK-NOFP16-SD-NEXT: fcvt s5, h16 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h17 -; CHECK-NOFP16-SD-NEXT: fcvt s17, h18 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h19 -; CHECK-NOFP16-SD-NEXT: mov h19, v0.h[3] +; CHECK-NOFP16-SD-NEXT: mov h24, v4.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: mov h25, v1.h[6] ; CHECK-NOFP16-SD-NEXT: fmaxnm s20, s21, s20 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h22 ; CHECK-NOFP16-SD-NEXT: mov h22, v3.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s5, h5 ; CHECK-NOFP16-SD-NEXT: fmaxnm s6, s7, s6 -; CHECK-NOFP16-SD-NEXT: mov h7, v2.h[3] -; CHECK-NOFP16-SD-NEXT: mov h25, v1.h[6] -; CHECK-NOFP16-SD-NEXT: fcvt h4, s4 -; CHECK-NOFP16-SD-NEXT: fmaxnm s5, s16, s5 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h23 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h16 +; CHECK-NOFP16-SD-NEXT: fcvt s16, h17 +; CHECK-NOFP16-SD-NEXT: mov h17, v2.h[3] +; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 +; CHECK-NOFP16-SD-NEXT: fmaxnm s5, s5, s0 +; CHECK-NOFP16-SD-NEXT: fcvt h0, s6 +; CHECK-NOFP16-SD-NEXT: fmaxnm s6, s16, s7 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h23 ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[3] -; CHECK-NOFP16-SD-NEXT: fmaxnm s17, s18, s17 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h19 -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: fcvt s7, h7 -; CHECK-NOFP16-SD-NEXT: fcvt h19, s5 +; CHECK-NOFP16-SD-NEXT: fmaxnm s18, s19, s18 +; CHECK-NOFP16-SD-NEXT: fcvt s17, h17 +; CHECK-NOFP16-SD-NEXT: fcvt s19, h24 +; CHECK-NOFP16-SD-NEXT: mov h24, v4.h[6] +; CHECK-NOFP16-SD-NEXT: fcvt h16, s5 ; CHECK-NOFP16-SD-NEXT: fcvt h5, s20 -; CHECK-NOFP16-SD-NEXT: fmaxnm s16, s16, s21 +; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 +; CHECK-NOFP16-SD-NEXT: fmaxnm s7, s7, s21 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h23 -; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 +; CHECK-NOFP16-SD-NEXT: fcvt h18, s18 ; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[4] +; CHECK-NOFP16-SD-NEXT: fmaxnm s17, s19, s17 +; CHECK-NOFP16-SD-NEXT: mov h19, v3.h[4] ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[4] -; CHECK-NOFP16-SD-NEXT: mov v4.h[1], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h22 -; CHECK-NOFP16-SD-NEXT: mov h22, v0.h[4] -; CHECK-NOFP16-SD-NEXT: fmaxnm s7, s18, s7 -; CHECK-NOFP16-SD-NEXT: mov h18, v3.h[4] -; CHECK-NOFP16-SD-NEXT: mov v5.h[1], v19.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h16, s16 -; CHECK-NOFP16-SD-NEXT: fmaxnm s6, s20, s6 -; CHECK-NOFP16-SD-NEXT: mov v4.h[2], v17.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s17, h21 -; CHECK-NOFP16-SD-NEXT: fcvt s19, h22 -; CHECK-NOFP16-SD-NEXT: fcvt h7, s7 -; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: mov v0.h[1], v16.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt s16, h22 +; CHECK-NOFP16-SD-NEXT: mov h22, v4.h[4] +; CHECK-NOFP16-SD-NEXT: mov v5.h[1], v6.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s7 +; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 +; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 +; CHECK-NOFP16-SD-NEXT: fmaxnm s7, s20, s16 +; CHECK-NOFP16-SD-NEXT: mov v0.h[2], v18.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt s16, h21 +; CHECK-NOFP16-SD-NEXT: fcvt s18, h22 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h23 -; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[5] -; CHECK-NOFP16-SD-NEXT: mov h22, v0.h[5] -; CHECK-NOFP16-SD-NEXT: mov v5.h[2], v16.h[0] -; CHECK-NOFP16-SD-NEXT: mov h16, v3.h[5] +; CHECK-NOFP16-SD-NEXT: mov v5.h[2], v6.h[0] ; CHECK-NOFP16-SD-NEXT: mov h23, v1.h[5] -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: mov h0, v0.h[7] +; CHECK-NOFP16-SD-NEXT: mov h21, v2.h[5] +; CHECK-NOFP16-SD-NEXT: mov h22, v4.h[5] ; CHECK-NOFP16-SD-NEXT: mov h1, v1.h[7] -; CHECK-NOFP16-SD-NEXT: fmaxnm s17, s19, s17 -; CHECK-NOFP16-SD-NEXT: mov h19, v2.h[6] -; CHECK-NOFP16-SD-NEXT: mov v4.h[3], v7.h[0] -; CHECK-NOFP16-SD-NEXT: fmaxnm s18, s20, s18 +; CHECK-NOFP16-SD-NEXT: mov h4, v4.h[7] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s7 +; CHECK-NOFP16-SD-NEXT: mov h7, v3.h[5] +; CHECK-NOFP16-SD-NEXT: mov v0.h[3], v17.h[0] +; CHECK-NOFP16-SD-NEXT: fmaxnm s16, s18, s16 +; CHECK-NOFP16-SD-NEXT: fmaxnm s19, s20, s19 ; CHECK-NOFP16-SD-NEXT: mov h20, v3.h[6] -; CHECK-NOFP16-SD-NEXT: fcvt s7, h21 +; CHECK-NOFP16-SD-NEXT: mov h18, v2.h[6] +; CHECK-NOFP16-SD-NEXT: fcvt s17, h21 ; CHECK-NOFP16-SD-NEXT: fcvt s21, h22 +; CHECK-NOFP16-SD-NEXT: mov h3, v3.h[7] ; CHECK-NOFP16-SD-NEXT: fcvt s22, h24 ; CHECK-NOFP16-SD-NEXT: mov h2, v2.h[7] ; CHECK-NOFP16-SD-NEXT: mov v5.h[3], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt s6, h16 -; CHECK-NOFP16-SD-NEXT: fcvt s16, h23 -; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 -; CHECK-NOFP16-SD-NEXT: fcvt s19, h19 -; CHECK-NOFP16-SD-NEXT: fcvt s23, h25 -; CHECK-NOFP16-SD-NEXT: fcvt h18, s18 +; CHECK-NOFP16-SD-NEXT: fcvt s6, h7 +; CHECK-NOFP16-SD-NEXT: fcvt s7, h23 +; CHECK-NOFP16-SD-NEXT: fcvt h16, s16 +; CHECK-NOFP16-SD-NEXT: fcvt h19, s19 ; CHECK-NOFP16-SD-NEXT: fcvt s20, h20 -; CHECK-NOFP16-SD-NEXT: mov h3, v3.h[7] -; CHECK-NOFP16-SD-NEXT: fmaxnm s7, s21, s7 -; CHECK-NOFP16-SD-NEXT: fcvt s2, h2 -; CHECK-NOFP16-SD-NEXT: fcvt s0, h0 -; CHECK-NOFP16-SD-NEXT: fmaxnm s6, s16, s6 -; CHECK-NOFP16-SD-NEXT: fcvt s1, h1 -; CHECK-NOFP16-SD-NEXT: mov v4.h[4], v17.h[0] -; CHECK-NOFP16-SD-NEXT: fmaxnm s16, s22, s19 -; CHECK-NOFP16-SD-NEXT: mov v5.h[4], v18.h[0] -; CHECK-NOFP16-SD-NEXT: fmaxnm s17, s23, s20 +; CHECK-NOFP16-SD-NEXT: fcvt s23, h25 +; CHECK-NOFP16-SD-NEXT: fcvt s18, h18 +; CHECK-NOFP16-SD-NEXT: fmaxnm s17, s21, s17 ; CHECK-NOFP16-SD-NEXT: fcvt s3, h3 -; CHECK-NOFP16-SD-NEXT: fcvt h7, s7 -; CHECK-NOFP16-SD-NEXT: fmaxnm s0, s0, s2 -; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 -; CHECK-NOFP16-SD-NEXT: fcvt h2, s16 +; CHECK-NOFP16-SD-NEXT: fcvt s1, h1 +; CHECK-NOFP16-SD-NEXT: fcvt s2, h2 +; CHECK-NOFP16-SD-NEXT: fmaxnm s6, s7, s6 +; CHECK-NOFP16-SD-NEXT: fcvt s4, h4 +; CHECK-NOFP16-SD-NEXT: mov v0.h[4], v16.h[0] +; CHECK-NOFP16-SD-NEXT: mov v5.h[4], v19.h[0] +; CHECK-NOFP16-SD-NEXT: fmaxnm s16, s23, s20 +; CHECK-NOFP16-SD-NEXT: fmaxnm s7, s22, s18 +; CHECK-NOFP16-SD-NEXT: fcvt h17, s17 ; CHECK-NOFP16-SD-NEXT: fmaxnm s1, s1, s3 -; CHECK-NOFP16-SD-NEXT: mov v4.h[5], v7.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h0, s0 -; CHECK-NOFP16-SD-NEXT: mov v5.h[5], v6.h[0] -; CHECK-NOFP16-SD-NEXT: fcvt h6, s17 +; CHECK-NOFP16-SD-NEXT: fcvt h6, s6 +; CHECK-NOFP16-SD-NEXT: fmaxnm s2, s4, s2 +; CHECK-NOFP16-SD-NEXT: mov v0.h[5], v17.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h4, s7 ; CHECK-NOFP16-SD-NEXT: fcvt h1, s1 -; CHECK-NOFP16-SD-NEXT: mov v4.h[6], v2.h[0] +; CHECK-NOFP16-SD-NEXT: mov v5.h[5], v6.h[0] +; CHECK-NOFP16-SD-NEXT: fcvt h6, s16 +; CHECK-NOFP16-SD-NEXT: fcvt h2, s2 +; CHECK-NOFP16-SD-NEXT: mov v0.h[6], v4.h[0] ; CHECK-NOFP16-SD-NEXT: mov v5.h[6], v6.h[0] -; CHECK-NOFP16-SD-NEXT: mov v4.h[7], v0.h[0] +; CHECK-NOFP16-SD-NEXT: mov v0.h[7], v2.h[0] ; CHECK-NOFP16-SD-NEXT: mov v5.h[7], v1.h[0] -; CHECK-NOFP16-SD-NEXT: mov v0.16b, v4.16b ; CHECK-NOFP16-SD-NEXT: mov v1.16b, v5.16b ; CHECK-NOFP16-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll index 79c99c48ce3dc1..86a1afabe9d76d 100644 --- a/llvm/test/CodeGen/AArch64/fpow.ll +++ b/llvm/test/CodeGen/AArch64/fpow.ll @@ -62,16 +62,16 @@ define <2 x double> @pow_v2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-SD-NEXT: str x30, [sp, #48] // 8-byte Folded Spill ; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 ; CHECK-SD-NEXT: .cfi_offset w30, -16 -; CHECK-SD-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-SD-NEXT: stp q0, q1, [sp, #16] // 32-byte Folded Spill ; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: mov d1, v1.d[1] ; CHECK-SD-NEXT: bl pow -; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: bl pow -; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] @@ -193,17 +193,17 @@ define <4 x double> @pow_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-SD-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-SD-NEXT: .cfi_def_cfa_offset 96 ; CHECK-SD-NEXT: .cfi_offset w30, -16 -; CHECK-SD-NEXT: stp q0, q2, [sp] // 32-byte Folded Spill +; CHECK-SD-NEXT: stp q0, q2, [sp, #16] // 32-byte Folded Spill ; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: stp q1, q3, [sp, #48] // 32-byte Folded Spill ; CHECK-SD-NEXT: mov d1, v2.d[1] ; CHECK-SD-NEXT: bl pow -; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: bl pow -; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill @@ -236,23 +236,21 @@ define <4 x double> @pow_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-GI-NEXT: .cfi_offset b10, -40 ; CHECK-GI-NEXT: .cfi_offset b11, -48 ; CHECK-GI-NEXT: mov v4.16b, v1.16b -; CHECK-GI-NEXT: str q1, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp q3, q1, [sp, #16] // 32-byte Folded Spill ; CHECK-GI-NEXT: mov v1.16b, v2.16b -; CHECK-GI-NEXT: str q3, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov d8, v0.d[1] ; CHECK-GI-NEXT: mov d10, v2.d[1] -; CHECK-GI-NEXT: mov d11, v3.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-GI-NEXT: mov d11, v3.d[1] ; CHECK-GI-NEXT: mov d9, v4.d[1] ; CHECK-GI-NEXT: bl pow ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: fmov d1, d10 ; CHECK-GI-NEXT: bl pow -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldp q1, q0, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-GI-NEXT: bl pow @@ -260,16 +258,15 @@ define <4 x double> @pow_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: fmov d1, d11 ; CHECK-GI-NEXT: bl pow -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp, #32] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v2.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] -; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #112 ; CHECK-GI-NEXT: ret entry: @@ -286,16 +283,16 @@ define <2 x float> @pow_v2f32(<2 x float> %a, <2 x float> %b) { ; CHECK-SD-NEXT: .cfi_offset w30, -16 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-SD-NEXT: stp q0, q1, [sp, #16] // 32-byte Folded Spill ; CHECK-SD-NEXT: mov s0, v0.s[1] ; CHECK-SD-NEXT: mov s1, v1.s[1] ; CHECK-SD-NEXT: bl powf -; CHECK-SD-NEXT: str d0, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-SD-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-SD-NEXT: bl powf -; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-SD-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] @@ -595,22 +592,23 @@ define <8 x float> @pow_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov v4.16b, v1.16b ; CHECK-GI-NEXT: mov v1.16b, v2.16b +; CHECK-GI-NEXT: str q3, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov s8, v0.s[1] ; CHECK-GI-NEXT: mov s9, v0.s[2] ; CHECK-GI-NEXT: mov s10, v0.s[3] -; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: mov s12, v3.s[1] -; CHECK-GI-NEXT: mov s11, v3.s[2] -; CHECK-GI-NEXT: mov s2, v4.s[1] -; CHECK-GI-NEXT: stp q3, q4, [sp] // 32-byte Folded Spill +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-GI-NEXT: mov s2, v4.s[2] ; CHECK-GI-NEXT: mov s5, v4.s[3] +; CHECK-GI-NEXT: str q4, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: mov s11, v4.s[1] ; CHECK-GI-NEXT: mov s14, v1.s[1] ; CHECK-GI-NEXT: mov s15, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 -; CHECK-GI-NEXT: str s2, [sp, #48] // 4-byte Folded Spill -; CHECK-GI-NEXT: mov s2, v4.s[2] ; CHECK-GI-NEXT: str s2, [sp, #112] // 4-byte Folded Spill +; CHECK-GI-NEXT: mov s2, v3.s[2] +; CHECK-GI-NEXT: str s2, [sp, #92] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov s2, v3.s[3] ; CHECK-GI-NEXT: stp s2, s5, [sp, #200] // 8-byte Folded Spill ; CHECK-GI-NEXT: bl powf @@ -618,46 +616,48 @@ define <8 x float> @pow_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: fmov s0, s8 ; CHECK-GI-NEXT: fmov s1, s14 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: str d0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s9 ; CHECK-GI-NEXT: fmov s1, s15 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: fmov s1, s13 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: str d0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-GI-NEXT: bl powf +; CHECK-GI-NEXT: str d0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: fmov s0, s11 ; CHECK-GI-NEXT: fmov s1, s12 -; CHECK-GI-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr s0, [sp, #48] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fmov s1, s11 -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr s0, [sp, #112] // 4-byte Folded Reload +; CHECK-GI-NEXT: ldr s1, [sp, #92] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: str d0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldp s1, s0, [sp, #200] // 8-byte Folded Reload ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldp q3, q2, [sp, #16] // 32-byte Folded Reload -; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: ldr q3, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #176] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #192] // 8-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d11, d10, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d15, d14, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d15, d14, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -775,14 +775,14 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h9, v0.h[1] -; CHECK-GI-NEXT: mov h10, v0.h[2] -; CHECK-GI-NEXT: mov h11, v0.h[3] -; CHECK-GI-NEXT: mov h12, v0.h[4] +; CHECK-GI-NEXT: mov h8, v0.h[1] +; CHECK-GI-NEXT: mov h9, v0.h[2] +; CHECK-GI-NEXT: mov h10, v0.h[3] +; CHECK-GI-NEXT: mov h11, v0.h[4] ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h15, v1.h[2] -; CHECK-GI-NEXT: mov h8, v1.h[3] -; CHECK-GI-NEXT: mov h13, v1.h[4] +; CHECK-GI-NEXT: mov h13, v1.h[3] +; CHECK-GI-NEXT: mov h12, v1.h[4] ; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 @@ -793,27 +793,27 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h9 +; CHECK-GI-NEXT: fcvt s2, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h10 +; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h8 +; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h13 +; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf @@ -966,8 +966,7 @@ define <4 x half> @pow_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-GI-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d0, d1 ; CHECK-GI-NEXT: add sp, sp, #112 ; CHECK-GI-NEXT: ret entry: @@ -1080,17 +1079,17 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h11, v0.h[1] -; CHECK-GI-NEXT: mov h12, v0.h[2] -; CHECK-GI-NEXT: mov h13, v0.h[3] -; CHECK-GI-NEXT: mov h14, v0.h[4] +; CHECK-GI-NEXT: mov h10, v0.h[1] +; CHECK-GI-NEXT: mov h11, v0.h[2] +; CHECK-GI-NEXT: mov h12, v0.h[3] +; CHECK-GI-NEXT: mov h13, v0.h[4] ; CHECK-GI-NEXT: mov h8, v1.h[1] ; CHECK-GI-NEXT: mov h9, v1.h[2] -; CHECK-GI-NEXT: mov h10, v1.h[3] -; CHECK-GI-NEXT: mov h15, v1.h[4] +; CHECK-GI-NEXT: mov h15, v1.h[3] +; CHECK-GI-NEXT: mov h14, v1.h[4] ; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] -; CHECK-GI-NEXT: str h2, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h2, [sp, #80] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: str h2, [sp, #96] // 2-byte Folded Spill @@ -1102,27 +1101,27 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #190] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h10 +; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h14 +; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h15 +; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf @@ -1134,10 +1133,10 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #188] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 @@ -1150,7 +1149,7 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload @@ -1165,7 +1164,7 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] @@ -1351,13 +1350,13 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; ; CHECK-GI-LABEL: pow_v16f16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sub sp, sp, #448 -; CHECK-GI-NEXT: stp d15, d14, [sp, #368] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d13, d12, [sp, #384] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d11, d10, [sp, #400] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #416] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x29, x30, [sp, #432] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 448 +; CHECK-GI-NEXT: sub sp, sp, #464 +; CHECK-GI-NEXT: stp d15, d14, [sp, #384] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d13, d12, [sp, #400] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d11, d10, [sp, #416] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d9, d8, [sp, #432] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x29, x30, [sp, #448] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 464 ; CHECK-GI-NEXT: .cfi_offset w30, -8 ; CHECK-GI-NEXT: .cfi_offset w29, -16 ; CHECK-GI-NEXT: .cfi_offset b8, -24 @@ -1368,209 +1367,210 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b13, -64 ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 -; CHECK-GI-NEXT: mov v4.16b, v1.16b -; CHECK-GI-NEXT: str q1, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v0.h[4] -; CHECK-GI-NEXT: mov h12, v0.h[1] -; CHECK-GI-NEXT: mov h13, v0.h[2] +; CHECK-GI-NEXT: mov h4, v0.h[4] +; CHECK-GI-NEXT: str q1, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: mov h10, v0.h[1] +; CHECK-GI-NEXT: mov h11, v0.h[2] +; CHECK-GI-NEXT: mov h12, v0.h[3] ; CHECK-GI-NEXT: str q3, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: mov h14, v0.h[3] -; CHECK-GI-NEXT: mov h15, v2.h[1] -; CHECK-GI-NEXT: mov h8, v2.h[2] -; CHECK-GI-NEXT: mov h9, v2.h[3] -; CHECK-GI-NEXT: mov h10, v2.h[4] -; CHECK-GI-NEXT: mov h11, v2.h[5] -; CHECK-GI-NEXT: str h1, [sp, #272] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v0.h[5] -; CHECK-GI-NEXT: str h1, [sp, #240] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v0.h[6] -; CHECK-GI-NEXT: str h1, [sp, #176] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v0.h[7] +; CHECK-GI-NEXT: mov h14, v2.h[1] +; CHECK-GI-NEXT: mov h15, v2.h[2] +; CHECK-GI-NEXT: mov h8, v2.h[3] +; CHECK-GI-NEXT: mov h9, v2.h[4] +; CHECK-GI-NEXT: mov h13, v2.h[5] +; CHECK-GI-NEXT: str h4, [sp, #176] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h4, v0.h[5] +; CHECK-GI-NEXT: str h4, [sp, #128] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h4, v0.h[6] +; CHECK-GI-NEXT: str h4, [sp, #80] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h4, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h1, [sp, #144] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v4.h[1] -; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h4, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov v4.16b, v1.16b +; CHECK-GI-NEXT: mov h1, v1.h[1] +; CHECK-GI-NEXT: str h1, [sp, #112] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[2] -; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[3] -; CHECK-GI-NEXT: str h1, [sp, #128] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #208] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[4] -; CHECK-GI-NEXT: str h1, [sp, #192] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #272] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[5] -; CHECK-GI-NEXT: str h1, [sp, #256] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #304] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[6] -; CHECK-GI-NEXT: str h1, [sp, #336] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v4.h[7] ; CHECK-GI-NEXT: str h1, [sp, #352] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v4.h[7] +; CHECK-GI-NEXT: str h1, [sp, #368] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] ; CHECK-GI-NEXT: str h1, [sp, #12] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[7] ; CHECK-GI-NEXT: str h1, [sp, #14] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[1] -; CHECK-GI-NEXT: str h1, [sp, #44] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #62] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[2] -; CHECK-GI-NEXT: str h1, [sp, #46] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #110] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[3] -; CHECK-GI-NEXT: str h1, [sp, #78] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #158] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[4] -; CHECK-GI-NEXT: str h1, [sp, #110] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #206] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[5] -; CHECK-GI-NEXT: str h1, [sp, #174] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #254] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[6] -; CHECK-GI-NEXT: str h1, [sp, #238] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v3.h[7] ; CHECK-GI-NEXT: str h1, [sp, #302] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v3.h[7] +; CHECK-GI-NEXT: str h1, [sp, #350] // 2-byte Folded Spill ; CHECK-GI-NEXT: fcvt s1, h2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h15 -; CHECK-GI-NEXT: str q0, [sp, #304] // 16-byte Folded Spill +; CHECK-GI-NEXT: fcvt s1, h14 +; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: fcvt s1, h15 +; CHECK-GI-NEXT: str q0, [sp, #256] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h14 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h9 -; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill +; CHECK-GI-NEXT: fcvt s1, h8 +; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #176] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h10 -; CHECK-GI-NEXT: str q0, [sp, #272] // 16-byte Folded Spill +; CHECK-GI-NEXT: fcvt s1, h9 +; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #240] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #128] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h11 -; CHECK-GI-NEXT: str q0, [sp, #240] // 16-byte Folded Spill +; CHECK-GI-NEXT: fcvt s1, h13 +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #176] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #12] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #144] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr q1, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #112] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #44] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #62] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #46] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #110] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #128] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #208] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #78] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #158] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #192] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #110] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #272] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #206] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #256] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #304] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #256] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #174] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #304] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #254] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #336] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #352] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #336] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #238] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #352] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #302] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #352] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #368] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #352] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #302] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #368] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #350] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr q3, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp x29, x30, [sp, #432] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d9, d8, [sp, #416] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #320] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x29, x30, [sp, #448] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #320] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #400] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d13, d12, [sp, #384] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #432] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d11, d10, [sp, #416] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d13, d12, [sp, #400] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d15, d14, [sp, #384] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #272] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d15, d14, [sp, #368] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #336] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #304] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #352] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #352] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v3.16b -; CHECK-GI-NEXT: add sp, sp, #448 +; CHECK-GI-NEXT: ldr q0, [sp, #368] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v1.16b, v3.16b +; CHECK-GI-NEXT: add sp, sp, #464 ; CHECK-GI-NEXT: ret entry: %c = call <16 x half> @llvm.pow.v16f16(<16 x half> %a, <16 x half> %b) diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index 92fd3183393ea7..90d8019b9b7f96 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -2377,114 +2377,114 @@ define <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) { ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-CVT-NEXT: mov x8, #562949953421311 // =0x1ffffffffffff -; CHECK-CVT-NEXT: mov x9, #-562949953421312 // =0xfffe000000000000 +; CHECK-CVT-NEXT: mov x11, #-562949953421312 // =0xfffe000000000000 ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: fcvt s3, h1 ; CHECK-CVT-NEXT: mov h4, v1.h[2] ; CHECK-CVT-NEXT: mov h1, v1.h[3] ; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvtzs x10, s3 +; CHECK-CVT-NEXT: fcvtzs x9, s3 ; CHECK-CVT-NEXT: fcvt s3, h4 ; CHECK-CVT-NEXT: fcvt s1, h1 -; CHECK-CVT-NEXT: fcvtzs x11, s2 -; CHECK-CVT-NEXT: cmp x10, x8 +; CHECK-CVT-NEXT: fcvtzs x10, s2 +; CHECK-CVT-NEXT: cmp x9, x8 ; CHECK-CVT-NEXT: fcvtzs x12, s3 -; CHECK-CVT-NEXT: csel x10, x10, x8, lt +; CHECK-CVT-NEXT: csel x9, x9, x8, lt ; CHECK-CVT-NEXT: mov h2, v0.h[1] ; CHECK-CVT-NEXT: fcvt s3, h0 -; CHECK-CVT-NEXT: cmp x10, x9 -; CHECK-CVT-NEXT: csel x4, x10, x9, gt -; CHECK-CVT-NEXT: cmp x11, x8 -; CHECK-CVT-NEXT: csel x10, x11, x8, lt -; CHECK-CVT-NEXT: fcvtzs x11, s1 +; CHECK-CVT-NEXT: cmp x9, x11 +; CHECK-CVT-NEXT: csel x4, x9, x11, gt +; CHECK-CVT-NEXT: cmp x10, x8 +; CHECK-CVT-NEXT: csel x9, x10, x8, lt +; CHECK-CVT-NEXT: fcvtzs x10, s1 ; CHECK-CVT-NEXT: mov h1, v0.h[2] -; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: cmp x9, x11 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h0, v0.h[3] -; CHECK-CVT-NEXT: csel x5, x10, x9, gt +; CHECK-CVT-NEXT: csel x5, x9, x11, gt ; CHECK-CVT-NEXT: cmp x12, x8 -; CHECK-CVT-NEXT: csel x10, x12, x8, lt +; CHECK-CVT-NEXT: csel x9, x12, x8, lt ; CHECK-CVT-NEXT: fcvtzs x12, s3 -; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: cmp x9, x11 ; CHECK-CVT-NEXT: fcvt s1, h1 -; CHECK-CVT-NEXT: csel x6, x10, x9, gt -; CHECK-CVT-NEXT: cmp x11, x8 +; CHECK-CVT-NEXT: csel x6, x9, x11, gt +; CHECK-CVT-NEXT: cmp x10, x8 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: csel x10, x11, x8, lt -; CHECK-CVT-NEXT: fcvtzs x11, s2 -; CHECK-CVT-NEXT: cmp x10, x9 -; CHECK-CVT-NEXT: csel x7, x10, x9, gt +; CHECK-CVT-NEXT: csel x9, x10, x8, lt +; CHECK-CVT-NEXT: fcvtzs x10, s2 +; CHECK-CVT-NEXT: cmp x9, x11 +; CHECK-CVT-NEXT: csel x7, x9, x11, gt ; CHECK-CVT-NEXT: cmp x12, x8 -; CHECK-CVT-NEXT: csel x10, x12, x8, lt +; CHECK-CVT-NEXT: csel x9, x12, x8, lt ; CHECK-CVT-NEXT: fcvtzs x12, s1 -; CHECK-CVT-NEXT: cmp x10, x9 -; CHECK-CVT-NEXT: csel x0, x10, x9, gt -; CHECK-CVT-NEXT: cmp x11, x8 -; CHECK-CVT-NEXT: csel x10, x11, x8, lt -; CHECK-CVT-NEXT: fcvtzs x11, s0 -; CHECK-CVT-NEXT: cmp x10, x9 -; CHECK-CVT-NEXT: csel x1, x10, x9, gt +; CHECK-CVT-NEXT: cmp x9, x11 +; CHECK-CVT-NEXT: csel x0, x9, x11, gt +; CHECK-CVT-NEXT: cmp x10, x8 +; CHECK-CVT-NEXT: csel x9, x10, x8, lt +; CHECK-CVT-NEXT: fcvtzs x10, s0 +; CHECK-CVT-NEXT: cmp x9, x11 +; CHECK-CVT-NEXT: csel x1, x9, x11, gt ; CHECK-CVT-NEXT: cmp x12, x8 -; CHECK-CVT-NEXT: csel x10, x12, x8, lt -; CHECK-CVT-NEXT: cmp x10, x9 -; CHECK-CVT-NEXT: csel x2, x10, x9, gt -; CHECK-CVT-NEXT: cmp x11, x8 -; CHECK-CVT-NEXT: csel x8, x11, x8, lt -; CHECK-CVT-NEXT: cmp x8, x9 -; CHECK-CVT-NEXT: csel x3, x8, x9, gt +; CHECK-CVT-NEXT: csel x9, x12, x8, lt +; CHECK-CVT-NEXT: cmp x9, x11 +; CHECK-CVT-NEXT: csel x2, x9, x11, gt +; CHECK-CVT-NEXT: cmp x10, x8 +; CHECK-CVT-NEXT: csel x8, x10, x8, lt +; CHECK-CVT-NEXT: cmp x8, x11 +; CHECK-CVT-NEXT: csel x3, x8, x11, gt ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_signed_v8f16_v8i50: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-FP16-NEXT: mov x8, #562949953421311 // =0x1ffffffffffff -; CHECK-FP16-NEXT: mov x9, #-562949953421312 // =0xfffe000000000000 +; CHECK-FP16-NEXT: mov x11, #-562949953421312 // =0xfffe000000000000 ; CHECK-FP16-NEXT: mov h2, v1.h[1] -; CHECK-FP16-NEXT: fcvtzs x10, h1 +; CHECK-FP16-NEXT: fcvtzs x9, h1 ; CHECK-FP16-NEXT: mov h3, v1.h[2] ; CHECK-FP16-NEXT: mov h1, v1.h[3] -; CHECK-FP16-NEXT: fcvtzs x11, h2 -; CHECK-FP16-NEXT: cmp x10, x8 +; CHECK-FP16-NEXT: fcvtzs x10, h2 +; CHECK-FP16-NEXT: cmp x9, x8 ; CHECK-FP16-NEXT: fcvtzs x12, h3 -; CHECK-FP16-NEXT: csel x10, x10, x8, lt +; CHECK-FP16-NEXT: csel x9, x9, x8, lt ; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: cmp x10, x9 -; CHECK-FP16-NEXT: csel x4, x10, x9, gt -; CHECK-FP16-NEXT: cmp x11, x8 -; CHECK-FP16-NEXT: csel x10, x11, x8, lt -; CHECK-FP16-NEXT: fcvtzs x11, h1 +; CHECK-FP16-NEXT: cmp x9, x11 +; CHECK-FP16-NEXT: csel x4, x9, x11, gt +; CHECK-FP16-NEXT: cmp x10, x8 +; CHECK-FP16-NEXT: csel x9, x10, x8, lt +; CHECK-FP16-NEXT: fcvtzs x10, h1 ; CHECK-FP16-NEXT: mov h1, v0.h[1] -; CHECK-FP16-NEXT: cmp x10, x9 -; CHECK-FP16-NEXT: csel x5, x10, x9, gt +; CHECK-FP16-NEXT: cmp x9, x11 +; CHECK-FP16-NEXT: csel x5, x9, x11, gt ; CHECK-FP16-NEXT: cmp x12, x8 -; CHECK-FP16-NEXT: csel x10, x12, x8, lt +; CHECK-FP16-NEXT: csel x9, x12, x8, lt ; CHECK-FP16-NEXT: fcvtzs x12, h0 ; CHECK-FP16-NEXT: mov h0, v0.h[3] -; CHECK-FP16-NEXT: cmp x10, x9 -; CHECK-FP16-NEXT: csel x6, x10, x9, gt -; CHECK-FP16-NEXT: cmp x11, x8 -; CHECK-FP16-NEXT: csel x10, x11, x8, lt -; CHECK-FP16-NEXT: fcvtzs x11, h1 -; CHECK-FP16-NEXT: cmp x10, x9 -; CHECK-FP16-NEXT: csel x7, x10, x9, gt +; CHECK-FP16-NEXT: cmp x9, x11 +; CHECK-FP16-NEXT: csel x6, x9, x11, gt +; CHECK-FP16-NEXT: cmp x10, x8 +; CHECK-FP16-NEXT: csel x9, x10, x8, lt +; CHECK-FP16-NEXT: fcvtzs x10, h1 +; CHECK-FP16-NEXT: cmp x9, x11 +; CHECK-FP16-NEXT: csel x7, x9, x11, gt ; CHECK-FP16-NEXT: cmp x12, x8 -; CHECK-FP16-NEXT: csel x10, x12, x8, lt +; CHECK-FP16-NEXT: csel x9, x12, x8, lt ; CHECK-FP16-NEXT: fcvtzs x12, h2 -; CHECK-FP16-NEXT: cmp x10, x9 -; CHECK-FP16-NEXT: csel x0, x10, x9, gt -; CHECK-FP16-NEXT: cmp x11, x8 -; CHECK-FP16-NEXT: csel x10, x11, x8, lt -; CHECK-FP16-NEXT: fcvtzs x11, h0 -; CHECK-FP16-NEXT: cmp x10, x9 -; CHECK-FP16-NEXT: csel x1, x10, x9, gt +; CHECK-FP16-NEXT: cmp x9, x11 +; CHECK-FP16-NEXT: csel x0, x9, x11, gt +; CHECK-FP16-NEXT: cmp x10, x8 +; CHECK-FP16-NEXT: csel x9, x10, x8, lt +; CHECK-FP16-NEXT: fcvtzs x10, h0 +; CHECK-FP16-NEXT: cmp x9, x11 +; CHECK-FP16-NEXT: csel x1, x9, x11, gt ; CHECK-FP16-NEXT: cmp x12, x8 -; CHECK-FP16-NEXT: csel x10, x12, x8, lt -; CHECK-FP16-NEXT: cmp x10, x9 -; CHECK-FP16-NEXT: csel x2, x10, x9, gt -; CHECK-FP16-NEXT: cmp x11, x8 -; CHECK-FP16-NEXT: csel x8, x11, x8, lt -; CHECK-FP16-NEXT: cmp x8, x9 -; CHECK-FP16-NEXT: csel x3, x8, x9, gt +; CHECK-FP16-NEXT: csel x9, x12, x8, lt +; CHECK-FP16-NEXT: cmp x9, x11 +; CHECK-FP16-NEXT: csel x2, x9, x11, gt +; CHECK-FP16-NEXT: cmp x10, x8 +; CHECK-FP16-NEXT: csel x8, x10, x8, lt +; CHECK-FP16-NEXT: cmp x8, x11 +; CHECK-FP16-NEXT: csel x3, x8, x11, gt ; CHECK-FP16-NEXT: ret %x = call <8 x i50> @llvm.fptosi.sat.v8f16.v8i50(<8 x half> %f) ret <8 x i50> %x @@ -2596,15 +2596,15 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: mov w8, #1895825407 // =0x70ffffff ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: fmov s9, w8 -; CHECK-NEXT: mov x22, #-34359738368 // =0xfffffff800000000 -; CHECK-NEXT: mov x23, #34359738367 // =0x7ffffffff +; CHECK-NEXT: mov x23, #-34359738368 // =0xfffffff800000000 +; CHECK-NEXT: mov x22, #34359738367 // =0x7ffffffff ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x8, x22, x1, lt +; CHECK-NEXT: csel x8, x23, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csel x8, xzr, x8, vs @@ -2616,9 +2616,9 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x22, x1, lt +; CHECK-NEXT: csel x9, x23, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2630,11 +2630,11 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[2] -; CHECK-NEXT: csel x8, x22, x1, lt +; CHECK-NEXT: csel x8, x23, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csel x26, xzr, x8, vs @@ -2645,14 +2645,14 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[1] -; CHECK-NEXT: csel x8, x22, x1, lt +; CHECK-NEXT: csel x8, x23, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x27, xzr, x8, vs +; CHECK-NEXT: csel x28, xzr, x8, vs ; CHECK-NEXT: csel x8, xzr, x9, vs ; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 @@ -2660,38 +2660,38 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel x8, x22, x1, lt +; CHECK-NEXT: csel x8, x23, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csel x20, xzr, x8, vs -; CHECK-NEXT: csel x21, xzr, x9, vs +; CHECK-NEXT: csel x27, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x22, x1, lt +; CHECK-NEXT: csel x9, x23, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x28, xzr, x8, vs +; CHECK-NEXT: csel x21, xzr, x8, vs ; CHECK-NEXT: csel x24, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[2] -; CHECK-NEXT: csel x8, x22, x1, lt +; CHECK-NEXT: csel x8, x23, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csel x25, xzr, x8, vs @@ -2699,14 +2699,14 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr x9, [sp] // 8-byte Folded Reload -; CHECK-NEXT: extr x8, x24, x28, #28 +; CHECK-NEXT: extr x8, x24, x21, #28 ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: bfi x25, x21, #36, #28 +; CHECK-NEXT: bfi x25, x27, #36, #28 ; CHECK-NEXT: lsr x11, x20, #28 ; CHECK-NEXT: stur x9, [x19, #75] -; CHECK-NEXT: extr x9, x20, x21, #28 +; CHECK-NEXT: extr x9, x20, x27, #28 ; CHECK-NEXT: stur x8, [x19, #41] -; CHECK-NEXT: csel x8, x22, x1, lt +; CHECK-NEXT: csel x8, x23, x1, lt ; CHECK-NEXT: str x9, [x19, #16] ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 @@ -2715,7 +2715,7 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: stur x10, [x19, #50] ; CHECK-NEXT: lsr x10, x24, #28 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: strb w10, [x19, #49] ; CHECK-NEXT: ldp x14, x12, [sp, #8] // 16-byte Folded Reload @@ -2723,9 +2723,9 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x8, xzr, x8, vs ; CHECK-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: bfi x8, x28, #36, #28 +; CHECK-NEXT: bfi x8, x21, #36, #28 ; CHECK-NEXT: extr x10, x14, x12, #28 -; CHECK-NEXT: bfi x27, x12, #36, #28 +; CHECK-NEXT: bfi x28, x12, #36, #28 ; CHECK-NEXT: ldr x12, [sp, #72] // 8-byte Folded Reload ; CHECK-NEXT: bfi x26, x13, #36, #28 ; CHECK-NEXT: stur x9, [x19, #25] @@ -2734,7 +2734,7 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: stur x8, [x19, #33] ; CHECK-NEXT: lsr x8, x12, #28 ; CHECK-NEXT: stur x10, [x19, #91] -; CHECK-NEXT: stur x27, [x19, #83] +; CHECK-NEXT: stur x28, [x19, #83] ; CHECK-NEXT: stur x11, [x19, #66] ; CHECK-NEXT: stur x26, [x19, #58] ; CHECK-NEXT: strb w9, [x19, #99] @@ -3030,75 +3030,75 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ; CHECK-CVT-NEXT: mov s3, v2.s[1] ; CHECK-CVT-NEXT: fcvtzs w10, s2 -; CHECK-CVT-NEXT: fcvtzs w16, s1 ; CHECK-CVT-NEXT: fcvtzs w9, s3 ; CHECK-CVT-NEXT: mov s3, v2.s[2] ; CHECK-CVT-NEXT: mov s2, v2.s[3] ; CHECK-CVT-NEXT: cmp w9, #127 -; CHECK-CVT-NEXT: fcvtzs w12, s3 +; CHECK-CVT-NEXT: fcvtzs w14, s3 ; CHECK-CVT-NEXT: mov s3, v1.s[1] ; CHECK-CVT-NEXT: csel w11, w9, w8, lt ; CHECK-CVT-NEXT: mov w9, #-128 // =0xffffff80 -; CHECK-CVT-NEXT: fcvtzs w14, s2 +; CHECK-CVT-NEXT: fcvtzs w15, s2 ; CHECK-CVT-NEXT: cmn w11, #128 ; CHECK-CVT-NEXT: mov s2, v1.s[2] -; CHECK-CVT-NEXT: mov s1, v1.s[3] -; CHECK-CVT-NEXT: csel w11, w11, w9, gt +; CHECK-CVT-NEXT: csel w12, w11, w9, gt ; CHECK-CVT-NEXT: cmp w10, #127 ; CHECK-CVT-NEXT: csel w10, w10, w8, lt -; CHECK-CVT-NEXT: fcvtzs w15, s3 +; CHECK-CVT-NEXT: cmn w10, #128 +; CHECK-CVT-NEXT: csel w13, w10, w9, gt +; CHECK-CVT-NEXT: cmp w14, #127 +; CHECK-CVT-NEXT: fcvtzs w16, s2 +; CHECK-CVT-NEXT: csel w10, w14, w8, lt +; CHECK-CVT-NEXT: fcvtzs w14, s3 ; CHECK-CVT-NEXT: fcvtl2 v3.4s, v0.8h ; CHECK-CVT-NEXT: cmn w10, #128 ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h -; CHECK-CVT-NEXT: csel w13, w10, w9, gt -; CHECK-CVT-NEXT: cmp w12, #127 -; CHECK-CVT-NEXT: fcvtzs w17, s1 -; CHECK-CVT-NEXT: csel w10, w12, w8, lt +; CHECK-CVT-NEXT: csel w11, w10, w9, gt +; CHECK-CVT-NEXT: cmp w15, #127 +; CHECK-CVT-NEXT: csel w10, w15, w8, lt +; CHECK-CVT-NEXT: fcvtzs w15, s1 +; CHECK-CVT-NEXT: mov s1, v1.s[3] ; CHECK-CVT-NEXT: cmn w10, #128 -; CHECK-CVT-NEXT: mov s1, v3.s[2] +; CHECK-CVT-NEXT: mov s2, v3.s[1] ; CHECK-CVT-NEXT: fcvtzs w0, s3 ; CHECK-CVT-NEXT: csel w10, w10, w9, gt ; CHECK-CVT-NEXT: cmp w14, #127 ; CHECK-CVT-NEXT: fcvtzs w4, s0 -; CHECK-CVT-NEXT: csel w12, w14, w8, lt -; CHECK-CVT-NEXT: cmn w12, #128 -; CHECK-CVT-NEXT: csel w12, w12, w9, gt -; CHECK-CVT-NEXT: cmp w15, #127 -; CHECK-CVT-NEXT: fcvtzs w1, s1 -; CHECK-CVT-NEXT: csel w14, w15, w8, lt -; CHECK-CVT-NEXT: fcvtzs w15, s2 -; CHECK-CVT-NEXT: mov s2, v3.s[1] +; CHECK-CVT-NEXT: csel w14, w14, w8, lt ; CHECK-CVT-NEXT: cmn w14, #128 -; CHECK-CVT-NEXT: mov s1, v0.s[1] +; CHECK-CVT-NEXT: fcvtzs w17, s1 +; CHECK-CVT-NEXT: mov s1, v3.s[2] ; CHECK-CVT-NEXT: csel w14, w14, w9, gt -; CHECK-CVT-NEXT: cmp w16, #127 -; CHECK-CVT-NEXT: csel w16, w16, w8, lt -; CHECK-CVT-NEXT: cmn w16, #128 -; CHECK-CVT-NEXT: fcvtzs w18, s2 -; CHECK-CVT-NEXT: mov s2, v3.s[3] -; CHECK-CVT-NEXT: csel w16, w16, w9, gt ; CHECK-CVT-NEXT: cmp w15, #127 -; CHECK-CVT-NEXT: fcvtzs w3, s1 +; CHECK-CVT-NEXT: fcvtzs w18, s2 ; CHECK-CVT-NEXT: csel w15, w15, w8, lt -; CHECK-CVT-NEXT: mov s1, v0.s[2] -; CHECK-CVT-NEXT: mov s0, v0.s[3] +; CHECK-CVT-NEXT: mov s2, v3.s[3] ; CHECK-CVT-NEXT: cmn w15, #128 ; CHECK-CVT-NEXT: csel w15, w15, w9, gt -; CHECK-CVT-NEXT: cmp w17, #127 +; CHECK-CVT-NEXT: cmp w16, #127 +; CHECK-CVT-NEXT: fcvtzs w1, s1 +; CHECK-CVT-NEXT: csel w16, w16, w8, lt +; CHECK-CVT-NEXT: mov s1, v0.s[1] +; CHECK-CVT-NEXT: cmn w16, #128 ; CHECK-CVT-NEXT: fcvtzs w2, s2 -; CHECK-CVT-NEXT: csel w17, w17, w8, lt ; CHECK-CVT-NEXT: fmov s2, w13 +; CHECK-CVT-NEXT: csel w16, w16, w9, gt +; CHECK-CVT-NEXT: cmp w17, #127 +; CHECK-CVT-NEXT: csel w17, w17, w8, lt ; CHECK-CVT-NEXT: cmn w17, #128 +; CHECK-CVT-NEXT: fcvtzs w3, s1 +; CHECK-CVT-NEXT: mov s1, v0.s[2] ; CHECK-CVT-NEXT: csel w17, w17, w9, gt ; CHECK-CVT-NEXT: cmp w18, #127 +; CHECK-CVT-NEXT: mov s0, v0.s[3] ; CHECK-CVT-NEXT: csel w18, w18, w8, lt -; CHECK-CVT-NEXT: mov v2.s[1], w11 +; CHECK-CVT-NEXT: mov v2.s[1], w12 ; CHECK-CVT-NEXT: cmn w18, #128 ; CHECK-CVT-NEXT: csel w18, w18, w9, gt ; CHECK-CVT-NEXT: cmp w0, #127 ; CHECK-CVT-NEXT: csel w0, w0, w8, lt ; CHECK-CVT-NEXT: cmn w0, #128 -; CHECK-CVT-NEXT: mov v2.s[2], w10 +; CHECK-CVT-NEXT: mov v2.s[2], w11 ; CHECK-CVT-NEXT: csel w0, w0, w9, gt ; CHECK-CVT-NEXT: cmp w1, #127 ; CHECK-CVT-NEXT: csel w1, w1, w8, lt @@ -3106,7 +3106,7 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: cmn w1, #128 ; CHECK-CVT-NEXT: csel w1, w1, w9, gt ; CHECK-CVT-NEXT: cmp w2, #127 -; CHECK-CVT-NEXT: mov v2.s[3], w12 +; CHECK-CVT-NEXT: mov v2.s[3], w10 ; CHECK-CVT-NEXT: csel w2, w2, w8, lt ; CHECK-CVT-NEXT: mov v3.s[1], w18 ; CHECK-CVT-NEXT: cmn w2, #128 @@ -3119,22 +3119,22 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: cmp w4, #127 ; CHECK-CVT-NEXT: csel w3, w4, w8, lt ; CHECK-CVT-NEXT: fcvtzs w4, s1 -; CHECK-CVT-NEXT: fmov s1, w16 +; CHECK-CVT-NEXT: fmov s1, w15 ; CHECK-CVT-NEXT: cmn w3, #128 -; CHECK-CVT-NEXT: csel w11, w3, w9, gt +; CHECK-CVT-NEXT: csel w12, w3, w9, gt ; CHECK-CVT-NEXT: mov v3.s[3], w2 -; CHECK-CVT-NEXT: fmov s4, w11 +; CHECK-CVT-NEXT: fmov s4, w12 ; CHECK-CVT-NEXT: mov v1.s[1], w14 -; CHECK-CVT-NEXT: fcvtzs w11, s0 +; CHECK-CVT-NEXT: fcvtzs w12, s0 ; CHECK-CVT-NEXT: cmp w4, #127 ; CHECK-CVT-NEXT: mov v4.s[1], w13 ; CHECK-CVT-NEXT: csel w13, w4, w8, lt ; CHECK-CVT-NEXT: cmn w13, #128 -; CHECK-CVT-NEXT: mov v1.s[2], w15 -; CHECK-CVT-NEXT: csel w10, w13, w9, gt -; CHECK-CVT-NEXT: cmp w11, #127 -; CHECK-CVT-NEXT: csel w8, w11, w8, lt -; CHECK-CVT-NEXT: mov v4.s[2], w10 +; CHECK-CVT-NEXT: mov v1.s[2], w16 +; CHECK-CVT-NEXT: csel w11, w13, w9, gt +; CHECK-CVT-NEXT: cmp w12, #127 +; CHECK-CVT-NEXT: csel w8, w12, w8, lt +; CHECK-CVT-NEXT: mov v4.s[2], w11 ; CHECK-CVT-NEXT: cmn w8, #128 ; CHECK-CVT-NEXT: csel w8, w8, w9, gt ; CHECK-CVT-NEXT: mov v1.s[3], w17 @@ -3163,82 +3163,82 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: mov s3, v2.s[1] ; CHECK-CVT-NEXT: fcvtzs w10, s2 -; CHECK-CVT-NEXT: fcvtzs w16, s0 ; CHECK-CVT-NEXT: fcvtzs w9, s3 ; CHECK-CVT-NEXT: mov s3, v2.s[2] ; CHECK-CVT-NEXT: mov s2, v2.s[3] ; CHECK-CVT-NEXT: cmp w9, w8 -; CHECK-CVT-NEXT: fcvtzs w12, s3 +; CHECK-CVT-NEXT: fcvtzs w14, s3 ; CHECK-CVT-NEXT: mov s3, v0.s[1] ; CHECK-CVT-NEXT: csel w11, w9, w8, lt ; CHECK-CVT-NEXT: mov w9, #-32768 // =0xffff8000 -; CHECK-CVT-NEXT: fcvtzs w14, s2 +; CHECK-CVT-NEXT: fcvtzs w15, s2 ; CHECK-CVT-NEXT: cmn w11, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: mov s2, v0.s[2] -; CHECK-CVT-NEXT: mov s0, v0.s[3] -; CHECK-CVT-NEXT: csel w11, w11, w9, gt +; CHECK-CVT-NEXT: csel w12, w11, w9, gt ; CHECK-CVT-NEXT: cmp w10, w8 ; CHECK-CVT-NEXT: csel w10, w10, w8, lt -; CHECK-CVT-NEXT: fcvtzs w15, s3 +; CHECK-CVT-NEXT: cmn w10, #8, lsl #12 // =32768 +; CHECK-CVT-NEXT: csel w13, w10, w9, gt +; CHECK-CVT-NEXT: cmp w14, w8 +; CHECK-CVT-NEXT: fcvtzs w16, s2 +; CHECK-CVT-NEXT: csel w10, w14, w8, lt +; CHECK-CVT-NEXT: fcvtzs w14, s3 ; CHECK-CVT-NEXT: fcvtl2 v3.4s, v1.8h ; CHECK-CVT-NEXT: cmn w10, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h -; CHECK-CVT-NEXT: csel w13, w10, w9, gt -; CHECK-CVT-NEXT: cmp w12, w8 -; CHECK-CVT-NEXT: fcvtzs w17, s0 -; CHECK-CVT-NEXT: csel w10, w12, w8, lt +; CHECK-CVT-NEXT: csel w11, w10, w9, gt +; CHECK-CVT-NEXT: cmp w15, w8 +; CHECK-CVT-NEXT: csel w10, w15, w8, lt +; CHECK-CVT-NEXT: fcvtzs w15, s0 +; CHECK-CVT-NEXT: mov s0, v0.s[3] ; CHECK-CVT-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: mov s0, v3.s[2] +; CHECK-CVT-NEXT: mov s2, v3.s[1] ; CHECK-CVT-NEXT: fcvtzs w0, s3 ; CHECK-CVT-NEXT: csel w10, w10, w9, gt ; CHECK-CVT-NEXT: cmp w14, w8 ; CHECK-CVT-NEXT: fcvtzs w4, s1 -; CHECK-CVT-NEXT: csel w12, w14, w8, lt -; CHECK-CVT-NEXT: cmn w12, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: csel w12, w12, w9, gt -; CHECK-CVT-NEXT: cmp w15, w8 -; CHECK-CVT-NEXT: fcvtzs w1, s0 -; CHECK-CVT-NEXT: csel w14, w15, w8, lt -; CHECK-CVT-NEXT: fcvtzs w15, s2 -; CHECK-CVT-NEXT: mov s2, v3.s[1] +; CHECK-CVT-NEXT: csel w14, w14, w8, lt ; CHECK-CVT-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: mov s0, v1.s[1] +; CHECK-CVT-NEXT: fcvtzs w17, s0 +; CHECK-CVT-NEXT: mov s0, v3.s[2] ; CHECK-CVT-NEXT: csel w14, w14, w9, gt -; CHECK-CVT-NEXT: cmp w16, w8 -; CHECK-CVT-NEXT: csel w16, w16, w8, lt -; CHECK-CVT-NEXT: cmn w16, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: fcvtzs w18, s2 -; CHECK-CVT-NEXT: mov s2, v3.s[3] -; CHECK-CVT-NEXT: csel w16, w16, w9, gt ; CHECK-CVT-NEXT: cmp w15, w8 -; CHECK-CVT-NEXT: fcvtzs w3, s0 +; CHECK-CVT-NEXT: fcvtzs w18, s2 ; CHECK-CVT-NEXT: csel w15, w15, w8, lt -; CHECK-CVT-NEXT: mov s0, v1.s[2] +; CHECK-CVT-NEXT: mov s2, v3.s[3] ; CHECK-CVT-NEXT: cmn w15, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: csel w15, w15, w9, gt -; CHECK-CVT-NEXT: cmp w17, w8 +; CHECK-CVT-NEXT: cmp w16, w8 +; CHECK-CVT-NEXT: fcvtzs w1, s0 +; CHECK-CVT-NEXT: csel w16, w16, w8, lt +; CHECK-CVT-NEXT: mov s0, v1.s[1] +; CHECK-CVT-NEXT: cmn w16, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: fcvtzs w2, s2 -; CHECK-CVT-NEXT: csel w17, w17, w8, lt ; CHECK-CVT-NEXT: fmov s2, w13 +; CHECK-CVT-NEXT: csel w16, w16, w9, gt +; CHECK-CVT-NEXT: cmp w17, w8 +; CHECK-CVT-NEXT: csel w17, w17, w8, lt ; CHECK-CVT-NEXT: cmn w17, #8, lsl #12 // =32768 +; CHECK-CVT-NEXT: fcvtzs w3, s0 +; CHECK-CVT-NEXT: mov s0, v1.s[2] ; CHECK-CVT-NEXT: csel w17, w17, w9, gt ; CHECK-CVT-NEXT: cmp w18, w8 +; CHECK-CVT-NEXT: mov v2.s[1], w12 ; CHECK-CVT-NEXT: csel w18, w18, w8, lt -; CHECK-CVT-NEXT: mov v2.s[1], w11 ; CHECK-CVT-NEXT: cmn w18, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: csel w18, w18, w9, gt ; CHECK-CVT-NEXT: cmp w0, w8 ; CHECK-CVT-NEXT: csel w0, w0, w8, lt +; CHECK-CVT-NEXT: mov v2.s[2], w11 ; CHECK-CVT-NEXT: cmn w0, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: mov v2.s[2], w10 ; CHECK-CVT-NEXT: csel w0, w0, w9, gt ; CHECK-CVT-NEXT: cmp w1, w8 ; CHECK-CVT-NEXT: csel w1, w1, w8, lt ; CHECK-CVT-NEXT: fmov s3, w0 ; CHECK-CVT-NEXT: cmn w1, #8, lsl #12 // =32768 +; CHECK-CVT-NEXT: mov v2.s[3], w10 ; CHECK-CVT-NEXT: csel w1, w1, w9, gt ; CHECK-CVT-NEXT: cmp w2, w8 -; CHECK-CVT-NEXT: mov v2.s[3], w12 ; CHECK-CVT-NEXT: csel w2, w2, w8, lt ; CHECK-CVT-NEXT: mov v3.s[1], w18 ; CHECK-CVT-NEXT: cmn w2, #8, lsl #12 // =32768 @@ -3253,21 +3253,21 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtzs w4, s0 ; CHECK-CVT-NEXT: mov s0, v1.s[3] ; CHECK-CVT-NEXT: cmn w3, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: fmov s1, w16 -; CHECK-CVT-NEXT: csel w11, w3, w9, gt +; CHECK-CVT-NEXT: fmov s1, w15 +; CHECK-CVT-NEXT: csel w12, w3, w9, gt ; CHECK-CVT-NEXT: mov v3.s[3], w2 -; CHECK-CVT-NEXT: fmov s4, w11 +; CHECK-CVT-NEXT: fmov s4, w12 ; CHECK-CVT-NEXT: mov v1.s[1], w14 ; CHECK-CVT-NEXT: cmp w4, w8 -; CHECK-CVT-NEXT: fcvtzs w11, s0 +; CHECK-CVT-NEXT: fcvtzs w12, s0 ; CHECK-CVT-NEXT: mov v4.s[1], w13 ; CHECK-CVT-NEXT: csel w13, w4, w8, lt ; CHECK-CVT-NEXT: cmn w13, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: csel w10, w13, w9, gt -; CHECK-CVT-NEXT: mov v1.s[2], w15 -; CHECK-CVT-NEXT: cmp w11, w8 -; CHECK-CVT-NEXT: csel w8, w11, w8, lt -; CHECK-CVT-NEXT: mov v4.s[2], w10 +; CHECK-CVT-NEXT: csel w11, w13, w9, gt +; CHECK-CVT-NEXT: mov v1.s[2], w16 +; CHECK-CVT-NEXT: cmp w12, w8 +; CHECK-CVT-NEXT: csel w8, w12, w8, lt +; CHECK-CVT-NEXT: mov v4.s[2], w11 ; CHECK-CVT-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: csel w8, w8, w9, gt ; CHECK-CVT-NEXT: mov v1.s[3], w17 @@ -3289,8 +3289,9 @@ define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) { ; CHECK-LABEL: test_signed_v8f64_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov d4, v3.d[1] -; CHECK-NEXT: fcvtzs w11, d3 -; CHECK-NEXT: mov w9, #127 // =0x7f +; CHECK-NEXT: fcvtzs w9, d3 +; CHECK-NEXT: mov w10, #127 // =0x7f +; CHECK-NEXT: mov w11, #-128 // =0xffffff80 ; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: fcvtzs w13, d2 ; CHECK-NEXT: fcvtzs w15, d1 @@ -3302,48 +3303,47 @@ define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) { ; CHECK-NEXT: cmp w8, #127 ; CHECK-NEXT: fcvtzs w12, d4 ; CHECK-NEXT: fcvtzs w16, d2 -; CHECK-NEXT: csel w10, w8, w9, lt -; CHECK-NEXT: mov w8, #-128 // =0xffffff80 -; CHECK-NEXT: cmn w10, #128 -; CHECK-NEXT: csel w10, w10, w8, gt -; CHECK-NEXT: cmp w11, #127 -; CHECK-NEXT: csel w11, w11, w9, lt -; CHECK-NEXT: cmn w11, #128 -; CHECK-NEXT: csel w11, w11, w8, gt +; CHECK-NEXT: csel w8, w8, w10, lt +; CHECK-NEXT: cmn w8, #128 +; CHECK-NEXT: csel w8, w8, w11, gt +; CHECK-NEXT: cmp w9, #127 +; CHECK-NEXT: csel w9, w9, w10, lt +; CHECK-NEXT: cmn w9, #128 +; CHECK-NEXT: csel w9, w9, w11, gt ; CHECK-NEXT: cmp w12, #127 -; CHECK-NEXT: csel w12, w12, w9, lt -; CHECK-NEXT: fmov s3, w11 +; CHECK-NEXT: csel w12, w12, w10, lt +; CHECK-NEXT: fmov s3, w9 ; CHECK-NEXT: cmn w12, #128 -; CHECK-NEXT: csel w12, w12, w8, gt +; CHECK-NEXT: csel w12, w12, w11, gt ; CHECK-NEXT: cmp w13, #127 -; CHECK-NEXT: csel w13, w13, w9, lt -; CHECK-NEXT: mov v3.s[1], w10 +; CHECK-NEXT: csel w13, w13, w10, lt +; CHECK-NEXT: mov v3.s[1], w8 ; CHECK-NEXT: cmn w13, #128 -; CHECK-NEXT: csel w13, w13, w8, gt +; CHECK-NEXT: csel w13, w13, w11, gt ; CHECK-NEXT: cmp w14, #127 -; CHECK-NEXT: csel w14, w14, w9, lt +; CHECK-NEXT: csel w14, w14, w10, lt ; CHECK-NEXT: fmov s2, w13 ; CHECK-NEXT: cmn w14, #128 -; CHECK-NEXT: csel w14, w14, w8, gt +; CHECK-NEXT: csel w14, w14, w11, gt ; CHECK-NEXT: cmp w15, #127 -; CHECK-NEXT: csel w15, w15, w9, lt +; CHECK-NEXT: csel w15, w15, w10, lt ; CHECK-NEXT: mov v2.s[1], w12 ; CHECK-NEXT: cmn w15, #128 -; CHECK-NEXT: csel w15, w15, w8, gt +; CHECK-NEXT: csel w15, w15, w11, gt ; CHECK-NEXT: cmp w16, #127 -; CHECK-NEXT: csel w11, w16, w9, lt +; CHECK-NEXT: csel w9, w16, w10, lt ; CHECK-NEXT: fmov s1, w15 -; CHECK-NEXT: cmn w11, #128 -; CHECK-NEXT: csel w10, w11, w8, gt +; CHECK-NEXT: cmn w9, #128 +; CHECK-NEXT: csel w8, w9, w11, gt ; CHECK-NEXT: cmp w17, #127 -; CHECK-NEXT: csel w9, w17, w9, lt +; CHECK-NEXT: csel w9, w17, w10, lt ; CHECK-NEXT: mov v1.s[1], w14 ; CHECK-NEXT: cmn w9, #128 -; CHECK-NEXT: csel w8, w9, w8, gt -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: csel w9, w9, w11, gt +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: mov v0.s[1], w8 ; CHECK-NEXT: adrp x8, .LCPI82_0 ; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI82_0] -; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f) @@ -3491,61 +3491,61 @@ define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) { ; CHECK-LABEL: test_signed_v8f64_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov d4, v3.d[1] -; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: fcvtzs w11, d3 +; CHECK-NEXT: mov w9, #32767 // =0x7fff +; CHECK-NEXT: fcvtzs w10, d3 +; CHECK-NEXT: mov w11, #-32768 // =0xffff8000 ; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: fcvtzs w13, d2 ; CHECK-NEXT: fcvtzs w15, d1 ; CHECK-NEXT: fcvtzs w17, d0 -; CHECK-NEXT: fcvtzs w9, d4 +; CHECK-NEXT: fcvtzs w8, d4 ; CHECK-NEXT: mov d4, v2.d[1] ; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: fcvtzs w14, d3 -; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: fcvtzs w12, d4 ; CHECK-NEXT: fcvtzs w16, d2 -; CHECK-NEXT: csel w10, w9, w8, lt -; CHECK-NEXT: mov w9, #-32768 // =0xffff8000 +; CHECK-NEXT: csel w8, w8, w9, lt +; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w8, w8, w11, gt +; CHECK-NEXT: cmp w10, w9 +; CHECK-NEXT: csel w10, w10, w9, lt ; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w10, w10, w9, gt -; CHECK-NEXT: cmp w11, w8 -; CHECK-NEXT: csel w11, w11, w8, lt -; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w11, w11, w9, gt -; CHECK-NEXT: cmp w12, w8 -; CHECK-NEXT: csel w12, w12, w8, lt -; CHECK-NEXT: fmov s3, w11 +; CHECK-NEXT: csel w10, w10, w11, gt +; CHECK-NEXT: cmp w12, w9 +; CHECK-NEXT: csel w12, w12, w9, lt +; CHECK-NEXT: fmov s3, w10 ; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w12, w12, w9, gt -; CHECK-NEXT: cmp w13, w8 -; CHECK-NEXT: csel w13, w13, w8, lt -; CHECK-NEXT: mov v3.s[1], w10 +; CHECK-NEXT: csel w12, w12, w11, gt +; CHECK-NEXT: cmp w13, w9 +; CHECK-NEXT: csel w13, w13, w9, lt +; CHECK-NEXT: mov v3.s[1], w8 ; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w13, w13, w9, gt -; CHECK-NEXT: cmp w14, w8 -; CHECK-NEXT: csel w14, w14, w8, lt +; CHECK-NEXT: csel w13, w13, w11, gt +; CHECK-NEXT: cmp w14, w9 +; CHECK-NEXT: csel w14, w14, w9, lt ; CHECK-NEXT: fmov s2, w13 ; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w14, w14, w9, gt -; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: csel w15, w15, w8, lt +; CHECK-NEXT: csel w14, w14, w11, gt +; CHECK-NEXT: cmp w15, w9 +; CHECK-NEXT: csel w15, w15, w9, lt ; CHECK-NEXT: mov v2.s[1], w12 ; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w15, w15, w9, gt -; CHECK-NEXT: cmp w16, w8 -; CHECK-NEXT: csel w11, w16, w8, lt +; CHECK-NEXT: csel w15, w15, w11, gt +; CHECK-NEXT: cmp w16, w9 +; CHECK-NEXT: csel w10, w16, w9, lt ; CHECK-NEXT: fmov s1, w15 -; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w10, w11, w9, gt -; CHECK-NEXT: cmp w17, w8 -; CHECK-NEXT: csel w8, w17, w8, lt +; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w8, w10, w11, gt +; CHECK-NEXT: cmp w17, w9 +; CHECK-NEXT: csel w9, w17, w9, lt ; CHECK-NEXT: mov v1.s[1], w14 -; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w8, w8, w9, gt -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w9, w9, w11, gt +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: mov v0.s[1], w8 ; CHECK-NEXT: adrp x8, .LCPI84_0 ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI84_0] -; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f) @@ -3562,8 +3562,8 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: fcvtzs w14, d2 ; CHECK-NEXT: fcvtzs w15, d1 ; CHECK-NEXT: mov d1, v7.d[1] -; CHECK-NEXT: fcvtzs w18, d0 -; CHECK-NEXT: fcvtzs w1, d7 +; CHECK-NEXT: fcvtzs w17, d0 +; CHECK-NEXT: fcvtzs w0, d7 ; CHECK-NEXT: fcvtzs w2, d6 ; CHECK-NEXT: fcvtzs w4, d5 ; CHECK-NEXT: fcvtzs w6, d4 @@ -3571,95 +3571,95 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: mov d16, v2.d[1] ; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: mov d0, v6.d[1] -; CHECK-NEXT: fcvtzs w0, d1 +; CHECK-NEXT: fcvtzs w18, d1 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: fcvtzs w13, d16 -; CHECK-NEXT: fcvtzs w17, d2 +; CHECK-NEXT: fcvtzs w12, d16 +; CHECK-NEXT: fcvtzs w16, d2 ; CHECK-NEXT: csel w10, w8, w9, lt ; CHECK-NEXT: mov w8, #-32768 // =0xffff8000 +; CHECK-NEXT: fcvtzs w1, d0 ; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 +; CHECK-NEXT: mov d0, v5.d[1] ; CHECK-NEXT: csel w10, w10, w8, gt ; CHECK-NEXT: cmp w11, w9 ; CHECK-NEXT: csel w11, w11, w9, lt ; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w12, w11, w8, gt -; CHECK-NEXT: cmp w13, w9 -; CHECK-NEXT: csel w11, w13, w9, lt -; CHECK-NEXT: fcvtzs w13, d3 +; CHECK-NEXT: csel w13, w11, w8, gt +; CHECK-NEXT: cmp w12, w9 +; CHECK-NEXT: fcvtzs w3, d0 +; CHECK-NEXT: csel w11, w12, w9, lt +; CHECK-NEXT: fcvtzs w12, d3 +; CHECK-NEXT: mov d0, v4.d[1] ; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w11, w11, w8, gt ; CHECK-NEXT: cmp w14, w9 ; CHECK-NEXT: csel w14, w14, w9, lt ; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 +; CHECK-NEXT: fcvtzs w5, d0 +; CHECK-NEXT: fmov s3, w13 ; CHECK-NEXT: csel w14, w14, w8, gt -; CHECK-NEXT: cmp w13, w9 -; CHECK-NEXT: csel w13, w13, w9, lt -; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w13, w13, w8, gt +; CHECK-NEXT: cmp w12, w9 +; CHECK-NEXT: csel w12, w12, w9, lt +; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 +; CHECK-NEXT: mov v3.s[1], w10 +; CHECK-NEXT: csel w12, w12, w8, gt ; CHECK-NEXT: cmp w15, w9 +; CHECK-NEXT: fmov s2, w14 ; CHECK-NEXT: csel w15, w15, w9, lt ; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w16, w15, w8, gt -; CHECK-NEXT: cmp w17, w9 -; CHECK-NEXT: csel w15, w17, w9, lt -; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w15, w15, w8, gt -; CHECK-NEXT: cmp w18, w9 -; CHECK-NEXT: csel w17, w18, w9, lt +; CHECK-NEXT: cmp w16, w9 +; CHECK-NEXT: mov v2.s[1], w11 +; CHECK-NEXT: csel w16, w16, w9, lt +; CHECK-NEXT: fmov s1, w15 +; CHECK-NEXT: cmn w16, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w16, w16, w8, gt +; CHECK-NEXT: cmp w17, w9 +; CHECK-NEXT: csel w17, w17, w9, lt +; CHECK-NEXT: mov v1.s[1], w12 ; CHECK-NEXT: cmn w17, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w17, w17, w8, gt -; CHECK-NEXT: cmp w0, w9 -; CHECK-NEXT: csel w18, w0, w9, lt -; CHECK-NEXT: fcvtzs w0, d0 -; CHECK-NEXT: mov d0, v5.d[1] +; CHECK-NEXT: cmp w18, w9 +; CHECK-NEXT: csel w18, w18, w9, lt +; CHECK-NEXT: fmov s0, w17 ; CHECK-NEXT: cmn w18, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w18, w18, w8, gt -; CHECK-NEXT: cmp w1, w9 -; CHECK-NEXT: csel w1, w1, w9, lt -; CHECK-NEXT: cmn w1, #8, lsl #12 // =32768 -; CHECK-NEXT: fcvtzs w3, d0 -; CHECK-NEXT: mov d0, v4.d[1] -; CHECK-NEXT: csel w1, w1, w8, gt ; CHECK-NEXT: cmp w0, w9 ; CHECK-NEXT: csel w0, w0, w9, lt -; CHECK-NEXT: fmov s7, w1 +; CHECK-NEXT: mov v0.s[1], w16 ; CHECK-NEXT: cmn w0, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w0, w0, w8, gt +; CHECK-NEXT: cmp w1, w9 +; CHECK-NEXT: csel w1, w1, w9, lt +; CHECK-NEXT: fmov s7, w0 +; CHECK-NEXT: cmn w1, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w1, w1, w8, gt ; CHECK-NEXT: cmp w2, w9 -; CHECK-NEXT: fcvtzs w5, d0 ; CHECK-NEXT: csel w2, w2, w9, lt -; CHECK-NEXT: fmov s3, w12 ; CHECK-NEXT: mov v7.s[1], w18 ; CHECK-NEXT: cmn w2, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w2, w2, w8, gt ; CHECK-NEXT: cmp w3, w9 ; CHECK-NEXT: csel w3, w3, w9, lt -; CHECK-NEXT: mov v3.s[1], w10 ; CHECK-NEXT: fmov s6, w2 ; CHECK-NEXT: cmn w3, #8, lsl #12 // =32768 -; CHECK-NEXT: fmov s2, w14 ; CHECK-NEXT: csel w3, w3, w8, gt ; CHECK-NEXT: cmp w4, w9 ; CHECK-NEXT: csel w4, w4, w9, lt -; CHECK-NEXT: mov v6.s[1], w0 +; CHECK-NEXT: mov v6.s[1], w1 ; CHECK-NEXT: cmn w4, #8, lsl #12 // =32768 -; CHECK-NEXT: mov v2.s[1], w11 -; CHECK-NEXT: csel w12, w4, w8, gt +; CHECK-NEXT: csel w13, w4, w8, gt ; CHECK-NEXT: cmp w5, w9 -; CHECK-NEXT: fmov s1, w16 ; CHECK-NEXT: csel w10, w5, w9, lt -; CHECK-NEXT: fmov s5, w12 +; CHECK-NEXT: fmov s5, w13 ; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w10, w10, w8, gt ; CHECK-NEXT: cmp w6, w9 -; CHECK-NEXT: mov v1.s[1], w13 ; CHECK-NEXT: csel w9, w6, w9, lt ; CHECK-NEXT: mov v5.s[1], w3 -; CHECK-NEXT: fmov s0, w17 ; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w8, w9, w8, gt ; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov v0.s[1], w15 ; CHECK-NEXT: adrp x8, .LCPI85_0 ; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI85_0] ; CHECK-NEXT: mov v4.s[1], w10 diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll index c94db3484994c3..721b1e26940db9 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -2196,13 +2196,13 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: mov w8, #1904214015 // =0x717fffff ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: fmov s9, w8 -; CHECK-NEXT: mov x23, #68719476735 // =0xfffffffff +; CHECK-NEXT: mov x22, #68719476735 // =0xfffffffff ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x10, x23, x8, gt +; CHECK-NEXT: csel x10, x22, x8, gt ; CHECK-NEXT: csinv x8, x9, xzr, le ; CHECK-NEXT: stp x8, x10, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 @@ -2213,7 +2213,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: stp x8, x9, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 @@ -2226,7 +2226,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x25, x23, x9, gt +; CHECK-NEXT: csel x25, x22, x9, gt ; CHECK-NEXT: str x8, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti @@ -2237,7 +2237,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x26, x23, x9, gt +; CHECK-NEXT: csel x26, x22, x9, gt ; CHECK-NEXT: csinv x28, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti @@ -2248,7 +2248,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x29, x23, x9, gt +; CHECK-NEXT: csel x29, x22, x9, gt ; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti @@ -2258,8 +2258,8 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x21, x23, x9, gt -; CHECK-NEXT: csinv x27, x8, xzr, le +; CHECK-NEXT: csel x27, x22, x9, gt +; CHECK-NEXT: csinv x21, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -2269,15 +2269,15 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x22, x23, x9, gt +; CHECK-NEXT: csel x23, x22, x9, gt ; CHECK-NEXT: csinv x24, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: extr x8, x21, x27, #28 +; CHECK-NEXT: extr x8, x27, x21, #28 ; CHECK-NEXT: extr x9, x29, x20, #28 ; CHECK-NEXT: stur x28, [x19, #75] ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: bfi x22, x20, #36, #28 +; CHECK-NEXT: bfi x23, x20, #36, #28 ; CHECK-NEXT: lsr x11, x29, #28 ; CHECK-NEXT: stur x8, [x19, #41] ; CHECK-NEXT: str x9, [x19, #16] @@ -2285,15 +2285,15 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: stp x24, x22, [x19] +; CHECK-NEXT: stp x24, x23, [x19] ; CHECK-NEXT: stur x10, [x19, #50] -; CHECK-NEXT: lsr x10, x21, #28 +; CHECK-NEXT: lsr x10, x27, #28 ; CHECK-NEXT: strb w11, [x19, #24] ; CHECK-NEXT: strb w10, [x19, #49] -; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: ldp x12, x11, [sp] // 16-byte Folded Reload -; CHECK-NEXT: bfi x9, x27, #36, #28 +; CHECK-NEXT: bfi x9, x21, #36, #28 ; CHECK-NEXT: stur x8, [x19, #25] ; CHECK-NEXT: stur x9, [x19, #33] ; CHECK-NEXT: extr x10, x11, x12, #28 @@ -2835,68 +2835,68 @@ define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) { ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: fmov s2, w10 ; CHECK-NEXT: fcvtzu w10, d16 -; CHECK-NEXT: mov d16, v4.d[1] ; CHECK-NEXT: mov v0.b[3], w11 ; CHECK-NEXT: mov v2.s[1], w9 ; CHECK-NEXT: fcvtzu w9, d3 +; CHECK-NEXT: mov d3, v4.d[1] ; CHECK-NEXT: cmp w10, #255 ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: mov w11, v2.s[1] ; CHECK-NEXT: mov v0.b[4], v2.b[0] ; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fcvtzu w9, d16 -; CHECK-NEXT: mov d16, v5.d[1] +; CHECK-NEXT: fmov s16, w9 +; CHECK-NEXT: fcvtzu w9, d3 +; CHECK-NEXT: mov d3, v5.d[1] ; CHECK-NEXT: mov v0.b[5], w11 -; CHECK-NEXT: mov v3.s[1], w10 +; CHECK-NEXT: mov v16.s[1], w10 ; CHECK-NEXT: fcvtzu w10, d4 ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, #255 -; CHECK-NEXT: mov w11, v3.s[1] -; CHECK-NEXT: mov v0.b[6], v3.b[0] +; CHECK-NEXT: mov w11, v16.s[1] +; CHECK-NEXT: mov v0.b[6], v16.b[0] ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: fmov s4, w10 -; CHECK-NEXT: fcvtzu w10, d16 +; CHECK-NEXT: fcvtzu w10, d3 +; CHECK-NEXT: mov d3, v6.d[1] ; CHECK-NEXT: mov v0.b[7], w11 ; CHECK-NEXT: mov v4.s[1], w9 ; CHECK-NEXT: fcvtzu w9, d5 -; CHECK-NEXT: mov d5, v6.d[1] ; CHECK-NEXT: cmp w10, #255 ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: mov w11, v4.s[1] ; CHECK-NEXT: mov v0.b[8], v4.b[0] ; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: fmov s16, w9 -; CHECK-NEXT: fcvtzu w9, d5 -; CHECK-NEXT: mov d5, v7.d[1] +; CHECK-NEXT: fmov s5, w9 +; CHECK-NEXT: fcvtzu w9, d3 +; CHECK-NEXT: mov d3, v7.d[1] ; CHECK-NEXT: mov v0.b[9], w11 -; CHECK-NEXT: mov v16.s[1], w10 +; CHECK-NEXT: mov v5.s[1], w10 ; CHECK-NEXT: fcvtzu w10, d6 ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, #255 -; CHECK-NEXT: mov v0.b[10], v16.b[0] -; CHECK-NEXT: mov w11, v16.s[1] +; CHECK-NEXT: mov v0.b[10], v5.b[0] +; CHECK-NEXT: mov w11, v5.s[1] ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: fmov s6, w10 ; CHECK-NEXT: fcvtzu w10, d7 ; CHECK-NEXT: mov v0.b[11], w11 ; CHECK-NEXT: mov v6.s[1], w9 -; CHECK-NEXT: fcvtzu w9, d5 +; CHECK-NEXT: fcvtzu w9, d3 ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: mov v0.b[12], v6.b[0] ; CHECK-NEXT: mov w11, v6.s[1] ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, #255 ; CHECK-NEXT: csel w8, w10, w8, lo -; CHECK-NEXT: fmov s5, w8 +; CHECK-NEXT: fmov s3, w8 ; CHECK-NEXT: mov v0.b[13], w11 -; CHECK-NEXT: mov v5.s[1], w9 -; CHECK-NEXT: mov v0.b[14], v5.b[0] -; CHECK-NEXT: mov w8, v5.s[1] +; CHECK-NEXT: mov v3.s[1], w9 +; CHECK-NEXT: mov v0.b[14], v3.b[0] +; CHECK-NEXT: mov w8, v3.s[1] ; CHECK-NEXT: mov v0.b[15], w8 ; CHECK-NEXT: ret %x = call <16 x i8> @llvm.fptoui.sat.v16f64.v16i8(<16 x double> %f) @@ -2959,68 +2959,68 @@ define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: mov w8, #65535 // =0xffff ; CHECK-NEXT: fcvtzu w9, d3 ; CHECK-NEXT: mov d3, v1.d[1] -; CHECK-NEXT: fcvtzu w10, d1 +; CHECK-NEXT: fcvtzu w12, d1 ; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzu w11, d2 -; CHECK-NEXT: fcvtzu w12, d0 +; CHECK-NEXT: fcvtzu w13, d0 ; CHECK-NEXT: mov d0, v7.d[1] ; CHECK-NEXT: mov d2, v6.d[1] -; CHECK-NEXT: fcvtzu w14, d7 -; CHECK-NEXT: fcvtzu w13, d16 -; CHECK-NEXT: fcvtzu w16, d17 -; CHECK-NEXT: fcvtzu w15, d6 +; CHECK-NEXT: fcvtzu w15, d7 +; CHECK-NEXT: fcvtzu w10, d16 +; CHECK-NEXT: fcvtzu w14, d17 +; CHECK-NEXT: fcvtzu w16, d6 ; CHECK-NEXT: fcvtzu w17, d3 ; CHECK-NEXT: mov d6, v5.d[1] ; CHECK-NEXT: mov d3, v4.d[1] ; CHECK-NEXT: fcvtzu w18, d1 -; CHECK-NEXT: cmp w13, w8 -; CHECK-NEXT: csel w13, w13, w8, lo +; CHECK-NEXT: cmp w10, w8 +; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w9, w8 ; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: cmp w14, w8 ; CHECK-NEXT: fmov s19, w9 -; CHECK-NEXT: csel w9, w16, w8, lo +; CHECK-NEXT: csel w9, w14, w8, lo ; CHECK-NEXT: cmp w11, w8 -; CHECK-NEXT: fcvtzu w16, d0 +; CHECK-NEXT: fcvtzu w14, d0 ; CHECK-NEXT: csel w11, w11, w8, lo ; CHECK-NEXT: cmp w17, w8 -; CHECK-NEXT: mov v19.s[1], w13 -; CHECK-NEXT: csel w13, w17, w8, lo -; CHECK-NEXT: cmp w10, w8 -; CHECK-NEXT: csel w10, w10, w8, lo +; CHECK-NEXT: mov v19.s[1], w10 +; CHECK-NEXT: csel w10, w17, w8, lo +; CHECK-NEXT: cmp w12, w8 +; CHECK-NEXT: csel w12, w12, w8, lo ; CHECK-NEXT: cmp w18, w8 ; CHECK-NEXT: fmov s18, w11 ; CHECK-NEXT: csel w11, w18, w8, lo -; CHECK-NEXT: cmp w12, w8 +; CHECK-NEXT: cmp w13, w8 ; CHECK-NEXT: fcvtzu w17, d2 -; CHECK-NEXT: csel w12, w12, w8, lo -; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: csel w13, w13, w8, lo +; CHECK-NEXT: cmp w14, w8 ; CHECK-NEXT: fcvtzu w18, d6 ; CHECK-NEXT: mov v18.s[1], w9 -; CHECK-NEXT: csel w9, w16, w8, lo -; CHECK-NEXT: cmp w14, w8 -; CHECK-NEXT: fmov s17, w10 -; CHECK-NEXT: csel w10, w14, w8, lo -; CHECK-NEXT: fcvtzu w16, d5 -; CHECK-NEXT: fmov s23, w10 -; CHECK-NEXT: cmp w17, w8 -; CHECK-NEXT: fcvtzu w14, d3 -; CHECK-NEXT: csel w10, w17, w8, lo +; CHECK-NEXT: csel w9, w14, w8, lo ; CHECK-NEXT: cmp w15, w8 +; CHECK-NEXT: fmov s17, w12 +; CHECK-NEXT: csel w12, w15, w8, lo +; CHECK-NEXT: fcvtzu w14, d5 +; CHECK-NEXT: fmov s23, w12 +; CHECK-NEXT: cmp w17, w8 +; CHECK-NEXT: fcvtzu w15, d3 +; CHECK-NEXT: csel w12, w17, w8, lo +; CHECK-NEXT: cmp w16, w8 ; CHECK-NEXT: fcvtzu w17, d4 -; CHECK-NEXT: mov v17.s[1], w13 +; CHECK-NEXT: mov v17.s[1], w10 ; CHECK-NEXT: mov v23.s[1], w9 -; CHECK-NEXT: csel w9, w15, w8, lo +; CHECK-NEXT: csel w9, w16, w8, lo ; CHECK-NEXT: cmp w18, w8 ; CHECK-NEXT: fmov s22, w9 ; CHECK-NEXT: csel w9, w18, w8, lo -; CHECK-NEXT: cmp w16, w8 -; CHECK-NEXT: fmov s16, w12 -; CHECK-NEXT: mov v22.s[1], w10 -; CHECK-NEXT: csel w10, w16, w8, lo ; CHECK-NEXT: cmp w14, w8 -; CHECK-NEXT: fmov s21, w10 -; CHECK-NEXT: csel w10, w14, w8, lo +; CHECK-NEXT: fmov s16, w13 +; CHECK-NEXT: mov v22.s[1], w12 +; CHECK-NEXT: csel w12, w14, w8, lo +; CHECK-NEXT: cmp w15, w8 +; CHECK-NEXT: fmov s21, w12 +; CHECK-NEXT: csel w12, w15, w8, lo ; CHECK-NEXT: cmp w17, w8 ; CHECK-NEXT: csel w8, w17, w8, lo ; CHECK-NEXT: mov v16.s[1], w11 @@ -3028,7 +3028,7 @@ define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: fmov s20, w8 ; CHECK-NEXT: adrp x8, .LCPI85_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI85_0] -; CHECK-NEXT: mov v20.s[1], w10 +; CHECK-NEXT: mov v20.s[1], w12 ; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b ; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll index 16a6ba3f8cc93f..7247ee9c05b6b6 100644 --- a/llvm/test/CodeGen/AArch64/frem.ll +++ b/llvm/test/CodeGen/AArch64/frem.ll @@ -64,16 +64,16 @@ define <2 x double> @frem_v2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-SD-NEXT: str x30, [sp, #48] // 8-byte Folded Spill ; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 ; CHECK-SD-NEXT: .cfi_offset w30, -16 -; CHECK-SD-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-SD-NEXT: stp q0, q1, [sp, #16] // 32-byte Folded Spill ; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: mov d1, v1.d[1] ; CHECK-SD-NEXT: bl fmod -; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: bl fmod -; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] @@ -195,17 +195,17 @@ define <4 x double> @frem_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-SD-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-SD-NEXT: .cfi_def_cfa_offset 96 ; CHECK-SD-NEXT: .cfi_offset w30, -16 -; CHECK-SD-NEXT: stp q0, q2, [sp] // 32-byte Folded Spill +; CHECK-SD-NEXT: stp q0, q2, [sp, #16] // 32-byte Folded Spill ; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: stp q1, q3, [sp, #48] // 32-byte Folded Spill ; CHECK-SD-NEXT: mov d1, v2.d[1] ; CHECK-SD-NEXT: bl fmod -; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: bl fmod -; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill @@ -238,23 +238,21 @@ define <4 x double> @frem_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-GI-NEXT: .cfi_offset b10, -40 ; CHECK-GI-NEXT: .cfi_offset b11, -48 ; CHECK-GI-NEXT: mov v4.16b, v1.16b -; CHECK-GI-NEXT: str q1, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp q3, q1, [sp, #16] // 32-byte Folded Spill ; CHECK-GI-NEXT: mov v1.16b, v2.16b -; CHECK-GI-NEXT: str q3, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov d8, v0.d[1] ; CHECK-GI-NEXT: mov d10, v2.d[1] -; CHECK-GI-NEXT: mov d11, v3.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-GI-NEXT: mov d11, v3.d[1] ; CHECK-GI-NEXT: mov d9, v4.d[1] ; CHECK-GI-NEXT: bl fmod ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: fmov d1, d10 ; CHECK-GI-NEXT: bl fmod -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldp q1, q0, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-GI-NEXT: bl fmod @@ -262,16 +260,15 @@ define <4 x double> @frem_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: fmov d1, d11 ; CHECK-GI-NEXT: bl fmod -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp, #32] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v2.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] -; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #112 ; CHECK-GI-NEXT: ret entry: @@ -288,16 +285,16 @@ define <2 x float> @frem_v2f32(<2 x float> %a, <2 x float> %b) { ; CHECK-SD-NEXT: .cfi_offset w30, -16 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-SD-NEXT: stp q0, q1, [sp, #16] // 32-byte Folded Spill ; CHECK-SD-NEXT: mov s0, v0.s[1] ; CHECK-SD-NEXT: mov s1, v1.s[1] ; CHECK-SD-NEXT: bl fmodf -; CHECK-SD-NEXT: str d0, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-SD-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-SD-NEXT: bl fmodf -; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-SD-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] @@ -597,22 +594,23 @@ define <8 x float> @frem_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov v4.16b, v1.16b ; CHECK-GI-NEXT: mov v1.16b, v2.16b +; CHECK-GI-NEXT: str q3, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov s8, v0.s[1] ; CHECK-GI-NEXT: mov s9, v0.s[2] ; CHECK-GI-NEXT: mov s10, v0.s[3] -; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: mov s12, v3.s[1] -; CHECK-GI-NEXT: mov s11, v3.s[2] -; CHECK-GI-NEXT: mov s2, v4.s[1] -; CHECK-GI-NEXT: stp q3, q4, [sp] // 32-byte Folded Spill +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-GI-NEXT: mov s2, v4.s[2] ; CHECK-GI-NEXT: mov s5, v4.s[3] +; CHECK-GI-NEXT: str q4, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: mov s11, v4.s[1] ; CHECK-GI-NEXT: mov s14, v1.s[1] ; CHECK-GI-NEXT: mov s15, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 -; CHECK-GI-NEXT: str s2, [sp, #48] // 4-byte Folded Spill -; CHECK-GI-NEXT: mov s2, v4.s[2] ; CHECK-GI-NEXT: str s2, [sp, #112] // 4-byte Folded Spill +; CHECK-GI-NEXT: mov s2, v3.s[2] +; CHECK-GI-NEXT: str s2, [sp, #92] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov s2, v3.s[3] ; CHECK-GI-NEXT: stp s2, s5, [sp, #200] // 8-byte Folded Spill ; CHECK-GI-NEXT: bl fmodf @@ -620,46 +618,48 @@ define <8 x float> @frem_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: fmov s0, s8 ; CHECK-GI-NEXT: fmov s1, s14 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: str d0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s9 ; CHECK-GI-NEXT: fmov s1, s15 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: fmov s1, s13 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: str d0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-GI-NEXT: bl fmodf +; CHECK-GI-NEXT: str d0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: fmov s0, s11 ; CHECK-GI-NEXT: fmov s1, s12 -; CHECK-GI-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr s0, [sp, #48] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fmov s1, s11 -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr s0, [sp, #112] // 4-byte Folded Reload +; CHECK-GI-NEXT: ldr s1, [sp, #92] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: str d0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldp s1, s0, [sp, #200] // 8-byte Folded Reload ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldp q3, q2, [sp, #16] // 32-byte Folded Reload -; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: ldr q3, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #176] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #192] // 8-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d11, d10, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d15, d14, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d15, d14, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -777,14 +777,14 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h9, v0.h[1] -; CHECK-GI-NEXT: mov h10, v0.h[2] -; CHECK-GI-NEXT: mov h11, v0.h[3] -; CHECK-GI-NEXT: mov h12, v0.h[4] +; CHECK-GI-NEXT: mov h8, v0.h[1] +; CHECK-GI-NEXT: mov h9, v0.h[2] +; CHECK-GI-NEXT: mov h10, v0.h[3] +; CHECK-GI-NEXT: mov h11, v0.h[4] ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h15, v1.h[2] -; CHECK-GI-NEXT: mov h8, v1.h[3] -; CHECK-GI-NEXT: mov h13, v1.h[4] +; CHECK-GI-NEXT: mov h13, v1.h[3] +; CHECK-GI-NEXT: mov h12, v1.h[4] ; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 @@ -795,27 +795,27 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h9 +; CHECK-GI-NEXT: fcvt s2, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h10 +; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h8 +; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h13 +; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf @@ -968,8 +968,7 @@ define <4 x half> @frem_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-GI-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d0, d1 ; CHECK-GI-NEXT: add sp, sp, #112 ; CHECK-GI-NEXT: ret entry: @@ -1082,17 +1081,17 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h11, v0.h[1] -; CHECK-GI-NEXT: mov h12, v0.h[2] -; CHECK-GI-NEXT: mov h13, v0.h[3] -; CHECK-GI-NEXT: mov h14, v0.h[4] +; CHECK-GI-NEXT: mov h10, v0.h[1] +; CHECK-GI-NEXT: mov h11, v0.h[2] +; CHECK-GI-NEXT: mov h12, v0.h[3] +; CHECK-GI-NEXT: mov h13, v0.h[4] ; CHECK-GI-NEXT: mov h8, v1.h[1] ; CHECK-GI-NEXT: mov h9, v1.h[2] -; CHECK-GI-NEXT: mov h10, v1.h[3] -; CHECK-GI-NEXT: mov h15, v1.h[4] +; CHECK-GI-NEXT: mov h15, v1.h[3] +; CHECK-GI-NEXT: mov h14, v1.h[4] ; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] -; CHECK-GI-NEXT: str h2, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h2, [sp, #80] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: str h2, [sp, #96] // 2-byte Folded Spill @@ -1104,27 +1103,27 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #190] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h10 +; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h14 +; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h15 +; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf @@ -1136,10 +1135,10 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #188] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 @@ -1152,7 +1151,7 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload @@ -1167,7 +1166,7 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] @@ -1353,13 +1352,13 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; ; CHECK-GI-LABEL: frem_v16f16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sub sp, sp, #448 -; CHECK-GI-NEXT: stp d15, d14, [sp, #368] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d13, d12, [sp, #384] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d11, d10, [sp, #400] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #416] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x29, x30, [sp, #432] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 448 +; CHECK-GI-NEXT: sub sp, sp, #464 +; CHECK-GI-NEXT: stp d15, d14, [sp, #384] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d13, d12, [sp, #400] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d11, d10, [sp, #416] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d9, d8, [sp, #432] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x29, x30, [sp, #448] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 464 ; CHECK-GI-NEXT: .cfi_offset w30, -8 ; CHECK-GI-NEXT: .cfi_offset w29, -16 ; CHECK-GI-NEXT: .cfi_offset b8, -24 @@ -1370,209 +1369,210 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b13, -64 ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 -; CHECK-GI-NEXT: mov v4.16b, v1.16b -; CHECK-GI-NEXT: str q1, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v0.h[4] -; CHECK-GI-NEXT: mov h12, v0.h[1] -; CHECK-GI-NEXT: mov h13, v0.h[2] +; CHECK-GI-NEXT: mov h4, v0.h[4] +; CHECK-GI-NEXT: str q1, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: mov h10, v0.h[1] +; CHECK-GI-NEXT: mov h11, v0.h[2] +; CHECK-GI-NEXT: mov h12, v0.h[3] ; CHECK-GI-NEXT: str q3, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: mov h14, v0.h[3] -; CHECK-GI-NEXT: mov h15, v2.h[1] -; CHECK-GI-NEXT: mov h8, v2.h[2] -; CHECK-GI-NEXT: mov h9, v2.h[3] -; CHECK-GI-NEXT: mov h10, v2.h[4] -; CHECK-GI-NEXT: mov h11, v2.h[5] -; CHECK-GI-NEXT: str h1, [sp, #272] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v0.h[5] -; CHECK-GI-NEXT: str h1, [sp, #240] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v0.h[6] -; CHECK-GI-NEXT: str h1, [sp, #176] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v0.h[7] +; CHECK-GI-NEXT: mov h14, v2.h[1] +; CHECK-GI-NEXT: mov h15, v2.h[2] +; CHECK-GI-NEXT: mov h8, v2.h[3] +; CHECK-GI-NEXT: mov h9, v2.h[4] +; CHECK-GI-NEXT: mov h13, v2.h[5] +; CHECK-GI-NEXT: str h4, [sp, #176] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h4, v0.h[5] +; CHECK-GI-NEXT: str h4, [sp, #128] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h4, v0.h[6] +; CHECK-GI-NEXT: str h4, [sp, #80] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h4, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h1, [sp, #144] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v4.h[1] -; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h4, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov v4.16b, v1.16b +; CHECK-GI-NEXT: mov h1, v1.h[1] +; CHECK-GI-NEXT: str h1, [sp, #112] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[2] -; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[3] -; CHECK-GI-NEXT: str h1, [sp, #128] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #208] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[4] -; CHECK-GI-NEXT: str h1, [sp, #192] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #272] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[5] -; CHECK-GI-NEXT: str h1, [sp, #256] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #304] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[6] -; CHECK-GI-NEXT: str h1, [sp, #336] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v4.h[7] ; CHECK-GI-NEXT: str h1, [sp, #352] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v4.h[7] +; CHECK-GI-NEXT: str h1, [sp, #368] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] ; CHECK-GI-NEXT: str h1, [sp, #12] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[7] ; CHECK-GI-NEXT: str h1, [sp, #14] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[1] -; CHECK-GI-NEXT: str h1, [sp, #44] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #62] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[2] -; CHECK-GI-NEXT: str h1, [sp, #46] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #110] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[3] -; CHECK-GI-NEXT: str h1, [sp, #78] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #158] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[4] -; CHECK-GI-NEXT: str h1, [sp, #110] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #206] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[5] -; CHECK-GI-NEXT: str h1, [sp, #174] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #254] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[6] -; CHECK-GI-NEXT: str h1, [sp, #238] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h1, v3.h[7] ; CHECK-GI-NEXT: str h1, [sp, #302] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v3.h[7] +; CHECK-GI-NEXT: str h1, [sp, #350] // 2-byte Folded Spill ; CHECK-GI-NEXT: fcvt s1, h2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h15 -; CHECK-GI-NEXT: str q0, [sp, #304] // 16-byte Folded Spill +; CHECK-GI-NEXT: fcvt s1, h14 +; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: fcvt s1, h15 +; CHECK-GI-NEXT: str q0, [sp, #256] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h14 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h9 -; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill +; CHECK-GI-NEXT: fcvt s1, h8 +; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #176] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h10 -; CHECK-GI-NEXT: str q0, [sp, #272] // 16-byte Folded Spill +; CHECK-GI-NEXT: fcvt s1, h9 +; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #240] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #128] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h11 -; CHECK-GI-NEXT: str q0, [sp, #240] // 16-byte Folded Spill +; CHECK-GI-NEXT: fcvt s1, h13 +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #176] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #12] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #144] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr q1, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #112] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #44] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #62] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #46] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #110] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #128] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #208] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #78] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #158] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #192] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #110] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #272] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #206] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #256] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #304] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #256] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #174] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #304] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #254] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #336] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #352] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #336] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #238] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #352] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #302] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #352] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #368] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #352] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #302] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #368] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #350] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr q3, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp x29, x30, [sp, #432] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d9, d8, [sp, #416] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #320] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x29, x30, [sp, #448] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #320] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #400] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d13, d12, [sp, #384] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #432] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d11, d10, [sp, #416] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d13, d12, [sp, #400] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d15, d14, [sp, #384] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #272] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d15, d14, [sp, #368] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #336] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #304] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #352] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #352] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v3.16b -; CHECK-GI-NEXT: add sp, sp, #448 +; CHECK-GI-NEXT: ldr q0, [sp, #368] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v1.16b, v3.16b +; CHECK-GI-NEXT: add sp, sp, #464 ; CHECK-GI-NEXT: ret entry: %c = frem <16 x half> %a, %b diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll index 361a1e996fd7ac..a094728f32cbb8 100644 --- a/llvm/test/CodeGen/AArch64/fsincos.ll +++ b/llvm/test/CodeGen/AArch64/fsincos.ll @@ -203,7 +203,7 @@ define <4 x double> @sin_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: .cfi_offset w30, -16 ; CHECK-GI-NEXT: .cfi_offset b8, -24 ; CHECK-GI-NEXT: .cfi_offset b9, -32 -; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov d8, v0.d[1] ; CHECK-GI-NEXT: mov d9, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -211,21 +211,21 @@ define <4 x double> @sin_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl sin -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: bl sin -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl sin -; CHECK-GI-NEXT: ldp q1, q2, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v2.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] -; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -241,14 +241,14 @@ define <2 x float> @sin_v2f32(<2 x float> %a) { ; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 ; CHECK-SD-NEXT: .cfi_offset w30, -16 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-SD-NEXT: mov s0, v0.s[1] ; CHECK-SD-NEXT: bl sinf -; CHECK-SD-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-SD-NEXT: bl sinf -; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] @@ -510,16 +510,16 @@ define <8 x float> @sin_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: mov s12, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s8 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s9 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: bl sinf @@ -532,7 +532,7 @@ define <8 x float> @sin_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: str d0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s13 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #48] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload @@ -541,11 +541,11 @@ define <8 x float> @sin_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -790,8 +790,7 @@ define <4 x half> @sin_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d0, d1 ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -1111,7 +1110,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov v2.16b, v1.16b -; CHECK-GI-NEXT: str q1, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h1, v1.h[2] ; CHECK-GI-NEXT: mov h15, v0.h[1] @@ -1122,57 +1121,57 @@ define <16 x half> @sin_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h12, v0.h[6] ; CHECK-GI-NEXT: mov h13, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h1, [sp, #16] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[3] -; CHECK-GI-NEXT: str h1, [sp, #32] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[4] -; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #128] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[5] -; CHECK-GI-NEXT: str h1, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] -; CHECK-GI-NEXT: str h1, [sp, #96] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #192] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[7] -; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #224] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: fcvt s1, h14 @@ -1180,77 +1179,79 @@ define <16 x half> @sin_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #128] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #192] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #224] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #288] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #272] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #256] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v3.16b +; CHECK-GI-NEXT: ldr q0, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v1.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #320 ; CHECK-GI-NEXT: ret entry: @@ -1459,7 +1460,7 @@ define <4 x double> @cos_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: .cfi_offset w30, -16 ; CHECK-GI-NEXT: .cfi_offset b8, -24 ; CHECK-GI-NEXT: .cfi_offset b9, -32 -; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov d8, v0.d[1] ; CHECK-GI-NEXT: mov d9, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -1467,21 +1468,21 @@ define <4 x double> @cos_v4f64(<4 x double> %a) { ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl cos -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: bl cos -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl cos -; CHECK-GI-NEXT: ldp q1, q2, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v2.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] -; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -1497,14 +1498,14 @@ define <2 x float> @cos_v2f32(<2 x float> %a) { ; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 ; CHECK-SD-NEXT: .cfi_offset w30, -16 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-SD-NEXT: mov s0, v0.s[1] ; CHECK-SD-NEXT: bl cosf -; CHECK-SD-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-SD-NEXT: bl cosf -; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] @@ -1766,16 +1767,16 @@ define <8 x float> @cos_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: mov s12, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s8 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s9 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: str d0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: str d0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: bl cosf @@ -1788,7 +1789,7 @@ define <8 x float> @cos_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: str d0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s13 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #48] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload @@ -1797,11 +1798,11 @@ define <8 x float> @cos_v8f32(<8 x float> %a) { ; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -2046,8 +2047,7 @@ define <4 x half> @cos_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d0, d1 ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -2367,7 +2367,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov v2.16b, v1.16b -; CHECK-GI-NEXT: str q1, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h1, v1.h[2] ; CHECK-GI-NEXT: mov h15, v0.h[1] @@ -2378,57 +2378,57 @@ define <16 x half> @cos_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h12, v0.h[6] ; CHECK-GI-NEXT: mov h13, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h1, [sp, #16] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[3] -; CHECK-GI-NEXT: str h1, [sp, #32] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[4] -; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #128] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[5] -; CHECK-GI-NEXT: str h1, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] -; CHECK-GI-NEXT: str h1, [sp, #96] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #192] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[7] -; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #224] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: fcvt s1, h14 @@ -2436,77 +2436,79 @@ define <16 x half> @cos_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #128] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #192] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #224] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #288] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #272] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #256] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 -; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v3.16b +; CHECK-GI-NEXT: ldr q0, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v1.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #320 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll index ed0d0d51948358..ab65bf5d40c21a 100644 --- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll +++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll @@ -110,14 +110,13 @@ define <2 x half> @exp10_v2f16(<2 x half> %x) { ; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill ; GISEL-NEXT: fmov s0, s1 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; GISEL-NEXT: fcvt h1, s0 +; GISEL-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; GISEL-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload ; GISEL-NEXT: ldr d8, [sp, #16] // 8-byte Folded Reload -; GISEL-NEXT: mov v1.h[1], v0.h[0] -; GISEL-NEXT: mov v1.h[2], v0.h[0] -; GISEL-NEXT: mov v1.h[3], v0.h[0] -; GISEL-NEXT: mov v0.16b, v1.16b +; GISEL-NEXT: mov v0.h[1], v1.h[0] +; GISEL-NEXT: mov v0.h[2], v0.h[0] +; GISEL-NEXT: mov v0.h[3], v0.h[0] ; GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 ; GISEL-NEXT: add sp, sp, #32 ; GISEL-NEXT: ret @@ -198,8 +197,7 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) { ; GISEL-NEXT: mov v1.h[1], v2.h[0] ; GISEL-NEXT: mov v1.h[2], v0.h[0] ; GISEL-NEXT: mov v1.h[3], v0.h[0] -; GISEL-NEXT: mov v0.16b, v1.16b -; GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; GISEL-NEXT: fmov d0, d1 ; GISEL-NEXT: add sp, sp, #64 ; GISEL-NEXT: ret %r = call <3 x half> @llvm.exp10.v3f16(<3 x half> %x) @@ -289,8 +287,7 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) { ; GISEL-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; GISEL-NEXT: mov v1.h[2], v2.h[0] ; GISEL-NEXT: mov v1.h[3], v0.h[0] -; GISEL-NEXT: mov v0.16b, v1.16b -; GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; GISEL-NEXT: fmov d0, d1 ; GISEL-NEXT: add sp, sp, #80 ; GISEL-NEXT: ret %r = call <4 x half> @llvm.exp10.v4f16(<4 x half> %x) @@ -350,14 +347,14 @@ define <2 x float> @exp10_v2f32(<2 x float> %x) { ; SDAG-NEXT: .cfi_def_cfa_offset 48 ; SDAG-NEXT: .cfi_offset w30, -16 ; SDAG-NEXT: // kill: def $d0 killed $d0 def $q0 -; SDAG-NEXT: str q0, [sp] // 16-byte Folded Spill +; SDAG-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; SDAG-NEXT: mov s0, v0.s[1] ; SDAG-NEXT: bl exp10f -; SDAG-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; SDAG-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; SDAG-NEXT: str d0, [sp] // 16-byte Folded Spill +; SDAG-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; SDAG-NEXT: // kill: def $s0 killed $s0 killed $q0 ; SDAG-NEXT: bl exp10f -; SDAG-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; SDAG-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; SDAG-NEXT: // kill: def $s0 killed $s0 def $q0 ; SDAG-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; SDAG-NEXT: mov v0.s[1], v1.s[0] @@ -701,7 +698,7 @@ define <4 x double> @exp10_v4f64(<4 x double> %x) { ; GISEL-NEXT: .cfi_offset w30, -16 ; GISEL-NEXT: .cfi_offset b8, -24 ; GISEL-NEXT: .cfi_offset b9, -32 -; GISEL-NEXT: str q1, [sp] // 16-byte Folded Spill +; GISEL-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; GISEL-NEXT: mov d8, v0.d[1] ; GISEL-NEXT: mov d9, v1.d[1] ; GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -709,21 +706,21 @@ define <4 x double> @exp10_v4f64(<4 x double> %x) { ; GISEL-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; GISEL-NEXT: fmov d0, d8 ; GISEL-NEXT: bl exp10 -; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; GISEL-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill +; GISEL-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 ; GISEL-NEXT: bl exp10 -; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill +; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; GISEL-NEXT: fmov d0, d9 ; GISEL-NEXT: bl exp10 -; GISEL-NEXT: ldp q1, q2, [sp, #16] // 32-byte Folded Reload +; GISEL-NEXT: ldp q1, q3, [sp, #16] // 32-byte Folded Reload ; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; GISEL-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; GISEL-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; GISEL-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; GISEL-NEXT: mov v2.d[1], v1.d[0] -; GISEL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; GISEL-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; GISEL-NEXT: mov v3.d[1], v2.d[0] ; GISEL-NEXT: mov v1.d[1], v0.d[0] -; GISEL-NEXT: mov v0.16b, v2.16b +; GISEL-NEXT: mov v0.16b, v3.16b ; GISEL-NEXT: add sp, sp, #80 ; GISEL-NEXT: ret %r = call <4 x double> @llvm.exp10.v4f64(<4 x double> %x) diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 74048b8bee3329..8ba63b98049972 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -662,49 +662,49 @@ define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ldr b1, [sp, #144] -; CHECK-NEXT: add x10, sp, #152 +; CHECK-NEXT: add x8, sp, #152 ; CHECK-NEXT: add x9, sp, #160 -; CHECK-NEXT: add x8, sp, #168 +; CHECK-NEXT: add x10, sp, #168 ; CHECK-NEXT: ldr b2, [sp, #272] -; CHECK-NEXT: ld1 { v1.b }[1], [x10] -; CHECK-NEXT: add x11, sp, #280 -; CHECK-NEXT: ldr b3, [sp, #80] -; CHECK-NEXT: mov v0.b[1], w1 +; CHECK-NEXT: ld1 { v1.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #280 ; CHECK-NEXT: ldr b4, [sp, #528] -; CHECK-NEXT: add x10, sp, #88 -; CHECK-NEXT: ld1 { v2.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #536 -; CHECK-NEXT: ldr b5, [sp, #336] +; CHECK-NEXT: mov v0.b[1], w1 +; CHECK-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #536 +; CHECK-NEXT: ldr b3, [sp, #80] +; CHECK-NEXT: ld1 { v4.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #176 ; CHECK-NEXT: ld1 { v1.b }[2], [x9] -; CHECK-NEXT: ld1 { v3.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #344 -; CHECK-NEXT: ld1 { v4.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #176 ; CHECK-NEXT: ldr b6, [sp, #656] -; CHECK-NEXT: mov v0.b[2], w2 -; CHECK-NEXT: ld1 { v5.b }[1], [x10] +; CHECK-NEXT: add x11, sp, #88 +; CHECK-NEXT: ldr b5, [sp, #336] ; CHECK-NEXT: ldr b7, [sp, #464] -; CHECK-NEXT: ld1 { v1.b }[3], [x8] ; CHECK-NEXT: add x12, sp, #664 +; CHECK-NEXT: mov v0.b[2], w2 +; CHECK-NEXT: ld1 { v3.b }[1], [x11] +; CHECK-NEXT: add x11, sp, #344 +; CHECK-NEXT: ld1 { v1.b }[3], [x10] ; CHECK-NEXT: add x9, sp, #472 ; CHECK-NEXT: ld1 { v6.b }[1], [x12] -; CHECK-NEXT: add x8, sp, #96 -; CHECK-NEXT: add x10, sp, #184 +; CHECK-NEXT: ld1 { v5.b }[1], [x11] +; CHECK-NEXT: add x11, sp, #184 ; CHECK-NEXT: add x12, sp, #288 ; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: ld1 { v3.b }[2], [x8] -; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: ld1 { v1.b }[4], [x11] -; CHECK-NEXT: add x8, sp, #352 ; CHECK-NEXT: ld1 { v2.b }[2], [x12] +; CHECK-NEXT: add x10, sp, #96 +; CHECK-NEXT: mov v0.b[3], w3 +; CHECK-NEXT: ld1 { v1.b }[4], [x8] ; CHECK-NEXT: add x13, sp, #544 -; CHECK-NEXT: ld1 { v5.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #672 +; CHECK-NEXT: ld1 { v3.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #352 ; CHECK-NEXT: ld1 { v4.b }[2], [x13] -; CHECK-NEXT: add x9, sp, #192 -; CHECK-NEXT: ld1 { v1.b }[5], [x10] ; CHECK-NEXT: ld1 { v6.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #480 +; CHECK-NEXT: ld1 { v1.b }[5], [x11] +; CHECK-NEXT: add x9, sp, #192 +; CHECK-NEXT: ld1 { v5.b }[2], [x10] ; CHECK-NEXT: mov v0.b[4], w4 ; CHECK-NEXT: ld1 { v7.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #296 @@ -732,13 +732,13 @@ define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 ; CHECK-NEXT: mov v0.b[6], w6 ; CHECK-NEXT: ld1 { v1.b }[8], [x13] ; CHECK-NEXT: ld1 { v4.b }[5], [x8] -; CHECK-NEXT: add x14, sp, #216 +; CHECK-NEXT: add x10, sp, #216 ; CHECK-NEXT: ld1 { v5.b }[5], [x17] ; CHECK-NEXT: add x13, sp, #576 -; CHECK-NEXT: add x11, sp, #224 -; CHECK-NEXT: add x10, sp, #232 +; CHECK-NEXT: add x14, sp, #224 +; CHECK-NEXT: add x11, sp, #232 ; CHECK-NEXT: add x15, sp, #240 -; CHECK-NEXT: ld1 { v1.b }[9], [x14] +; CHECK-NEXT: ld1 { v1.b }[9], [x10] ; CHECK-NEXT: ld1 { v4.b }[6], [x13] ; CHECK-NEXT: add x13, sp, #384 ; CHECK-NEXT: mov v0.b[7], w7 @@ -746,22 +746,22 @@ define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 ; CHECK-NEXT: add x13, sp, #112 ; CHECK-NEXT: ld1 { v3.b }[4], [x13] ; CHECK-NEXT: add x13, sp, #32 -; CHECK-NEXT: add x14, sp, #584 -; CHECK-NEXT: ld1 { v1.b }[10], [x11] -; CHECK-NEXT: ld1 { v4.b }[7], [x14] -; CHECK-NEXT: add x11, sp, #312 -; CHECK-NEXT: add x14, sp, #40 -; CHECK-NEXT: ld1 { v2.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #592 +; CHECK-NEXT: add x10, sp, #584 +; CHECK-NEXT: ld1 { v1.b }[10], [x14] +; CHECK-NEXT: ld1 { v4.b }[7], [x10] +; CHECK-NEXT: add x14, sp, #312 +; CHECK-NEXT: add x10, sp, #40 +; CHECK-NEXT: ld1 { v2.b }[5], [x14] +; CHECK-NEXT: add x14, sp, #592 ; CHECK-NEXT: ld1 { v0.b }[8], [x12] ; CHECK-NEXT: add x12, sp, #24 ; CHECK-NEXT: add x16, sp, #248 -; CHECK-NEXT: ld1 { v1.b }[11], [x10] -; CHECK-NEXT: ld1 { v4.b }[8], [x11] -; CHECK-NEXT: add x11, sp, #400 +; CHECK-NEXT: ld1 { v1.b }[11], [x11] +; CHECK-NEXT: ld1 { v4.b }[8], [x14] +; CHECK-NEXT: add x14, sp, #400 ; CHECK-NEXT: add x9, sp, #256 ; CHECK-NEXT: add x8, sp, #264 -; CHECK-NEXT: add x10, sp, #72 +; CHECK-NEXT: add x11, sp, #72 ; CHECK-NEXT: ld1 { v0.b }[9], [x12] ; CHECK-NEXT: add x12, sp, #392 ; CHECK-NEXT: movi v16.2d, #0000000000000000 @@ -774,25 +774,25 @@ define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 ; CHECK-NEXT: ld1 { v0.b }[10], [x13] ; CHECK-NEXT: ld1 { v3.b }[5], [x15] ; CHECK-NEXT: add x15, sp, #408 -; CHECK-NEXT: ld1 { v5.b }[8], [x11] +; CHECK-NEXT: ld1 { v5.b }[8], [x14] ; CHECK-NEXT: add x13, sp, #56 ; CHECK-NEXT: ld1 { v1.b }[13], [x16] -; CHECK-NEXT: add x11, sp, #64 +; CHECK-NEXT: add x14, sp, #64 ; CHECK-NEXT: add x16, sp, #616 ; CHECK-NEXT: movi v19.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v0.b }[11], [x14] -; CHECK-NEXT: add x14, sp, #600 -; CHECK-NEXT: ld1 { v4.b }[9], [x14] +; CHECK-NEXT: ld1 { v0.b }[11], [x10] +; CHECK-NEXT: add x10, sp, #600 +; CHECK-NEXT: ld1 { v4.b }[9], [x10] ; CHECK-NEXT: ld1 { v5.b }[9], [x15] ; CHECK-NEXT: add x15, sp, #608 ; CHECK-NEXT: ld1 { v1.b }[14], [x9] ; CHECK-NEXT: add x9, sp, #488 -; CHECK-NEXT: add x14, sp, #320 +; CHECK-NEXT: add x10, sp, #320 ; CHECK-NEXT: ld1 { v0.b }[12], [x12] ; CHECK-NEXT: ld1 { v7.b }[3], [x9] -; CHECK-NEXT: ld1 { v2.b }[6], [x14] +; CHECK-NEXT: ld1 { v2.b }[6], [x10] ; CHECK-NEXT: ld1 { v4.b }[10], [x15] -; CHECK-NEXT: add x14, sp, #624 +; CHECK-NEXT: add x10, sp, #624 ; CHECK-NEXT: add x9, sp, #688 ; CHECK-NEXT: ld1 { v1.b }[15], [x8] ; CHECK-NEXT: add x8, sp, #432 @@ -806,54 +806,54 @@ define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 ; CHECK-NEXT: ld1 { v6.b }[3], [x16] ; CHECK-NEXT: add x13, sp, #632 ; CHECK-NEXT: add x12, sp, #504 -; CHECK-NEXT: ld1 { v0.b }[14], [x11] -; CHECK-NEXT: add x11, sp, #424 +; CHECK-NEXT: ld1 { v0.b }[14], [x14] +; CHECK-NEXT: add x14, sp, #424 ; CHECK-NEXT: add x15, sp, #128 -; CHECK-NEXT: ld1 { v5.b }[11], [x11] -; CHECK-NEXT: ld1 { v4.b }[12], [x14] -; CHECK-NEXT: add x11, sp, #696 +; CHECK-NEXT: ld1 { v5.b }[11], [x14] +; CHECK-NEXT: ld1 { v4.b }[12], [x10] +; CHECK-NEXT: add x10, sp, #496 ; CHECK-NEXT: ld1 { v6.b }[4], [x9] +; CHECK-NEXT: ld1 { v7.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #440 +; CHECK-NEXT: ld1 { v0.b }[15], [x11] +; CHECK-NEXT: add x11, sp, #696 ; CHECK-NEXT: ld1 { v3.b }[6], [x15] -; CHECK-NEXT: add x9, sp, #640 -; CHECK-NEXT: ld1 { v0.b }[15], [x10] -; CHECK-NEXT: add x10, sp, #496 ; CHECK-NEXT: ld1 { v5.b }[12], [x8] -; CHECK-NEXT: ld1 { v7.b }[4], [x10] ; CHECK-NEXT: ld1 { v4.b }[13], [x13] -; CHECK-NEXT: add x10, sp, #440 +; CHECK-NEXT: add x9, sp, #640 ; CHECK-NEXT: ld1 { v6.b }[5], [x11] +; CHECK-NEXT: ld1 { v7.b }[5], [x12] ; CHECK-NEXT: add x11, sp, #512 ; CHECK-NEXT: add x8, sp, #136 ; CHECK-NEXT: sdot v17.4s, v0.16b, v1.16b ; CHECK-NEXT: ld1 { v5.b }[13], [x10] -; CHECK-NEXT: ld1 { v7.b }[5], [x12] ; CHECK-NEXT: ld1 { v4.b }[14], [x9] ; CHECK-NEXT: add x9, sp, #448 ; CHECK-NEXT: add x10, sp, #704 +; CHECK-NEXT: ld1 { v7.b }[6], [x11] ; CHECK-NEXT: ld1 { v3.b }[7], [x8] ; CHECK-NEXT: ld1 { v6.b }[6], [x10] ; CHECK-NEXT: add x8, sp, #648 ; CHECK-NEXT: add x10, sp, #520 ; CHECK-NEXT: ld1 { v5.b }[14], [x9] -; CHECK-NEXT: ld1 { v7.b }[6], [x11] ; CHECK-NEXT: ld1 { v4.b }[15], [x8] ; CHECK-NEXT: add x8, sp, #456 ; CHECK-NEXT: add x9, sp, #712 +; CHECK-NEXT: ld1 { v7.b }[7], [x10] ; CHECK-NEXT: sdot v19.2s, v3.8b, v2.8b ; CHECK-NEXT: ld1 { v6.b }[7], [x9] ; CHECK-NEXT: addv s0, v17.4s ; CHECK-NEXT: ld1 { v5.b }[15], [x8] -; CHECK-NEXT: ld1 { v7.b }[7], [x10] +; CHECK-NEXT: sdot v18.2s, v7.8b, v6.8b ; CHECK-NEXT: addp v1.2s, v19.2s, v19.2s ; CHECK-NEXT: sdot v16.4s, v5.16b, v4.16b -; CHECK-NEXT: sdot v18.2s, v7.8b, v6.8b ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: addp v3.2s, v18.2s, v18.2s ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: addv s2, v16.4s -; CHECK-NEXT: addp v3.2s, v18.2s, v18.2s ; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: fmov w10, s2 ; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: fmov w10, s2 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1109,215 +1109,215 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 ; CHECK-NEXT: ldr b0, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b1, [sp, #16] -; CHECK-NEXT: add x10, sp, #24 -; CHECK-NEXT: ldr b2, [sp, #280] -; CHECK-NEXT: ld1 { v0.b }[1], [x8] ; CHECK-NEXT: ldr b3, [sp, #216] ; CHECK-NEXT: add x11, sp, #224 -; CHECK-NEXT: mov v4.b[1], w1 -; CHECK-NEXT: ld1 { v1.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #288 +; CHECK-NEXT: ld1 { v0.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #24 ; CHECK-NEXT: ldr b5, [sp, #152] -; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: mov v4.b[1], w1 +; CHECK-NEXT: ld1 { v1.b }[1], [x8] +; CHECK-NEXT: ldr b2, [sp, #280] ; CHECK-NEXT: ld1 { v3.b }[1], [x11] -; CHECK-NEXT: add x10, sp, #160 -; CHECK-NEXT: ld1 { v0.b }[2], [x9] -; CHECK-NEXT: ld1 { v5.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #32 -; CHECK-NEXT: add x11, sp, #296 +; CHECK-NEXT: add x11, sp, #160 +; CHECK-NEXT: add x8, sp, #288 +; CHECK-NEXT: ld1 { v5.b }[1], [x11] +; CHECK-NEXT: add x11, sp, #32 +; CHECK-NEXT: add x9, sp, #96 +; CHECK-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-NEXT: ld1 { v1.b }[2], [x11] +; CHECK-NEXT: add x11, sp, #232 ; CHECK-NEXT: mov v4.b[2], w2 -; CHECK-NEXT: ld1 { v1.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #232 -; CHECK-NEXT: add x8, sp, #104 -; CHECK-NEXT: ld1 { v2.b }[2], [x11] -; CHECK-NEXT: ld1 { v3.b }[2], [x10] +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: ld1 { v3.b }[2], [x11] ; CHECK-NEXT: add x11, sp, #168 -; CHECK-NEXT: ld1 { v0.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #40 +; CHECK-NEXT: add x12, sp, #296 +; CHECK-NEXT: add x10, sp, #104 ; CHECK-NEXT: ld1 { v5.b }[2], [x11] -; CHECK-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #240 +; CHECK-NEXT: add x11, sp, #40 +; CHECK-NEXT: ld1 { v2.b }[2], [x12] +; CHECK-NEXT: ld1 { v1.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #240 +; CHECK-NEXT: ld1 { v0.b }[3], [x10] ; CHECK-NEXT: mov v4.b[3], w3 -; CHECK-NEXT: ld1 { v3.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #176 -; CHECK-NEXT: add x12, sp, #112 +; CHECK-NEXT: ld1 { v3.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #176 +; CHECK-NEXT: add x13, sp, #304 +; CHECK-NEXT: ld1 { v5.b }[3], [x11] +; CHECK-NEXT: add x8, sp, #112 +; CHECK-NEXT: ld1 { v2.b }[3], [x13] ; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: add x9, sp, #120 -; CHECK-NEXT: ld1 { v5.b }[3], [x8] -; CHECK-NEXT: ld1 { v0.b }[4], [x12] -; CHECK-NEXT: add x12, sp, #184 +; CHECK-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-NEXT: ld1 { v1.b }[4], [x13] -; CHECK-NEXT: add x15, sp, #56 -; CHECK-NEXT: add x14, sp, #128 +; CHECK-NEXT: add x13, sp, #184 +; CHECK-NEXT: add x9, sp, #120 ; CHECK-NEXT: mov v4.b[4], w4 -; CHECK-NEXT: add x11, sp, #304 -; CHECK-NEXT: add x13, sp, #256 -; CHECK-NEXT: ld1 { v5.b }[4], [x12] +; CHECK-NEXT: ld1 { v5.b }[4], [x13] +; CHECK-NEXT: add x8, sp, #56 ; CHECK-NEXT: ld1 { v0.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #192 -; CHECK-NEXT: add x12, sp, #248 -; CHECK-NEXT: ld1 { v1.b }[5], [x15] -; CHECK-NEXT: add x15, sp, #200 -; CHECK-NEXT: ld1 { v3.b }[4], [x12] -; CHECK-NEXT: ld1 { v2.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #64 -; CHECK-NEXT: mov v4.b[5], w5 +; CHECK-NEXT: add x13, sp, #248 +; CHECK-NEXT: ld1 { v3.b }[4], [x13] +; CHECK-NEXT: ld1 { v1.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #200 ; CHECK-NEXT: ld1 { v5.b }[5], [x9] -; CHECK-NEXT: ld1 { v0.b }[6], [x14] -; CHECK-NEXT: ldr b6, [sp, #352] +; CHECK-NEXT: ldr b17, [sp, #352] +; CHECK-NEXT: add x12, sp, #128 +; CHECK-NEXT: mov v4.b[5], w5 +; CHECK-NEXT: add x9, sp, #256 +; CHECK-NEXT: ldr b20, [sp, #552] +; CHECK-NEXT: ld1 { v0.b }[6], [x12] +; CHECK-NEXT: add x12, sp, #208 +; CHECK-NEXT: ld1 { v3.b }[5], [x9] +; CHECK-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #360 +; CHECK-NEXT: add x9, sp, #560 +; CHECK-NEXT: ld1 { v17.b }[1], [x8] +; CHECK-NEXT: ld1 { v20.b }[1], [x9] ; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: ld1 { v1.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #360 -; CHECK-NEXT: ld1 { v3.b }[5], [x13] -; CHECK-NEXT: ldr b18, [sp, #552] -; CHECK-NEXT: ld1 { v5.b }[6], [x15] -; CHECK-NEXT: add x14, sp, #208 -; CHECK-NEXT: ld1 { v6.b }[1], [x11] ; CHECK-NEXT: mov v4.b[6], w6 +; CHECK-NEXT: add x9, sp, #368 ; CHECK-NEXT: ld1 { v0.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #560 -; CHECK-NEXT: add x9, sp, #264 -; CHECK-NEXT: ld1 { v18.b }[1], [x10] +; CHECK-NEXT: ld1 { v5.b }[7], [x12] ; CHECK-NEXT: add x10, sp, #568 -; CHECK-NEXT: ld1 { v5.b }[7], [x14] -; CHECK-NEXT: ld1 { v3.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #368 -; CHECK-NEXT: ld1 { v6.b }[2], [x9] +; CHECK-NEXT: add x11, sp, #64 +; CHECK-NEXT: ld1 { v17.b }[2], [x9] +; CHECK-NEXT: ldr b6, [sp, #144] +; CHECK-NEXT: ld1 { v20.b }[2], [x10] +; CHECK-NEXT: ld1 { v1.b }[6], [x11] ; CHECK-NEXT: add x11, sp, #488 -; CHECK-NEXT: ldr b7, [sp, #144] -; CHECK-NEXT: mov v4.b[7], w7 -; CHECK-NEXT: ld1 { v18.b }[2], [x10] ; CHECK-NEXT: add x10, sp, #376 -; CHECK-NEXT: sshll v17.8h, v5.8b, #0 +; CHECK-NEXT: mov v4.b[7], w7 +; CHECK-NEXT: sshll v16.8h, v5.8b, #0 ; CHECK-NEXT: ldr b5, [sp, #480] -; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: ld1 { v6.b }[3], [x10] +; CHECK-NEXT: sshll v7.8h, v6.8b, #0 +; CHECK-NEXT: ld1 { v17.b }[3], [x10] ; CHECK-NEXT: add x10, sp, #576 -; CHECK-NEXT: add x8, sp, #312 ; CHECK-NEXT: ld1 { v5.b }[1], [x11] -; CHECK-NEXT: ld1 { v18.b }[3], [x10] +; CHECK-NEXT: ld1 { v20.b }[3], [x10] ; CHECK-NEXT: add x11, sp, #496 -; CHECK-NEXT: sshll v16.8h, v4.8b, #0 -; CHECK-NEXT: ldr b4, [sp, #344] ; CHECK-NEXT: add x10, sp, #384 -; CHECK-NEXT: ld1 { v6.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #584 -; CHECK-NEXT: ld1 { v2.b }[4], [x8] -; CHECK-NEXT: sshll v19.8h, v4.8b, #0 +; CHECK-NEXT: ldr b19, [sp, #616] +; CHECK-NEXT: add x12, sp, #624 +; CHECK-NEXT: sshll v6.8h, v4.8b, #0 +; CHECK-NEXT: ldr b4, [sp, #344] +; CHECK-NEXT: ld1 { v17.b }[4], [x10] ; CHECK-NEXT: ld1 { v5.b }[2], [x11] -; CHECK-NEXT: ld1 { v18.b }[4], [x10] -; CHECK-NEXT: smull2 v4.4s, v16.8h, v17.8h -; CHECK-NEXT: smull v16.4s, v16.4h, v17.4h -; CHECK-NEXT: ldr b17, [sp, #416] +; CHECK-NEXT: add x10, sp, #584 ; CHECK-NEXT: add x11, sp, #504 +; CHECK-NEXT: sshll v18.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v20.b }[4], [x10] ; CHECK-NEXT: add x10, sp, #424 -; CHECK-NEXT: add x16, sp, #320 -; CHECK-NEXT: smull v19.4s, v7.4h, v19.4h -; CHECK-NEXT: movi v7.2d, #0000000000000000 +; CHECK-NEXT: smull2 v4.4s, v6.8h, v16.8h +; CHECK-NEXT: smull v6.4s, v6.4h, v16.4h +; CHECK-NEXT: ldr b16, [sp, #416] ; CHECK-NEXT: ld1 { v5.b }[3], [x11] ; CHECK-NEXT: add x11, sp, #392 -; CHECK-NEXT: ld1 { v17.b }[1], [x10] +; CHECK-NEXT: ld1 { v19.b }[1], [x12] +; CHECK-NEXT: smull v18.4s, v7.4h, v18.4h +; CHECK-NEXT: movi v7.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v16.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #592 -; CHECK-NEXT: ld1 { v2.b }[5], [x16] -; CHECK-NEXT: ld1 { v6.b }[5], [x11] -; CHECK-NEXT: ld1 { v18.b }[5], [x10] +; CHECK-NEXT: ld1 { v17.b }[5], [x11] ; CHECK-NEXT: add x11, sp, #512 +; CHECK-NEXT: ld1 { v20.b }[5], [x10] ; CHECK-NEXT: add x10, sp, #432 -; CHECK-NEXT: add x12, sp, #328 -; CHECK-NEXT: mov v7.s[0], v19.s[0] ; CHECK-NEXT: ld1 { v5.b }[4], [x11] ; CHECK-NEXT: add x11, sp, #400 -; CHECK-NEXT: ld1 { v17.b }[2], [x10] +; CHECK-NEXT: ld1 { v16.b }[2], [x10] ; CHECK-NEXT: add x10, sp, #600 -; CHECK-NEXT: ldr b19, [sp, #680] -; CHECK-NEXT: ldr b20, [sp, #616] -; CHECK-NEXT: ld1 { v2.b }[6], [x12] -; CHECK-NEXT: ld1 { v6.b }[6], [x11] -; CHECK-NEXT: ld1 { v18.b }[6], [x10] +; CHECK-NEXT: mov v7.s[0], v18.s[0] +; CHECK-NEXT: ldr b18, [sp, #680] +; CHECK-NEXT: ld1 { v17.b }[6], [x11] +; CHECK-NEXT: ld1 { v20.b }[6], [x10] ; CHECK-NEXT: add x11, sp, #688 -; CHECK-NEXT: add x12, sp, #624 -; CHECK-NEXT: ld1 { v19.b }[1], [x11] -; CHECK-NEXT: ld1 { v20.b }[1], [x12] ; CHECK-NEXT: add x10, sp, #408 +; CHECK-NEXT: ld1 { v18.b }[1], [x11] ; CHECK-NEXT: add x11, sp, #608 ; CHECK-NEXT: add x12, sp, #440 -; CHECK-NEXT: ld1 { v6.b }[7], [x10] -; CHECK-NEXT: ld1 { v18.b }[7], [x11] -; CHECK-NEXT: ld1 { v17.b }[3], [x12] +; CHECK-NEXT: ld1 { v17.b }[7], [x10] +; CHECK-NEXT: ld1 { v16.b }[3], [x12] ; CHECK-NEXT: add x10, sp, #696 +; CHECK-NEXT: ld1 { v20.b }[7], [x11] ; CHECK-NEXT: add x11, sp, #632 -; CHECK-NEXT: ld1 { v19.b }[2], [x10] +; CHECK-NEXT: add x14, sp, #312 +; CHECK-NEXT: ld1 { v18.b }[2], [x10] +; CHECK-NEXT: ld1 { v19.b }[2], [x11] ; CHECK-NEXT: add x10, sp, #448 -; CHECK-NEXT: ld1 { v20.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #640 -; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: ld1 { v17.b }[4], [x10] +; CHECK-NEXT: ld1 { v16.b }[4], [x10] ; CHECK-NEXT: add x10, sp, #704 -; CHECK-NEXT: sshll v18.8h, v18.8b, #0 -; CHECK-NEXT: ld1 { v19.b }[3], [x10] +; CHECK-NEXT: add x11, sp, #640 +; CHECK-NEXT: ld1 { v2.b }[4], [x14] +; CHECK-NEXT: sshll v17.8h, v17.8b, #0 +; CHECK-NEXT: sshll v20.8h, v20.8b, #0 +; CHECK-NEXT: ld1 { v18.b }[3], [x10] +; CHECK-NEXT: ld1 { v19.b }[3], [x11] ; CHECK-NEXT: add x10, sp, #712 -; CHECK-NEXT: add x12, sp, #520 -; CHECK-NEXT: ld1 { v20.b }[3], [x11] ; CHECK-NEXT: add x11, sp, #648 +; CHECK-NEXT: add x13, sp, #320 +; CHECK-NEXT: add x12, sp, #520 +; CHECK-NEXT: ld1 { v2.b }[5], [x13] ; CHECK-NEXT: ldr b21, [sp, #544] -; CHECK-NEXT: smull2 v22.4s, v6.8h, v18.8h -; CHECK-NEXT: smull v6.4s, v6.4h, v18.4h -; CHECK-NEXT: ldr b18, [sp, #744] -; CHECK-NEXT: ld1 { v19.b }[4], [x10] +; CHECK-NEXT: smull2 v22.4s, v17.8h, v20.8h +; CHECK-NEXT: ld1 { v18.b }[4], [x10] +; CHECK-NEXT: ld1 { v19.b }[4], [x11] +; CHECK-NEXT: add x11, sp, #456 +; CHECK-NEXT: smull v17.4s, v17.4h, v20.4h +; CHECK-NEXT: ldr b20, [sp, #744] ; CHECK-NEXT: ld1 { v5.b }[5], [x12] +; CHECK-NEXT: ld1 { v16.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #720 ; CHECK-NEXT: add x12, sp, #656 -; CHECK-NEXT: ld1 { v20.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #456 +; CHECK-NEXT: ld1 { v18.b }[5], [x11] +; CHECK-NEXT: ld1 { v19.b }[5], [x12] +; CHECK-NEXT: add x15, sp, #328 +; CHECK-NEXT: add x13, sp, #264 ; CHECK-NEXT: sshll v21.8h, v21.8b, #0 -; CHECK-NEXT: ld1 { v17.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #720 -; CHECK-NEXT: sshll v18.8h, v18.8b, #0 -; CHECK-NEXT: ld1 { v19.b }[5], [x11] +; CHECK-NEXT: sshll v20.8h, v20.8b, #0 ; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: add x11, sp, #464 -; CHECK-NEXT: ld1 { v20.b }[5], [x12] +; CHECK-NEXT: ld1 { v2.b }[6], [x15] +; CHECK-NEXT: ld1 { v3.b }[6], [x13] ; CHECK-NEXT: ld1 { v5.b }[6], [x10] +; CHECK-NEXT: add x11, sp, #464 ; CHECK-NEXT: add x12, sp, #728 ; CHECK-NEXT: add x13, sp, #664 -; CHECK-NEXT: add x8, sp, #72 -; CHECK-NEXT: ld1 { v17.b }[6], [x11] -; CHECK-NEXT: ld1 { v19.b }[6], [x12] -; CHECK-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-NEXT: ld1 { v16.b }[6], [x11] +; CHECK-NEXT: ld1 { v18.b }[6], [x12] +; CHECK-NEXT: ld1 { v19.b }[6], [x13] ; CHECK-NEXT: add x8, sp, #336 -; CHECK-NEXT: ld1 { v20.b }[6], [x13] ; CHECK-NEXT: add x9, sp, #272 -; CHECK-NEXT: smull v18.4s, v21.4h, v18.4h +; CHECK-NEXT: smull v20.4s, v21.4h, v20.4h ; CHECK-NEXT: movi v21.2d, #0000000000000000 ; CHECK-NEXT: add x10, sp, #536 +; CHECK-NEXT: add x14, sp, #72 ; CHECK-NEXT: ld1 { v2.b }[7], [x8] ; CHECK-NEXT: ld1 { v3.b }[7], [x9] ; CHECK-NEXT: ld1 { v5.b }[7], [x10] ; CHECK-NEXT: add x8, sp, #472 ; CHECK-NEXT: add x9, sp, #736 ; CHECK-NEXT: add x10, sp, #672 -; CHECK-NEXT: ld1 { v17.b }[7], [x8] -; CHECK-NEXT: ld1 { v19.b }[7], [x9] -; CHECK-NEXT: ld1 { v20.b }[7], [x10] +; CHECK-NEXT: ld1 { v1.b }[7], [x14] +; CHECK-NEXT: ld1 { v16.b }[7], [x8] +; CHECK-NEXT: ld1 { v18.b }[7], [x9] +; CHECK-NEXT: ld1 { v19.b }[7], [x10] +; CHECK-NEXT: mov v21.s[0], v20.s[0] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: mov v21.s[0], v18.s[0] -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: sshll v18.8h, v19.8b, #0 -; CHECK-NEXT: sshll v19.8h, v20.8b, #0 -; CHECK-NEXT: smlal v16.4s, v0.4h, v2.4h +; CHECK-NEXT: sshll v16.8h, v16.8b, #0 +; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: sshll v19.8h, v19.8b, #0 +; CHECK-NEXT: smlal v6.4s, v0.4h, v2.4h ; CHECK-NEXT: smlal2 v4.4s, v0.8h, v2.8h ; CHECK-NEXT: smlal v7.4s, v1.4h, v3.4h -; CHECK-NEXT: smlal v6.4s, v5.4h, v18.4h +; CHECK-NEXT: smlal v17.4s, v5.4h, v18.4h ; CHECK-NEXT: smlal2 v22.4s, v5.8h, v18.8h -; CHECK-NEXT: smlal v21.4s, v17.4h, v19.4h +; CHECK-NEXT: smlal v21.4s, v16.4h, v19.4h ; CHECK-NEXT: smlal2 v4.4s, v1.8h, v3.8h -; CHECK-NEXT: add v0.4s, v16.4s, v7.4s -; CHECK-NEXT: add v1.4s, v6.4s, v21.4s -; CHECK-NEXT: smlal2 v22.4s, v17.8h, v19.8h +; CHECK-NEXT: add v0.4s, v6.4s, v7.4s +; CHECK-NEXT: add v1.4s, v17.4s, v21.4s +; CHECK-NEXT: smlal2 v22.4s, v16.8h, v19.8h ; CHECK-NEXT: add v0.4s, v0.4s, v4.4s ; CHECK-NEXT: add v1.4s, v1.4s, v22.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s @@ -1346,70 +1346,70 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> % ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ldr b1, [sp, #80] -; CHECK-NEXT: add x10, sp, #88 +; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b2, [sp, #16] ; CHECK-NEXT: add x9, sp, #96 ; CHECK-NEXT: ldr b3, [sp, #480] -; CHECK-NEXT: ld1 { v1.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #24 +; CHECK-NEXT: ld1 { v1.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #24 ; CHECK-NEXT: ldr b4, [sp, #352] ; CHECK-NEXT: mov v0.b[1], w1 -; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: ld1 { v2.b }[1], [x8] ; CHECK-NEXT: add x11, sp, #488 -; CHECK-NEXT: add x10, sp, #360 +; CHECK-NEXT: add x8, sp, #360 ; CHECK-NEXT: ldr b5, [sp, #416] -; CHECK-NEXT: add x8, sp, #104 +; CHECK-NEXT: add x10, sp, #104 ; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #32 ; CHECK-NEXT: ld1 { v3.b }[1], [x11] ; CHECK-NEXT: ld1 { v2.b }[2], [x9] ; CHECK-NEXT: add x11, sp, #424 -; CHECK-NEXT: ld1 { v4.b }[1], [x10] +; CHECK-NEXT: ld1 { v4.b }[1], [x8] ; CHECK-NEXT: mov v0.b[2], w2 ; CHECK-NEXT: ld1 { v5.b }[1], [x11] ; CHECK-NEXT: add x9, sp, #368 -; CHECK-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #40 +; CHECK-NEXT: ld1 { v1.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #40 ; CHECK-NEXT: add x12, sp, #496 -; CHECK-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-NEXT: ld1 { v2.b }[3], [x10] ; CHECK-NEXT: ld1 { v4.b }[2], [x9] -; CHECK-NEXT: add x8, sp, #432 +; CHECK-NEXT: add x10, sp, #432 ; CHECK-NEXT: ld1 { v3.b }[2], [x12] ; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-NEXT: ld1 { v5.b }[2], [x10] ; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: add x10, sp, #112 -; CHECK-NEXT: add x8, sp, #504 +; CHECK-NEXT: add x8, sp, #112 +; CHECK-NEXT: add x10, sp, #504 ; CHECK-NEXT: ld1 { v2.b }[4], [x13] ; CHECK-NEXT: add x13, sp, #376 -; CHECK-NEXT: ld1 { v1.b }[4], [x10] +; CHECK-NEXT: ld1 { v1.b }[4], [x8] ; CHECK-NEXT: ld1 { v4.b }[3], [x13] ; CHECK-NEXT: add x13, sp, #440 -; CHECK-NEXT: ld1 { v3.b }[3], [x8] +; CHECK-NEXT: ld1 { v3.b }[3], [x10] ; CHECK-NEXT: ld1 { v5.b }[3], [x13] ; CHECK-NEXT: add x11, sp, #120 -; CHECK-NEXT: add x8, sp, #56 +; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: mov v0.b[4], w4 ; CHECK-NEXT: add x13, sp, #512 ; CHECK-NEXT: ld1 { v1.b }[5], [x11] -; CHECK-NEXT: ld1 { v2.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #384 +; CHECK-NEXT: ld1 { v2.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #384 ; CHECK-NEXT: add x11, sp, #448 ; CHECK-NEXT: ld1 { v3.b }[4], [x13] -; CHECK-NEXT: ld1 { v4.b }[4], [x8] +; CHECK-NEXT: ld1 { v4.b }[4], [x10] ; CHECK-NEXT: ld1 { v5.b }[4], [x11] ; CHECK-NEXT: add x12, sp, #128 -; CHECK-NEXT: add x10, sp, #64 -; CHECK-NEXT: add x8, sp, #520 +; CHECK-NEXT: add x8, sp, #64 +; CHECK-NEXT: add x10, sp, #520 ; CHECK-NEXT: mov v0.b[5], w5 ; CHECK-NEXT: ld1 { v1.b }[6], [x12] -; CHECK-NEXT: ld1 { v2.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #392 +; CHECK-NEXT: ld1 { v2.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #392 ; CHECK-NEXT: add x11, sp, #456 ; CHECK-NEXT: ldr b6, [sp, #144] ; CHECK-NEXT: ldr b7, [sp, #544] -; CHECK-NEXT: ld1 { v3.b }[5], [x8] -; CHECK-NEXT: ld1 { v4.b }[5], [x10] +; CHECK-NEXT: ld1 { v3.b }[5], [x10] +; CHECK-NEXT: ld1 { v4.b }[5], [x8] ; CHECK-NEXT: ld1 { v5.b }[5], [x11] ; CHECK-NEXT: add x9, sp, #136 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0 @@ -1717,286 +1717,286 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-NEXT: ldr b0, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b2, [sp, #144] -; CHECK-NEXT: fmov s4, w0 -; CHECK-NEXT: add x10, sp, #152 ; CHECK-NEXT: ldr b3, [sp, #16] -; CHECK-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-NEXT: ld1 { v2.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #24 ; CHECK-NEXT: ldr b1, [sp, #344] ; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: ld1 { v3.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #352 -; CHECK-NEXT: mov v4.b[1], w1 -; CHECK-NEXT: add x8, sp, #104 +; CHECK-NEXT: ld1 { v0.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #152 +; CHECK-NEXT: add x10, sp, #104 +; CHECK-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #24 +; CHECK-NEXT: fmov s4, w0 +; CHECK-NEXT: ld1 { v3.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #352 +; CHECK-NEXT: add x11, sp, #112 ; CHECK-NEXT: ld1 { v0.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #160 -; CHECK-NEXT: ld1 { v1.b }[1], [x10] +; CHECK-NEXT: ld1 { v1.b }[1], [x8] ; CHECK-NEXT: ld1 { v2.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: add x12, sp, #360 +; CHECK-NEXT: mov v4.b[1], w1 ; CHECK-NEXT: ld1 { v3.b }[2], [x9] -; CHECK-NEXT: add x11, sp, #112 -; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: ld1 { v1.b }[2], [x12] -; CHECK-NEXT: add x12, sp, #168 -; CHECK-NEXT: ld1 { v0.b }[3], [x8] -; CHECK-NEXT: mov v4.b[2], w2 -; CHECK-NEXT: ld1 { v2.b }[3], [x12] -; CHECK-NEXT: add x12, sp, #40 -; CHECK-NEXT: ld1 { v3.b }[3], [x12] +; CHECK-NEXT: add x9, sp, #360 +; CHECK-NEXT: add x8, sp, #120 +; CHECK-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #168 +; CHECK-NEXT: ld1 { v0.b }[3], [x10] +; CHECK-NEXT: ld1 { v2.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #40 ; CHECK-NEXT: add x13, sp, #176 -; CHECK-NEXT: ldr b16, [sp, #216] +; CHECK-NEXT: ld1 { v3.b }[3], [x9] +; CHECK-NEXT: ldr b6, [sp, #216] +; CHECK-NEXT: add x14, sp, #224 ; CHECK-NEXT: ld1 { v0.b }[4], [x11] ; CHECK-NEXT: add x11, sp, #48 -; CHECK-NEXT: add x12, sp, #368 +; CHECK-NEXT: mov v4.b[2], w2 ; CHECK-NEXT: ld1 { v2.b }[4], [x13] -; CHECK-NEXT: add x13, sp, #224 -; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: mov v4.b[3], w3 +; CHECK-NEXT: add x9, sp, #368 +; CHECK-NEXT: ld1 { v6.b }[1], [x14] ; CHECK-NEXT: ld1 { v3.b }[4], [x11] -; CHECK-NEXT: ld1 { v16.b }[1], [x13] -; CHECK-NEXT: ld1 { v0.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #56 -; CHECK-NEXT: ld1 { v1.b }[3], [x12] -; CHECK-NEXT: add x12, sp, #184 -; CHECK-NEXT: ldr b5, [sp, #280] +; CHECK-NEXT: add x12, sp, #128 +; CHECK-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-NEXT: ld1 { v0.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #56 +; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: add x14, sp, #64 +; CHECK-NEXT: ldr b7, [sp, #280] +; CHECK-NEXT: ld1 { v2.b }[5], [x9] +; CHECK-NEXT: ld1 { v3.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #232 ; CHECK-NEXT: add x11, sp, #376 -; CHECK-NEXT: ld1 { v3.b }[5], [x10] -; CHECK-NEXT: ld1 { v2.b }[5], [x12] -; CHECK-NEXT: add x10, sp, #232 -; CHECK-NEXT: mov v4.b[4], w4 -; CHECK-NEXT: ld1 { v0.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #288 -; CHECK-NEXT: add x15, sp, #64 -; CHECK-NEXT: ld1 { v16.b }[2], [x10] -; CHECK-NEXT: ldr b17, [sp, #408] -; CHECK-NEXT: ld1 { v5.b }[1], [x9] -; CHECK-NEXT: add x14, sp, #192 +; CHECK-NEXT: ld1 { v0.b }[6], [x12] +; CHECK-NEXT: add x12, sp, #288 +; CHECK-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-NEXT: ldr b16, [sp, #408] +; CHECK-NEXT: ld1 { v7.b }[1], [x12] +; CHECK-NEXT: mov v4.b[3], w3 +; CHECK-NEXT: ld1 { v3.b }[6], [x14] +; CHECK-NEXT: add x13, sp, #192 ; CHECK-NEXT: ld1 { v1.b }[4], [x11] -; CHECK-NEXT: ld1 { v3.b }[6], [x15] -; CHECK-NEXT: add x15, sp, #416 -; CHECK-NEXT: ld1 { v2.b }[6], [x14] -; CHECK-NEXT: add x14, sp, #240 -; CHECK-NEXT: ld1 { v17.b }[1], [x15] +; CHECK-NEXT: add x9, sp, #72 +; CHECK-NEXT: add x14, sp, #416 +; CHECK-NEXT: ld1 { v2.b }[6], [x13] +; CHECK-NEXT: add x13, sp, #240 +; CHECK-NEXT: ld1 { v16.b }[1], [x14] +; CHECK-NEXT: add x15, sp, #384 +; CHECK-NEXT: ld1 { v3.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #296 -; CHECK-NEXT: add x8, sp, #136 -; CHECK-NEXT: mov v4.b[5], w5 -; CHECK-NEXT: add x13, sp, #384 -; CHECK-NEXT: ld1 { v16.b }[3], [x14] -; CHECK-NEXT: ld1 { v5.b }[2], [x9] -; CHECK-NEXT: ld1 { v1.b }[5], [x13] -; CHECK-NEXT: ld1 { v0.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #424 -; CHECK-NEXT: add x9, sp, #248 -; CHECK-NEXT: ld1 { v17.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #304 -; CHECK-NEXT: add x10, sp, #392 -; CHECK-NEXT: ld1 { v16.b }[4], [x9] -; CHECK-NEXT: ld1 { v5.b }[3], [x8] -; CHECK-NEXT: mov v4.b[6], w6 -; CHECK-NEXT: ld1 { v1.b }[6], [x10] +; CHECK-NEXT: ld1 { v6.b }[3], [x13] +; CHECK-NEXT: ld1 { v7.b }[2], [x9] +; CHECK-NEXT: add x10, sp, #136 +; CHECK-NEXT: ld1 { v1.b }[5], [x15] +; CHECK-NEXT: add x9, sp, #424 +; CHECK-NEXT: mov v4.b[4], w4 +; CHECK-NEXT: ld1 { v0.b }[7], [x10] +; CHECK-NEXT: add x10, sp, #248 +; CHECK-NEXT: ld1 { v16.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #304 +; CHECK-NEXT: add x8, sp, #392 +; CHECK-NEXT: ld1 { v6.b }[4], [x10] +; CHECK-NEXT: ld1 { v7.b }[3], [x9] +; CHECK-NEXT: ld1 { v1.b }[6], [x8] ; CHECK-NEXT: add x10, sp, #432 ; CHECK-NEXT: add x9, sp, #256 -; CHECK-NEXT: ld1 { v17.b }[3], [x10] +; CHECK-NEXT: ld1 { v16.b }[3], [x10] ; CHECK-NEXT: add x10, sp, #312 ; CHECK-NEXT: ldr b22, [sp, #608] +; CHECK-NEXT: mov v4.b[5], w5 ; CHECK-NEXT: add x8, sp, #400 -; CHECK-NEXT: ld1 { v16.b }[5], [x9] -; CHECK-NEXT: ld1 { v5.b }[4], [x10] +; CHECK-NEXT: ld1 { v6.b }[5], [x9] +; CHECK-NEXT: ld1 { v7.b }[4], [x10] ; CHECK-NEXT: add x9, sp, #616 ; CHECK-NEXT: ld1 { v1.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #440 ; CHECK-NEXT: ld1 { v22.b }[1], [x9] -; CHECK-NEXT: mov v4.b[7], w7 -; CHECK-NEXT: ld1 { v17.b }[4], [x8] +; CHECK-NEXT: ldr b5, [sp, #208] +; CHECK-NEXT: ld1 { v16.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #320 ; CHECK-NEXT: add x10, sp, #448 -; CHECK-NEXT: ldr b6, [sp, #208] -; CHECK-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-NEXT: ld1 { v7.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #624 -; CHECK-NEXT: ldr b7, [sp, #472] +; CHECK-NEXT: mov v4.b[6], w6 ; CHECK-NEXT: ld1 { v22.b }[2], [x8] -; CHECK-NEXT: ld1 { v17.b }[5], [x10] +; CHECK-NEXT: sshll v18.8h, v5.8b, #0 +; CHECK-NEXT: ldr b5, [sp, #480] +; CHECK-NEXT: ld1 { v16.b }[5], [x10] ; CHECK-NEXT: add x10, sp, #328 -; CHECK-NEXT: sshll v20.8h, v4.8b, #0 -; CHECK-NEXT: ldr b4, [sp, #480] ; CHECK-NEXT: add x8, sp, #456 -; CHECK-NEXT: ld1 { v5.b }[6], [x10] +; CHECK-NEXT: ld1 { v7.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #632 -; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: add x9, sp, #264 ; CHECK-NEXT: ld1 { v22.b }[3], [x10] ; CHECK-NEXT: add x10, sp, #488 -; CHECK-NEXT: ld1 { v17.b }[6], [x8] +; CHECK-NEXT: mov v4.b[7], w7 +; CHECK-NEXT: ld1 { v16.b }[6], [x8] ; CHECK-NEXT: add x8, sp, #336 -; CHECK-NEXT: ld1 { v4.b }[1], [x10] -; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: ld1 { v5.b }[7], [x8] +; CHECK-NEXT: ld1 { v5.b }[1], [x10] +; CHECK-NEXT: ld1 { v7.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #640 -; CHECK-NEXT: add x9, sp, #264 +; CHECK-NEXT: ld1 { v6.b }[6], [x9] ; CHECK-NEXT: ld1 { v22.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #496 -; CHECK-NEXT: ld1 { v16.b }[6], [x9] -; CHECK-NEXT: ld1 { v4.b }[2], [x8] +; CHECK-NEXT: add x9, sp, #272 +; CHECK-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-NEXT: sshll v20.8h, v4.8b, #0 +; CHECK-NEXT: ldr b4, [sp, #472] ; CHECK-NEXT: add x8, sp, #648 -; CHECK-NEXT: smull v18.4s, v6.4h, v7.4h +; CHECK-NEXT: sshll v17.8h, v7.8b, #0 ; CHECK-NEXT: ldr b7, [sp, #544] -; CHECK-NEXT: add x9, sp, #272 -; CHECK-NEXT: movi v6.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v22.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #504 -; CHECK-NEXT: ld1 { v16.b }[7], [x9] -; CHECK-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v5.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #552 -; CHECK-NEXT: add x9, sp, #656 +; CHECK-NEXT: ld1 { v6.b }[7], [x9] ; CHECK-NEXT: ld1 { v7.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #512 -; CHECK-NEXT: ldr b21, [sp, #672] +; CHECK-NEXT: add x9, sp, #656 +; CHECK-NEXT: smull v18.4s, v18.4h, v4.4h +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v22.b }[6], [x9] -; CHECK-NEXT: mov v6.s[0], v18.s[0] -; CHECK-NEXT: add x9, sp, #664 -; CHECK-NEXT: ld1 { v4.b }[4], [x8] +; CHECK-NEXT: ld1 { v5.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #560 -; CHECK-NEXT: sshll v23.8h, v16.8b, #0 +; CHECK-NEXT: ldr b21, [sp, #672] ; CHECK-NEXT: ld1 { v7.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #520 -; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: add x9, sp, #664 +; CHECK-NEXT: sshll v23.8h, v6.8b, #0 ; CHECK-NEXT: ld1 { v22.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #528 -; CHECK-NEXT: add x10, sp, #464 -; CHECK-NEXT: ld1 { v4.b }[5], [x8] +; CHECK-NEXT: ld1 { v5.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #568 -; CHECK-NEXT: smull2 v18.4s, v20.8h, v23.8h +; CHECK-NEXT: mov v4.s[0], v18.s[0] ; CHECK-NEXT: ld1 { v7.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #680 -; CHECK-NEXT: smlal v6.4s, v20.4h, v23.4h +; CHECK-NEXT: movi v19.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v21.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #576 +; CHECK-NEXT: smull2 v18.4s, v20.8h, v23.8h +; CHECK-NEXT: ld1 { v5.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #536 +; CHECK-NEXT: ldr b25, [sp, #936] +; CHECK-NEXT: ld1 { v7.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #688 +; CHECK-NEXT: smlal v4.4s, v20.4h, v23.4h ; CHECK-NEXT: sshll v20.8h, v22.8b, #0 +; CHECK-NEXT: ld1 { v21.b }[2], [x8] ; CHECK-NEXT: ldr b22, [sp, #736] -; CHECK-NEXT: ld1 { v4.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #576 ; CHECK-NEXT: ldr b23, [sp, #1000] -; CHECK-NEXT: ld1 { v7.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #688 -; CHECK-NEXT: sshll v24.8h, v22.8b, #0 -; CHECK-NEXT: ld1 { v21.b }[2], [x9] +; CHECK-NEXT: ld1 { v5.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #696 -; CHECK-NEXT: sshll v25.8h, v23.8b, #0 -; CHECK-NEXT: add x8, sp, #536 -; CHECK-NEXT: ldr b22, [sp, #872] -; CHECK-NEXT: ldr b23, [sp, #936] -; CHECK-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-NEXT: sshll v24.8h, v22.8b, #0 ; CHECK-NEXT: add x8, sp, #584 -; CHECK-NEXT: ld1 { v17.b }[7], [x10] +; CHECK-NEXT: ldr b22, [sp, #872] +; CHECK-NEXT: sshll v23.8h, v23.8b, #0 ; CHECK-NEXT: ld1 { v21.b }[3], [x9] ; CHECK-NEXT: ld1 { v7.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #880 ; CHECK-NEXT: add x9, sp, #704 -; CHECK-NEXT: smull v25.4s, v24.4h, v25.4h -; CHECK-NEXT: ldr b24, [sp, #744] +; CHECK-NEXT: add x10, sp, #464 ; CHECK-NEXT: ld1 { v22.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #944 -; CHECK-NEXT: add x10, sp, #888 +; CHECK-NEXT: ld1 { v16.b }[7], [x10] +; CHECK-NEXT: smull v24.4s, v24.4h, v23.4h +; CHECK-NEXT: ldr b23, [sp, #744] ; CHECK-NEXT: ld1 { v21.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #752 -; CHECK-NEXT: ld1 { v23.b }[1], [x8] -; CHECK-NEXT: ld1 { v24.b }[1], [x9] +; CHECK-NEXT: ld1 { v25.b }[1], [x8] +; CHECK-NEXT: add x10, sp, #888 +; CHECK-NEXT: ld1 { v23.b }[1], [x9] ; CHECK-NEXT: add x8, sp, #712 -; CHECK-NEXT: add x9, sp, #760 ; CHECK-NEXT: ld1 { v22.b }[2], [x10] +; CHECK-NEXT: add x9, sp, #760 ; CHECK-NEXT: add x10, sp, #952 -; CHECK-NEXT: mov v19.s[0], v25.s[0] -; CHECK-NEXT: ldr b25, [sp, #808] -; CHECK-NEXT: ld1 { v23.b }[2], [x10] ; CHECK-NEXT: ld1 { v21.b }[5], [x8] -; CHECK-NEXT: ld1 { v24.b }[2], [x9] +; CHECK-NEXT: mov v19.s[0], v24.s[0] +; CHECK-NEXT: ldr b24, [sp, #808] +; CHECK-NEXT: ld1 { v25.b }[2], [x10] +; CHECK-NEXT: ld1 { v23.b }[2], [x9] ; CHECK-NEXT: add x8, sp, #816 ; CHECK-NEXT: add x9, sp, #896 -; CHECK-NEXT: ld1 { v25.b }[1], [x8] +; CHECK-NEXT: ld1 { v24.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #960 ; CHECK-NEXT: ld1 { v22.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #768 -; CHECK-NEXT: ld1 { v23.b }[3], [x8] +; CHECK-NEXT: ld1 { v25.b }[3], [x8] ; CHECK-NEXT: add x10, sp, #904 -; CHECK-NEXT: ld1 { v24.b }[3], [x9] +; CHECK-NEXT: ld1 { v23.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #824 ; CHECK-NEXT: add x8, sp, #720 -; CHECK-NEXT: ld1 { v25.b }[2], [x9] +; CHECK-NEXT: ld1 { v24.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #968 ; CHECK-NEXT: ld1 { v22.b }[4], [x10] ; CHECK-NEXT: add x10, sp, #776 -; CHECK-NEXT: ld1 { v23.b }[4], [x9] +; CHECK-NEXT: ld1 { v25.b }[4], [x9] ; CHECK-NEXT: ld1 { v21.b }[6], [x8] -; CHECK-NEXT: ld1 { v24.b }[4], [x10] +; CHECK-NEXT: ld1 { v23.b }[4], [x10] ; CHECK-NEXT: add x8, sp, #832 ; CHECK-NEXT: add x9, sp, #912 -; CHECK-NEXT: ld1 { v25.b }[3], [x8] +; CHECK-NEXT: ld1 { v24.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #976 ; CHECK-NEXT: ld1 { v22.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #784 -; CHECK-NEXT: ld1 { v23.b }[5], [x8] +; CHECK-NEXT: ld1 { v25.b }[5], [x8] ; CHECK-NEXT: add x10, sp, #920 -; CHECK-NEXT: ld1 { v24.b }[5], [x9] +; CHECK-NEXT: ld1 { v23.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #840 ; CHECK-NEXT: add x8, sp, #728 -; CHECK-NEXT: ld1 { v25.b }[4], [x9] +; CHECK-NEXT: ld1 { v24.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #984 ; CHECK-NEXT: ld1 { v22.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #792 -; CHECK-NEXT: ld1 { v23.b }[6], [x9] +; CHECK-NEXT: ld1 { v25.b }[6], [x9] ; CHECK-NEXT: ld1 { v21.b }[7], [x8] -; CHECK-NEXT: ld1 { v24.b }[6], [x10] +; CHECK-NEXT: ld1 { v23.b }[6], [x10] ; CHECK-NEXT: add x8, sp, #848 ; CHECK-NEXT: add x9, sp, #928 -; CHECK-NEXT: ld1 { v25.b }[5], [x8] -; CHECK-NEXT: add x12, sp, #72 +; CHECK-NEXT: ld1 { v24.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #992 ; CHECK-NEXT: ld1 { v22.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #800 -; CHECK-NEXT: ld1 { v3.b }[7], [x12] -; CHECK-NEXT: ld1 { v23.b }[7], [x8] +; CHECK-NEXT: ld1 { v25.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #592 -; CHECK-NEXT: ld1 { v24.b }[7], [x9] +; CHECK-NEXT: ld1 { v23.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #856 ; CHECK-NEXT: ld1 { v7.b }[6], [x8] +; CHECK-NEXT: ld1 { v24.b }[6], [x9] ; CHECK-NEXT: add x11, sp, #200 -; CHECK-NEXT: ld1 { v25.b }[6], [x9] ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 ; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 ; CHECK-NEXT: sshll v21.8h, v21.8b, #0 ; CHECK-NEXT: sshll v22.8h, v22.8b, #0 -; CHECK-NEXT: sshll v23.8h, v23.8b, #0 +; CHECK-NEXT: sshll v25.8h, v25.8b, #0 ; CHECK-NEXT: add x8, sp, #600 -; CHECK-NEXT: sshll v24.8h, v24.8b, #0 +; CHECK-NEXT: sshll v23.8h, v23.8b, #0 ; CHECK-NEXT: add x9, sp, #864 ; CHECK-NEXT: ld1 { v2.b }[7], [x11] ; CHECK-NEXT: ld1 { v7.b }[7], [x8] -; CHECK-NEXT: ld1 { v25.b }[7], [x9] -; CHECK-NEXT: smull v16.4s, v3.4h, v5.4h -; CHECK-NEXT: smull2 v3.4s, v3.8h, v5.8h -; CHECK-NEXT: smull v5.4s, v21.4h, v23.4h -; CHECK-NEXT: smull2 v21.4s, v21.8h, v23.8h -; CHECK-NEXT: smull2 v23.4s, v20.8h, v22.8h -; CHECK-NEXT: smlal v19.4s, v4.4h, v24.4h +; CHECK-NEXT: ld1 { v24.b }[7], [x9] +; CHECK-NEXT: smull v6.4s, v3.4h, v17.4h +; CHECK-NEXT: smull2 v3.4s, v3.8h, v17.8h +; CHECK-NEXT: smull v17.4s, v21.4h, v25.4h +; CHECK-NEXT: smull2 v21.4s, v21.8h, v25.8h +; CHECK-NEXT: smull2 v25.4s, v20.8h, v22.8h +; CHECK-NEXT: smlal v19.4s, v5.4h, v23.4h ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: sshll v17.8h, v17.8b, #0 +; CHECK-NEXT: sshll v16.8h, v16.8b, #0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: sshll v25.8h, v25.8b, #0 -; CHECK-NEXT: smlal2 v3.4s, v2.8h, v17.8h -; CHECK-NEXT: smlal v16.4s, v2.4h, v17.4h -; CHECK-NEXT: smlal2 v23.4s, v4.8h, v24.8h +; CHECK-NEXT: sshll v24.8h, v24.8b, #0 +; CHECK-NEXT: smlal2 v3.4s, v2.8h, v16.8h +; CHECK-NEXT: smlal v6.4s, v2.4h, v16.4h +; CHECK-NEXT: smlal2 v25.4s, v5.8h, v23.8h ; CHECK-NEXT: smlal2 v18.4s, v0.8h, v1.8h -; CHECK-NEXT: smlal v6.4s, v0.4h, v1.4h +; CHECK-NEXT: smlal v4.4s, v0.4h, v1.4h ; CHECK-NEXT: smlal v19.4s, v20.4h, v22.4h -; CHECK-NEXT: smlal2 v21.4s, v7.8h, v25.8h -; CHECK-NEXT: smlal v5.4s, v7.4h, v25.4h +; CHECK-NEXT: smlal2 v21.4s, v7.8h, v24.8h +; CHECK-NEXT: smlal v17.4s, v7.4h, v24.4h ; CHECK-NEXT: add v0.4s, v18.4s, v3.4s -; CHECK-NEXT: add v1.4s, v6.4s, v16.4s -; CHECK-NEXT: add v2.4s, v23.4s, v21.4s -; CHECK-NEXT: add v3.4s, v19.4s, v5.4s +; CHECK-NEXT: add v1.4s, v4.4s, v6.4s +; CHECK-NEXT: add v2.4s, v25.4s, v21.4s +; CHECK-NEXT: add v3.4s, v19.4s, v17.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: add v1.4s, v3.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s @@ -2028,146 +2028,146 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-NEXT: ldr b2, [sp, #144] ; CHECK-NEXT: add x9, sp, #152 ; CHECK-NEXT: ldr b3, [sp, #16] -; CHECK-NEXT: add x12, sp, #32 +; CHECK-NEXT: add x10, sp, #32 ; CHECK-NEXT: ld1 { v1.b }[1], [x8] ; CHECK-NEXT: ld1 { v2.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #96 ; CHECK-NEXT: add x8, sp, #24 -; CHECK-NEXT: add x11, sp, #112 ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: add x12, sp, #40 ; CHECK-NEXT: ld1 { v3.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #160 -; CHECK-NEXT: ldr b4, [sp, #480] +; CHECK-NEXT: ldr b5, [sp, #480] ; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #104 ; CHECK-NEXT: ld1 { v2.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #168 -; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: ld1 { v3.b }[2], [x12] -; CHECK-NEXT: add x12, sp, #40 -; CHECK-NEXT: ldr b5, [sp, #608] -; CHECK-NEXT: ld1 { v1.b }[3], [x9] -; CHECK-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #112 +; CHECK-NEXT: add x11, sp, #120 +; CHECK-NEXT: ldr b4, [sp, #608] +; CHECK-NEXT: ld1 { v3.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #168 ; CHECK-NEXT: mov v0.b[1], w1 +; CHECK-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-NEXT: ld1 { v2.b }[3], [x10] ; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: add x14, sp, #184 +; CHECK-NEXT: add x13, sp, #184 ; CHECK-NEXT: ldr b16, [sp, #544] +; CHECK-NEXT: ldr b17, [sp, #672] ; CHECK-NEXT: ld1 { v3.b }[3], [x12] ; CHECK-NEXT: add x12, sp, #176 -; CHECK-NEXT: ldr b17, [sp, #672] -; CHECK-NEXT: ld1 { v1.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #488 +; CHECK-NEXT: add x10, sp, #136 +; CHECK-NEXT: ld1 { v1.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #488 ; CHECK-NEXT: ld1 { v2.b }[4], [x12] -; CHECK-NEXT: ld1 { v4.b }[1], [x11] +; CHECK-NEXT: ld1 { v5.b }[1], [x8] ; CHECK-NEXT: mov v0.b[2], w2 -; CHECK-NEXT: add x11, sp, #192 -; CHECK-NEXT: ld1 { v3.b }[4], [x13] -; CHECK-NEXT: add x13, sp, #616 -; CHECK-NEXT: add x12, sp, #56 -; CHECK-NEXT: ld1 { v1.b }[5], [x10] -; CHECK-NEXT: ld1 { v5.b }[1], [x13] -; CHECK-NEXT: add x13, sp, #496 -; CHECK-NEXT: ld1 { v4.b }[2], [x13] -; CHECK-NEXT: ld1 { v2.b }[5], [x14] -; CHECK-NEXT: add x14, sp, #680 -; CHECK-NEXT: ld1 { v17.b }[1], [x14] -; CHECK-NEXT: add x13, sp, #504 -; CHECK-NEXT: ld1 { v3.b }[5], [x12] +; CHECK-NEXT: add x12, sp, #192 +; CHECK-NEXT: ldr b18, [sp, #736] +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: movi v7.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v1.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #616 +; CHECK-NEXT: ld1 { v2.b }[5], [x13] +; CHECK-NEXT: ld1 { v4.b }[1], [x11] +; CHECK-NEXT: add x11, sp, #496 +; CHECK-NEXT: add x13, sp, #680 +; CHECK-NEXT: ld1 { v5.b }[2], [x11] +; CHECK-NEXT: ld1 { v17.b }[1], [x13] +; CHECK-NEXT: add x11, sp, #504 ; CHECK-NEXT: ld1 { v1.b }[6], [x9] ; CHECK-NEXT: add x9, sp, #552 -; CHECK-NEXT: add x12, sp, #688 +; CHECK-NEXT: ld1 { v2.b }[6], [x12] ; CHECK-NEXT: ld1 { v16.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #624 -; CHECK-NEXT: ld1 { v4.b }[3], [x13] -; CHECK-NEXT: ld1 { v2.b }[6], [x11] +; CHECK-NEXT: add x12, sp, #688 +; CHECK-NEXT: ld1 { v5.b }[3], [x11] ; CHECK-NEXT: add x11, sp, #560 -; CHECK-NEXT: add x8, sp, #136 ; CHECK-NEXT: ld1 { v17.b }[2], [x12] -; CHECK-NEXT: ld1 { v5.b }[2], [x9] -; CHECK-NEXT: ld1 { v1.b }[7], [x8] -; CHECK-NEXT: ld1 { v16.b }[2], [x11] -; CHECK-NEXT: add x8, sp, #512 +; CHECK-NEXT: ld1 { v4.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #512 ; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: ld1 { v4.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #568 -; CHECK-NEXT: add x9, sp, #696 +; CHECK-NEXT: ld1 { v16.b }[2], [x11] +; CHECK-NEXT: ld1 { v1.b }[7], [x10] +; CHECK-NEXT: add x10, sp, #696 +; CHECK-NEXT: ld1 { v5.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #568 ; CHECK-NEXT: add x11, sp, #632 -; CHECK-NEXT: ld1 { v17.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #520 -; CHECK-NEXT: ld1 { v16.b }[3], [x8] -; CHECK-NEXT: ld1 { v5.b }[3], [x11] -; CHECK-NEXT: add x8, sp, #640 -; CHECK-NEXT: ld1 { v4.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #576 +; CHECK-NEXT: ld1 { v17.b }[3], [x10] +; CHECK-NEXT: ld1 { v4.b }[3], [x11] +; CHECK-NEXT: add x10, sp, #520 +; CHECK-NEXT: ld1 { v16.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #640 ; CHECK-NEXT: add x11, sp, #704 -; CHECK-NEXT: ldr b18, [sp, #736] +; CHECK-NEXT: ld1 { v5.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #576 ; CHECK-NEXT: mov v0.b[4], w4 ; CHECK-NEXT: ld1 { v17.b }[4], [x11] -; CHECK-NEXT: ld1 { v16.b }[4], [x9] -; CHECK-NEXT: ld1 { v5.b }[4], [x8] -; CHECK-NEXT: add x9, sp, #528 +; CHECK-NEXT: ld1 { v4.b }[4], [x9] ; CHECK-NEXT: sshll v18.8h, v18.8b, #0 -; CHECK-NEXT: add x8, sp, #648 +; CHECK-NEXT: ld1 { v16.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #528 +; CHECK-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-NEXT: add x9, sp, #648 ; CHECK-NEXT: add x11, sp, #584 ; CHECK-NEXT: add x12, sp, #712 -; CHECK-NEXT: ld1 { v4.b }[6], [x9] -; CHECK-NEXT: movi v7.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v16.b }[5], [x11] +; CHECK-NEXT: ld1 { v5.b }[6], [x10] ; CHECK-NEXT: ld1 { v17.b }[5], [x12] -; CHECK-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-NEXT: ld1 { v4.b }[5], [x9] +; CHECK-NEXT: ld1 { v16.b }[5], [x11] +; CHECK-NEXT: add x14, sp, #56 ; CHECK-NEXT: mov v0.b[5], w5 -; CHECK-NEXT: add x9, sp, #536 +; CHECK-NEXT: add x10, sp, #536 ; CHECK-NEXT: sshll v18.4s, v18.4h, #0 -; CHECK-NEXT: add x8, sp, #656 +; CHECK-NEXT: ld1 { v3.b }[5], [x14] +; CHECK-NEXT: add x9, sp, #656 ; CHECK-NEXT: add x11, sp, #592 ; CHECK-NEXT: add x12, sp, #720 -; CHECK-NEXT: ld1 { v4.b }[7], [x9] +; CHECK-NEXT: ld1 { v5.b }[7], [x10] ; CHECK-NEXT: ld1 { v16.b }[6], [x11] ; CHECK-NEXT: ld1 { v17.b }[6], [x12] -; CHECK-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-NEXT: ld1 { v4.b }[6], [x9] ; CHECK-NEXT: ldr b6, [sp, #208] -; CHECK-NEXT: add x10, sp, #64 +; CHECK-NEXT: add x8, sp, #64 ; CHECK-NEXT: mov v7.s[0], v18.s[0] ; CHECK-NEXT: mov v0.b[6], w6 -; CHECK-NEXT: ld1 { v3.b }[6], [x10] +; CHECK-NEXT: ld1 { v3.b }[6], [x8] ; CHECK-NEXT: add x8, sp, #664 ; CHECK-NEXT: add x9, sp, #600 ; CHECK-NEXT: add x10, sp, #728 -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0 ; CHECK-NEXT: ld1 { v16.b }[7], [x9] ; CHECK-NEXT: ld1 { v17.b }[7], [x10] -; CHECK-NEXT: ld1 { v5.b }[7], [x8] +; CHECK-NEXT: ld1 { v4.b }[7], [x8] ; CHECK-NEXT: movi v18.2d, #0000000000000000 ; CHECK-NEXT: mov v0.b[7], w7 ; CHECK-NEXT: add x9, sp, #200 ; CHECK-NEXT: add x10, sp, #72 -; CHECK-NEXT: saddw v7.4s, v7.4s, v4.4h +; CHECK-NEXT: saddw v7.4s, v7.4s, v5.4h ; CHECK-NEXT: sshll v6.4s, v6.4h, #0 ; CHECK-NEXT: sshll v16.8h, v16.8b, #0 ; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 ; CHECK-NEXT: ld1 { v2.b }[7], [x9] ; CHECK-NEXT: ld1 { v3.b }[7], [x10] ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: mov v18.s[0], v6.s[0] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: saddl2 v6.4s, v17.8h, v16.8h -; CHECK-NEXT: saddl2 v4.4s, v5.8h, v4.8h +; CHECK-NEXT: saddl2 v5.4s, v4.8h, v5.8h ; CHECK-NEXT: saddl v16.4s, v17.4h, v16.4h -; CHECK-NEXT: saddw v5.4s, v7.4s, v5.4h +; CHECK-NEXT: saddw v4.4s, v7.4s, v4.4h ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 ; CHECK-NEXT: saddl2 v17.4s, v0.8h, v1.8h ; CHECK-NEXT: saddw v0.4s, v18.4s, v0.4h ; CHECK-NEXT: saddl2 v7.4s, v3.8h, v2.8h -; CHECK-NEXT: add v4.4s, v4.4s, v6.4s +; CHECK-NEXT: add v5.4s, v5.4s, v6.4s ; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h -; CHECK-NEXT: add v5.4s, v5.4s, v16.4s +; CHECK-NEXT: add v4.4s, v4.4s, v16.4s ; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: add v6.4s, v17.4s, v7.4s -; CHECK-NEXT: add v1.4s, v5.4s, v4.4s +; CHECK-NEXT: add v1.4s, v4.4s, v5.4s ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: add v1.4s, v6.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s @@ -2265,380 +2265,380 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldr b3, [sp, #592] +; CHECK-NEXT: ldr b6, [sp, #592] ; CHECK-NEXT: add x8, sp, #600 -; CHECK-NEXT: ldr b6, [sp, #208] +; CHECK-NEXT: ldr b16, [sp, #208] ; CHECK-NEXT: ldr b0, [sp, #336] ; CHECK-NEXT: add x9, sp, #344 -; CHECK-NEXT: ldr b2, [sp, #464] -; CHECK-NEXT: ld1 { v3.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #216 -; CHECK-NEXT: add x10, sp, #624 +; CHECK-NEXT: ldr b1, [sp, #464] ; CHECK-NEXT: ld1 { v6.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #216 +; CHECK-NEXT: add x10, sp, #472 +; CHECK-NEXT: ld1 { v16.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #608 ; CHECK-NEXT: ld1 { v0.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #232 -; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: ldr b7, [sp, #1360] -; CHECK-NEXT: ld1 { v3.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #224 -; CHECK-NEXT: add x11, sp, #648 +; CHECK-NEXT: ld1 { v1.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #240 ; CHECK-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #224 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: ld1 { v16.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #616 +; CHECK-NEXT: ldr b5, [sp, #1360] ; CHECK-NEXT: add x12, sp, #376 -; CHECK-NEXT: mov v1.b[1], w1 -; CHECK-NEXT: ldr b16, [sp, #976] +; CHECK-NEXT: add x11, sp, #656 +; CHECK-NEXT: ldr b7, [sp, #976] +; CHECK-NEXT: ld1 { v6.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #624 +; CHECK-NEXT: mov v2.b[1], w1 +; CHECK-NEXT: ld1 { v16.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #632 ; CHECK-NEXT: add x14, sp, #288 -; CHECK-NEXT: ld1 { v3.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #632 ; CHECK-NEXT: add x15, sp, #408 -; CHECK-NEXT: ld1 { v6.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #472 ; CHECK-NEXT: add x13, sp, #696 -; CHECK-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #240 ; CHECK-NEXT: add x16, sp, #448 -; CHECK-NEXT: ld1 { v3.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #352 -; CHECK-NEXT: mov v1.b[2], w2 -; CHECK-NEXT: ld1 { v6.b }[4], [x9] -; CHECK-NEXT: ld1 { v0.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #1368 -; CHECK-NEXT: ld1 { v7.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #248 -; CHECK-NEXT: add x9, sp, #640 -; CHECK-NEXT: ld1 { v3.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #656 -; CHECK-NEXT: movi v5.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v6.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #360 -; CHECK-NEXT: mov v1.b[3], w3 -; CHECK-NEXT: ld1 { v0.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #256 +; CHECK-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #352 ; CHECK-NEXT: movi v4.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v3.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #368 +; CHECK-NEXT: ld1 { v16.b }[4], [x10] +; CHECK-NEXT: ld1 { v0.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #1368 +; CHECK-NEXT: ld1 { v5.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #248 +; CHECK-NEXT: add x10, sp, #640 +; CHECK-NEXT: ld1 { v6.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #648 +; CHECK-NEXT: mov v2.b[2], w2 +; CHECK-NEXT: ld1 { v16.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #360 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v0.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #256 ; CHECK-NEXT: ldr b17, [sp, #720] ; CHECK-NEXT: ld1 { v6.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #984 -; CHECK-NEXT: ld1 { v0.b }[4], [x9] -; CHECK-NEXT: ld1 { v16.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #664 -; CHECK-NEXT: ld1 { v3.b }[7], [x11] -; CHECK-NEXT: add x11, sp, #264 -; CHECK-NEXT: mov v1.b[4], w4 -; CHECK-NEXT: ld1 { v6.b }[7], [x11] -; CHECK-NEXT: add x9, sp, #672 -; CHECK-NEXT: add x11, sp, #680 +; CHECK-NEXT: add x10, sp, #368 +; CHECK-NEXT: ld1 { v16.b }[6], [x8] +; CHECK-NEXT: mov v2.b[3], w3 +; CHECK-NEXT: add x8, sp, #984 +; CHECK-NEXT: ld1 { v0.b }[4], [x10] +; CHECK-NEXT: ld1 { v7.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #664 +; CHECK-NEXT: ld1 { v6.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #264 +; CHECK-NEXT: add x10, sp, #672 +; CHECK-NEXT: ld1 { v16.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #680 ; CHECK-NEXT: ld1 { v0.b }[5], [x12] ; CHECK-NEXT: add x12, sp, #480 -; CHECK-NEXT: ld1 { v2.b }[2], [x12] +; CHECK-NEXT: mov v2.b[4], w4 +; CHECK-NEXT: ld1 { v1.b }[2], [x12] ; CHECK-NEXT: add x12, sp, #272 -; CHECK-NEXT: ld1 { v3.b }[8], [x8] -; CHECK-NEXT: ld1 { v6.b }[8], [x12] +; CHECK-NEXT: ld1 { v6.b }[8], [x11] +; CHECK-NEXT: ld1 { v16.b }[8], [x12] ; CHECK-NEXT: add x12, sp, #384 -; CHECK-NEXT: mov v1.b[5], w5 +; CHECK-NEXT: add x11, sp, #688 ; CHECK-NEXT: ld1 { v0.b }[6], [x12] ; CHECK-NEXT: add x12, sp, #280 -; CHECK-NEXT: add x8, sp, #688 -; CHECK-NEXT: ld1 { v3.b }[9], [x10] -; CHECK-NEXT: add x10, sp, #1376 -; CHECK-NEXT: ld1 { v7.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #392 -; CHECK-NEXT: ld1 { v6.b }[9], [x12] -; CHECK-NEXT: ld1 { v0.b }[7], [x10] -; CHECK-NEXT: mov v1.b[6], w6 +; CHECK-NEXT: ld1 { v6.b }[9], [x8] +; CHECK-NEXT: add x8, sp, #1376 +; CHECK-NEXT: mov v2.b[5], w5 +; CHECK-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #392 +; CHECK-NEXT: ld1 { v16.b }[9], [x12] +; CHECK-NEXT: ld1 { v0.b }[7], [x8] ; CHECK-NEXT: add x12, sp, #704 -; CHECK-NEXT: ld1 { v3.b }[10], [x9] -; CHECK-NEXT: add x9, sp, #400 -; CHECK-NEXT: add x10, sp, #712 -; CHECK-NEXT: ld1 { v6.b }[10], [x14] +; CHECK-NEXT: add x8, sp, #712 +; CHECK-NEXT: ld1 { v6.b }[10], [x10] +; CHECK-NEXT: add x10, sp, #400 +; CHECK-NEXT: ld1 { v16.b }[10], [x14] ; CHECK-NEXT: add x14, sp, #992 -; CHECK-NEXT: ld1 { v0.b }[8], [x9] -; CHECK-NEXT: ld1 { v16.b }[2], [x14] +; CHECK-NEXT: mov v2.b[6], w6 +; CHECK-NEXT: ld1 { v0.b }[8], [x10] +; CHECK-NEXT: ld1 { v7.b }[2], [x14] ; CHECK-NEXT: add x14, sp, #296 -; CHECK-NEXT: ld1 { v3.b }[11], [x11] -; CHECK-NEXT: add x9, sp, #304 -; CHECK-NEXT: add x11, sp, #312 -; CHECK-NEXT: ld1 { v6.b }[11], [x14] -; CHECK-NEXT: mov v1.b[7], w7 +; CHECK-NEXT: ld1 { v6.b }[11], [x9] +; CHECK-NEXT: add x10, sp, #304 +; CHECK-NEXT: add x9, sp, #312 +; CHECK-NEXT: ld1 { v16.b }[11], [x14] ; CHECK-NEXT: add x14, sp, #320 ; CHECK-NEXT: ld1 { v0.b }[9], [x15] +; CHECK-NEXT: mov v2.b[7], w7 ; CHECK-NEXT: add x15, sp, #328 -; CHECK-NEXT: ld1 { v3.b }[12], [x8] -; CHECK-NEXT: add x8, sp, #416 -; CHECK-NEXT: ld1 { v6.b }[12], [x9] -; CHECK-NEXT: add x9, sp, #1384 -; CHECK-NEXT: ld1 { v0.b }[10], [x8] -; CHECK-NEXT: ld1 { v7.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #424 -; CHECK-NEXT: ld1 { v3.b }[13], [x13] -; CHECK-NEXT: add x8, sp, #432 +; CHECK-NEXT: ld1 { v6.b }[12], [x11] +; CHECK-NEXT: add x11, sp, #416 +; CHECK-NEXT: ld1 { v16.b }[12], [x10] +; CHECK-NEXT: add x10, sp, #1384 +; CHECK-NEXT: ld1 { v0.b }[10], [x11] +; CHECK-NEXT: ld1 { v5.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #424 +; CHECK-NEXT: ld1 { v6.b }[13], [x13] +; CHECK-NEXT: add x11, sp, #432 ; CHECK-NEXT: add x13, sp, #440 -; CHECK-NEXT: ld1 { v6.b }[13], [x11] -; CHECK-NEXT: add x11, sp, #16 -; CHECK-NEXT: ld1 { v0.b }[11], [x9] -; CHECK-NEXT: add x9, sp, #1000 -; CHECK-NEXT: ld1 { v1.b }[8], [x11] -; CHECK-NEXT: ld1 { v16.b }[3], [x9] -; CHECK-NEXT: ld1 { v3.b }[14], [x12] +; CHECK-NEXT: ld1 { v16.b }[13], [x9] +; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: ld1 { v0.b }[11], [x10] +; CHECK-NEXT: add x10, sp, #1000 +; CHECK-NEXT: ld1 { v2.b }[8], [x9] +; CHECK-NEXT: ld1 { v7.b }[3], [x10] +; CHECK-NEXT: ld1 { v6.b }[14], [x12] ; CHECK-NEXT: add x12, sp, #488 -; CHECK-NEXT: ld1 { v6.b }[14], [x14] +; CHECK-NEXT: ld1 { v16.b }[14], [x14] ; CHECK-NEXT: add x14, sp, #1392 -; CHECK-NEXT: ld1 { v2.b }[3], [x12] -; CHECK-NEXT: ld1 { v7.b }[4], [x14] -; CHECK-NEXT: add x11, sp, #1008 -; CHECK-NEXT: ld1 { v0.b }[12], [x8] -; CHECK-NEXT: ld1 { v16.b }[4], [x11] -; CHECK-NEXT: add x8, sp, #1400 -; CHECK-NEXT: ld1 { v3.b }[15], [x10] -; CHECK-NEXT: add x10, sp, #496 -; CHECK-NEXT: add x9, sp, #24 -; CHECK-NEXT: ld1 { v6.b }[15], [x15] +; CHECK-NEXT: ld1 { v1.b }[3], [x12] +; CHECK-NEXT: ld1 { v5.b }[4], [x14] +; CHECK-NEXT: add x9, sp, #1008 +; CHECK-NEXT: ld1 { v0.b }[12], [x11] +; CHECK-NEXT: ld1 { v7.b }[4], [x9] +; CHECK-NEXT: add x11, sp, #1400 +; CHECK-NEXT: ld1 { v6.b }[15], [x8] +; CHECK-NEXT: add x8, sp, #496 +; CHECK-NEXT: add x10, sp, #24 +; CHECK-NEXT: add x9, sp, #1408 +; CHECK-NEXT: ld1 { v5.b }[5], [x11] +; CHECK-NEXT: ld1 { v1.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #1016 ; CHECK-NEXT: ld1 { v7.b }[5], [x8] -; CHECK-NEXT: ld1 { v2.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #1016 -; CHECK-NEXT: ld1 { v16.b }[5], [x10] ; CHECK-NEXT: ld1 { v0.b }[13], [x13] -; CHECK-NEXT: add x8, sp, #1408 -; CHECK-NEXT: ld1 { v1.b }[9], [x9] -; CHECK-NEXT: add x9, sp, #504 +; CHECK-NEXT: ld1 { v2.b }[9], [x10] +; CHECK-NEXT: add x8, sp, #504 ; CHECK-NEXT: add x10, sp, #512 +; CHECK-NEXT: ld1 { v16.b }[15], [x15] +; CHECK-NEXT: ld1 { v5.b }[6], [x9] +; CHECK-NEXT: ld1 { v1.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #1024 +; CHECK-NEXT: add x9, sp, #32 ; CHECK-NEXT: ld1 { v7.b }[6], [x8] -; CHECK-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #1024 -; CHECK-NEXT: add x8, sp, #32 -; CHECK-NEXT: ld1 { v16.b }[6], [x9] ; CHECK-NEXT: ld1 { v0.b }[14], [x16] -; CHECK-NEXT: ld1 { v1.b }[10], [x8] -; CHECK-NEXT: add x8, sp, #1416 -; CHECK-NEXT: add x9, sp, #456 -; CHECK-NEXT: ld1 { v7.b }[7], [x8] -; CHECK-NEXT: ld1 { v2.b }[6], [x10] +; CHECK-NEXT: ld1 { v2.b }[10], [x9] +; CHECK-NEXT: add x9, sp, #1416 +; CHECK-NEXT: add x8, sp, #456 +; CHECK-NEXT: ld1 { v5.b }[7], [x9] +; CHECK-NEXT: ld1 { v1.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #1032 -; CHECK-NEXT: add x8, sp, #40 -; CHECK-NEXT: ld1 { v16.b }[7], [x10] -; CHECK-NEXT: ld1 { v0.b }[15], [x9] -; CHECK-NEXT: ld1 { v1.b }[11], [x8] -; CHECK-NEXT: add x8, sp, #1424 -; CHECK-NEXT: add x9, sp, #520 +; CHECK-NEXT: add x9, sp, #40 +; CHECK-NEXT: ld1 { v7.b }[7], [x10] +; CHECK-NEXT: ld1 { v0.b }[15], [x8] +; CHECK-NEXT: ld1 { v2.b }[11], [x9] +; CHECK-NEXT: add x9, sp, #1424 +; CHECK-NEXT: add x8, sp, #520 +; CHECK-NEXT: ld1 { v5.b }[8], [x9] +; CHECK-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #1040 +; CHECK-NEXT: add x9, sp, #48 ; CHECK-NEXT: ld1 { v7.b }[8], [x8] -; CHECK-NEXT: ld1 { v2.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #1040 -; CHECK-NEXT: add x8, sp, #48 -; CHECK-NEXT: ld1 { v16.b }[8], [x9] ; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: ld1 { v1.b }[12], [x8] -; CHECK-NEXT: add x8, sp, #1432 -; CHECK-NEXT: sdot v5.4s, v6.16b, v3.16b -; CHECK-NEXT: ld1 { v7.b }[9], [x8] -; CHECK-NEXT: ld1 { v2.b }[8], [x10] -; CHECK-NEXT: add x8, sp, #1048 -; CHECK-NEXT: ldr b3, [sp, #80] -; CHECK-NEXT: ld1 { v16.b }[9], [x8] +; CHECK-NEXT: ld1 { v2.b }[12], [x9] +; CHECK-NEXT: add x9, sp, #1432 +; CHECK-NEXT: sdot v4.4s, v16.16b, v6.16b +; CHECK-NEXT: ld1 { v5.b }[9], [x9] +; CHECK-NEXT: ld1 { v1.b }[8], [x10] +; CHECK-NEXT: add x9, sp, #1048 +; CHECK-NEXT: ldr b6, [sp, #80] +; CHECK-NEXT: ld1 { v7.b }[9], [x9] +; CHECK-NEXT: add x8, sp, #56 ; CHECK-NEXT: add x10, sp, #88 -; CHECK-NEXT: add x8, sp, #536 +; CHECK-NEXT: add x9, sp, #536 ; CHECK-NEXT: add x11, sp, #1440 -; CHECK-NEXT: add x9, sp, #56 -; CHECK-NEXT: ld1 { v3.b }[1], [x10] -; CHECK-NEXT: ld1 { v2.b }[9], [x8] +; CHECK-NEXT: ld1 { v6.b }[1], [x10] +; CHECK-NEXT: ld1 { v2.b }[13], [x8] +; CHECK-NEXT: ld1 { v1.b }[9], [x9] ; CHECK-NEXT: add x8, sp, #1056 -; CHECK-NEXT: ld1 { v7.b }[10], [x11] -; CHECK-NEXT: ld1 { v16.b }[10], [x8] -; CHECK-NEXT: ld1 { v1.b }[13], [x9] +; CHECK-NEXT: ld1 { v5.b }[10], [x11] ; CHECK-NEXT: add x9, sp, #96 +; CHECK-NEXT: ld1 { v7.b }[10], [x8] ; CHECK-NEXT: add x8, sp, #544 ; CHECK-NEXT: add x10, sp, #1448 -; CHECK-NEXT: ld1 { v3.b }[2], [x9] -; CHECK-NEXT: ld1 { v2.b }[10], [x8] +; CHECK-NEXT: ld1 { v6.b }[2], [x9] +; CHECK-NEXT: ld1 { v1.b }[10], [x8] ; CHECK-NEXT: add x8, sp, #1064 -; CHECK-NEXT: ld1 { v7.b }[11], [x10] -; CHECK-NEXT: ld1 { v16.b }[11], [x8] +; CHECK-NEXT: ld1 { v5.b }[11], [x10] ; CHECK-NEXT: add x10, sp, #104 -; CHECK-NEXT: add x8, sp, #552 ; CHECK-NEXT: add x11, sp, #1456 +; CHECK-NEXT: ld1 { v7.b }[11], [x8] +; CHECK-NEXT: add x8, sp, #552 ; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ld1 { v3.b }[3], [x10] -; CHECK-NEXT: ld1 { v2.b }[11], [x8] +; CHECK-NEXT: ld1 { v6.b }[3], [x10] +; CHECK-NEXT: ld1 { v1.b }[11], [x8] ; CHECK-NEXT: add x8, sp, #1072 -; CHECK-NEXT: ld1 { v7.b }[12], [x11] -; CHECK-NEXT: ld1 { v16.b }[12], [x8] -; CHECK-NEXT: ld1 { v1.b }[14], [x9] +; CHECK-NEXT: ld1 { v5.b }[12], [x11] +; CHECK-NEXT: ld1 { v2.b }[14], [x9] ; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: ld1 { v7.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #560 ; CHECK-NEXT: add x10, sp, #1464 -; CHECK-NEXT: ld1 { v3.b }[4], [x9] -; CHECK-NEXT: ld1 { v2.b }[12], [x8] +; CHECK-NEXT: ld1 { v6.b }[4], [x9] +; CHECK-NEXT: ld1 { v1.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #1080 -; CHECK-NEXT: ld1 { v7.b }[13], [x10] -; CHECK-NEXT: ld1 { v16.b }[13], [x8] +; CHECK-NEXT: ld1 { v5.b }[13], [x10] ; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: add x8, sp, #568 ; CHECK-NEXT: add x11, sp, #1472 +; CHECK-NEXT: ld1 { v7.b }[13], [x8] +; CHECK-NEXT: add x8, sp, #568 ; CHECK-NEXT: add x9, sp, #72 -; CHECK-NEXT: ld1 { v3.b }[5], [x10] -; CHECK-NEXT: ld1 { v2.b }[13], [x8] +; CHECK-NEXT: ld1 { v6.b }[5], [x10] +; CHECK-NEXT: ld1 { v1.b }[13], [x8] ; CHECK-NEXT: add x8, sp, #1088 -; CHECK-NEXT: ld1 { v7.b }[14], [x11] -; CHECK-NEXT: ld1 { v16.b }[14], [x8] -; CHECK-NEXT: ld1 { v1.b }[15], [x9] +; CHECK-NEXT: ld1 { v5.b }[14], [x11] +; CHECK-NEXT: ld1 { v2.b }[15], [x9] ; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: ldr b6, [sp, #1104] +; CHECK-NEXT: ld1 { v7.b }[14], [x8] +; CHECK-NEXT: ldr b16, [sp, #1104] ; CHECK-NEXT: add x10, sp, #1480 -; CHECK-NEXT: ld1 { v3.b }[6], [x9] +; CHECK-NEXT: ld1 { v6.b }[6], [x9] ; CHECK-NEXT: add x8, sp, #1096 ; CHECK-NEXT: add x9, sp, #1112 -; CHECK-NEXT: ld1 { v7.b }[15], [x10] -; CHECK-NEXT: ld1 { v16.b }[15], [x8] -; CHECK-NEXT: ld1 { v6.b }[1], [x9] -; CHECK-NEXT: add x8, sp, #728 +; CHECK-NEXT: ld1 { v5.b }[15], [x10] +; CHECK-NEXT: ld1 { v16.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #576 +; CHECK-NEXT: ld1 { v7.b }[15], [x8] +; CHECK-NEXT: add x8, sp, #728 ; CHECK-NEXT: add x10, sp, #136 ; CHECK-NEXT: ld1 { v17.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #1120 -; CHECK-NEXT: ld1 { v2.b }[14], [x9] -; CHECK-NEXT: sdot v4.4s, v16.16b, v7.16b -; CHECK-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-NEXT: ld1 { v1.b }[14], [x9] +; CHECK-NEXT: ld1 { v16.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #736 -; CHECK-NEXT: ldr b7, [sp, #1232] -; CHECK-NEXT: ldr b16, [sp, #848] -; CHECK-NEXT: ld1 { v3.b }[7], [x10] +; CHECK-NEXT: ld1 { v6.b }[7], [x10] +; CHECK-NEXT: sdot v3.4s, v7.16b, v5.16b +; CHECK-NEXT: ldr b5, [sp, #1232] +; CHECK-NEXT: ldr b7, [sp, #848] ; CHECK-NEXT: ld1 { v17.b }[2], [x8] ; CHECK-NEXT: add x9, sp, #1240 ; CHECK-NEXT: add x10, sp, #856 -; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: ld1 { v16.b }[1], [x10] +; CHECK-NEXT: ld1 { v5.b }[1], [x9] +; CHECK-NEXT: ld1 { v7.b }[1], [x10] ; CHECK-NEXT: add x8, sp, #1128 ; CHECK-NEXT: add x11, sp, #744 -; CHECK-NEXT: ld1 { v6.b }[3], [x8] +; CHECK-NEXT: ld1 { v16.b }[3], [x8] ; CHECK-NEXT: add x10, sp, #1248 ; CHECK-NEXT: ld1 { v17.b }[3], [x11] ; CHECK-NEXT: add x11, sp, #864 ; CHECK-NEXT: add x9, sp, #144 -; CHECK-NEXT: ld1 { v7.b }[2], [x10] -; CHECK-NEXT: ld1 { v16.b }[2], [x11] +; CHECK-NEXT: ld1 { v5.b }[2], [x10] +; CHECK-NEXT: ld1 { v7.b }[2], [x11] ; CHECK-NEXT: add x8, sp, #1136 ; CHECK-NEXT: add x12, sp, #752 -; CHECK-NEXT: ld1 { v3.b }[8], [x9] -; CHECK-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-NEXT: ld1 { v6.b }[8], [x9] +; CHECK-NEXT: ld1 { v16.b }[4], [x8] ; CHECK-NEXT: ld1 { v17.b }[4], [x12] ; CHECK-NEXT: add x9, sp, #1256 ; CHECK-NEXT: add x10, sp, #872 -; CHECK-NEXT: ld1 { v7.b }[3], [x9] -; CHECK-NEXT: ld1 { v16.b }[3], [x10] +; CHECK-NEXT: ld1 { v5.b }[3], [x9] +; CHECK-NEXT: ld1 { v7.b }[3], [x10] ; CHECK-NEXT: add x8, sp, #1144 ; CHECK-NEXT: add x11, sp, #760 -; CHECK-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-NEXT: ld1 { v16.b }[5], [x8] ; CHECK-NEXT: add x10, sp, #1264 ; CHECK-NEXT: ld1 { v17.b }[5], [x11] ; CHECK-NEXT: add x11, sp, #880 ; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ld1 { v7.b }[4], [x10] -; CHECK-NEXT: ld1 { v16.b }[4], [x11] +; CHECK-NEXT: ld1 { v5.b }[4], [x10] +; CHECK-NEXT: ld1 { v7.b }[4], [x11] ; CHECK-NEXT: add x8, sp, #1152 ; CHECK-NEXT: add x12, sp, #768 -; CHECK-NEXT: ld1 { v3.b }[9], [x9] -; CHECK-NEXT: ld1 { v6.b }[6], [x8] +; CHECK-NEXT: ld1 { v6.b }[9], [x9] +; CHECK-NEXT: ld1 { v16.b }[6], [x8] ; CHECK-NEXT: ld1 { v17.b }[6], [x12] ; CHECK-NEXT: add x9, sp, #1272 ; CHECK-NEXT: add x10, sp, #888 -; CHECK-NEXT: ld1 { v7.b }[5], [x9] -; CHECK-NEXT: ld1 { v16.b }[5], [x10] +; CHECK-NEXT: ld1 { v5.b }[5], [x9] +; CHECK-NEXT: ld1 { v7.b }[5], [x10] ; CHECK-NEXT: add x8, sp, #1160 ; CHECK-NEXT: add x11, sp, #776 -; CHECK-NEXT: ld1 { v6.b }[7], [x8] +; CHECK-NEXT: ld1 { v16.b }[7], [x8] ; CHECK-NEXT: add x10, sp, #1280 ; CHECK-NEXT: ld1 { v17.b }[7], [x11] ; CHECK-NEXT: add x11, sp, #896 ; CHECK-NEXT: add x9, sp, #160 -; CHECK-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-NEXT: ld1 { v16.b }[6], [x11] +; CHECK-NEXT: ld1 { v5.b }[6], [x10] +; CHECK-NEXT: ld1 { v7.b }[6], [x11] ; CHECK-NEXT: add x8, sp, #1168 ; CHECK-NEXT: add x12, sp, #784 -; CHECK-NEXT: ld1 { v3.b }[10], [x9] -; CHECK-NEXT: ld1 { v6.b }[8], [x8] +; CHECK-NEXT: ld1 { v6.b }[10], [x9] +; CHECK-NEXT: ld1 { v16.b }[8], [x8] ; CHECK-NEXT: ld1 { v17.b }[8], [x12] ; CHECK-NEXT: add x9, sp, #1288 ; CHECK-NEXT: add x10, sp, #904 -; CHECK-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-NEXT: ld1 { v16.b }[7], [x10] +; CHECK-NEXT: ld1 { v5.b }[7], [x9] +; CHECK-NEXT: ld1 { v7.b }[7], [x10] ; CHECK-NEXT: add x8, sp, #1176 ; CHECK-NEXT: add x11, sp, #792 -; CHECK-NEXT: ld1 { v6.b }[9], [x8] +; CHECK-NEXT: ld1 { v16.b }[9], [x8] ; CHECK-NEXT: add x10, sp, #1296 ; CHECK-NEXT: ld1 { v17.b }[9], [x11] ; CHECK-NEXT: add x11, sp, #912 ; CHECK-NEXT: add x9, sp, #168 -; CHECK-NEXT: ld1 { v7.b }[8], [x10] -; CHECK-NEXT: ld1 { v16.b }[8], [x11] +; CHECK-NEXT: ld1 { v5.b }[8], [x10] +; CHECK-NEXT: ld1 { v7.b }[8], [x11] ; CHECK-NEXT: add x8, sp, #1184 ; CHECK-NEXT: add x12, sp, #800 -; CHECK-NEXT: ld1 { v3.b }[11], [x9] -; CHECK-NEXT: ld1 { v6.b }[10], [x8] +; CHECK-NEXT: ld1 { v6.b }[11], [x9] +; CHECK-NEXT: ld1 { v16.b }[10], [x8] ; CHECK-NEXT: ld1 { v17.b }[10], [x12] ; CHECK-NEXT: add x9, sp, #1304 ; CHECK-NEXT: add x10, sp, #920 -; CHECK-NEXT: ld1 { v7.b }[9], [x9] -; CHECK-NEXT: ld1 { v16.b }[9], [x10] +; CHECK-NEXT: ld1 { v5.b }[9], [x9] +; CHECK-NEXT: ld1 { v7.b }[9], [x10] ; CHECK-NEXT: add x8, sp, #1192 ; CHECK-NEXT: add x11, sp, #808 -; CHECK-NEXT: ld1 { v6.b }[11], [x8] +; CHECK-NEXT: ld1 { v16.b }[11], [x8] ; CHECK-NEXT: add x10, sp, #1312 ; CHECK-NEXT: ld1 { v17.b }[11], [x11] ; CHECK-NEXT: add x11, sp, #928 ; CHECK-NEXT: add x9, sp, #176 -; CHECK-NEXT: ld1 { v7.b }[10], [x10] -; CHECK-NEXT: ld1 { v16.b }[10], [x11] +; CHECK-NEXT: ld1 { v5.b }[10], [x10] +; CHECK-NEXT: ld1 { v7.b }[10], [x11] ; CHECK-NEXT: add x8, sp, #1200 ; CHECK-NEXT: add x12, sp, #816 -; CHECK-NEXT: ld1 { v3.b }[12], [x9] -; CHECK-NEXT: ld1 { v6.b }[12], [x8] +; CHECK-NEXT: ld1 { v6.b }[12], [x9] +; CHECK-NEXT: ld1 { v16.b }[12], [x8] ; CHECK-NEXT: ld1 { v17.b }[12], [x12] ; CHECK-NEXT: add x9, sp, #1320 ; CHECK-NEXT: add x10, sp, #936 -; CHECK-NEXT: ld1 { v7.b }[11], [x9] -; CHECK-NEXT: ld1 { v16.b }[11], [x10] +; CHECK-NEXT: ld1 { v5.b }[11], [x9] +; CHECK-NEXT: ld1 { v7.b }[11], [x10] ; CHECK-NEXT: add x8, sp, #1208 ; CHECK-NEXT: add x11, sp, #824 -; CHECK-NEXT: ld1 { v6.b }[13], [x8] +; CHECK-NEXT: ld1 { v16.b }[13], [x8] ; CHECK-NEXT: add x10, sp, #1328 ; CHECK-NEXT: ld1 { v17.b }[13], [x11] ; CHECK-NEXT: add x11, sp, #944 ; CHECK-NEXT: add x9, sp, #184 -; CHECK-NEXT: ld1 { v7.b }[12], [x10] -; CHECK-NEXT: ld1 { v16.b }[12], [x11] +; CHECK-NEXT: ld1 { v5.b }[12], [x10] +; CHECK-NEXT: ld1 { v7.b }[12], [x11] ; CHECK-NEXT: add x8, sp, #1216 ; CHECK-NEXT: add x12, sp, #832 -; CHECK-NEXT: ld1 { v3.b }[13], [x9] -; CHECK-NEXT: ld1 { v6.b }[14], [x8] +; CHECK-NEXT: ld1 { v6.b }[13], [x9] +; CHECK-NEXT: ld1 { v16.b }[14], [x8] ; CHECK-NEXT: ld1 { v17.b }[14], [x12] ; CHECK-NEXT: add x9, sp, #1336 ; CHECK-NEXT: add x10, sp, #952 -; CHECK-NEXT: ld1 { v7.b }[13], [x9] -; CHECK-NEXT: ld1 { v16.b }[13], [x10] +; CHECK-NEXT: ld1 { v5.b }[13], [x9] +; CHECK-NEXT: ld1 { v7.b }[13], [x10] ; CHECK-NEXT: add x8, sp, #1224 ; CHECK-NEXT: add x11, sp, #840 -; CHECK-NEXT: ld1 { v6.b }[15], [x8] +; CHECK-NEXT: ld1 { v16.b }[15], [x8] ; CHECK-NEXT: add x8, sp, #192 ; CHECK-NEXT: ld1 { v17.b }[15], [x11] ; CHECK-NEXT: add x10, sp, #1344 ; CHECK-NEXT: add x11, sp, #960 -; CHECK-NEXT: ld1 { v3.b }[14], [x8] -; CHECK-NEXT: ld1 { v7.b }[14], [x10] -; CHECK-NEXT: ld1 { v16.b }[14], [x11] +; CHECK-NEXT: ld1 { v6.b }[14], [x8] +; CHECK-NEXT: ld1 { v5.b }[14], [x10] +; CHECK-NEXT: ld1 { v7.b }[14], [x11] ; CHECK-NEXT: add x9, sp, #584 -; CHECK-NEXT: sdot v5.4s, v1.16b, v0.16b +; CHECK-NEXT: sdot v4.4s, v2.16b, v0.16b ; CHECK-NEXT: add x8, sp, #200 -; CHECK-NEXT: sdot v4.4s, v17.16b, v6.16b -; CHECK-NEXT: ld1 { v2.b }[15], [x9] +; CHECK-NEXT: sdot v3.4s, v17.16b, v16.16b +; CHECK-NEXT: ld1 { v1.b }[15], [x9] ; CHECK-NEXT: add x9, sp, #1352 ; CHECK-NEXT: add x10, sp, #968 -; CHECK-NEXT: ld1 { v3.b }[15], [x8] -; CHECK-NEXT: ld1 { v7.b }[15], [x9] -; CHECK-NEXT: ld1 { v16.b }[15], [x10] -; CHECK-NEXT: sdot v5.4s, v3.16b, v2.16b -; CHECK-NEXT: sdot v4.4s, v16.16b, v7.16b -; CHECK-NEXT: add v0.4s, v5.4s, v4.4s +; CHECK-NEXT: ld1 { v6.b }[15], [x8] +; CHECK-NEXT: ld1 { v5.b }[15], [x9] +; CHECK-NEXT: ld1 { v7.b }[15], [x10] +; CHECK-NEXT: sdot v4.4s, v6.16b, v1.16b +; CHECK-NEXT: sdot v3.4s, v7.16b, v5.16b +; CHECK-NEXT: add v0.4s, v4.4s, v3.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2662,194 +2662,194 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldr b5, [sp, #208] +; CHECK-NEXT: ldr b7, [sp, #208] ; CHECK-NEXT: add x8, sp, #216 ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ldr b4, [sp, #976] ; CHECK-NEXT: add x9, sp, #984 -; CHECK-NEXT: add x12, sp, #328 -; CHECK-NEXT: ld1 { v5.b }[1], [x8] +; CHECK-NEXT: add x10, sp, #328 +; CHECK-NEXT: ld1 { v7.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #224 ; CHECK-NEXT: movi v1.16b, #1 ; CHECK-NEXT: mov v0.b[1], w1 ; CHECK-NEXT: ld1 { v4.b }[1], [x9] ; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: add x11, sp, #992 -; CHECK-NEXT: ldr b6, [sp, #720] -; CHECK-NEXT: ldr b7, [sp, #80] -; CHECK-NEXT: ld1 { v5.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #232 +; CHECK-NEXT: ldr b5, [sp, #720] +; CHECK-NEXT: ldr b6, [sp, #80] ; CHECK-NEXT: add x13, sp, #88 -; CHECK-NEXT: ld1 { v4.b }[2], [x11] -; CHECK-NEXT: ld1 { v7.b }[1], [x13] -; CHECK-NEXT: add x13, sp, #856 -; CHECK-NEXT: mov v0.b[2], w2 +; CHECK-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #232 ; CHECK-NEXT: add x14, sp, #1008 +; CHECK-NEXT: ld1 { v6.b }[1], [x13] +; CHECK-NEXT: add x13, sp, #856 ; CHECK-NEXT: add x15, sp, #872 -; CHECK-NEXT: ld1 { v5.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #240 +; CHECK-NEXT: mov v0.b[2], w2 ; CHECK-NEXT: add x16, sp, #888 -; CHECK-NEXT: add x10, sp, #16 -; CHECK-NEXT: add x9, sp, #24 -; CHECK-NEXT: add x11, sp, #40 +; CHECK-NEXT: add x12, sp, #16 +; CHECK-NEXT: ld1 { v7.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #240 +; CHECK-NEXT: add x11, sp, #24 +; CHECK-NEXT: add x9, sp, #32 ; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-NEXT: ld1 { v7.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #248 ; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-NEXT: ld1 { v7.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #256 ; CHECK-NEXT: mov v0.b[4], w4 -; CHECK-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-NEXT: ld1 { v7.b }[6], [x8] ; CHECK-NEXT: add x8, sp, #264 ; CHECK-NEXT: mov v0.b[5], w5 -; CHECK-NEXT: ld1 { v5.b }[7], [x8] +; CHECK-NEXT: ld1 { v7.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #272 -; CHECK-NEXT: ld1 { v5.b }[8], [x8] +; CHECK-NEXT: ld1 { v7.b }[8], [x8] ; CHECK-NEXT: add x8, sp, #280 ; CHECK-NEXT: mov v0.b[6], w6 -; CHECK-NEXT: ld1 { v5.b }[9], [x8] +; CHECK-NEXT: ld1 { v7.b }[9], [x8] ; CHECK-NEXT: add x8, sp, #288 ; CHECK-NEXT: mov v0.b[7], w7 -; CHECK-NEXT: ld1 { v5.b }[10], [x8] +; CHECK-NEXT: ld1 { v7.b }[10], [x8] ; CHECK-NEXT: add x8, sp, #296 -; CHECK-NEXT: ld1 { v0.b }[8], [x10] -; CHECK-NEXT: add x10, sp, #128 -; CHECK-NEXT: ld1 { v5.b }[11], [x8] +; CHECK-NEXT: ld1 { v0.b }[8], [x12] +; CHECK-NEXT: add x12, sp, #128 +; CHECK-NEXT: ld1 { v7.b }[11], [x8] ; CHECK-NEXT: add x8, sp, #304 -; CHECK-NEXT: ld1 { v0.b }[9], [x9] -; CHECK-NEXT: add x9, sp, #136 -; CHECK-NEXT: ld1 { v5.b }[12], [x8] +; CHECK-NEXT: ld1 { v0.b }[9], [x11] +; CHECK-NEXT: add x11, sp, #136 +; CHECK-NEXT: ld1 { v7.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ld1 { v5.b }[13], [x8] +; CHECK-NEXT: ld1 { v0.b }[10], [x9] +; CHECK-NEXT: add x9, sp, #144 +; CHECK-NEXT: ld1 { v7.b }[13], [x8] ; CHECK-NEXT: add x8, sp, #320 -; CHECK-NEXT: ld1 { v5.b }[14], [x8] -; CHECK-NEXT: add x8, sp, #32 -; CHECK-NEXT: ld1 { v0.b }[10], [x8] -; CHECK-NEXT: add x8, sp, #144 -; CHECK-NEXT: ld1 { v5.b }[15], [x12] -; CHECK-NEXT: add x12, sp, #728 -; CHECK-NEXT: ld1 { v6.b }[1], [x12] -; CHECK-NEXT: add x12, sp, #1000 -; CHECK-NEXT: ld1 { v0.b }[11], [x11] -; CHECK-NEXT: ld1 { v4.b }[3], [x12] -; CHECK-NEXT: add x12, sp, #736 -; CHECK-NEXT: add x11, sp, #920 -; CHECK-NEXT: sdot v3.4s, v5.16b, v1.16b -; CHECK-NEXT: ldr b5, [sp, #848] -; CHECK-NEXT: ld1 { v6.b }[2], [x12] -; CHECK-NEXT: add x12, sp, #48 -; CHECK-NEXT: ld1 { v5.b }[1], [x13] +; CHECK-NEXT: ld1 { v7.b }[14], [x8] +; CHECK-NEXT: add x8, sp, #992 +; CHECK-NEXT: ld1 { v4.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #40 +; CHECK-NEXT: ld1 { v0.b }[11], [x8] +; CHECK-NEXT: add x8, sp, #152 +; CHECK-NEXT: ld1 { v7.b }[15], [x10] +; CHECK-NEXT: add x10, sp, #728 +; CHECK-NEXT: ld1 { v5.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #1000 +; CHECK-NEXT: ld1 { v4.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #736 +; CHECK-NEXT: sdot v3.4s, v7.16b, v1.16b +; CHECK-NEXT: ldr b7, [sp, #848] +; CHECK-NEXT: ld1 { v5.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #48 +; CHECK-NEXT: ld1 { v7.b }[1], [x13] ; CHECK-NEXT: add x13, sp, #744 ; CHECK-NEXT: ld1 { v4.b }[4], [x14] ; CHECK-NEXT: add x14, sp, #96 -; CHECK-NEXT: ld1 { v0.b }[12], [x12] -; CHECK-NEXT: ld1 { v6.b }[3], [x13] +; CHECK-NEXT: ld1 { v0.b }[12], [x10] +; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: ld1 { v5.b }[3], [x13] ; CHECK-NEXT: add x13, sp, #864 -; CHECK-NEXT: ld1 { v7.b }[2], [x14] +; CHECK-NEXT: ld1 { v6.b }[2], [x14] ; CHECK-NEXT: add x14, sp, #1016 -; CHECK-NEXT: ld1 { v5.b }[2], [x13] +; CHECK-NEXT: ld1 { v7.b }[2], [x13] ; CHECK-NEXT: add x13, sp, #752 ; CHECK-NEXT: ld1 { v4.b }[5], [x14] ; CHECK-NEXT: add x14, sp, #104 -; CHECK-NEXT: ld1 { v6.b }[4], [x13] +; CHECK-NEXT: ld1 { v5.b }[4], [x13] ; CHECK-NEXT: add x13, sp, #1024 -; CHECK-NEXT: ld1 { v7.b }[3], [x14] -; CHECK-NEXT: ld1 { v5.b }[3], [x15] +; CHECK-NEXT: ld1 { v6.b }[3], [x14] +; CHECK-NEXT: ld1 { v7.b }[3], [x15] ; CHECK-NEXT: add x15, sp, #760 ; CHECK-NEXT: add x14, sp, #112 ; CHECK-NEXT: ld1 { v4.b }[6], [x13] ; CHECK-NEXT: add x13, sp, #880 -; CHECK-NEXT: ld1 { v6.b }[5], [x15] +; CHECK-NEXT: ld1 { v5.b }[5], [x15] ; CHECK-NEXT: add x15, sp, #1032 -; CHECK-NEXT: ld1 { v7.b }[4], [x14] -; CHECK-NEXT: ld1 { v5.b }[4], [x13] +; CHECK-NEXT: ld1 { v6.b }[4], [x14] +; CHECK-NEXT: ld1 { v7.b }[4], [x13] ; CHECK-NEXT: add x14, sp, #768 ; CHECK-NEXT: add x13, sp, #120 ; CHECK-NEXT: ld1 { v4.b }[7], [x15] ; CHECK-NEXT: add x15, sp, #1040 -; CHECK-NEXT: ld1 { v6.b }[6], [x14] -; CHECK-NEXT: ld1 { v7.b }[5], [x13] +; CHECK-NEXT: ld1 { v5.b }[6], [x14] +; CHECK-NEXT: ld1 { v6.b }[5], [x13] ; CHECK-NEXT: add x13, sp, #776 -; CHECK-NEXT: ld1 { v5.b }[5], [x16] +; CHECK-NEXT: ld1 { v7.b }[5], [x16] ; CHECK-NEXT: add x14, sp, #1048 ; CHECK-NEXT: ld1 { v4.b }[8], [x15] ; CHECK-NEXT: add x15, sp, #896 -; CHECK-NEXT: ld1 { v6.b }[7], [x13] -; CHECK-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #784 -; CHECK-NEXT: ld1 { v5.b }[6], [x15] +; CHECK-NEXT: ld1 { v5.b }[7], [x13] +; CHECK-NEXT: ld1 { v6.b }[6], [x12] +; CHECK-NEXT: add x12, sp, #784 +; CHECK-NEXT: ld1 { v7.b }[6], [x15] ; CHECK-NEXT: add x13, sp, #1056 ; CHECK-NEXT: ld1 { v4.b }[9], [x14] ; CHECK-NEXT: add x14, sp, #904 -; CHECK-NEXT: ld1 { v6.b }[8], [x10] -; CHECK-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #792 -; CHECK-NEXT: ld1 { v5.b }[7], [x14] -; CHECK-NEXT: add x10, sp, #1064 +; CHECK-NEXT: ld1 { v5.b }[8], [x12] +; CHECK-NEXT: ld1 { v6.b }[7], [x11] +; CHECK-NEXT: add x11, sp, #792 +; CHECK-NEXT: ld1 { v7.b }[7], [x14] +; CHECK-NEXT: add x12, sp, #1064 ; CHECK-NEXT: ld1 { v4.b }[10], [x13] ; CHECK-NEXT: add x13, sp, #912 -; CHECK-NEXT: ld1 { v6.b }[9], [x9] -; CHECK-NEXT: ld1 { v7.b }[8], [x8] +; CHECK-NEXT: ld1 { v5.b }[9], [x11] +; CHECK-NEXT: ld1 { v6.b }[8], [x9] ; CHECK-NEXT: add x9, sp, #800 -; CHECK-NEXT: ld1 { v5.b }[8], [x13] -; CHECK-NEXT: add x8, sp, #152 -; CHECK-NEXT: ld1 { v4.b }[11], [x10] -; CHECK-NEXT: add x10, sp, #1072 -; CHECK-NEXT: ld1 { v6.b }[10], [x9] -; CHECK-NEXT: ld1 { v7.b }[9], [x8] +; CHECK-NEXT: ld1 { v7.b }[8], [x13] +; CHECK-NEXT: add x11, sp, #1072 +; CHECK-NEXT: ld1 { v4.b }[11], [x12] +; CHECK-NEXT: add x12, sp, #920 +; CHECK-NEXT: ld1 { v5.b }[10], [x9] +; CHECK-NEXT: ld1 { v6.b }[9], [x8] ; CHECK-NEXT: add x9, sp, #808 -; CHECK-NEXT: ld1 { v5.b }[9], [x11] +; CHECK-NEXT: ld1 { v7.b }[9], [x12] ; CHECK-NEXT: add x8, sp, #56 -; CHECK-NEXT: ld1 { v4.b }[12], [x10] -; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: ld1 { v4.b }[12], [x11] ; CHECK-NEXT: ld1 { v0.b }[13], [x8] -; CHECK-NEXT: ld1 { v6.b }[11], [x9] +; CHECK-NEXT: add x8, sp, #816 +; CHECK-NEXT: ld1 { v5.b }[11], [x9] ; CHECK-NEXT: add x9, sp, #928 -; CHECK-NEXT: ld1 { v7.b }[10], [x10] +; CHECK-NEXT: ld1 { v6.b }[10], [x10] ; CHECK-NEXT: add x10, sp, #1080 -; CHECK-NEXT: ld1 { v5.b }[10], [x9] -; CHECK-NEXT: add x8, sp, #816 -; CHECK-NEXT: ld1 { v4.b }[13], [x10] +; CHECK-NEXT: ld1 { v7.b }[10], [x9] ; CHECK-NEXT: add x9, sp, #168 +; CHECK-NEXT: ld1 { v4.b }[13], [x10] ; CHECK-NEXT: add x10, sp, #176 -; CHECK-NEXT: ld1 { v6.b }[12], [x8] +; CHECK-NEXT: ld1 { v5.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #936 -; CHECK-NEXT: ld1 { v7.b }[11], [x9] +; CHECK-NEXT: ld1 { v6.b }[11], [x9] ; CHECK-NEXT: add x9, sp, #1088 -; CHECK-NEXT: ld1 { v5.b }[11], [x8] +; CHECK-NEXT: ld1 { v7.b }[11], [x8] ; CHECK-NEXT: add x8, sp, #64 ; CHECK-NEXT: ld1 { v4.b }[14], [x9] ; CHECK-NEXT: add x9, sp, #824 ; CHECK-NEXT: ld1 { v0.b }[14], [x8] -; CHECK-NEXT: ld1 { v6.b }[13], [x9] +; CHECK-NEXT: ld1 { v5.b }[13], [x9] ; CHECK-NEXT: add x9, sp, #944 -; CHECK-NEXT: ld1 { v7.b }[12], [x10] +; CHECK-NEXT: ld1 { v6.b }[12], [x10] ; CHECK-NEXT: add x10, sp, #1096 -; CHECK-NEXT: ld1 { v5.b }[12], [x9] +; CHECK-NEXT: ld1 { v7.b }[12], [x9] ; CHECK-NEXT: add x8, sp, #832 ; CHECK-NEXT: ld1 { v4.b }[15], [x10] ; CHECK-NEXT: add x9, sp, #184 ; CHECK-NEXT: add x10, sp, #72 -; CHECK-NEXT: ld1 { v6.b }[14], [x8] +; CHECK-NEXT: ld1 { v5.b }[14], [x8] ; CHECK-NEXT: add x8, sp, #952 -; CHECK-NEXT: ld1 { v7.b }[13], [x9] -; CHECK-NEXT: ld1 { v5.b }[13], [x8] +; CHECK-NEXT: ld1 { v6.b }[13], [x9] +; CHECK-NEXT: ld1 { v7.b }[13], [x8] ; CHECK-NEXT: add x8, sp, #840 ; CHECK-NEXT: ld1 { v0.b }[15], [x10] ; CHECK-NEXT: sdot v2.4s, v4.16b, v1.16b ; CHECK-NEXT: add x9, sp, #192 -; CHECK-NEXT: ld1 { v6.b }[15], [x8] +; CHECK-NEXT: ld1 { v5.b }[15], [x8] ; CHECK-NEXT: add x8, sp, #960 -; CHECK-NEXT: ld1 { v7.b }[14], [x9] -; CHECK-NEXT: ld1 { v5.b }[14], [x8] +; CHECK-NEXT: ld1 { v6.b }[14], [x9] +; CHECK-NEXT: ld1 { v7.b }[14], [x8] ; CHECK-NEXT: sdot v3.4s, v0.16b, v1.16b ; CHECK-NEXT: add x8, sp, #200 ; CHECK-NEXT: add x9, sp, #968 -; CHECK-NEXT: sdot v2.4s, v6.16b, v1.16b -; CHECK-NEXT: ld1 { v7.b }[15], [x8] -; CHECK-NEXT: ld1 { v5.b }[15], [x9] -; CHECK-NEXT: sdot v3.4s, v7.16b, v1.16b ; CHECK-NEXT: sdot v2.4s, v5.16b, v1.16b +; CHECK-NEXT: ld1 { v6.b }[15], [x8] +; CHECK-NEXT: ld1 { v7.b }[15], [x9] +; CHECK-NEXT: sdot v3.4s, v6.16b, v1.16b +; CHECK-NEXT: sdot v2.4s, v7.16b, v1.16b ; CHECK-NEXT: add v0.4s, v3.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll index 913205f3275367..44887dee91a489 100644 --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -459,56 +459,56 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { ; CHECK-NEXT: mov v0.b[1], w1 ; CHECK-NEXT: ld1 { v3.b }[1], [x11] ; CHECK-NEXT: ld1 { v1.b }[1], [x9] -; CHECK-NEXT: add x11, sp, #16 +; CHECK-NEXT: add x12, sp, #16 ; CHECK-NEXT: add x9, sp, #112 -; CHECK-NEXT: add x13, sp, #184 +; CHECK-NEXT: add x11, sp, #120 ; CHECK-NEXT: ld1 { v2.b }[2], [x10] -; CHECK-NEXT: add x12, sp, #120 ; CHECK-NEXT: add x14, sp, #32 -; CHECK-NEXT: ld1 { v3.b }[2], [x11] -; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: ldr b5, [sp, #64] +; CHECK-NEXT: ld1 { v3.b }[2], [x12] +; CHECK-NEXT: add x12, sp, #184 +; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: mov v0.b[2], w2 ; CHECK-NEXT: ldr b4, [sp, #224] -; CHECK-NEXT: add x11, sp, #128 -; CHECK-NEXT: ld1 { v2.b }[3], [x13] -; CHECK-NEXT: add x13, sp, #24 +; CHECK-NEXT: add x9, sp, #128 +; CHECK-NEXT: ld1 { v2.b }[3], [x12] +; CHECK-NEXT: add x12, sp, #24 ; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: ld1 { v3.b }[3], [x13] -; CHECK-NEXT: ld1 { v1.b }[3], [x12] -; CHECK-NEXT: add x12, sp, #192 -; CHECK-NEXT: add x13, sp, #200 +; CHECK-NEXT: ld1 { v3.b }[3], [x12] +; CHECK-NEXT: ld1 { v1.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #192 +; CHECK-NEXT: add x12, sp, #200 ; CHECK-NEXT: add x15, sp, #80 -; CHECK-NEXT: add x9, sp, #144 +; CHECK-NEXT: add x13, sp, #144 ; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: ld1 { v2.b }[4], [x12] -; CHECK-NEXT: add x12, sp, #232 +; CHECK-NEXT: ld1 { v2.b }[4], [x11] +; CHECK-NEXT: add x11, sp, #232 ; CHECK-NEXT: ld1 { v3.b }[4], [x14] ; CHECK-NEXT: add x14, sp, #72 -; CHECK-NEXT: ld1 { v4.b }[1], [x12] +; CHECK-NEXT: ld1 { v4.b }[1], [x11] ; CHECK-NEXT: ld1 { v5.b }[1], [x14] ; CHECK-NEXT: add x14, sp, #40 -; CHECK-NEXT: ld1 { v1.b }[4], [x11] -; CHECK-NEXT: ld1 { v2.b }[5], [x13] -; CHECK-NEXT: add x12, sp, #208 -; CHECK-NEXT: add x13, sp, #48 +; CHECK-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-NEXT: ld1 { v2.b }[5], [x12] +; CHECK-NEXT: add x11, sp, #208 +; CHECK-NEXT: add x12, sp, #48 ; CHECK-NEXT: mov v0.b[4], w4 ; CHECK-NEXT: ld1 { v3.b }[5], [x14] ; CHECK-NEXT: add x14, sp, #240 ; CHECK-NEXT: ld1 { v4.b }[2], [x14] ; CHECK-NEXT: ld1 { v5.b }[2], [x15] ; CHECK-NEXT: ld1 { v1.b }[5], [x10] -; CHECK-NEXT: ld1 { v2.b }[6], [x12] -; CHECK-NEXT: add x11, sp, #216 +; CHECK-NEXT: ld1 { v2.b }[6], [x11] +; CHECK-NEXT: add x9, sp, #216 ; CHECK-NEXT: add x10, sp, #56 -; CHECK-NEXT: ld1 { v3.b }[6], [x13] -; CHECK-NEXT: add x12, sp, #248 -; CHECK-NEXT: add x13, sp, #88 +; CHECK-NEXT: ld1 { v3.b }[6], [x12] +; CHECK-NEXT: add x11, sp, #248 +; CHECK-NEXT: add x12, sp, #88 ; CHECK-NEXT: mov v0.b[5], w5 -; CHECK-NEXT: ld1 { v4.b }[3], [x12] -; CHECK-NEXT: ld1 { v5.b }[3], [x13] -; CHECK-NEXT: ld1 { v1.b }[6], [x9] -; CHECK-NEXT: ld1 { v2.b }[7], [x11] +; CHECK-NEXT: ld1 { v4.b }[3], [x11] +; CHECK-NEXT: ld1 { v5.b }[3], [x12] +; CHECK-NEXT: ld1 { v1.b }[6], [x13] +; CHECK-NEXT: ld1 { v2.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #152 ; CHECK-NEXT: ld1 { v3.b }[7], [x10] ; CHECK-NEXT: uaddl v4.8h, v5.8b, v4.8b @@ -545,48 +545,48 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) { ; CHECK-NEXT: .cfi_offset w21, -24 ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w23, -48 -; CHECK-NEXT: ldr w13, [sp, #112] -; CHECK-NEXT: ldr w14, [sp, #144] +; CHECK-NEXT: ldr w10, [sp, #112] +; CHECK-NEXT: ldr w13, [sp, #144] ; CHECK-NEXT: fmov s2, w4 -; CHECK-NEXT: ldr w17, [sp, #176] -; CHECK-NEXT: ldr w19, [sp, #208] +; CHECK-NEXT: ldr w15, [sp, #176] +; CHECK-NEXT: ldr w17, [sp, #208] ; CHECK-NEXT: fmov s3, w0 -; CHECK-NEXT: ldr w20, [sp, #80] -; CHECK-NEXT: ldr w21, [sp, #48] -; CHECK-NEXT: fmov s5, w13 -; CHECK-NEXT: fmov s4, w19 -; CHECK-NEXT: fmov s6, w17 -; CHECK-NEXT: fmov s7, w14 -; CHECK-NEXT: fmov s0, w20 -; CHECK-NEXT: fmov s1, w21 -; CHECK-NEXT: ldr w10, [sp, #120] -; CHECK-NEXT: ldr w11, [sp, #152] -; CHECK-NEXT: ldr w12, [sp, #184] -; CHECK-NEXT: ldr w15, [sp, #216] -; CHECK-NEXT: ldr w22, [sp, #88] -; CHECK-NEXT: ldr w23, [sp, #56] +; CHECK-NEXT: ldr w18, [sp, #80] +; CHECK-NEXT: ldr w20, [sp, #48] +; CHECK-NEXT: fmov s5, w10 +; CHECK-NEXT: fmov s4, w17 +; CHECK-NEXT: fmov s6, w15 +; CHECK-NEXT: fmov s7, w13 +; CHECK-NEXT: fmov s0, w18 +; CHECK-NEXT: fmov s1, w20 +; CHECK-NEXT: ldr w9, [sp, #120] +; CHECK-NEXT: ldr w12, [sp, #152] +; CHECK-NEXT: ldr w14, [sp, #184] +; CHECK-NEXT: ldr w16, [sp, #216] +; CHECK-NEXT: ldr w21, [sp, #88] +; CHECK-NEXT: ldr w22, [sp, #56] ; CHECK-NEXT: mov v2.h[1], w5 ; CHECK-NEXT: mov v3.h[1], w1 -; CHECK-NEXT: mov v5.h[1], w10 -; CHECK-NEXT: mov v4.h[1], w15 -; CHECK-NEXT: mov v0.h[1], w22 -; CHECK-NEXT: mov v1.h[1], w23 -; CHECK-NEXT: mov v6.h[1], w12 -; CHECK-NEXT: mov v7.h[1], w11 +; CHECK-NEXT: mov v5.h[1], w9 +; CHECK-NEXT: mov v4.h[1], w16 +; CHECK-NEXT: mov v0.h[1], w21 +; CHECK-NEXT: mov v1.h[1], w22 +; CHECK-NEXT: mov v6.h[1], w14 +; CHECK-NEXT: mov v7.h[1], w12 ; CHECK-NEXT: ldr w8, [sp, #128] -; CHECK-NEXT: ldr w9, [sp, #160] -; CHECK-NEXT: ldr w16, [sp, #64] -; CHECK-NEXT: ldr w18, [sp, #96] -; CHECK-NEXT: ldr w10, [sp, #192] -; CHECK-NEXT: ldr w11, [sp, #224] +; CHECK-NEXT: ldr w11, [sp, #160] +; CHECK-NEXT: ldr w19, [sp, #64] +; CHECK-NEXT: ldr w23, [sp, #96] +; CHECK-NEXT: ldr w9, [sp, #192] +; CHECK-NEXT: ldr w10, [sp, #224] ; CHECK-NEXT: mov v2.h[2], w6 ; CHECK-NEXT: mov v3.h[2], w2 -; CHECK-NEXT: mov v0.h[2], w18 -; CHECK-NEXT: mov v1.h[2], w16 +; CHECK-NEXT: mov v0.h[2], w23 +; CHECK-NEXT: mov v1.h[2], w19 ; CHECK-NEXT: mov v5.h[2], w8 -; CHECK-NEXT: mov v4.h[2], w11 -; CHECK-NEXT: mov v6.h[2], w10 -; CHECK-NEXT: mov v7.h[2], w9 +; CHECK-NEXT: mov v4.h[2], w10 +; CHECK-NEXT: mov v6.h[2], w9 +; CHECK-NEXT: mov v7.h[2], w11 ; CHECK-NEXT: ldr w12, [sp, #72] ; CHECK-NEXT: ldr w13, [sp, #104] ; CHECK-NEXT: ldr w8, [sp, #136] diff --git a/llvm/test/CodeGen/AArch64/pow.ll b/llvm/test/CodeGen/AArch64/pow.ll index 5141b21c7976a9..572ea086e16751 100644 --- a/llvm/test/CodeGen/AArch64/pow.ll +++ b/llvm/test/CodeGen/AArch64/pow.ll @@ -110,17 +110,17 @@ define <2 x double> @pow_v2f64_one_fourth_not_enough_fmf(<2 x double> %x) nounwi ; CHECK-LABEL: pow_v2f64_one_fourth_not_enough_fmf: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: fmov d1, #0.25000000 ; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: bl pow ; CHECK-NEXT: fmov d1, #0.25000000 -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl pow -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], v1.d[0] diff --git a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll index 5b501762418ef5..c5b041d4b63213 100644 --- a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll @@ -35,18 +35,18 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: adrp x14, __DefaultRuneLocale@GOTPAGE ; CHECK-NEXT: ldrb w12, [x0, #4] ; CHECK-NEXT: ldrb w13, [x1, #4] -; CHECK-NEXT: ldr x9, [x0, #16] -; CHECK-NEXT: ldr x10, [x1, #16] +; CHECK-NEXT: ldr x10, [x0, #16] +; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: mov x11, xzr ; CHECK-NEXT: Lloh1: ; CHECK-NEXT: ldr x14, [x14, __DefaultRuneLocale@GOTPAGEOFF] -; CHECK-NEXT: ldrsb x8, [x9, x11] +; CHECK-NEXT: ldrsb x8, [x10, x11] ; CHECK-NEXT: tbz x8, #63, LBB0_3 ; CHECK-NEXT: LBB0_2: ; %cond.false.i.i -; CHECK-NEXT: stp x9, x0, [sp, #32] ; 16-byte Folded Spill +; CHECK-NEXT: stp x10, x0, [sp, #32] ; 16-byte Folded Spill ; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: mov w1, #32768 ; =0x8000 -; CHECK-NEXT: str x10, [sp, #8] ; 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill ; CHECK-NEXT: str x11, [sp, #24] ; 8-byte Folded Spill ; CHECK-NEXT: str w12, [sp, #4] ; 4-byte Folded Spill ; CHECK-NEXT: str w13, [sp, #20] ; 4-byte Folded Spill @@ -56,10 +56,10 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: Lloh3: ; CHECK-NEXT: ldr x14, [x14, __DefaultRuneLocale@GOTPAGEOFF] -; CHECK-NEXT: ldp x11, x9, [sp, #24] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x11, x10, [sp, #24] ; 16-byte Folded Reload ; CHECK-NEXT: ldr w13, [sp, #20] ; 4-byte Folded Reload ; CHECK-NEXT: ldr w12, [sp, #4] ; 4-byte Folded Reload -; CHECK-NEXT: ldr x10, [sp, #8] ; 8-byte Folded Reload +; CHECK-NEXT: ldr x9, [sp, #8] ; 8-byte Folded Reload ; CHECK-NEXT: ldr x0, [sp, #40] ; 8-byte Folded Reload ; CHECK-NEXT: cbz w8, LBB0_4 ; CHECK-NEXT: b LBB0_6 @@ -69,7 +69,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: and w8, w8, #0x8000 ; CHECK-NEXT: cbnz w8, LBB0_6 ; CHECK-NEXT: LBB0_4: ; %lor.rhs -; CHECK-NEXT: ldrsb x8, [x10, x11] +; CHECK-NEXT: ldrsb x8, [x9, x11] ; CHECK-NEXT: tbnz x8, #63, LBB0_8 ; CHECK-NEXT: ; %bb.5: ; %cond.true.i.i217 ; CHECK-NEXT: add x8, x14, x8, lsl #2 @@ -77,20 +77,20 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: and w8, w8, #0x8000 ; CHECK-NEXT: cbz w8, LBB0_9 ; CHECK-NEXT: LBB0_6: ; %while.body -; CHECK-NEXT: ldrb w8, [x9, x11] -; CHECK-NEXT: ldrb w15, [x10, x11] +; CHECK-NEXT: ldrb w8, [x10, x11] +; CHECK-NEXT: ldrb w15, [x9, x11] ; CHECK-NEXT: cmp w8, w15 ; CHECK-NEXT: b.ne LBB0_42 ; CHECK-NEXT: ; %bb.7: ; %if.end17 ; CHECK-NEXT: add x11, x11, #1 -; CHECK-NEXT: ldrsb x8, [x9, x11] +; CHECK-NEXT: ldrsb x8, [x10, x11] ; CHECK-NEXT: tbz x8, #63, LBB0_3 ; CHECK-NEXT: b LBB0_2 ; CHECK-NEXT: LBB0_8: ; %cond.false.i.i219 -; CHECK-NEXT: stp x9, x0, [sp, #32] ; 16-byte Folded Spill +; CHECK-NEXT: stp x10, x0, [sp, #32] ; 16-byte Folded Spill ; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: mov w1, #32768 ; =0x8000 -; CHECK-NEXT: str x10, [sp, #8] ; 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill ; CHECK-NEXT: str x11, [sp, #24] ; 8-byte Folded Spill ; CHECK-NEXT: str w12, [sp, #4] ; 4-byte Folded Spill ; CHECK-NEXT: str w13, [sp, #20] ; 4-byte Folded Spill @@ -100,10 +100,10 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: Lloh5: ; CHECK-NEXT: ldr x14, [x14, __DefaultRuneLocale@GOTPAGEOFF] -; CHECK-NEXT: ldp x11, x9, [sp, #24] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x11, x10, [sp, #24] ; 16-byte Folded Reload ; CHECK-NEXT: ldr w13, [sp, #20] ; 4-byte Folded Reload ; CHECK-NEXT: ldr w12, [sp, #4] ; 4-byte Folded Reload -; CHECK-NEXT: ldr x10, [sp, #8] ; 8-byte Folded Reload +; CHECK-NEXT: ldr x9, [sp, #8] ; 8-byte Folded Reload ; CHECK-NEXT: ldr x0, [sp, #40] ; 8-byte Folded Reload ; CHECK-NEXT: cbnz w8, LBB0_6 ; CHECK-NEXT: LBB0_9: ; %while.end @@ -111,16 +111,16 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: cbnz w8, LBB0_24 ; CHECK-NEXT: ; %bb.10: ; %if.then23 ; CHECK-NEXT: ldr x12, [x0, #16] -; CHECK-NEXT: ldrb w8, [x9, x11] +; CHECK-NEXT: ldrb w8, [x10, x11] ; CHECK-NEXT: ldrb w13, [x12] ; CHECK-NEXT: cmp w13, #83 ; CHECK-NEXT: b.eq LBB0_19 ; CHECK-NEXT: LBB0_11: ; %while.cond59.preheader ; CHECK-NEXT: cbz w8, LBB0_23 ; CHECK-NEXT: LBB0_12: ; %land.rhs.preheader -; CHECK-NEXT: add x12, x9, x11 -; CHECK-NEXT: add x9, x10, x11 -; CHECK-NEXT: add x10, x12, #1 +; CHECK-NEXT: add x10, x10, x11 +; CHECK-NEXT: add x9, x9, x11 +; CHECK-NEXT: add x10, x10, #1 ; CHECK-NEXT: LBB0_13: ; %land.rhs ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb w11, [x9], #1 @@ -154,11 +154,11 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: cmp w8, #112 ; CHECK-NEXT: b.ne LBB0_12 ; CHECK-NEXT: ; %bb.21: ; %land.lhs.true35 -; CHECK-NEXT: ldrb w13, [x10, x11] +; CHECK-NEXT: ldrb w13, [x9, x11] ; CHECK-NEXT: cmp w13, #112 ; CHECK-NEXT: b.ne LBB0_12 ; CHECK-NEXT: ; %bb.22: ; %land.lhs.true43 -; CHECK-NEXT: sub x12, x9, x12 +; CHECK-NEXT: sub x12, x10, x12 ; CHECK-NEXT: add x12, x12, x11 ; CHECK-NEXT: cmp x12, #1 ; CHECK-NEXT: b.ne LBB0_44 @@ -172,7 +172,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: cmp w13, #2 ; CHECK-NEXT: b.ne LBB0_33 ; CHECK-NEXT: ; %bb.26: ; %while.cond95.preheader -; CHECK-NEXT: ldrb w12, [x9, x11] +; CHECK-NEXT: ldrb w12, [x10, x11] ; CHECK-NEXT: cbz w12, LBB0_23 ; CHECK-NEXT: ; %bb.27: ; %land.rhs99.preheader ; CHECK-NEXT: mov x8, xzr @@ -180,14 +180,14 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: b LBB0_29 ; CHECK-NEXT: LBB0_28: ; %if.then117 ; CHECK-NEXT: ; in Loop: Header=BB0_29 Depth=1 -; CHECK-NEXT: add x12, x9, x8 +; CHECK-NEXT: add x12, x10, x8 ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: add x12, x12, x11 ; CHECK-NEXT: ldrb w12, [x12, #1] ; CHECK-NEXT: cbz w12, LBB0_43 ; CHECK-NEXT: LBB0_29: ; %land.rhs99 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x13, x10, x8 +; CHECK-NEXT: add x13, x9, x8 ; CHECK-NEXT: ldrb w13, [x13, x11] ; CHECK-NEXT: cbz w13, LBB0_23 ; CHECK-NEXT: ; %bb.30: ; %while.body104 @@ -211,27 +211,27 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: cmp w12, #2 ; CHECK-NEXT: b.ne LBB0_43 ; CHECK-NEXT: ; %bb.35: ; %while.cond130.preheader -; CHECK-NEXT: ldrb w8, [x9, x11] -; CHECK-NEXT: cbz w8, LBB0_23 +; CHECK-NEXT: ldrb w12, [x10, x11] +; CHECK-NEXT: cbz w12, LBB0_23 ; CHECK-NEXT: ; %bb.36: ; %land.rhs134.preheader -; CHECK-NEXT: mov x12, xzr +; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov w0, #1 ; =0x1 ; CHECK-NEXT: b LBB0_38 ; CHECK-NEXT: LBB0_37: ; %if.then152 ; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1 -; CHECK-NEXT: add x8, x9, x12 -; CHECK-NEXT: add x12, x12, #1 -; CHECK-NEXT: add x8, x8, x11 -; CHECK-NEXT: ldrb w8, [x8, #1] -; CHECK-NEXT: cbz w8, LBB0_43 +; CHECK-NEXT: add x12, x10, x8 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: add x12, x12, x11 +; CHECK-NEXT: ldrb w12, [x12, #1] +; CHECK-NEXT: cbz w12, LBB0_43 ; CHECK-NEXT: LBB0_38: ; %land.rhs134 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x13, x10, x12 +; CHECK-NEXT: add x13, x9, x8 ; CHECK-NEXT: ldrb w13, [x13, x11] ; CHECK-NEXT: cbz w13, LBB0_23 ; CHECK-NEXT: ; %bb.39: ; %while.body139 ; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1 -; CHECK-NEXT: cmp w8, w13 +; CHECK-NEXT: cmp w12, w13 ; CHECK-NEXT: b.eq LBB0_37 ; CHECK-NEXT: ; %bb.40: ; %while.body139 ; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1 @@ -239,7 +239,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: b.eq LBB0_37 ; CHECK-NEXT: ; %bb.41: ; %while.body139 ; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1 -; CHECK-NEXT: cmp w8, #94 +; CHECK-NEXT: cmp w12, #94 ; CHECK-NEXT: b.eq LBB0_37 ; CHECK-NEXT: LBB0_42: ; CHECK-NEXT: mov w0, wzr @@ -251,7 +251,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: cmp x12, #2 ; CHECK-NEXT: b.ne LBB0_11 ; CHECK-NEXT: ; %bb.45: ; %land.lhs.true52 -; CHECK-NEXT: add x12, x9, x11 +; CHECK-NEXT: add x12, x10, x11 ; CHECK-NEXT: mov w0, #1 ; =0x1 ; CHECK-NEXT: ldurb w12, [x12, #-1] ; CHECK-NEXT: cmp w12, #73 diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll index 419f25c22eb724..4281a52e4cd0a2 100644 --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -8,14 +8,14 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-LABEL: run_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #192 -; CHECK-NEXT: .cfi_def_cfa_offset 192 -; CHECK-NEXT: stp d15, d14, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #112] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #128] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #144] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #160] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #176] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #176 +; CHECK-NEXT: .cfi_def_cfa_offset 176 +; CHECK-NEXT: stp d15, d14, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #112] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #128] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #144] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #160] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w21, -24 @@ -29,16 +29,13 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: // implicit-def: $q6 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: adrp x10, B+48 ; CHECK-NEXT: add x10, x10, :lo12:B+48 ; CHECK-NEXT: adrp x11, A ; CHECK-NEXT: add x11, x11, :lo12:A -; CHECK-NEXT: // kill: killed $q6 ; CHECK-NEXT: // implicit-def: $q6 -; CHECK-NEXT: // implicit-def: $q0 ; CHECK-NEXT: // implicit-def: $q2 ; CHECK-NEXT: // implicit-def: $q3 ; CHECK-NEXT: // implicit-def: $q4 @@ -46,7 +43,7 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: // implicit-def: $q7 ; CHECK-NEXT: // implicit-def: $q16 ; CHECK-NEXT: // implicit-def: $q17 -; CHECK-NEXT: // implicit-def: $q10 +; CHECK-NEXT: // implicit-def: $q18 ; CHECK-NEXT: // implicit-def: $q19 ; CHECK-NEXT: // implicit-def: $q20 ; CHECK-NEXT: // implicit-def: $q21 @@ -54,134 +51,135 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: // implicit-def: $q23 ; CHECK-NEXT: // implicit-def: $q24 ; CHECK-NEXT: // implicit-def: $q25 -; CHECK-NEXT: // implicit-def: $q27 ; CHECK-NEXT: // implicit-def: $q26 +; CHECK-NEXT: // implicit-def: $q27 ; CHECK-NEXT: // implicit-def: $q28 ; CHECK-NEXT: // implicit-def: $q30 -; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: // implicit-def: $q15 ; CHECK-NEXT: // implicit-def: $q29 ; CHECK-NEXT: // implicit-def: $q31 +; CHECK-NEXT: // implicit-def: $q8 +; CHECK-NEXT: // implicit-def: $q9 +; CHECK-NEXT: // implicit-def: $q10 +; CHECK-NEXT: // implicit-def: $q11 ; CHECK-NEXT: // implicit-def: $q12 ; CHECK-NEXT: // implicit-def: $q13 -; CHECK-NEXT: // implicit-def: $q11 -; CHECK-NEXT: // kill: killed $q6 -; CHECK-NEXT: // implicit-def: $q6 -; CHECK-NEXT: // kill: killed $q6 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q14, [x8] ; CHECK-NEXT: mov x12, xzr -; CHECK-NEXT: str q18, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: ldr x14, [x12] +; CHECK-NEXT: ldr q14, [x8] +; CHECK-NEXT: stp q29, q15, [sp] // 32-byte Folded Spill ; CHECK-NEXT: ldr q15, [x12] +; CHECK-NEXT: ldr x13, [x12] ; CHECK-NEXT: add x7, x11, x8 -; CHECK-NEXT: fmov x15, d14 -; CHECK-NEXT: mov x16, v14.d[1] -; CHECK-NEXT: ldr q18, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fmov x18, d15 -; CHECK-NEXT: mov x13, v15.d[1] +; CHECK-NEXT: fmov x12, d14 +; CHECK-NEXT: mov x14, v14.d[1] ; CHECK-NEXT: ldr x5, [x8] +; CHECK-NEXT: fmov x17, d15 ; CHECK-NEXT: ldr q14, [x10], #64 +; CHECK-NEXT: mov x16, v15.d[1] ; CHECK-NEXT: ldr x7, [x7, #128] -; CHECK-NEXT: mul x17, x15, x14 -; CHECK-NEXT: mov v6.16b, v0.16b -; CHECK-NEXT: mov v9.16b, v27.16b -; CHECK-NEXT: mov x12, v14.d[1] +; CHECK-NEXT: stp q30, q28, [sp, #32] // 32-byte Folded Spill +; CHECK-NEXT: mul x15, x12, x13 +; CHECK-NEXT: mov x0, v14.d[1] ; CHECK-NEXT: fmov x4, d14 -; CHECK-NEXT: mov v27.16b, v23.16b -; CHECK-NEXT: mul x1, x16, x14 -; CHECK-NEXT: mov v23.16b, v19.16b -; CHECK-NEXT: mov v19.16b, v7.16b +; CHECK-NEXT: mov v30.16b, v27.16b +; CHECK-NEXT: mov v27.16b, v24.16b +; CHECK-NEXT: mov v24.16b, v21.16b +; CHECK-NEXT: mul x18, x17, x13 +; CHECK-NEXT: mov v21.16b, v18.16b +; CHECK-NEXT: mov v18.16b, v7.16b ; CHECK-NEXT: mov v7.16b, v2.16b -; CHECK-NEXT: stp q26, q31, [sp] // 32-byte Folded Spill -; CHECK-NEXT: mov v31.16b, v22.16b -; CHECK-NEXT: mul x0, x18, x14 -; CHECK-NEXT: mov v26.16b, v10.16b -; CHECK-NEXT: mov v22.16b, v5.16b -; CHECK-NEXT: fmov d15, x17 -; CHECK-NEXT: mov v5.16b, v1.16b -; CHECK-NEXT: mov v8.16b, v20.16b -; CHECK-NEXT: mul x2, x13, x14 -; CHECK-NEXT: mov v20.16b, v16.16b +; CHECK-NEXT: mov v28.16b, v25.16b +; CHECK-NEXT: mov v25.16b, v22.16b +; CHECK-NEXT: mul x19, x12, x5 +; CHECK-NEXT: mov v22.16b, v19.16b +; CHECK-NEXT: mov v19.16b, v16.16b +; CHECK-NEXT: fmov d15, x15 ; CHECK-NEXT: mov v16.16b, v3.16b -; CHECK-NEXT: mov v10.16b, v21.16b -; CHECK-NEXT: mov v21.16b, v17.16b +; CHECK-NEXT: mov v29.16b, v26.16b +; CHECK-NEXT: mul x12, x12, x7 +; CHECK-NEXT: mov v26.16b, v23.16b +; CHECK-NEXT: mov v23.16b, v20.16b +; CHECK-NEXT: fmov d14, x18 +; CHECK-NEXT: mov v20.16b, v17.16b ; CHECK-NEXT: mov v17.16b, v4.16b -; CHECK-NEXT: mov v15.d[1], x1 -; CHECK-NEXT: mul x3, x12, x14 +; CHECK-NEXT: mul x1, x14, x13 +; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov v5.16b, v1.16b +; CHECK-NEXT: fmov d1, x19 ; CHECK-NEXT: add x8, x8, #8 -; CHECK-NEXT: fmov d14, x0 -; CHECK-NEXT: cmp x8, #64 ; CHECK-NEXT: add x9, x9, #1 -; CHECK-NEXT: mul x14, x4, x14 -; CHECK-NEXT: add v18.2d, v18.2d, v15.2d -; CHECK-NEXT: mul x19, x15, x5 +; CHECK-NEXT: mul x2, x16, x13 +; CHECK-NEXT: cmp x8, #64 +; CHECK-NEXT: fmov d2, x12 +; CHECK-NEXT: mul x3, x0, x13 +; CHECK-NEXT: mov v15.d[1], x1 +; CHECK-NEXT: mul x13, x4, x13 ; CHECK-NEXT: mov v14.d[1], x2 -; CHECK-NEXT: mul x15, x15, x7 -; CHECK-NEXT: fmov d0, x14 -; CHECK-NEXT: str q18, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: ldp q18, q15, [sp, #32] // 32-byte Folded Reload -; CHECK-NEXT: mul x6, x16, x5 -; CHECK-NEXT: fmov d1, x19 +; CHECK-NEXT: mul x21, x17, x7 +; CHECK-NEXT: add v12.2d, v12.2d, v15.2d +; CHECK-NEXT: ldr q15, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: mul x6, x14, x5 +; CHECK-NEXT: fmov d0, x13 +; CHECK-NEXT: add v13.2d, v13.2d, v14.2d +; CHECK-NEXT: add v11.2d, v11.2d, v14.2d +; CHECK-NEXT: mul x14, x14, x7 +; CHECK-NEXT: fmov d3, x21 +; CHECK-NEXT: mul x18, x4, x7 ; CHECK-NEXT: mov v0.d[1], x3 -; CHECK-NEXT: mul x16, x16, x7 -; CHECK-NEXT: fmov d2, x15 -; CHECK-NEXT: add v15.2d, v15.2d, v14.2d -; CHECK-NEXT: mul x21, x18, x7 ; CHECK-NEXT: mov v1.d[1], x6 -; CHECK-NEXT: mul x0, x4, x7 -; CHECK-NEXT: str q15, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: add v15.2d, v11.2d, v14.2d -; CHECK-NEXT: mov v2.d[1], x16 -; CHECK-NEXT: ldr q11, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: mul x20, x13, x7 -; CHECK-NEXT: fmov d3, x21 -; CHECK-NEXT: add v11.2d, v11.2d, v0.2d -; CHECK-NEXT: add v12.2d, v12.2d, v1.2d -; CHECK-NEXT: mul x22, x12, x7 -; CHECK-NEXT: fmov d4, x0 -; CHECK-NEXT: add v18.2d, v18.2d, v2.2d -; CHECK-NEXT: mov v2.16b, v7.16b -; CHECK-NEXT: mul x14, x18, x5 -; CHECK-NEXT: mov v7.16b, v19.16b -; CHECK-NEXT: mov v19.16b, v23.16b +; CHECK-NEXT: mul x20, x16, x7 +; CHECK-NEXT: mov v2.d[1], x14 +; CHECK-NEXT: mul x22, x0, x7 +; CHECK-NEXT: add v10.2d, v10.2d, v0.2d +; CHECK-NEXT: fmov d4, x18 +; CHECK-NEXT: add v8.2d, v8.2d, v1.2d +; CHECK-NEXT: mul x13, x17, x5 ; CHECK-NEXT: mov v3.d[1], x20 -; CHECK-NEXT: mov v23.16b, v27.16b -; CHECK-NEXT: mov v27.16b, v9.16b -; CHECK-NEXT: mul x15, x4, x5 -; CHECK-NEXT: add v27.2d, v9.2d, v1.2d -; CHECK-NEXT: str q11, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: add v15.2d, v15.2d, v2.2d +; CHECK-NEXT: mov v2.16b, v7.16b +; CHECK-NEXT: mul x14, x4, x5 +; CHECK-NEXT: mov v7.16b, v18.16b +; CHECK-NEXT: mov v18.16b, v21.16b ; CHECK-NEXT: mov v4.d[1], x22 -; CHECK-NEXT: add v19.2d, v19.2d, v1.2d +; CHECK-NEXT: mov v21.16b, v24.16b +; CHECK-NEXT: mov v24.16b, v27.16b +; CHECK-NEXT: mul x12, x16, x5 +; CHECK-NEXT: mov v27.16b, v30.16b +; CHECK-NEXT: ldr q30, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: fmov d14, x13 ; CHECK-NEXT: add v7.2d, v7.2d, v1.2d -; CHECK-NEXT: mul x13, x13, x5 -; CHECK-NEXT: add v23.2d, v23.2d, v1.2d -; CHECK-NEXT: add v1.2d, v5.2d, v1.2d -; CHECK-NEXT: fmov d14, x14 +; CHECK-NEXT: mul x13, x0, x5 ; CHECK-NEXT: add v30.2d, v30.2d, v3.2d ; CHECK-NEXT: mov v3.16b, v16.16b -; CHECK-NEXT: mul x12, x12, x5 -; CHECK-NEXT: mov v16.16b, v20.16b -; CHECK-NEXT: mov v5.16b, v22.16b -; CHECK-NEXT: fmov d0, x15 +; CHECK-NEXT: mov v16.16b, v19.16b +; CHECK-NEXT: mov v19.16b, v22.16b +; CHECK-NEXT: mov v22.16b, v25.16b +; CHECK-NEXT: mov v25.16b, v28.16b +; CHECK-NEXT: ldr q28, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: fmov d0, x14 +; CHECK-NEXT: mov v14.d[1], x12 ; CHECK-NEXT: add v28.2d, v28.2d, v4.2d ; CHECK-NEXT: mov v4.16b, v17.16b -; CHECK-NEXT: mov v17.16b, v21.16b -; CHECK-NEXT: mov v21.16b, v10.16b -; CHECK-NEXT: mov v10.16b, v26.16b -; CHECK-NEXT: mov v14.d[1], x13 -; CHECK-NEXT: mov v22.16b, v31.16b -; CHECK-NEXT: mov v20.16b, v8.16b -; CHECK-NEXT: ldp q26, q31, [sp] // 32-byte Folded Reload -; CHECK-NEXT: mov v11.16b, v15.16b -; CHECK-NEXT: mov v0.d[1], x12 -; CHECK-NEXT: add v13.2d, v13.2d, v14.2d +; CHECK-NEXT: mov v17.16b, v20.16b +; CHECK-NEXT: mov v20.16b, v23.16b +; CHECK-NEXT: mov v23.16b, v26.16b +; CHECK-NEXT: mov v26.16b, v29.16b +; CHECK-NEXT: mov v0.d[1], x13 +; CHECK-NEXT: ldr q29, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add v19.2d, v19.2d, v1.2d +; CHECK-NEXT: add v9.2d, v9.2d, v14.2d ; CHECK-NEXT: add v31.2d, v31.2d, v14.2d -; CHECK-NEXT: add v26.2d, v26.2d, v14.2d +; CHECK-NEXT: add v27.2d, v27.2d, v14.2d +; CHECK-NEXT: add v26.2d, v26.2d, v1.2d +; CHECK-NEXT: add v23.2d, v23.2d, v1.2d +; CHECK-NEXT: add v1.2d, v5.2d, v1.2d +; CHECK-NEXT: ldr q5, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: add v24.2d, v24.2d, v14.2d ; CHECK-NEXT: add v22.2d, v22.2d, v14.2d -; CHECK-NEXT: add v20.2d, v8.2d, v14.2d -; CHECK-NEXT: add v10.2d, v10.2d, v14.2d +; CHECK-NEXT: add v20.2d, v20.2d, v14.2d +; CHECK-NEXT: add v18.2d, v18.2d, v14.2d ; CHECK-NEXT: add v16.2d, v16.2d, v14.2d ; CHECK-NEXT: add v5.2d, v5.2d, v14.2d ; CHECK-NEXT: add v3.2d, v3.2d, v14.2d @@ -191,38 +189,34 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: add v21.2d, v21.2d, v0.2d ; CHECK-NEXT: add v17.2d, v17.2d, v0.2d ; CHECK-NEXT: add v4.2d, v4.2d, v0.2d -; CHECK-NEXT: add v0.2d, v6.2d, v0.2d +; CHECK-NEXT: add v6.2d, v6.2d, v0.2d ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup -; CHECK-NEXT: ldr q6, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: stp q12, q31, [x8, #80] -; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload -; CHECK-NEXT: str q6, [x8] -; CHECK-NEXT: ldr q6, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: str q29, [x8, #112] -; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload -; CHECK-NEXT: stp q6, q11, [x8, #16] -; CHECK-NEXT: ldr q6, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: stp q18, q30, [x8, #144] -; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload -; CHECK-NEXT: stp q6, q13, [x8, #48] -; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: stp q28, q26, [x8, #176] -; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: stp q19, q10, [x8, #336] -; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload -; CHECK-NEXT: str q27, [x8, #208] +; CHECK-NEXT: stp q13, q12, [x8] +; CHECK-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload +; CHECK-NEXT: stp q11, q10, [x8, #32] +; CHECK-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload +; CHECK-NEXT: stp q9, q8, [x8, #64] +; CHECK-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload +; CHECK-NEXT: stp q15, q30, [x8, #144] +; CHECK-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload +; CHECK-NEXT: stp q31, q29, [x8, #96] +; CHECK-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: stp q28, q27, [x8, #176] +; CHECK-NEXT: ldp d15, d14, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: str q26, [x8, #208] ; CHECK-NEXT: stp q25, q24, [x8, #240] ; CHECK-NEXT: stp q23, q22, [x8, #272] ; CHECK-NEXT: stp q21, q20, [x8, #304] +; CHECK-NEXT: stp q19, q18, [x8, #336] ; CHECK-NEXT: stp q17, q16, [x8, #368] ; CHECK-NEXT: stp q7, q5, [x8, #400] ; CHECK-NEXT: stp q4, q3, [x8, #432] ; CHECK-NEXT: stp q1, q2, [x8, #464] -; CHECK-NEXT: str q0, [x8, #496] -; CHECK-NEXT: add sp, sp, #192 +; CHECK-NEXT: str q6, [x8, #496] +; CHECK-NEXT: add sp, sp, #176 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w19 ; CHECK-NEXT: .cfi_restore w20 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index 70987df1c9c04e..5f7a22ed055c89 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -333,21 +333,21 @@ define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: punpklo p2.h, p1.b +; VBITS_GE_256-NEXT: punpklo p2.h, p0.b +; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: and p0.b, p2/z, p2.b, p1.b ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z4.d] +; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z4.d] +; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: ret ; @@ -711,21 +711,21 @@ define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: punpklo p2.h, p1.b ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: punpklo p2.h, p0.b +; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: and p0.b, p2/z, p2.b, p1.b ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z4.d] +; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z4.d] +; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index a5303c901b80f3..6e29f7cbabcc80 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -33,75 +33,75 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: umov w8, v0.b[8] ; CHECK-NEXT: umov w9, v0.b[9] ; CHECK-NEXT: umov w10, v0.b[1] -; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v2.16b, v0.16b ; CHECK-NEXT: umov w11, v0.b[15] -; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: umov w8, v0.b[10] -; CHECK-NEXT: mov v1.b[1], w10 +; CHECK-NEXT: mov v2.b[1], w10 ; CHECK-NEXT: umov w10, v0.b[11] -; CHECK-NEXT: mov v2.b[1], w9 +; CHECK-NEXT: mov v1.b[1], w9 ; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: mov v2.b[2], w8 +; CHECK-NEXT: mov v1.b[2], w8 ; CHECK-NEXT: umov w8, v0.b[3] -; CHECK-NEXT: mov v1.b[2], w9 +; CHECK-NEXT: mov v2.b[2], w9 ; CHECK-NEXT: umov w9, v0.b[12] -; CHECK-NEXT: mov v2.b[3], w10 +; CHECK-NEXT: mov v1.b[3], w10 ; CHECK-NEXT: umov w10, v0.b[4] -; CHECK-NEXT: mov v1.b[3], w8 +; CHECK-NEXT: mov v2.b[3], w8 ; CHECK-NEXT: umov w8, v0.b[13] -; CHECK-NEXT: mov v2.b[4], w9 +; CHECK-NEXT: mov v1.b[4], w9 ; CHECK-NEXT: umov w9, v0.b[5] -; CHECK-NEXT: mov v1.b[4], w10 +; CHECK-NEXT: mov v2.b[4], w10 ; CHECK-NEXT: umov w10, v0.b[14] -; CHECK-NEXT: mov v2.b[5], w8 +; CHECK-NEXT: mov v1.b[5], w8 ; CHECK-NEXT: umov w8, v0.b[6] -; CHECK-NEXT: mov v1.b[5], w9 +; CHECK-NEXT: mov v2.b[5], w9 ; CHECK-NEXT: umov w9, v0.b[7] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 -; CHECK-NEXT: mov v2.b[6], w10 -; CHECK-NEXT: mov v1.b[6], w8 +; CHECK-NEXT: mov v1.b[6], w10 +; CHECK-NEXT: mov v2.b[6], w8 ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: mov x10, #8 // =0x8 ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NEXT: mov v2.b[7], w11 -; CHECK-NEXT: mov v1.b[7], w9 +; CHECK-NEXT: mov v1.b[7], w11 +; CHECK-NEXT: mov v2.b[7], w9 ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: mov x9, #24 // =0x18 -; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 -; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: lsl z3.s, z3.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: lsl z2.s, z2.s, #31 ; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: lsl z2.s, z2.s, #31 ; CHECK-NEXT: and z0.s, z0.s, #0x1 ; CHECK-NEXT: and z3.s, z3.s, #0x1 -; CHECK-NEXT: asr z2.s, z2.s, #31 ; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: asr z2.s, z2.s, #31 ; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] -; CHECK-NEXT: and z2.s, z2.s, #0x1 ; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: and z2.s, z2.s, #0x1 ; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z0.s, p2/m, #0 // =0x0 -; CHECK-NEXT: cmpne p3.s, p0/z, z2.s, #0 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 +; CHECK-NEXT: cmpne p3.s, p0/z, z1.s, #0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: cmpne p1.s, p0/z, z2.s, #0 ; CHECK-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] ; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0 -; CHECK-NEXT: mov z2.s, p1/m, #0 // =0x0 +; CHECK-NEXT: mov z1.s, p1/m, #0 // =0x0 ; CHECK-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] -; CHECK-NEXT: st1w { z2.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll index 1bace71db0c118..486f59d7900e95 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll @@ -770,19 +770,19 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) { ; CHECK-NEXT: b.lt .LBB70_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, #1 // =0x1 -; CHECK-NEXT: whilelo p0.s, xzr, x9 +; CHECK-NEXT: whilelo p1.s, xzr, x9 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: cntw x10 ; CHECK-NEXT: .LBB70_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2, x8, lsl #2] -; CHECK-NEXT: mad z1.s, p1/m, z2.s, z0.s -; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p1/z, [x2, x8, lsl #2] +; CHECK-NEXT: mad z1.s, p0/m, z2.s, z0.s +; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] ; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p0.s, x8, x9 +; CHECK-NEXT: whilelo p1.s, x8, x9 ; CHECK-NEXT: b.mi .LBB70_2 ; CHECK-NEXT: .LBB70_3: // %for.cond.cleanup ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index d6adf9cf0ad672..756e25f8e3368d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -215,44 +215,44 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #128 ; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: mov z3.h, z1.h[3] -; CHECK-NEXT: mov z4.h, z1.h[2] -; CHECK-NEXT: fcvtzu x8, h1 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: fcvtzu x10, h0 -; CHECK-NEXT: fcvtzu x9, h2 -; CHECK-NEXT: fcvtzu x11, h3 -; CHECK-NEXT: fcvtzu x12, h4 -; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: mov z4.h, z1.h[3] -; CHECK-NEXT: fcvtzu x13, h1 -; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: mov z3.h, z0.h[1] -; CHECK-NEXT: stp x8, x9, [sp, #32] -; CHECK-NEXT: fcvtzu x8, h2 -; CHECK-NEXT: fcvtzu x9, h4 -; CHECK-NEXT: stp x12, x11, [sp, #48] -; CHECK-NEXT: fcvtzu x11, h1 -; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: ldp q0, q4, [x0] +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: mov z3.h, z0.h[2] +; CHECK-NEXT: fcvtzu x8, h0 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: mov z5.h, z4.h[3] +; CHECK-NEXT: fcvtzu x10, h4 +; CHECK-NEXT: fcvtzu x9, h1 +; CHECK-NEXT: fcvtzu x11, h2 ; CHECK-NEXT: fcvtzu x12, h3 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z3.h, z0.h[3] +; CHECK-NEXT: fcvtzu x13, h0 +; CHECK-NEXT: mov z0.h, z0.h[2] +; CHECK-NEXT: mov z2.h, z4.h[1] +; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: fcvtzu x8, h1 +; CHECK-NEXT: fcvtzu x9, h3 +; CHECK-NEXT: stp x12, x11, [sp, #48] +; CHECK-NEXT: fcvtzu x11, h0 +; CHECK-NEXT: mov z1.h, z4.h[2] +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: fcvtzu x12, h2 ; CHECK-NEXT: stp x13, x8, [sp] ; CHECK-NEXT: fcvtzu x8, h5 ; CHECK-NEXT: stp x11, x9, [sp, #16] -; CHECK-NEXT: fcvtzu x9, h2 -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: fcvtzu x11, h0 -; CHECK-NEXT: mov z0.h, z0.h[2] +; CHECK-NEXT: fcvtzu x9, h1 +; CHECK-NEXT: mov z0.h, z4.h[1] +; CHECK-NEXT: mov z1.h, z4.h[3] +; CHECK-NEXT: mov z2.h, z4.h[2] +; CHECK-NEXT: fcvtzu x11, h4 ; CHECK-NEXT: stp x10, x12, [sp, #96] ; CHECK-NEXT: ldp q3, q4, [sp] -; CHECK-NEXT: fcvtzu x10, h1 -; CHECK-NEXT: fcvtzu x12, h2 +; CHECK-NEXT: fcvtzu x10, h0 +; CHECK-NEXT: fcvtzu x12, h1 ; CHECK-NEXT: stp x9, x8, [sp, #112] -; CHECK-NEXT: fcvtzu x8, h0 +; CHECK-NEXT: fcvtzu x8, h2 ; CHECK-NEXT: ldp q0, q1, [sp, #32] ; CHECK-NEXT: ldp q6, q7, [sp, #96] ; CHECK-NEXT: stp x11, x10, [sp, #64] @@ -965,44 +965,44 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #128 ; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: mov z3.h, z1.h[3] -; CHECK-NEXT: mov z4.h, z1.h[2] -; CHECK-NEXT: fcvtzs x8, h1 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: fcvtzs x10, h0 -; CHECK-NEXT: fcvtzs x9, h2 -; CHECK-NEXT: fcvtzs x11, h3 -; CHECK-NEXT: fcvtzs x12, h4 -; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: mov z4.h, z1.h[3] -; CHECK-NEXT: fcvtzs x13, h1 -; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: mov z3.h, z0.h[1] -; CHECK-NEXT: stp x8, x9, [sp, #32] -; CHECK-NEXT: fcvtzs x8, h2 -; CHECK-NEXT: fcvtzs x9, h4 -; CHECK-NEXT: stp x12, x11, [sp, #48] -; CHECK-NEXT: fcvtzs x11, h1 -; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: ldp q0, q4, [x0] +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: mov z3.h, z0.h[2] +; CHECK-NEXT: fcvtzs x8, h0 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: mov z5.h, z4.h[3] +; CHECK-NEXT: fcvtzs x10, h4 +; CHECK-NEXT: fcvtzs x9, h1 +; CHECK-NEXT: fcvtzs x11, h2 ; CHECK-NEXT: fcvtzs x12, h3 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z3.h, z0.h[3] +; CHECK-NEXT: fcvtzs x13, h0 +; CHECK-NEXT: mov z0.h, z0.h[2] +; CHECK-NEXT: mov z2.h, z4.h[1] +; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: fcvtzs x8, h1 +; CHECK-NEXT: fcvtzs x9, h3 +; CHECK-NEXT: stp x12, x11, [sp, #48] +; CHECK-NEXT: fcvtzs x11, h0 +; CHECK-NEXT: mov z1.h, z4.h[2] +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: fcvtzs x12, h2 ; CHECK-NEXT: stp x13, x8, [sp] ; CHECK-NEXT: fcvtzs x8, h5 ; CHECK-NEXT: stp x11, x9, [sp, #16] -; CHECK-NEXT: fcvtzs x9, h2 -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: fcvtzs x11, h0 -; CHECK-NEXT: mov z0.h, z0.h[2] +; CHECK-NEXT: fcvtzs x9, h1 +; CHECK-NEXT: mov z0.h, z4.h[1] +; CHECK-NEXT: mov z1.h, z4.h[3] +; CHECK-NEXT: mov z2.h, z4.h[2] +; CHECK-NEXT: fcvtzs x11, h4 ; CHECK-NEXT: stp x10, x12, [sp, #96] ; CHECK-NEXT: ldp q3, q4, [sp] -; CHECK-NEXT: fcvtzs x10, h1 -; CHECK-NEXT: fcvtzs x12, h2 +; CHECK-NEXT: fcvtzs x10, h0 +; CHECK-NEXT: fcvtzs x12, h1 ; CHECK-NEXT: stp x9, x8, [sp, #112] -; CHECK-NEXT: fcvtzs x8, h0 +; CHECK-NEXT: fcvtzs x8, h2 ; CHECK-NEXT: ldp q0, q1, [sp, #32] ; CHECK-NEXT: ldp q6, q7, [sp, #96] ; CHECK-NEXT: stp x11, x10, [sp, #64] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index 5e3ce0ddebe2e0..0de7e050eabca3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -105,55 +105,55 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: sdiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q6, q2, [x0] +; CHECK-NEXT: ldp q6, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q7, q3, [x1] -; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: ldp q7, q1, [x1] +; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: mov z16.d, z6.d -; CHECK-NEXT: mov z0.d, z3.d -; CHECK-NEXT: ext z1.b, z1.b, z2.b, #8 +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: ext z16.b, z16.b, z6.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z6.h, z6.b -; CHECK-NEXT: ext z0.b, z0.b, z3.b, #8 ; CHECK-NEXT: sunpklo z3.h, z3.b -; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z16.h, z16.b -; CHECK-NEXT: sunpklo z4.h, z0.b -; CHECK-NEXT: sunpklo z5.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: sunpklo z18.s, z16.h -; CHECK-NEXT: sunpklo z0.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: sunpklo z4.s, z2.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sunpklo z16.s, z16.h -; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z5.s -; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z4.s -; CHECK-NEXT: sunpklo z4.h, z2.b -; CHECK-NEXT: sunpklo z2.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: sunpklo z5.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z16.s, z16.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z5.s, z0.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpklo z3.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: mov z5.d, z7.d ; CHECK-NEXT: ext z5.b, z5.b, z7.b, #8 ; CHECK-NEXT: sunpklo z7.h, z7.b -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sunpklo z5.h, z5.b ; CHECK-NEXT: sunpklo z17.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h ; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: sunpklo z18.s, z6.h ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 ; CHECK-NEXT: sunpklo z6.s, z6.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z16.s ; CHECK-NEXT: sunpklo z16.s, z7.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 @@ -165,18 +165,18 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: uzp1 z2.b, z7.b, z7.b ; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b -; CHECK-NEXT: stp q3, q2, [x0] +; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: stp q2, q3, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -472,55 +472,55 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: udiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q6, q2, [x0] +; CHECK-NEXT: ldp q6, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q7, q3, [x1] -; CHECK-NEXT: mov z1.d, z2.d +; CHECK-NEXT: ldp q7, q1, [x1] +; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: mov z16.d, z6.d -; CHECK-NEXT: mov z0.d, z3.d -; CHECK-NEXT: ext z1.b, z1.b, z2.b, #8 +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: ext z16.b, z16.b, z6.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z6.h, z6.b -; CHECK-NEXT: ext z0.b, z0.b, z3.b, #8 ; CHECK-NEXT: uunpklo z3.h, z3.b -; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z16.h, z16.b -; CHECK-NEXT: uunpklo z4.h, z0.b -; CHECK-NEXT: uunpklo z5.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: uunpklo z18.s, z16.h -; CHECK-NEXT: uunpklo z0.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: uunpklo z4.s, z2.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: uunpklo z16.s, z16.h -; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z5.s -; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z4.s -; CHECK-NEXT: uunpklo z4.h, z2.b -; CHECK-NEXT: uunpklo z2.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z16.s, z16.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z5.s, z0.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z3.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: mov z5.d, z7.d ; CHECK-NEXT: ext z5.b, z5.b, z7.b, #8 ; CHECK-NEXT: uunpklo z7.h, z7.b -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uunpklo z5.h, z5.b ; CHECK-NEXT: uunpklo z17.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h ; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: uunpklo z18.s, z6.h ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 ; CHECK-NEXT: uunpklo z6.s, z6.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z16.s ; CHECK-NEXT: uunpklo z16.s, z7.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 @@ -532,18 +532,18 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: uzp1 z2.b, z7.b, z7.b ; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b -; CHECK-NEXT: stp q3, q2, [x0] +; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: stp q2, q3, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index eb95a410209b45..68075b196893b3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -109,84 +109,81 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: srem_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: ldp q16, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q17, q1, [x1] ; CHECK-NEXT: ptrue p1.b, vl16 -; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: sunpklo z7.h, z1.b -; CHECK-NEXT: sunpklo z16.h, z0.b -; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: mov z18.d, z16.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z6.s, z7.h -; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 -; CHECK-NEXT: sunpklo z17.s, z16.h -; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 -; CHECK-NEXT: sunpklo z4.h, z2.b +; CHECK-NEXT: ext z18.b, z18.b, z16.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: sunpklo z3.h, z3.b -; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: sunpklo z16.s, z16.h -; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z17.s -; CHECK-NEXT: sunpklo z2.s, z4.h +; CHECK-NEXT: sunpklo z18.h, z18.b +; CHECK-NEXT: sunpklo z2.h, z2.b ; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: movprfx z5, z3 -; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z4.s -; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: ldr q4, [x1] -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h -; CHECK-NEXT: mov z18.d, z3.d -; CHECK-NEXT: mov z17.d, z4.d -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: ext z18.b, z18.b, z3.b, #8 -; CHECK-NEXT: ext z17.b, z17.b, z4.b, #8 -; CHECK-NEXT: sunpklo z18.h, z18.b -; CHECK-NEXT: sunpklo z17.h, z17.b ; CHECK-NEXT: sunpklo z20.s, z18.h +; CHECK-NEXT: sunpklo z4.s, z2.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 -; CHECK-NEXT: sunpklo z19.s, z17.h -; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 -; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z18.s, z18.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z5.h, z0.b +; CHECK-NEXT: sunpklo z7.s, z5.h +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpklo z3.h, z1.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: sunpklo z6.s, z3.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: mov z7.d, z17.d +; CHECK-NEXT: ext z7.b, z7.b, z17.b, #8 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: sunpklo z17.s, z17.h +; CHECK-NEXT: sunpklo z7.h, z7.b +; CHECK-NEXT: sunpklo z19.s, z7.h +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s -; CHECK-NEXT: sunpklo z20.h, z3.b -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: sunpklo z20.h, z16.b +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: sunpklo z22.s, z20.h ; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 ; CHECK-NEXT: sunpklo z20.s, z20.h -; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: sunpklo z18.h, z4.b -; CHECK-NEXT: uzp1 z16.h, z19.h, z19.h +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z18.s +; CHECK-NEXT: sunpklo z18.h, z17.b +; CHECK-NEXT: uzp1 z5.h, z19.h, z19.h ; CHECK-NEXT: sunpklo z21.s, z18.h ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: sunpklo z18.s, z18.h ; CHECK-NEXT: sdivr z21.s, p0/m, z21.s, z22.s -; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h -; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h -; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b -; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b +; CHECK-NEXT: splice z5.h, p0, z5.h, z7.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h +; CHECK-NEXT: splice z6.h, p0, z6.h, z3.h +; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b +; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z5.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b -; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b -; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b -; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b -; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b +; CHECK-NEXT: uzp1 z3.b, z19.b, z19.b +; CHECK-NEXT: splice z5.b, p0, z5.b, z4.b +; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b +; CHECK-NEXT: movprfx z2, z16 +; CHECK-NEXT: mls z2.b, p1/m, z3.b, z17.b +; CHECK-NEXT: mls z0.b, p1/m, z5.b, z1.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a @@ -498,84 +495,81 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: urem_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: ldp q16, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q17, q1, [x1] ; CHECK-NEXT: ptrue p1.b, vl16 -; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: uunpklo z7.h, z1.b -; CHECK-NEXT: uunpklo z16.h, z0.b -; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: mov z18.d, z16.d +; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z6.s, z7.h -; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 -; CHECK-NEXT: uunpklo z17.s, z16.h -; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 -; CHECK-NEXT: uunpklo z4.h, z2.b +; CHECK-NEXT: ext z18.b, z18.b, z16.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: uunpklo z3.h, z3.b -; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: uunpklo z16.s, z16.h -; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z17.s -; CHECK-NEXT: uunpklo z2.s, z4.h +; CHECK-NEXT: uunpklo z18.h, z18.b +; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: movprfx z5, z3 -; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z4.s -; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: ldr q4, [x1] -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h -; CHECK-NEXT: mov z18.d, z3.d -; CHECK-NEXT: mov z17.d, z4.d -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: ext z18.b, z18.b, z3.b, #8 -; CHECK-NEXT: ext z17.b, z17.b, z4.b, #8 -; CHECK-NEXT: uunpklo z18.h, z18.b -; CHECK-NEXT: uunpklo z17.h, z17.b ; CHECK-NEXT: uunpklo z20.s, z18.h +; CHECK-NEXT: uunpklo z4.s, z2.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 -; CHECK-NEXT: uunpklo z19.s, z17.h -; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 -; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z18.s, z18.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z5.h, z0.b +; CHECK-NEXT: uunpklo z7.s, z5.h +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z3.h, z1.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uunpklo z6.s, z3.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: mov z7.d, z17.d +; CHECK-NEXT: ext z7.b, z7.b, z17.b, #8 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: uunpklo z17.s, z17.h +; CHECK-NEXT: uunpklo z7.h, z7.b +; CHECK-NEXT: uunpklo z19.s, z7.h +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s -; CHECK-NEXT: uunpklo z20.h, z3.b -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: uunpklo z20.h, z16.b +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: uunpklo z22.s, z20.h ; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 ; CHECK-NEXT: uunpklo z20.s, z20.h -; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: uunpklo z18.h, z4.b -; CHECK-NEXT: uzp1 z16.h, z19.h, z19.h +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z18.s +; CHECK-NEXT: uunpklo z18.h, z17.b +; CHECK-NEXT: uzp1 z5.h, z19.h, z19.h ; CHECK-NEXT: uunpklo z21.s, z18.h ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: uunpklo z18.s, z18.h ; CHECK-NEXT: udivr z21.s, p0/m, z21.s, z22.s -; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h -; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h -; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b -; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b +; CHECK-NEXT: splice z5.h, p0, z5.h, z7.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h +; CHECK-NEXT: splice z6.h, p0, z6.h, z3.h +; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b +; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z5.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b -; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b -; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b -; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b -; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b +; CHECK-NEXT: uzp1 z3.b, z19.b, z19.b +; CHECK-NEXT: splice z5.b, p0, z5.b, z4.b +; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b +; CHECK-NEXT: movprfx z2, z16 +; CHECK-NEXT: mls z2.b, p1/m, z3.b, z17.b +; CHECK-NEXT: mls z0.b, p1/m, z5.b, z1.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index 1f036fa08ef155..289707488fc386 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -80,119 +80,119 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #64 ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: ldp q0, q4, [x0] -; CHECK-NEXT: ldp q2, q6, [x0, #32] -; CHECK-NEXT: mov z16.h, z3.h[7] -; CHECK-NEXT: mov z18.h, z3.h[6] -; CHECK-NEXT: mov z17.h, z4.h[7] -; CHECK-NEXT: ldp q5, q7, [x1, #32] -; CHECK-NEXT: mov z19.h, z4.h[6] +; CHECK-NEXT: ldp q2, q4, [x1] +; CHECK-NEXT: ldp q0, q5, [x0] +; CHECK-NEXT: ldp q1, q6, [x0, #32] +; CHECK-NEXT: mov z16.h, z4.h[7] +; CHECK-NEXT: mov z18.h, z4.h[6] +; CHECK-NEXT: mov z17.h, z5.h[7] +; CHECK-NEXT: ldp q3, q7, [x1, #32] +; CHECK-NEXT: mov z19.h, z5.h[6] ; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z3.h[5] +; CHECK-NEXT: mov z16.h, z4.h[5] ; CHECK-NEXT: fmov w9, s17 -; CHECK-NEXT: mov z17.h, z4.h[5] -; CHECK-NEXT: mov z20.h, z7.h[6] +; CHECK-NEXT: mov z17.h, z5.h[5] ; CHECK-NEXT: strh w8, [sp, #30] ; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z3.h[4] +; CHECK-NEXT: mov z18.h, z4.h[4] ; CHECK-NEXT: strh w9, [sp, #28] ; CHECK-NEXT: fmov w9, s19 -; CHECK-NEXT: mov z19.h, z6.h[7] -; CHECK-NEXT: zip1 z3.h, z4.h, z3.h +; CHECK-NEXT: mov z19.h, z7.h[6] +; CHECK-NEXT: zip1 z4.h, z5.h, z4.h ; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z4.h[4] +; CHECK-NEXT: mov z16.h, z5.h[4] ; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: zip1 z4.h, z6.h, z7.h +; CHECK-NEXT: zip1 z5.h, z6.h, z7.h ; CHECK-NEXT: strh w8, [sp, #22] ; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z1.h[7] -; CHECK-NEXT: add z3.h, z3.h, z4.h +; CHECK-NEXT: mov z17.h, z2.h[7] ; CHECK-NEXT: strh w8, [sp, #20] ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z18.h, z0.h[7] ; CHECK-NEXT: strh w8, [sp, #18] ; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z1.h[6] +; CHECK-NEXT: mov z16.h, z2.h[6] ; CHECK-NEXT: strh w8, [sp, #16] ; CHECK-NEXT: fmov w8, s17 ; CHECK-NEXT: mov z17.h, z0.h[6] ; CHECK-NEXT: strh w8, [sp, #62] ; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z1.h[5] +; CHECK-NEXT: mov z18.h, z2.h[5] ; CHECK-NEXT: strh w8, [sp, #60] ; CHECK-NEXT: fmov w8, s16 ; CHECK-NEXT: mov z16.h, z0.h[5] ; CHECK-NEXT: strh w8, [sp, #58] ; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z1.h[4] +; CHECK-NEXT: mov z17.h, z2.h[4] ; CHECK-NEXT: strh w8, [sp, #56] ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z18.h, z0.h[4] -; CHECK-NEXT: zip1 z0.h, z0.h, z1.h -; CHECK-NEXT: zip1 z1.h, z2.h, z5.h +; CHECK-NEXT: zip1 z0.h, z0.h, z2.h ; CHECK-NEXT: strh w8, [sp, #54] ; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: ldr q16, [sp, #16] -; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: mov z16.h, z7.h[7] ; CHECK-NEXT: strh w8, [sp, #52] ; CHECK-NEXT: fmov w8, s17 +; CHECK-NEXT: mov z17.h, z6.h[7] ; CHECK-NEXT: strh w8, [sp, #50] ; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z7.h[7] +; CHECK-NEXT: ldr q18, [sp, #16] ; CHECK-NEXT: strh w8, [sp, #48] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z6.h[6] -; CHECK-NEXT: ldr q17, [sp, #48] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z16.h, z6.h[6] +; CHECK-NEXT: ldr q20, [sp, #48] ; CHECK-NEXT: strh w8, [sp, #46] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z7.h[5] +; CHECK-NEXT: fmov w8, s17 +; CHECK-NEXT: mov z17.h, z7.h[5] ; CHECK-NEXT: strh w8, [sp, #44] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z6.h[5] +; CHECK-NEXT: fmov w8, s19 +; CHECK-NEXT: mov z19.h, z6.h[5] ; CHECK-NEXT: strh w8, [sp, #42] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z7.h[4] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z16.h, z7.h[4] ; CHECK-NEXT: strh w8, [sp, #40] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z6.h[4] +; CHECK-NEXT: fmov w8, s17 +; CHECK-NEXT: mov z17.h, z6.h[4] ; CHECK-NEXT: strh w8, [sp, #38] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z5.h[7] +; CHECK-NEXT: fmov w8, s19 +; CHECK-NEXT: mov z19.h, z3.h[7] ; CHECK-NEXT: strh w8, [sp, #36] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z2.h[7] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z16.h, z1.h[7] ; CHECK-NEXT: strh w8, [sp, #34] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z5.h[6] +; CHECK-NEXT: fmov w8, s17 +; CHECK-NEXT: mov z17.h, z3.h[6] ; CHECK-NEXT: strh w8, [sp, #32] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z2.h[6] +; CHECK-NEXT: fmov w8, s19 +; CHECK-NEXT: mov z19.h, z1.h[6] +; CHECK-NEXT: ldr q2, [sp, #32] ; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z5.h[5] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z16.h, z3.h[5] +; CHECK-NEXT: add z2.h, z18.h, z2.h ; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z2.h[5] +; CHECK-NEXT: fmov w8, s17 +; CHECK-NEXT: mov z17.h, z1.h[5] ; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z5.h[4] -; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: fmov w8, s19 +; CHECK-NEXT: mov z19.h, z3.h[4] +; CHECK-NEXT: fmov w9, s17 ; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z2.h[4] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z16.h, z1.h[4] +; CHECK-NEXT: zip1 z1.h, z1.h, z3.h ; CHECK-NEXT: strh w9, [sp, #4] -; CHECK-NEXT: ldr q2, [sp, #32] +; CHECK-NEXT: add z3.h, z4.h, z5.h ; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: add z2.h, z16.h, z2.h +; CHECK-NEXT: fmov w8, s19 +; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: fmov w8, s16 ; CHECK-NEXT: strh w8, [sp] ; CHECK-NEXT: ldr q4, [sp] ; CHECK-NEXT: stp q3, q2, [x0, #32] -; CHECK-NEXT: add z1.h, z17.h, z4.h +; CHECK-NEXT: add z1.h, z20.h, z4.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret @@ -956,22 +956,20 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #64 ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q3, q2, [x0] ; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: mov z4.h, z3.h[6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z6.h, z3.h[2] -; CHECK-NEXT: mov z5.h, z3.h[4] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z7.h, z2.h[6] -; CHECK-NEXT: mov z17.h, z2.h[7] -; CHECK-NEXT: mov z16.h, z3.h[1] +; CHECK-NEXT: mov z4.h, z2.h[6] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z6.h, z2.h[2] +; CHECK-NEXT: mov z5.h, z2.h[4] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z7.h, z3.h[6] ; CHECK-NEXT: strh w8, [sp, #40] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z2.h[4] +; CHECK-NEXT: mov z4.h, z3.h[4] ; CHECK-NEXT: strh w9, [sp, #32] ; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.h, z2.h[2] +; CHECK-NEXT: mov z5.h, z3.h[2] ; CHECK-NEXT: strh w8, [sp, #46] ; CHECK-NEXT: fmov w8, s6 ; CHECK-NEXT: mov z6.h, z1.h[2] @@ -982,13 +980,12 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; CHECK-NEXT: fmov w8, s4 ; CHECK-NEXT: mov z4.h, z1.h[6] ; CHECK-NEXT: strh w9, [sp, #38] -; CHECK-NEXT: fmov w9, s16 ; CHECK-NEXT: strh w8, [sp, #36] ; CHECK-NEXT: fmov w8, s5 ; CHECK-NEXT: mov z5.h, z1.h[4] -; CHECK-NEXT: strh w9, [sp, #56] ; CHECK-NEXT: strh w8, [sp, #34] ; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ldr q16, [sp, #32] ; CHECK-NEXT: strh w8, [sp, #8] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: strh w8, [sp] @@ -999,63 +996,66 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; CHECK-NEXT: mov z5.h, z0.h[2] ; CHECK-NEXT: strh w8, [sp, #12] ; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z3.h[7] +; CHECK-NEXT: mov z6.h, z2.h[7] ; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.h, z3.h[5] +; CHECK-NEXT: mov z7.h, z3.h[7] ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z4.h, z2.h[5] ; CHECK-NEXT: strh w8, [sp, #4] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z3.h[3] -; CHECK-NEXT: ldr q3, [sp, #32] +; CHECK-NEXT: mov z5.h, z2.h[3] +; CHECK-NEXT: mov z2.h, z2.h[1] ; CHECK-NEXT: strh w8, [sp, #2] ; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z2.h[5] -; CHECK-NEXT: ldr q4, [sp] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.h, z3.h[1] +; CHECK-NEXT: ldr q6, [sp] ; CHECK-NEXT: strh w8, [sp, #62] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.h, z1.h[7] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z4.h, z3.h[5] +; CHECK-NEXT: strh w9, [sp, #56] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.h, z0.h[7] ; CHECK-NEXT: strh w8, [sp, #60] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z2.h[3] -; CHECK-NEXT: mov z2.h, z2.h[1] +; CHECK-NEXT: mov z5.h, z3.h[3] +; CHECK-NEXT: mov z3.h, z1.h[7] +; CHECK-NEXT: strh w9, [sp, #48] ; CHECK-NEXT: strh w8, [sp, #58] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.h, z0.h[7] +; CHECK-NEXT: fmov w8, s7 ; CHECK-NEXT: strh w8, [sp, #54] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z1.h[5] -; CHECK-NEXT: strh w9, [sp, #48] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z4.h, z1.h[5] ; CHECK-NEXT: strh w8, [sp, #52] ; CHECK-NEXT: fmov w8, s5 ; CHECK-NEXT: mov z5.h, z1.h[3] ; CHECK-NEXT: mov z1.h, z1.h[1] ; CHECK-NEXT: strh w8, [sp, #50] -; CHECK-NEXT: fmov w8, s7 +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z3.h, z0.h[5] ; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z0.h[5] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z4.h, z0.h[3] +; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strh w8, [sp, #24] ; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: fmov w8, s4 ; CHECK-NEXT: strh w8, [sp, #18] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: ldr q0, [sp, #48] -; CHECK-NEXT: add z0.h, z3.h, z0.h +; CHECK-NEXT: add z0.h, z16.h, z0.h ; CHECK-NEXT: strh w8, [sp, #16] ; CHECK-NEXT: ldr q1, [sp, #16] -; CHECK-NEXT: add z1.h, z4.h, z1.h +; CHECK-NEXT: add z1.h, z6.h, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret @@ -1133,45 +1133,45 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z2.h, z1.h[6] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z4.h, z1.h[2] -; CHECK-NEXT: mov z6.h, z0.h[4] -; CHECK-NEXT: mov z3.h, z1.h[4] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z5.h, z0.h[6] +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: mov z6.h, z1.h[4] +; CHECK-NEXT: mov z3.h, z0.h[4] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z5.h, z1.h[6] ; CHECK-NEXT: strh w8, [sp, #8] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: mov z2.h, z1.h[2] ; CHECK-NEXT: strh w9, [sp] ; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z3.h, z1.h[7] +; CHECK-NEXT: mov z3.h, z0.h[7] ; CHECK-NEXT: strh w8, [sp, #14] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[5] +; CHECK-NEXT: mov z4.h, z0.h[5] ; CHECK-NEXT: strh w9, [sp, #12] ; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.h, z1.h[3] -; CHECK-NEXT: mov z1.h, z1.h[1] +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: fmov w8, s6 ; CHECK-NEXT: strh w9, [sp, #6] -; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z0.h, z1.h[1] ; CHECK-NEXT: strh w8, [sp, #4] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[7] +; CHECK-NEXT: mov z2.h, z1.h[7] ; CHECK-NEXT: strh w9, [sp, #24] ; CHECK-NEXT: strh w8, [sp, #2] ; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: strh w8, [sp, #30] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z0.h[5] +; CHECK-NEXT: mov z4.h, z1.h[5] ; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: mov z5.h, z1.h[3] ; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w8, [sp, #22] diff --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll index cd06f8dbfad84c..bd4a5c93a1ab87 100644 --- a/llvm/test/CodeGen/AArch64/swifterror.ll +++ b/llvm/test/CodeGen/AArch64/swifterror.ll @@ -1618,7 +1618,7 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_ ; CHECK-APPLE-LABEL: params_and_return_in_reg: ; CHECK-APPLE: ; %bb.0: ; CHECK-APPLE-NEXT: sub sp, sp, #128 -; CHECK-APPLE-NEXT: stp x20, x28, [sp, #24] ; 16-byte Folded Spill +; CHECK-APPLE-NEXT: stp x21, x28, [sp, #24] ; 16-byte Folded Spill ; CHECK-APPLE-NEXT: stp x27, x26, [sp, #48] ; 16-byte Folded Spill ; CHECK-APPLE-NEXT: stp x25, x24, [sp, #64] ; 16-byte Folded Spill ; CHECK-APPLE-NEXT: stp x23, x22, [sp, #80] ; 16-byte Folded Spill @@ -1637,8 +1637,8 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_ ; CHECK-APPLE-NEXT: .cfi_offset w26, -72 ; CHECK-APPLE-NEXT: .cfi_offset w27, -80 ; CHECK-APPLE-NEXT: .cfi_offset w28, -96 -; CHECK-APPLE-NEXT: mov x23, x21 -; CHECK-APPLE-NEXT: str x7, [sp, #16] ; 8-byte Folded Spill +; CHECK-APPLE-NEXT: str x20, [sp, #8] ; 8-byte Folded Spill +; CHECK-APPLE-NEXT: mov x23, x7 ; CHECK-APPLE-NEXT: mov x24, x6 ; CHECK-APPLE-NEXT: mov x25, x5 ; CHECK-APPLE-NEXT: mov x26, x4 @@ -1657,7 +1657,7 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_ ; CHECK-APPLE-NEXT: mov x20, xzr ; CHECK-APPLE-NEXT: mov x21, xzr ; CHECK-APPLE-NEXT: bl _params_in_reg2 -; CHECK-APPLE-NEXT: str x21, [sp, #8] ; 8-byte Folded Spill +; CHECK-APPLE-NEXT: str x21, [sp, #16] ; 8-byte Folded Spill ; CHECK-APPLE-NEXT: mov x0, x22 ; CHECK-APPLE-NEXT: mov x1, x19 ; CHECK-APPLE-NEXT: mov x2, x28 @@ -1665,17 +1665,18 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_ ; CHECK-APPLE-NEXT: mov x4, x26 ; CHECK-APPLE-NEXT: mov x5, x25 ; CHECK-APPLE-NEXT: mov x6, x24 -; CHECK-APPLE-NEXT: ldp x7, x20, [sp, #16] ; 16-byte Folded Reload -; CHECK-APPLE-NEXT: mov x21, x23 +; CHECK-APPLE-NEXT: mov x7, x23 +; CHECK-APPLE-NEXT: ldr x20, [sp, #8] ; 8-byte Folded Reload +; CHECK-APPLE-NEXT: ldr x21, [sp, #24] ; 8-byte Folded Reload ; CHECK-APPLE-NEXT: bl _params_and_return_in_reg2 ; CHECK-APPLE-NEXT: mov x19, x0 ; CHECK-APPLE-NEXT: mov x22, x1 -; CHECK-APPLE-NEXT: mov x24, x2 -; CHECK-APPLE-NEXT: mov x25, x3 -; CHECK-APPLE-NEXT: mov x26, x4 -; CHECK-APPLE-NEXT: mov x27, x5 -; CHECK-APPLE-NEXT: mov x28, x6 -; CHECK-APPLE-NEXT: mov x23, x7 +; CHECK-APPLE-NEXT: mov x23, x2 +; CHECK-APPLE-NEXT: mov x24, x3 +; CHECK-APPLE-NEXT: mov x25, x4 +; CHECK-APPLE-NEXT: mov x26, x5 +; CHECK-APPLE-NEXT: mov x27, x6 +; CHECK-APPLE-NEXT: mov x28, x7 ; CHECK-APPLE-NEXT: str x21, [sp, #24] ; 8-byte Folded Spill ; CHECK-APPLE-NEXT: mov w0, #1 ; =0x1 ; CHECK-APPLE-NEXT: mov w1, #2 ; =0x2 @@ -1686,16 +1687,16 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_ ; CHECK-APPLE-NEXT: mov w6, #7 ; =0x7 ; CHECK-APPLE-NEXT: mov w7, #8 ; =0x8 ; CHECK-APPLE-NEXT: mov x20, xzr -; CHECK-APPLE-NEXT: ldr x21, [sp, #8] ; 8-byte Folded Reload +; CHECK-APPLE-NEXT: ldr x21, [sp, #16] ; 8-byte Folded Reload ; CHECK-APPLE-NEXT: bl _params_in_reg2 ; CHECK-APPLE-NEXT: mov x0, x19 ; CHECK-APPLE-NEXT: mov x1, x22 -; CHECK-APPLE-NEXT: mov x2, x24 -; CHECK-APPLE-NEXT: mov x3, x25 -; CHECK-APPLE-NEXT: mov x4, x26 -; CHECK-APPLE-NEXT: mov x5, x27 -; CHECK-APPLE-NEXT: mov x6, x28 -; CHECK-APPLE-NEXT: mov x7, x23 +; CHECK-APPLE-NEXT: mov x2, x23 +; CHECK-APPLE-NEXT: mov x3, x24 +; CHECK-APPLE-NEXT: mov x4, x25 +; CHECK-APPLE-NEXT: mov x5, x26 +; CHECK-APPLE-NEXT: mov x6, x27 +; CHECK-APPLE-NEXT: mov x7, x28 ; CHECK-APPLE-NEXT: ldp x21, x28, [sp, #24] ; 16-byte Folded Reload ; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #112] ; 16-byte Folded Reload ; CHECK-APPLE-NEXT: ldp x20, x19, [sp, #96] ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/vec-libcalls.ll b/llvm/test/CodeGen/AArch64/vec-libcalls.ll index e1b4967ed0fb93..37ad21a778e8ee 100644 --- a/llvm/test/CodeGen/AArch64/vec-libcalls.ll +++ b/llvm/test/CodeGen/AArch64/vec-libcalls.ll @@ -52,15 +52,15 @@ define <2 x float> @sin_v2f32(<2 x float> %x) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #48 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov s0, v0.s[1] ; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: bl sinf -; CHECK-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl sinf -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.s[1], v1.s[0] diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll index c7134508883b11..4765e7fd2d1f95 100644 --- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll @@ -405,60 +405,60 @@ define <8 x half> @test_copysign_v8f16_v8f32(<8 x half> %a, <8 x float> %b) #0 { ; NOFP16-LABEL: test_copysign_v8f16_v8f32: ; NOFP16: ; %bb.0: ; NOFP16-NEXT: fcvtn v1.4h, v1.4s -; NOFP16-NEXT: mov h4, v0[1] +; NOFP16-NEXT: mov h3, v0[1] ; NOFP16-NEXT: fcvt s6, h0 -; NOFP16-NEXT: mvni.4s v3, #128, lsl #24 +; NOFP16-NEXT: mvni.4s v5, #128, lsl #24 ; NOFP16-NEXT: mov h7, v0[2] ; NOFP16-NEXT: fcvtn v2.4h, v2.4s -; NOFP16-NEXT: mov h5, v1[1] +; NOFP16-NEXT: mov h4, v1[1] ; NOFP16-NEXT: fcvt s16, h1 -; NOFP16-NEXT: fcvt s4, h4 +; NOFP16-NEXT: fcvt s3, h3 ; NOFP16-NEXT: mov h17, v1[2] ; NOFP16-NEXT: mov h1, v1[3] ; NOFP16-NEXT: fcvt s7, h7 -; NOFP16-NEXT: fcvt s5, h5 -; NOFP16-NEXT: bif.16b v6, v16, v3 +; NOFP16-NEXT: fcvt s4, h4 +; NOFP16-NEXT: bif.16b v6, v16, v5 ; NOFP16-NEXT: mov h16, v0[3] ; NOFP16-NEXT: fcvt s17, h17 ; NOFP16-NEXT: fcvt s18, h1 -; NOFP16-NEXT: bif.16b v4, v5, v3 +; NOFP16-NEXT: bif.16b v3, v4, v5 ; NOFP16-NEXT: fcvt h1, s6 -; NOFP16-NEXT: mov.16b v6, v3 -; NOFP16-NEXT: mov h5, v0[4] +; NOFP16-NEXT: mov.16b v6, v5 +; NOFP16-NEXT: mov h4, v0[4] ; NOFP16-NEXT: fcvt s16, h16 ; NOFP16-NEXT: bsl.16b v6, v7, v17 ; NOFP16-NEXT: mov h7, v0[5] ; NOFP16-NEXT: mov h17, v2[1] -; NOFP16-NEXT: fcvt h4, s4 -; NOFP16-NEXT: fcvt s5, h5 -; NOFP16-NEXT: bif.16b v16, v18, v3 +; NOFP16-NEXT: fcvt h3, s3 +; NOFP16-NEXT: fcvt s4, h4 +; NOFP16-NEXT: bif.16b v16, v18, v5 ; NOFP16-NEXT: fcvt h6, s6 ; NOFP16-NEXT: fcvt s7, h7 ; NOFP16-NEXT: fcvt s17, h17 -; NOFP16-NEXT: mov.h v1[1], v4[0] -; NOFP16-NEXT: fcvt s4, h2 -; NOFP16-NEXT: bif.16b v7, v17, v3 -; NOFP16-NEXT: bit.16b v4, v5, v3 -; NOFP16-NEXT: fcvt h5, s16 +; NOFP16-NEXT: mov.h v1[1], v3[0] +; NOFP16-NEXT: fcvt s3, h2 +; NOFP16-NEXT: bif.16b v7, v17, v5 +; NOFP16-NEXT: bit.16b v3, v4, v5 +; NOFP16-NEXT: fcvt h4, s16 ; NOFP16-NEXT: mov.h v1[2], v6[0] ; NOFP16-NEXT: mov h6, v0[6] ; NOFP16-NEXT: mov h16, v2[2] ; NOFP16-NEXT: mov h0, v0[7] ; NOFP16-NEXT: mov h2, v2[3] -; NOFP16-NEXT: mov.h v1[3], v5[0] -; NOFP16-NEXT: fcvt h4, s4 -; NOFP16-NEXT: fcvt s5, h6 +; NOFP16-NEXT: mov.h v1[3], v4[0] +; NOFP16-NEXT: fcvt h3, s3 +; NOFP16-NEXT: fcvt s4, h6 ; NOFP16-NEXT: fcvt s6, h16 ; NOFP16-NEXT: fcvt s0, h0 ; NOFP16-NEXT: fcvt s2, h2 -; NOFP16-NEXT: mov.h v1[4], v4[0] -; NOFP16-NEXT: fcvt h4, s7 -; NOFP16-NEXT: bif.16b v5, v6, v3 -; NOFP16-NEXT: bif.16b v0, v2, v3 -; NOFP16-NEXT: mov.h v1[5], v4[0] -; NOFP16-NEXT: fcvt h4, s5 +; NOFP16-NEXT: mov.h v1[4], v3[0] +; NOFP16-NEXT: fcvt h3, s7 +; NOFP16-NEXT: bif.16b v4, v6, v5 +; NOFP16-NEXT: bif.16b v0, v2, v5 +; NOFP16-NEXT: mov.h v1[5], v3[0] +; NOFP16-NEXT: fcvt h3, s4 ; NOFP16-NEXT: fcvt h0, s0 -; NOFP16-NEXT: mov.h v1[6], v4[0] +; NOFP16-NEXT: mov.h v1[6], v3[0] ; NOFP16-NEXT: mov.h v1[7], v0[0] ; NOFP16-NEXT: mov.16b v0, v1 ; NOFP16-NEXT: ret diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll index 220dc70165e87c..674009dd29ec15 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -81,21 +81,21 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) { ; ; GCN-LABEL: atomic_add_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b32 s6, s5 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: buffer_atomic_add v1, v2, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB1_2: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0 @@ -179,21 +179,21 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) { ; ; GCN-LABEL: atomic_sub_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b32 s6, s5 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB3_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: buffer_atomic_sub v1, v2, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB3_2: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0 @@ -281,22 +281,22 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) { ; ; GCN-LABEL: atomic_xor_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b32 s6, s5 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB5_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GCN-NEXT: s_and_b32 s4, s4, 1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: buffer_atomic_xor v1, v2, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB5_2: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 @@ -383,21 +383,21 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) { ; ; GCN-LABEL: atomic_ptr_add_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b32 s6, s5 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB7_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: buffer_atomic_add v1, v2, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB7_2: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0 @@ -485,21 +485,21 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) { ; ; GCN-LABEL: atomic_ptr_sub_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b32 s6, s5 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB9_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: buffer_atomic_sub v1, v2, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB9_2: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0 @@ -591,22 +591,22 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) { ; ; GCN-LABEL: atomic_ptr_xor_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b32 s6, s5 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB11_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GCN-NEXT: s_and_b32 s4, s4, 1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: buffer_atomic_xor v1, v2, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB11_2: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll index 33a4d3c5494f7c..ccb0b3726f5723 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -11,40 +11,54 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_mov_b32_e32 v6, v2 ; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:64 ; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96 -; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:176 +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:192 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill @@ -64,8 +78,6 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224 ; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240 -; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256 ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264 @@ -78,14 +90,14 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292 ; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296 ; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:332 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336 ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344 @@ -94,30 +106,51 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356 ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360 ; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:432 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:436 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:440 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:444 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -135,7 +168,9 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: v_and_b32_e32 v0, 63, v6 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GCN-NEXT: v_add_u32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v16, v20 @@ -155,22 +190,18 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -186,40 +217,54 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_mov_b32_e32 v6, v2 ; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:64 ; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96 -; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:176 +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:192 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill @@ -239,8 +284,6 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224 ; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240 -; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256 ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264 @@ -253,14 +296,14 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292 ; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296 ; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:332 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336 ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344 @@ -269,30 +312,51 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356 ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360 ; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:432 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:436 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:440 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:444 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -310,7 +374,9 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: v_bfe_u32 v0, v6, 1, 6 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GCN-NEXT: v_add_u32_e32 v0, v1, v0 ; GCN-NEXT: v_and_b32_e32 v1, 1, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 @@ -332,24 +398,20 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(16) +; GCN-NEXT: s_waitcnt vmcnt(12) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -365,40 +427,54 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_mov_b32_e32 v6, v2 ; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:64 ; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96 -; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:176 +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:192 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill @@ -430,14 +506,14 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292 ; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296 ; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:332 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336 ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344 @@ -446,30 +522,51 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356 ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360 ; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:432 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:436 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:440 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:444 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -510,22 +607,18 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll index 786d65f7dcc40d..9c6e714c4329a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -158,102 +158,102 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx ; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_add_u32_e32 v16, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v12, v8, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v9, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v16, v10, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, v11, v7, vcc ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v15, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[3:4] ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 1, v2 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v16 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v12, v8, v10, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v9, v11, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v12, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v12, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v18, v10, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, v11, v7, vcc ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v18, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v9, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc @@ -281,58 +281,58 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:16 +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 offset:16 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_add_i32_e32 v16, vcc, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v12, v8, v10, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v9, v11, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v10, v12, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v16, v10, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v17, v11, v7, vcc ; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v14, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v5, v15, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index faab70cac39cbf..7db15d9fcbcd5f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -13,28 +13,28 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v16, 0x100 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x0 ; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x80 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x80 ; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NEXT: v_mov_b32_e32 v4, s40 -; GCN-NEXT: v_mov_b32_e32 v5, s41 -; GCN-NEXT: v_mov_b32_e32 v6, s42 -; GCN-NEXT: v_mov_b32_e32 v7, s43 -; GCN-NEXT: v_mov_b32_e32 v8, s44 -; GCN-NEXT: v_mov_b32_e32 v9, s45 -; GCN-NEXT: v_mov_b32_e32 v10, s46 -; GCN-NEXT: v_mov_b32_e32 v11, s47 -; GCN-NEXT: v_mov_b32_e32 v12, s48 -; GCN-NEXT: v_mov_b32_e32 v13, s49 -; GCN-NEXT: v_mov_b32_e32 v14, s50 -; GCN-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NEXT: v_mov_b32_e32 v8, s12 +; GCN-NEXT: v_mov_b32_e32 v9, s13 +; GCN-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NEXT: v_mov_b32_e32 v12, s16 +; GCN-NEXT: v_mov_b32_e32 v13, s17 +; GCN-NEXT: v_mov_b32_e32 v14, s18 +; GCN-NEXT: v_mov_b32_e32 v15, s19 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0xc0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:260 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:264 @@ -83,71 +83,71 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:376 ; GCN-NEXT: v_mov_b32_e32 v0, s67 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:380 -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:384 -; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s37 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:388 -; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s38 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:392 -; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s39 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:396 -; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s40 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:400 -; GCN-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s41 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:404 -; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s42 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:408 -; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s43 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:412 -; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:416 -; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_mov_b32_e32 v0, s45 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:420 -; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s46 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:424 -; GCN-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NEXT: v_mov_b32_e32 v0, s47 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:428 -; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v0, s48 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:432 -; GCN-NEXT: v_mov_b32_e32 v0, s17 +; GCN-NEXT: v_mov_b32_e32 v0, s49 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:436 -; GCN-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NEXT: v_mov_b32_e32 v0, s50 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:440 -; GCN-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NEXT: v_mov_b32_e32 v0, s51 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:444 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:448 -; GCN-NEXT: v_mov_b32_e32 v0, s37 +; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:452 -; GCN-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:456 -; GCN-NEXT: v_mov_b32_e32 v0, s39 +; GCN-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:460 -; GCN-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:464 -; GCN-NEXT: v_mov_b32_e32 v0, s41 +; GCN-NEXT: v_mov_b32_e32 v0, s9 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:468 -; GCN-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:472 -; GCN-NEXT: v_mov_b32_e32 v0, s43 +; GCN-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:476 -; GCN-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:480 -; GCN-NEXT: v_mov_b32_e32 v0, s45 +; GCN-NEXT: v_mov_b32_e32 v0, s13 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:484 -; GCN-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:488 -; GCN-NEXT: v_mov_b32_e32 v0, s47 +; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:492 -; GCN-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:496 -; GCN-NEXT: v_mov_b32_e32 v0, s49 +; GCN-NEXT: v_mov_b32_e32 v0, s17 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:500 -; GCN-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NEXT: s_and_b32 s4, s25, 63 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:504 -; GCN-NEXT: v_mov_b32_e32 v0, s51 +; GCN-NEXT: v_mov_b32_e32 v0, s19 ; GCN-NEXT: s_lshl_b32 s4, s4, 2 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:508 ; GCN-NEXT: v_add_u32_e32 v0, s4, v16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 61439021a88757..f6c6f29d227062 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -5822,34 +5822,34 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v3, v0 :: v_dual_cndmask_b32 v17, v4, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v19, v3, v0 :: v_dual_cndmask_b32 v18, v4, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 4, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v6, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s9, 6, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v0 :: v_dual_cndmask_b32 v6, v8, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v4, v7, v0 :: v_dual_cndmask_b32 v5, v8, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX11-NEXT: v_readfirstlane_b32 s0, v18 -; GFX11-NEXT: v_readfirstlane_b32 s1, v17 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v11, v0, s6 +; GFX11-NEXT: v_readfirstlane_b32 s0, v19 +; GFX11-NEXT: v_readfirstlane_b32 s1, v18 ; GFX11-NEXT: v_readfirstlane_b32 s2, v3 -; GFX11-NEXT: v_readfirstlane_b32 s3, v4 -; GFX11-NEXT: v_dual_cndmask_b32 v7, v9, v0 :: v_dual_cndmask_b32 v8, v10, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v9, v0 :: v_dual_cndmask_b32 v7, v10, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v11, v0, s6 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, v1, s6 -; GFX11-NEXT: v_readfirstlane_b32 s4, v5 -; GFX11-NEXT: v_readfirstlane_b32 s5, v6 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v13, v0 :: v_dual_cndmask_b32 v12, v14, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v15, v0, s9 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v15, v0, s9 +; GFX11-NEXT: v_readfirstlane_b32 s3, v17 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v13, v0 :: v_dual_cndmask_b32 v11, v14, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v16, v1, s9 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_readfirstlane_b32 s8, v10 +; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: v_readfirstlane_b32 s8, v8 ; GFX11-NEXT: v_readfirstlane_b32 s9, v9 -; GFX11-NEXT: v_readfirstlane_b32 s10, v11 -; GFX11-NEXT: v_readfirstlane_b32 s11, v12 -; GFX11-NEXT: v_readfirstlane_b32 s12, v14 +; GFX11-NEXT: v_readfirstlane_b32 s10, v10 +; GFX11-NEXT: v_readfirstlane_b32 s11, v11 +; GFX11-NEXT: v_readfirstlane_b32 s12, v12 ; GFX11-NEXT: v_readfirstlane_b32 s13, v13 ; GFX11-NEXT: ; return to shader part epilog entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index a5482bd5b79a96..3a82e04880a72c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -1432,20 +1432,20 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] ; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_W32-NEXT: s_mov_b32 s2, 0 -; GFX10_W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10_W32-NEXT: s_mov_b32 s3, 0 +; GFX10_W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10_W32-NEXT: s_cbranch_execz .LBB13_2 ; GFX10_W32-NEXT: ; %bb.1: ; %bb ; GFX10_W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x50 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10_W32-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10_W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10_W32-NEXT: s_cselect_b32 s3, 1, 0 ; GFX10_W32-NEXT: .LBB13_2: ; %exit -; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 +; GFX10_W32-NEXT: s_and_b32 s2, 1, s3 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 @@ -1490,21 +1490,21 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; GFX11_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W32-NEXT: global_load_b96 v[1:3], v1, s[2:3] -; GFX11_W32-NEXT: s_mov_b32 s2, 0 -; GFX11_W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11_W32-NEXT: s_mov_b32 s3, 0 +; GFX11_W32-NEXT: s_mov_b32 s2, exec_lo ; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11_W32-NEXT: s_cbranch_execz .LBB13_2 ; GFX11_W32-NEXT: ; %bb.1: ; %bb ; GFX11_W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x50 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11_W32-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11_W32-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11_W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11_W32-NEXT: s_cselect_b32 s3, 1, 0 ; GFX11_W32-NEXT: .LBB13_2: ; %exit -; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W32-NEXT: s_and_b32 s2, 1, s3 ; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX11_W32-NEXT: s_waitcnt vmcnt(0) ; GFX11_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index d62454c8bd0b60..0e3f6f5a21444e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -188,30 +188,33 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: +; GFX1013-NEXT: v_mov_b32_e32 v16, v13 +; GFX1013-NEXT: v_mov_b32_e32 v17, v14 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: v_readfirstlane_b32 s5, v12 -; GFX1013-NEXT: v_readfirstlane_b32 s6, v13 -; GFX1013-NEXT: v_readfirstlane_b32 s7, v14 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v16 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v17 ; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] -; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1013-NEXT: image_bvh_intersect_ray v[15:18], v[0:10], s[4:7] +; GFX1013-NEXT: image_bvh_intersect_ray v[12:15], v[0:10], s[4:7] ; GFX1013-NEXT: ; implicit-def: $vgpr11 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 -; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX1013-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB6_1 ; GFX1013-NEXT: ; %bb.2: ; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, v15 -; GFX1013-NEXT: v_mov_b32_e32 v1, v16 -; GFX1013-NEXT: v_mov_b32_e32 v2, v17 -; GFX1013-NEXT: v_mov_b32_e32 v3, v18 +; GFX1013-NEXT: v_mov_b32_e32 v0, v12 +; GFX1013-NEXT: v_mov_b32_e32 v1, v13 +; GFX1013-NEXT: v_mov_b32_e32 v2, v14 +; GFX1013-NEXT: v_mov_b32_e32 v3, v15 ; GFX1013-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: image_bvh_intersect_ray_vgpr_descr: @@ -419,30 +422,33 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; ; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: +; GFX1013-NEXT: v_mov_b32_e32 v17, v14 +; GFX1013-NEXT: v_mov_b32_e32 v18, v15 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: v_readfirstlane_b32 s5, v13 -; GFX1013-NEXT: v_readfirstlane_b32 s6, v14 -; GFX1013-NEXT: v_readfirstlane_b32 s7, v15 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v17 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v18 ; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] -; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1013-NEXT: image_bvh64_intersect_ray v[16:19], v[0:11], s[4:7] +; GFX1013-NEXT: image_bvh64_intersect_ray v[13:16], v[0:11], s[4:7] ; GFX1013-NEXT: ; implicit-def: $vgpr12 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 +; GFX1013-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB8_1 ; GFX1013-NEXT: ; %bb.2: ; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, v16 -; GFX1013-NEXT: v_mov_b32_e32 v1, v17 -; GFX1013-NEXT: v_mov_b32_e32 v2, v18 -; GFX1013-NEXT: v_mov_b32_e32 v3, v19 +; GFX1013-NEXT: v_mov_b32_e32 v0, v13 +; GFX1013-NEXT: v_mov_b32_e32 v1, v14 +; GFX1013-NEXT: v_mov_b32_e32 v2, v15 +; GFX1013-NEXT: v_mov_b32_e32 v3, v16 ; GFX1013-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: image_bvh64_intersect_ray_vgpr_descr: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index eb3f74be71de01..0255a77aa0ffd1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -838,165 +838,165 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: v_readfirstlane_b32 s21, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_readfirstlane_b32 s23, v1 +; GFX7-NEXT: v_readfirstlane_b32 s19, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX7-NEXT: v_mul_hi_u32 v3, v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v4, s11 ; GFX7-NEXT: s_mul_i32 s18, s16, s10 +; GFX7-NEXT: v_readfirstlane_b32 s24, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: v_readfirstlane_b32 s22, v3 +; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX7-NEXT: s_mul_i32 s20, s1, s9 -; GFX7-NEXT: v_readfirstlane_b32 s19, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mul_hi_u32 v5, s1, v4 ; GFX7-NEXT: s_add_u32 s18, s20, s18 +; GFX7-NEXT: v_readfirstlane_b32 s25, v3 +; GFX7-NEXT: v_mul_hi_u32 v3, v2, s10 ; GFX7-NEXT: s_addc_u32 s19, s21, s19 ; GFX7-NEXT: s_mul_i32 s21, s2, s8 -; GFX7-NEXT: v_readfirstlane_b32 s23, v1 -; GFX7-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX7-NEXT: s_cselect_b32 s20, 1, 0 -; GFX7-NEXT: v_readfirstlane_b32 s22, v3 ; GFX7-NEXT: s_add_u32 s18, s21, s18 -; GFX7-NEXT: s_addc_u32 s19, s22, s19 -; GFX7-NEXT: s_mul_i32 s22, s16, s9 -; GFX7-NEXT: s_cselect_b32 s21, 1, 0 -; GFX7-NEXT: s_add_u32 s17, s22, s17 -; GFX7-NEXT: s_addc_u32 s22, s23, s18 -; GFX7-NEXT: v_readfirstlane_b32 s23, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s12 -; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1 -; GFX7-NEXT: s_mul_i32 s18, s1, s8 -; GFX7-NEXT: s_cselect_b32 s25, 1, 0 -; GFX7-NEXT: s_add_u32 s18, s18, s17 -; GFX7-NEXT: s_addc_u32 s17, s23, s22 -; GFX7-NEXT: v_mov_b32_e32 v4, s11 -; GFX7-NEXT: v_readfirstlane_b32 s23, v3 -; GFX7-NEXT: v_mul_hi_u32 v3, v2, s10 -; GFX7-NEXT: v_mul_hi_u32 v5, s1, v4 -; GFX7-NEXT: s_mul_i32 s22, s16, s12 -; GFX7-NEXT: s_mul_i32 s24, s1, s11 ; GFX7-NEXT: v_readfirstlane_b32 s28, v3 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_addc_u32 s19, s22, s19 +; GFX7-NEXT: s_mul_i32 s22, s16, s9 ; GFX7-NEXT: v_readfirstlane_b32 s27, v5 ; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9 -; GFX7-NEXT: s_cselect_b32 s26, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s24, s22 -; GFX7-NEXT: s_addc_u32 s23, s27, s23 +; GFX7-NEXT: s_cselect_b32 s21, 1, 0 +; GFX7-NEXT: s_add_u32 s17, s22, s17 +; GFX7-NEXT: s_addc_u32 s18, s23, s18 +; GFX7-NEXT: s_mul_i32 s23, s1, s8 +; GFX7-NEXT: s_cselect_b32 s22, 1, 0 +; GFX7-NEXT: s_add_u32 s17, s23, s17 +; GFX7-NEXT: s_addc_u32 s18, s24, s18 +; GFX7-NEXT: s_mul_i32 s24, s16, s12 +; GFX7-NEXT: s_mul_i32 s26, s1, s11 ; GFX7-NEXT: v_readfirstlane_b32 s29, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: s_cselect_b32 s23, 1, 0 +; GFX7-NEXT: s_add_u32 s24, s26, s24 ; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8 +; GFX7-NEXT: s_addc_u32 s25, s27, s25 ; GFX7-NEXT: s_mul_i32 s27, s2, s10 -; GFX7-NEXT: s_cselect_b32 s22, 1, 0 +; GFX7-NEXT: s_cselect_b32 s26, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s27, s24 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10 -; GFX7-NEXT: s_addc_u32 s27, s28, s23 +; GFX7-NEXT: s_addc_u32 s25, s28, s25 ; GFX7-NEXT: s_mul_i32 s28, s3, s9 -; GFX7-NEXT: s_cselect_b32 s23, 1, 0 -; GFX7-NEXT: s_add_u32 s28, s28, s24 +; GFX7-NEXT: s_cselect_b32 s27, 1, 0 +; GFX7-NEXT: s_add_u32 s24, s28, s24 ; GFX7-NEXT: v_readfirstlane_b32 s30, v6 ; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4 -; GFX7-NEXT: s_addc_u32 s27, s29, s27 +; GFX7-NEXT: s_addc_u32 s25, s29, s25 ; GFX7-NEXT: s_mul_i32 s29, s4, s8 -; GFX7-NEXT: s_cselect_b32 s24, 1, 0 -; GFX7-NEXT: s_add_u32 s28, s29, s28 +; GFX7-NEXT: s_cselect_b32 s28, 1, 0 +; GFX7-NEXT: s_add_u32 s24, s29, s24 ; GFX7-NEXT: v_readfirstlane_b32 s33, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9 -; GFX7-NEXT: s_addc_u32 s27, s30, s27 +; GFX7-NEXT: s_addc_u32 s25, s30, s25 ; GFX7-NEXT: s_mul_i32 s30, s16, s11 ; GFX7-NEXT: s_cselect_b32 s29, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s31, v6 ; GFX7-NEXT: s_add_u32 s19, s30, s19 -; GFX7-NEXT: s_addc_u32 s28, s31, s28 +; GFX7-NEXT: s_addc_u32 s24, s31, s24 ; GFX7-NEXT: s_mul_i32 s31, s1, s10 ; GFX7-NEXT: s_cselect_b32 s30, 1, 0 ; GFX7-NEXT: s_add_u32 s19, s31, s19 ; GFX7-NEXT: v_readfirstlane_b32 s34, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s8 -; GFX7-NEXT: s_addc_u32 s28, s33, s28 +; GFX7-NEXT: s_addc_u32 s24, s33, s24 ; GFX7-NEXT: s_mul_i32 s33, s2, s9 ; GFX7-NEXT: s_cselect_b32 s31, 1, 0 ; GFX7-NEXT: s_add_u32 s19, s33, s19 -; GFX7-NEXT: s_addc_u32 s28, s34, s28 +; GFX7-NEXT: s_addc_u32 s24, s34, s24 ; GFX7-NEXT: s_mul_i32 s34, s3, s8 ; GFX7-NEXT: s_cselect_b32 s33, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: s_add_u32 s19, s34, s19 ; GFX7-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-NEXT: s_addc_u32 s28, s35, s28 +; GFX7-NEXT: s_addc_u32 s24, s35, s24 ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: s_cselect_b32 s34, 1, 0 -; GFX7-NEXT: s_cmp_lg_u32 s26, 0 -; GFX7-NEXT: s_addc_u32 s19, s25, s19 +; GFX7-NEXT: s_cmp_lg_u32 s23, 0 +; GFX7-NEXT: s_addc_u32 s19, s22, s19 ; GFX7-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-NEXT: s_cselect_b32 s25, 1, 0 +; GFX7-NEXT: s_cselect_b32 s22, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 ; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2 ; GFX7-NEXT: s_addc_u32 s20, s20, 0 -; GFX7-NEXT: v_readfirstlane_b32 s26, v0 +; GFX7-NEXT: v_readfirstlane_b32 s23, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1 -; GFX7-NEXT: s_cmp_lg_u32 s25, 0 -; GFX7-NEXT: s_addc_u32 s20, s20, s28 -; GFX7-NEXT: s_mul_i32 s25, s16, s14 -; GFX7-NEXT: s_mul_i32 s28, s1, s13 +; GFX7-NEXT: s_cmp_lg_u32 s22, 0 +; GFX7-NEXT: s_addc_u32 s20, s20, s24 +; GFX7-NEXT: s_mul_i32 s22, s16, s14 +; GFX7-NEXT: s_mul_i32 s24, s1, s13 ; GFX7-NEXT: s_cselect_b32 s21, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_add_u32 s22, s24, s22 +; GFX7-NEXT: s_addc_u32 s23, s35, s23 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11 -; GFX7-NEXT: s_mul_i32 s28, s2, s12 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_mul_i32 s24, s2, s12 +; GFX7-NEXT: s_add_u32 s22, s24, s22 +; GFX7-NEXT: s_addc_u32 s23, s35, s23 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10 -; GFX7-NEXT: s_mul_i32 s28, s3, s11 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_mul_i32 s24, s3, s11 +; GFX7-NEXT: s_add_u32 s22, s24, s22 +; GFX7-NEXT: s_addc_u32 s23, s35, s23 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9 -; GFX7-NEXT: s_mul_i32 s28, s4, s10 -; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_mul_i32 s24, s4, s10 +; GFX7-NEXT: s_add_u32 s22, s24, s22 ; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_addc_u32 s23, s35, s23 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8 -; GFX7-NEXT: s_mul_i32 s28, s5, s9 -; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_mul_i32 s24, s5, s9 +; GFX7-NEXT: s_add_u32 s22, s24, s22 ; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX7-NEXT: v_readfirstlane_b32 s36, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 -; GFX7-NEXT: s_mul_i32 s28, s6, s8 +; GFX7-NEXT: s_addc_u32 s23, s35, s23 +; GFX7-NEXT: s_mul_i32 s24, s6, s8 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 -; GFX7-NEXT: s_mul_i32 s28, s16, s13 +; GFX7-NEXT: s_add_u32 s22, s24, s22 +; GFX7-NEXT: s_addc_u32 s23, s35, s23 +; GFX7-NEXT: s_mul_i32 s24, s16, s13 ; GFX7-NEXT: v_readfirstlane_b32 s35, v2 -; GFX7-NEXT: s_add_u32 s27, s28, s27 +; GFX7-NEXT: s_add_u32 s24, s24, s25 ; GFX7-NEXT: v_readfirstlane_b32 s37, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10 -; GFX7-NEXT: s_addc_u32 s25, s35, s25 +; GFX7-NEXT: s_addc_u32 s22, s35, s22 ; GFX7-NEXT: s_mul_i32 s35, s1, s12 -; GFX7-NEXT: s_cselect_b32 s28, 1, 0 -; GFX7-NEXT: s_add_u32 s27, s35, s27 -; GFX7-NEXT: s_addc_u32 s25, s36, s25 +; GFX7-NEXT: s_cselect_b32 s25, 1, 0 +; GFX7-NEXT: s_add_u32 s24, s35, s24 +; GFX7-NEXT: s_addc_u32 s22, s36, s22 ; GFX7-NEXT: s_mul_i32 s36, s2, s11 ; GFX7-NEXT: s_cselect_b32 s35, 1, 0 -; GFX7-NEXT: s_add_u32 s27, s36, s27 +; GFX7-NEXT: s_add_u32 s24, s36, s24 ; GFX7-NEXT: v_readfirstlane_b32 s38, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9 -; GFX7-NEXT: s_addc_u32 s25, s37, s25 +; GFX7-NEXT: s_addc_u32 s22, s37, s22 ; GFX7-NEXT: s_mul_i32 s37, s3, s10 ; GFX7-NEXT: s_cselect_b32 s36, 1, 0 -; GFX7-NEXT: s_add_u32 s27, s37, s27 +; GFX7-NEXT: s_add_u32 s24, s37, s24 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8 -; GFX7-NEXT: s_addc_u32 s25, s38, s25 +; GFX7-NEXT: s_addc_u32 s22, s38, s22 ; GFX7-NEXT: s_mul_i32 s38, s4, s9 ; GFX7-NEXT: s_cselect_b32 s37, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s39, v1 -; GFX7-NEXT: s_add_u32 s27, s38, s27 -; GFX7-NEXT: s_addc_u32 s25, s39, s25 +; GFX7-NEXT: s_add_u32 s24, s38, s24 +; GFX7-NEXT: s_addc_u32 s22, s39, s22 ; GFX7-NEXT: s_mul_i32 s39, s5, s8 ; GFX7-NEXT: s_cselect_b32 s38, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s40, v0 -; GFX7-NEXT: s_add_u32 s27, s39, s27 -; GFX7-NEXT: s_addc_u32 s25, s40, s25 +; GFX7-NEXT: s_add_u32 s24, s39, s24 +; GFX7-NEXT: s_addc_u32 s22, s40, s22 ; GFX7-NEXT: s_cselect_b32 s39, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s31, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 @@ -1005,18 +1005,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cmp_lg_u32 s34, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 -; GFX7-NEXT: s_addc_u32 s21, s30, s27 -; GFX7-NEXT: s_cselect_b32 s27, 1, 0 -; GFX7-NEXT: s_cmp_lg_u32 s23, 0 -; GFX7-NEXT: s_addc_u32 s22, s22, 0 -; GFX7-NEXT: s_cmp_lg_u32 s24, 0 -; GFX7-NEXT: s_addc_u32 s22, s22, 0 -; GFX7-NEXT: s_cmp_lg_u32 s29, 0 -; GFX7-NEXT: s_addc_u32 s22, s22, 0 +; GFX7-NEXT: s_addc_u32 s21, s30, s24 +; GFX7-NEXT: s_cselect_b32 s24, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s27, 0 -; GFX7-NEXT: s_addc_u32 s22, s22, s25 +; GFX7-NEXT: s_addc_u32 s26, s26, 0 +; GFX7-NEXT: s_cmp_lg_u32 s28, 0 +; GFX7-NEXT: s_addc_u32 s26, s26, 0 +; GFX7-NEXT: s_cmp_lg_u32 s29, 0 +; GFX7-NEXT: s_addc_u32 s26, s26, 0 +; GFX7-NEXT: s_cmp_lg_u32 s24, 0 +; GFX7-NEXT: s_addc_u32 s22, s26, s22 ; GFX7-NEXT: s_mul_i32 s16, s16, s15 -; GFX7-NEXT: s_addc_u32 s15, s26, s16 +; GFX7-NEXT: s_addc_u32 s15, s23, s16 ; GFX7-NEXT: s_mul_i32 s1, s1, s14 ; GFX7-NEXT: s_cmp_lg_u32 s39, 0 ; GFX7-NEXT: s_addc_u32 s1, s15, s1 @@ -1033,13 +1033,13 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cmp_lg_u32 s35, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_mul_i32 s6, s6, s9 -; GFX7-NEXT: s_cmp_lg_u32 s28, 0 +; GFX7-NEXT: s_cmp_lg_u32 s25, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s6 ; GFX7-NEXT: s_mul_i32 s7, s7, s8 ; GFX7-NEXT: s_mul_i32 s0, s0, s8 ; GFX7-NEXT: s_add_u32 s7, s7, s1 -; GFX7-NEXT: s_mov_b32 s1, s18 -; GFX7-NEXT: s_mov_b32 s2, s17 +; GFX7-NEXT: s_mov_b32 s1, s17 +; GFX7-NEXT: s_mov_b32 s2, s18 ; GFX7-NEXT: s_mov_b32 s3, s19 ; GFX7-NEXT: s_mov_b32 s4, s20 ; GFX7-NEXT: s_mov_b32 s5, s21 @@ -1059,165 +1059,165 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: v_readfirstlane_b32 s21, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_readfirstlane_b32 s23, v1 +; GFX8-NEXT: v_readfirstlane_b32 s19, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s11 ; GFX8-NEXT: s_mul_i32 s18, s16, s10 +; GFX8-NEXT: v_readfirstlane_b32 s24, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_readfirstlane_b32 s22, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX8-NEXT: s_mul_i32 s20, s1, s9 -; GFX8-NEXT: v_readfirstlane_b32 s19, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mul_hi_u32 v5, s1, v4 ; GFX8-NEXT: s_add_u32 s18, s20, s18 +; GFX8-NEXT: v_readfirstlane_b32 s25, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, v2, s10 ; GFX8-NEXT: s_addc_u32 s19, s21, s19 ; GFX8-NEXT: s_mul_i32 s21, s2, s8 -; GFX8-NEXT: v_readfirstlane_b32 s23, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0 -; GFX8-NEXT: v_readfirstlane_b32 s22, v3 ; GFX8-NEXT: s_add_u32 s18, s21, s18 -; GFX8-NEXT: s_addc_u32 s19, s22, s19 -; GFX8-NEXT: s_mul_i32 s22, s16, s9 -; GFX8-NEXT: s_cselect_b32 s21, 1, 0 -; GFX8-NEXT: s_add_u32 s17, s22, s17 -; GFX8-NEXT: s_addc_u32 s22, s23, s18 -; GFX8-NEXT: v_readfirstlane_b32 s23, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s12 -; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 -; GFX8-NEXT: s_mul_i32 s18, s1, s8 -; GFX8-NEXT: s_cselect_b32 s25, 1, 0 -; GFX8-NEXT: s_add_u32 s18, s18, s17 -; GFX8-NEXT: s_addc_u32 s17, s23, s22 -; GFX8-NEXT: v_mov_b32_e32 v4, s11 -; GFX8-NEXT: v_readfirstlane_b32 s23, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, v2, s10 -; GFX8-NEXT: v_mul_hi_u32 v5, s1, v4 -; GFX8-NEXT: s_mul_i32 s22, s16, s12 -; GFX8-NEXT: s_mul_i32 s24, s1, s11 ; GFX8-NEXT: v_readfirstlane_b32 s28, v3 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_addc_u32 s19, s22, s19 +; GFX8-NEXT: s_mul_i32 s22, s16, s9 ; GFX8-NEXT: v_readfirstlane_b32 s27, v5 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9 -; GFX8-NEXT: s_cselect_b32 s26, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s24, s22 -; GFX8-NEXT: s_addc_u32 s23, s27, s23 +; GFX8-NEXT: s_cselect_b32 s21, 1, 0 +; GFX8-NEXT: s_add_u32 s17, s22, s17 +; GFX8-NEXT: s_addc_u32 s18, s23, s18 +; GFX8-NEXT: s_mul_i32 s23, s1, s8 +; GFX8-NEXT: s_cselect_b32 s22, 1, 0 +; GFX8-NEXT: s_add_u32 s17, s23, s17 +; GFX8-NEXT: s_addc_u32 s18, s24, s18 +; GFX8-NEXT: s_mul_i32 s24, s16, s12 +; GFX8-NEXT: s_mul_i32 s26, s1, s11 ; GFX8-NEXT: v_readfirstlane_b32 s29, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: s_cselect_b32 s23, 1, 0 +; GFX8-NEXT: s_add_u32 s24, s26, s24 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, s8 +; GFX8-NEXT: s_addc_u32 s25, s27, s25 ; GFX8-NEXT: s_mul_i32 s27, s2, s10 -; GFX8-NEXT: s_cselect_b32 s22, 1, 0 +; GFX8-NEXT: s_cselect_b32 s26, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s27, s24 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s10 -; GFX8-NEXT: s_addc_u32 s27, s28, s23 +; GFX8-NEXT: s_addc_u32 s25, s28, s25 ; GFX8-NEXT: s_mul_i32 s28, s3, s9 -; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_add_u32 s28, s28, s24 +; GFX8-NEXT: s_cselect_b32 s27, 1, 0 +; GFX8-NEXT: s_add_u32 s24, s28, s24 ; GFX8-NEXT: v_readfirstlane_b32 s30, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, s16, v4 -; GFX8-NEXT: s_addc_u32 s27, s29, s27 +; GFX8-NEXT: s_addc_u32 s25, s29, s25 ; GFX8-NEXT: s_mul_i32 s29, s4, s8 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 -; GFX8-NEXT: s_add_u32 s28, s29, s28 +; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_add_u32 s24, s29, s24 ; GFX8-NEXT: v_readfirstlane_b32 s33, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v2, s9 -; GFX8-NEXT: s_addc_u32 s27, s30, s27 +; GFX8-NEXT: s_addc_u32 s25, s30, s25 ; GFX8-NEXT: s_mul_i32 s30, s16, s11 ; GFX8-NEXT: s_cselect_b32 s29, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s31, v6 ; GFX8-NEXT: s_add_u32 s19, s30, s19 -; GFX8-NEXT: s_addc_u32 s28, s31, s28 +; GFX8-NEXT: s_addc_u32 s24, s31, s24 ; GFX8-NEXT: s_mul_i32 s31, s1, s10 ; GFX8-NEXT: s_cselect_b32 s30, 1, 0 ; GFX8-NEXT: s_add_u32 s19, s31, s19 ; GFX8-NEXT: v_readfirstlane_b32 s34, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s8 -; GFX8-NEXT: s_addc_u32 s28, s33, s28 +; GFX8-NEXT: s_addc_u32 s24, s33, s24 ; GFX8-NEXT: s_mul_i32 s33, s2, s9 ; GFX8-NEXT: s_cselect_b32 s31, 1, 0 ; GFX8-NEXT: s_add_u32 s19, s33, s19 -; GFX8-NEXT: s_addc_u32 s28, s34, s28 +; GFX8-NEXT: s_addc_u32 s24, s34, s24 ; GFX8-NEXT: s_mul_i32 s34, s3, s8 ; GFX8-NEXT: s_cselect_b32 s33, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: s_add_u32 s19, s34, s19 ; GFX8-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NEXT: s_addc_u32 s28, s35, s28 +; GFX8-NEXT: s_addc_u32 s24, s35, s24 ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: s_cselect_b32 s34, 1, 0 -; GFX8-NEXT: s_cmp_lg_u32 s26, 0 -; GFX8-NEXT: s_addc_u32 s19, s25, s19 +; GFX8-NEXT: s_cmp_lg_u32 s23, 0 +; GFX8-NEXT: s_addc_u32 s19, s22, s19 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 -; GFX8-NEXT: s_cselect_b32 s25, 1, 0 +; GFX8-NEXT: s_cselect_b32 s22, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2 ; GFX8-NEXT: s_addc_u32 s20, s20, 0 -; GFX8-NEXT: v_readfirstlane_b32 s26, v0 +; GFX8-NEXT: v_readfirstlane_b32 s23, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v1 -; GFX8-NEXT: s_cmp_lg_u32 s25, 0 -; GFX8-NEXT: s_addc_u32 s20, s20, s28 -; GFX8-NEXT: s_mul_i32 s25, s16, s14 -; GFX8-NEXT: s_mul_i32 s28, s1, s13 +; GFX8-NEXT: s_cmp_lg_u32 s22, 0 +; GFX8-NEXT: s_addc_u32 s20, s20, s24 +; GFX8-NEXT: s_mul_i32 s22, s16, s14 +; GFX8-NEXT: s_mul_i32 s24, s1, s13 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_add_u32 s22, s24, s22 +; GFX8-NEXT: s_addc_u32 s23, s35, s23 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s11 -; GFX8-NEXT: s_mul_i32 s28, s2, s12 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_mul_i32 s24, s2, s12 +; GFX8-NEXT: s_add_u32 s22, s24, s22 +; GFX8-NEXT: s_addc_u32 s23, s35, s23 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v5, s10 -; GFX8-NEXT: s_mul_i32 s28, s3, s11 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_mul_i32 s24, s3, s11 +; GFX8-NEXT: s_add_u32 s22, s24, s22 +; GFX8-NEXT: s_addc_u32 s23, s35, s23 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mul_hi_u32 v6, v0, s9 -; GFX8-NEXT: s_mul_i32 s28, s4, s10 -; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_mul_i32 s24, s4, s10 +; GFX8-NEXT: s_add_u32 s22, s24, s22 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_addc_u32 s23, s35, s23 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_mul_hi_u32 v6, v6, s8 -; GFX8-NEXT: s_mul_i32 s28, s5, s9 -; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_mul_i32 s24, s5, s9 +; GFX8-NEXT: s_add_u32 s22, s24, s22 ; GFX8-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX8-NEXT: v_readfirstlane_b32 s36, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s2, v4 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 -; GFX8-NEXT: s_mul_i32 s28, s6, s8 +; GFX8-NEXT: s_addc_u32 s23, s35, s23 +; GFX8-NEXT: s_mul_i32 s24, s6, s8 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 -; GFX8-NEXT: s_mul_i32 s28, s16, s13 +; GFX8-NEXT: s_add_u32 s22, s24, s22 +; GFX8-NEXT: s_addc_u32 s23, s35, s23 +; GFX8-NEXT: s_mul_i32 s24, s16, s13 ; GFX8-NEXT: v_readfirstlane_b32 s35, v2 -; GFX8-NEXT: s_add_u32 s27, s28, s27 +; GFX8-NEXT: s_add_u32 s24, s24, s25 ; GFX8-NEXT: v_readfirstlane_b32 s37, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v3, s10 -; GFX8-NEXT: s_addc_u32 s25, s35, s25 +; GFX8-NEXT: s_addc_u32 s22, s35, s22 ; GFX8-NEXT: s_mul_i32 s35, s1, s12 -; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_add_u32 s27, s35, s27 -; GFX8-NEXT: s_addc_u32 s25, s36, s25 +; GFX8-NEXT: s_cselect_b32 s25, 1, 0 +; GFX8-NEXT: s_add_u32 s24, s35, s24 +; GFX8-NEXT: s_addc_u32 s22, s36, s22 ; GFX8-NEXT: s_mul_i32 s36, s2, s11 ; GFX8-NEXT: s_cselect_b32 s35, 1, 0 -; GFX8-NEXT: s_add_u32 s27, s36, s27 +; GFX8-NEXT: s_add_u32 s24, s36, s24 ; GFX8-NEXT: v_readfirstlane_b32 s38, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v5, s9 -; GFX8-NEXT: s_addc_u32 s25, s37, s25 +; GFX8-NEXT: s_addc_u32 s22, s37, s22 ; GFX8-NEXT: s_mul_i32 s37, s3, s10 ; GFX8-NEXT: s_cselect_b32 s36, 1, 0 -; GFX8-NEXT: s_add_u32 s27, s37, s27 +; GFX8-NEXT: s_add_u32 s24, s37, s24 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s8 -; GFX8-NEXT: s_addc_u32 s25, s38, s25 +; GFX8-NEXT: s_addc_u32 s22, s38, s22 ; GFX8-NEXT: s_mul_i32 s38, s4, s9 ; GFX8-NEXT: s_cselect_b32 s37, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s39, v1 -; GFX8-NEXT: s_add_u32 s27, s38, s27 -; GFX8-NEXT: s_addc_u32 s25, s39, s25 +; GFX8-NEXT: s_add_u32 s24, s38, s24 +; GFX8-NEXT: s_addc_u32 s22, s39, s22 ; GFX8-NEXT: s_mul_i32 s39, s5, s8 ; GFX8-NEXT: s_cselect_b32 s38, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s40, v0 -; GFX8-NEXT: s_add_u32 s27, s39, s27 -; GFX8-NEXT: s_addc_u32 s25, s40, s25 +; GFX8-NEXT: s_add_u32 s24, s39, s24 +; GFX8-NEXT: s_addc_u32 s22, s40, s22 ; GFX8-NEXT: s_cselect_b32 s39, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s31, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 @@ -1226,18 +1226,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cmp_lg_u32 s34, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 -; GFX8-NEXT: s_addc_u32 s21, s30, s27 -; GFX8-NEXT: s_cselect_b32 s27, 1, 0 -; GFX8-NEXT: s_cmp_lg_u32 s23, 0 -; GFX8-NEXT: s_addc_u32 s22, s22, 0 -; GFX8-NEXT: s_cmp_lg_u32 s24, 0 -; GFX8-NEXT: s_addc_u32 s22, s22, 0 -; GFX8-NEXT: s_cmp_lg_u32 s29, 0 -; GFX8-NEXT: s_addc_u32 s22, s22, 0 +; GFX8-NEXT: s_addc_u32 s21, s30, s24 +; GFX8-NEXT: s_cselect_b32 s24, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 -; GFX8-NEXT: s_addc_u32 s22, s22, s25 +; GFX8-NEXT: s_addc_u32 s26, s26, 0 +; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_addc_u32 s26, s26, 0 +; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_addc_u32 s26, s26, 0 +; GFX8-NEXT: s_cmp_lg_u32 s24, 0 +; GFX8-NEXT: s_addc_u32 s22, s26, s22 ; GFX8-NEXT: s_mul_i32 s16, s16, s15 -; GFX8-NEXT: s_addc_u32 s15, s26, s16 +; GFX8-NEXT: s_addc_u32 s15, s23, s16 ; GFX8-NEXT: s_mul_i32 s1, s1, s14 ; GFX8-NEXT: s_cmp_lg_u32 s39, 0 ; GFX8-NEXT: s_addc_u32 s1, s15, s1 @@ -1254,13 +1254,13 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cmp_lg_u32 s35, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_mul_i32 s6, s6, s9 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_cmp_lg_u32 s25, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s6 ; GFX8-NEXT: s_mul_i32 s7, s7, s8 ; GFX8-NEXT: s_mul_i32 s0, s0, s8 ; GFX8-NEXT: s_add_u32 s7, s7, s1 -; GFX8-NEXT: s_mov_b32 s1, s18 -; GFX8-NEXT: s_mov_b32 s2, s17 +; GFX8-NEXT: s_mov_b32 s1, s17 +; GFX8-NEXT: s_mov_b32 s2, s18 ; GFX8-NEXT: s_mov_b32 s3, s19 ; GFX8-NEXT: s_mov_b32 s4, s20 ; GFX8-NEXT: s_mov_b32 s5, s21 @@ -1269,10 +1269,9 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; ; GFX9-LABEL: s_mul_i256: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s16, s0 -; GFX9-NEXT: s_mul_i32 s18, s16, s10 +; GFX9-NEXT: s_mul_i32 s18, s0, s10 ; GFX9-NEXT: s_mul_i32 s20, s1, s9 -; GFX9-NEXT: s_mul_hi_u32 s19, s16, s10 +; GFX9-NEXT: s_mul_hi_u32 s19, s0, s10 ; GFX9-NEXT: s_mul_hi_u32 s21, s1, s9 ; GFX9-NEXT: s_add_u32 s18, s20, s18 ; GFX9-NEXT: s_addc_u32 s19, s21, s19 @@ -1280,11 +1279,11 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s22, s2, s8 ; GFX9-NEXT: s_add_u32 s18, s21, s18 -; GFX9-NEXT: s_mul_hi_u32 s17, s16, s8 +; GFX9-NEXT: s_mul_hi_u32 s17, s0, s8 ; GFX9-NEXT: s_addc_u32 s19, s22, s19 -; GFX9-NEXT: s_mul_i32 s22, s16, s9 +; GFX9-NEXT: s_mul_i32 s22, s0, s9 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9 +; GFX9-NEXT: s_mul_hi_u32 s23, s0, s9 ; GFX9-NEXT: s_add_u32 s17, s22, s17 ; GFX9-NEXT: s_addc_u32 s18, s23, s18 ; GFX9-NEXT: s_mul_i32 s23, s1, s8 @@ -1292,10 +1291,10 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_mul_hi_u32 s24, s1, s8 ; GFX9-NEXT: s_add_u32 s17, s23, s17 ; GFX9-NEXT: s_addc_u32 s18, s24, s18 -; GFX9-NEXT: s_mul_i32 s24, s16, s12 +; GFX9-NEXT: s_mul_i32 s24, s0, s12 ; GFX9-NEXT: s_mul_i32 s26, s1, s11 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s25, s16, s12 +; GFX9-NEXT: s_mul_hi_u32 s25, s0, s12 ; GFX9-NEXT: s_mul_hi_u32 s27, s1, s11 ; GFX9-NEXT: s_add_u32 s24, s26, s24 ; GFX9-NEXT: s_addc_u32 s25, s27, s25 @@ -1314,9 +1313,9 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_mul_hi_u32 s30, s4, s8 ; GFX9-NEXT: s_add_u32 s24, s29, s24 ; GFX9-NEXT: s_addc_u32 s25, s30, s25 -; GFX9-NEXT: s_mul_i32 s30, s16, s11 +; GFX9-NEXT: s_mul_i32 s30, s0, s11 ; GFX9-NEXT: s_cselect_b32 s29, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s31, s16, s11 +; GFX9-NEXT: s_mul_hi_u32 s31, s0, s11 ; GFX9-NEXT: s_add_u32 s19, s30, s19 ; GFX9-NEXT: s_addc_u32 s24, s31, s24 ; GFX9-NEXT: s_mul_i32 s31, s1, s10 @@ -1342,10 +1341,10 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_addc_u32 s20, s20, 0 ; GFX9-NEXT: s_cmp_lg_u32 s22, 0 ; GFX9-NEXT: s_addc_u32 s20, s20, s24 -; GFX9-NEXT: s_mul_i32 s22, s16, s14 +; GFX9-NEXT: s_mul_i32 s22, s0, s14 ; GFX9-NEXT: s_mul_i32 s24, s1, s13 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s23, s16, s14 +; GFX9-NEXT: s_mul_hi_u32 s23, s0, s14 ; GFX9-NEXT: s_mul_hi_u32 s35, s1, s13 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 @@ -1369,8 +1368,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_mul_hi_u32 s35, s6, s8 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 -; GFX9-NEXT: s_mul_i32 s24, s16, s13 -; GFX9-NEXT: s_mul_hi_u32 s35, s16, s13 +; GFX9-NEXT: s_mul_i32 s24, s0, s13 +; GFX9-NEXT: s_mul_hi_u32 s35, s0, s13 ; GFX9-NEXT: s_add_u32 s24, s24, s25 ; GFX9-NEXT: s_addc_u32 s22, s35, s22 ; GFX9-NEXT: s_mul_i32 s35, s1, s12 @@ -1415,30 +1414,31 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_cmp_lg_u32 s29, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 ; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_mul_i32 s16, s0, s8 ; GFX9-NEXT: s_addc_u32 s22, s26, s22 -; GFX9-NEXT: s_mul_i32 s16, s16, s15 -; GFX9-NEXT: s_addc_u32 s15, s23, s16 +; GFX9-NEXT: s_mul_i32 s0, s0, s15 +; GFX9-NEXT: s_addc_u32 s0, s23, s0 ; GFX9-NEXT: s_mul_i32 s1, s1, s14 ; GFX9-NEXT: s_cmp_lg_u32 s39, 0 -; GFX9-NEXT: s_addc_u32 s1, s15, s1 +; GFX9-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-NEXT: s_mul_i32 s2, s2, s13 ; GFX9-NEXT: s_cmp_lg_u32 s38, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, s2 +; GFX9-NEXT: s_addc_u32 s0, s0, s2 ; GFX9-NEXT: s_mul_i32 s3, s3, s12 ; GFX9-NEXT: s_cmp_lg_u32 s37, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: s_addc_u32 s0, s0, s3 ; GFX9-NEXT: s_mul_i32 s4, s4, s11 ; GFX9-NEXT: s_cmp_lg_u32 s36, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, s4 +; GFX9-NEXT: s_addc_u32 s0, s0, s4 ; GFX9-NEXT: s_mul_i32 s5, s5, s10 ; GFX9-NEXT: s_cmp_lg_u32 s35, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_addc_u32 s0, s0, s5 ; GFX9-NEXT: s_mul_i32 s6, s6, s9 ; GFX9-NEXT: s_cmp_lg_u32 s25, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, s6 +; GFX9-NEXT: s_addc_u32 s0, s0, s6 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 -; GFX9-NEXT: s_mul_i32 s0, s0, s8 -; GFX9-NEXT: s_add_u32 s7, s7, s1 +; GFX9-NEXT: s_add_u32 s7, s7, s0 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: s_mov_b32 s1, s17 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s3, s19 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 4248f7b6a15831..ccfb88284b8843 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -26,133 +26,131 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v2, v1, v0 -; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 -; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v8, v6 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 -; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7 -; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v6, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v7, v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v7 +; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v6 +; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v7, vcc +; CHECK-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; CHECK-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; CHECK-NEXT: v_trunc_f32_e32 v3, v2 +; CHECK-NEXT: v_mac_f32_e32 v1, 0xcf800000, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v11, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v12, v8, v1 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v8, v[2:3] +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, v11, v1 +; CHECK-NEXT: v_mul_lo_u32 v13, v8, v2 +; CHECK-NEXT: v_mul_lo_u32 v14, v11, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CHECK-NEXT: v_mul_hi_u32 v12, v8, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v14, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v1 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v2, vcc +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v11, v[2:3] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 +; CHECK-NEXT: v_mul_hi_u32 v12, v8, v1 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v8, v[2:3] +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v1 +; CHECK-NEXT: v_mul_lo_u32 v10, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v1, v11, v1 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v12, v11, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_mul_hi_u32 v10, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v10, v5, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v11, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc -; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10 -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7 -; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10 -; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 -; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 +; CHECK-NEXT: v_mul_lo_u32 v10, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v1, v3 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v10, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v11, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[2:3] +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, v[2:3] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v4, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v4, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 +; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v6 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v10 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -381,264 +379,262 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v4, v8 -; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v4, v8 +; GISEL-NEXT: v_xor_b32_e32 v9, v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v12 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v9 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v12 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v9, vcc +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v10, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v10 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v10 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v16, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v16, v4 +; GISEL-NEXT: v_mul_hi_u32 v17, v13, v4 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[10:11] +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v17, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v17, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v5, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v16, v[5:6] +; GISEL-NEXT: v_ashrrev_i32_e32 v14, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[10:11] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v0, v14 +; GISEL-NEXT: v_mul_lo_u32 v0, v16, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v10 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v14 +; GISEL-NEXT: v_mul_hi_u32 v1, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v5, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v13, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v14, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v13, v[11:12] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v11, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v11, v12, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v1, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v6 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v13, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v10, v4 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v16, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v13, v[4:5] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v4, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v15, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v4, v9, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, v10, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v1, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v4 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v1, v4 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v13 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v12 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v18, v1 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 ; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v6 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v6, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v15, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v18, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v21, v19, v[10:11] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v1, vcc +; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v20, v18, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v15 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v21, v19, v[9:10] +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v19, v10 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v19, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v19, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc ; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v10 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v18, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v19, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v19, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v10, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v20, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v14, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v21, v10, v[8:9] -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; GISEL-NEXT: v_xor_b32_e32 v1, v4, v13 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v9, v2, v14 -; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v4, v10, v8 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v14 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v18, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v9, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v13, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v14, v8 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v20, v10, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v13 +; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v21, v9, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v2, v10, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v11, v8 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_mul_hi_u32 v4, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v3, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; GISEL-NEXT: v_xor_b32_e32 v9, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v15, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 +; GISEL-NEXT: v_mul_hi_u32 v8, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v8, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v8, v12, v13 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v12, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v13 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v10, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v5, v10, v[7:8] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v9, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -662,128 +658,128 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v5, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc -; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CGP-NEXT: v_trunc_f32_e32 v5, v4 -; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v18, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v4, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v5, v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v5 +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc +; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 +; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; CGP-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; CGP-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; CGP-NEXT: v_trunc_f32_e32 v3, v2 +; CGP-NEXT: v_mac_f32_e32 v1, 0xcf800000, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v1 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v3 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v15, v[2:3] +; CGP-NEXT: v_mul_hi_u32 v16, v12, v1 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v12, v[2:3] +; CGP-NEXT: v_mul_lo_u32 v3, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v15, v1 +; CGP-NEXT: v_mul_lo_u32 v17, v12, v2 +; CGP-NEXT: v_mul_lo_u32 v18, v15, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 +; CGP-NEXT: v_mul_hi_u32 v16, v12, v2 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v17, v3 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v18, v1 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v2, v15, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v1 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v2, vcc +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v15, v[2:3] ; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v12, v1 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v12, v[2:3] +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v13 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v14, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 +; CGP-NEXT: v_xor_b32_e32 v11, v3, v13 +; CGP-NEXT: v_mul_lo_u32 v3, v15, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v12, v2 +; CGP-NEXT: v_mul_hi_u32 v1, v15, v1 ; CGP-NEXT: v_xor_b32_e32 v10, v10, v13 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_mul_hi_u32 v14, v12, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v16, v15, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v14, v12, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v2, v15, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v3, v10, v1 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v2 +; CGP-NEXT: v_mul_hi_u32 v14, v11, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v10, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v10, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v11, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v1, v3 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v14, 0 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v12, v[2:3] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v1 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v14, v[2:3] +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v10, v2, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v10, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5 +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v5, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 -; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 +; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CGP-NEXT: v_cndmask_b32_e64 v3, v10, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v4, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 ; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc @@ -835,128 +831,128 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: .LBB2_7: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v3, v2 -; CGP-NEXT: v_xor_b32_e32 v3, v5, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v7, v6 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v6, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v7, v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v7 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v6 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v7, vcc +; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; CGP-NEXT: v_trunc_f32_e32 v5, v4 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v5 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v13, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v4 +; CGP-NEXT: v_mul_lo_u32 v16, v13, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v15, v5 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] +; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v3 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v13, v[4:5] ; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[4:5] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v11 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 +; CGP-NEXT: v_xor_b32_e32 v9, v5, v11 +; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v4 +; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 ; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v13, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v6 +; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v13, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v9, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v8, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v3, v5 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, 0 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v10, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v12, v[4:5] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v8, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v8, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v7, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v7 +; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; CGP-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 ; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v5, v11, v2 ; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc @@ -1131,173 +1127,173 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v8, 0 ; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_trunc_f32_e32 v6, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v6 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v9, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v7, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v6, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v11, 0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v4 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v11, v[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[6:7] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v9, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v8 +; GISEL-NEXT: v_xor_b32_e32 v11, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v10, v6 ; GISEL-NEXT: v_xor_b32_e32 v12, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v1, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v9, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0x1000 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v7 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v5, v11, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[7:8] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v10, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] ; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v12, v7 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v7, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, 0x1000 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v6, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, 0x1000 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v10, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v1 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v8, s[4:5] +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v6 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7] -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[6:7] +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v13, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc ; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 ; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v0 ; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v10, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v8, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v8 +; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 ; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v9, v6 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v8 -; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -1305,7 +1301,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 @@ -1316,13 +1312,13 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 ; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v9, v10, v4 +; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1344,7 +1340,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v9, v4, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc @@ -1370,10 +1366,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom: @@ -1381,8 +1377,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s7, 0xf000 -; CGP-NEXT: s_movk_i32 s6, 0x1000 +; CGP-NEXT: s_movk_i32 s6, 0xf000 +; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1393,7 +1389,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -1417,11 +1413,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -1468,11 +1464,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v1, v6 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v10, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v11, v6, vcc @@ -1498,10 +1494,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v15, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v11 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc @@ -1531,11 +1527,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] ; CGP-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -1587,11 +1583,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v10, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -1761,173 +1757,173 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v8, 0 ; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_trunc_f32_e32 v6, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v6 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v9, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v7, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v6, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v11, 0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v4 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v11, v[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[6:7] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v9, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v8 +; GISEL-NEXT: v_xor_b32_e32 v11, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v10, v6 ; GISEL-NEXT: v_xor_b32_e32 v12, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v1, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v9, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0x12d8fb -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v7 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v5, v11, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[7:8] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v10, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] ; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v12, v7 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v7, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, 0x12d8fb +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v6, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v10, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v1 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v8, s[4:5] +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v6 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7] -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[6:7] +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v13, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc ; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 ; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v0 ; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v10, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v8, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v8 +; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 ; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v9, v6 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v8 -; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -1935,7 +1931,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 @@ -1946,13 +1942,13 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 ; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v9, v10, v4 +; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1974,7 +1970,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v9, v4, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc @@ -2000,10 +1996,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -2011,8 +2007,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s7, 0xffed2705 -; CGP-NEXT: s_mov_b32 s6, 0x12d8fb +; CGP-NEXT: s_mov_b32 s6, 0xffed2705 +; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -2023,7 +2019,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -2047,11 +2043,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -2098,11 +2094,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v1, v6 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v10, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v11, v6, vcc @@ -2128,10 +2124,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v15, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v11 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc @@ -2161,11 +2157,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] ; CGP-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -2217,11 +2213,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v10, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -2279,131 +2275,131 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: .LBB7_3: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v6, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v2, v1, v0 -; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 -; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc -; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; CHECK-NEXT: v_trunc_f32_e32 v7, v6 -; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v7, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v8, v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v7 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v8 +; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 +; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc +; CHECK-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; CHECK-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; CHECK-NEXT: v_trunc_f32_e32 v5, v2 +; CHECK-NEXT: v_mac_f32_e32 v1, 0xcf800000, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[2:3] +; CHECK-NEXT: v_mul_lo_u32 v2, v12, v1 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, v12, v1 +; CHECK-NEXT: v_mul_lo_u32 v13, v9, v5 +; CHECK-NEXT: v_mul_lo_u32 v14, v12, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v14, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; CHECK-NEXT: v_mul_hi_u32 v5, v12, v5 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v1 +; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v2, vcc +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[2:3] +; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v10 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v10, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v2, v10 +; CHECK-NEXT: v_mul_lo_u32 v2, v12, v1 +; CHECK-NEXT: v_mul_lo_u32 v6, v9, v5 +; CHECK-NEXT: v_xor_b32_e32 v11, v3, v10 +; CHECK-NEXT: v_mul_hi_u32 v3, v9, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, v12, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v12, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v5 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v5, v12, v5 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v1 +; CHECK-NEXT: v_mul_lo_u32 v5, v4, v2 +; CHECK-NEXT: v_mul_hi_u32 v6, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, v11, v1 +; CHECK-NEXT: v_mul_hi_u32 v9, v11, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v1, v3 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v6, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v5, v[2:3] +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v6, v[2:3] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v11, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v11, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v8 +; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v7 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v8 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v3, v4, v9, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -2449,264 +2445,264 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v4 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, v5, v4 -; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4 +; GISEL-NEXT: v_xor_b32_e32 v12, v5, v4 +; GISEL-NEXT: v_xor_b32_e32 v8, v7, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v12 ; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v8 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 -; GISEL-NEXT: v_trunc_f32_e32 v11, v9 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v12, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v13, v15, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v7, v15, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v14, v12, v[10:11] -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v12 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v14, v13, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v14, v7, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v15, v13, v[10:11] +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v13, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v7 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v12, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v13, v15, v[7:8] -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v14, v12, v[10:11] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v5 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v14, v13, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v14, v7, v[5:6] +; GISEL-NEXT: v_ashrrev_i32_e32 v14, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v15, v13, v[10:11] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v5, v0, v14 +; GISEL-NEXT: v_mul_lo_u32 v0, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v13, v10 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v14 +; GISEL-NEXT: v_mul_hi_u32 v1, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v1, v7, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v13, v10 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v5, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v14, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v8, v15, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v8, v16, v[1:2] -; GISEL-NEXT: v_lshl_b64 v[11:12], s[4:5], v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v15, v[9:10] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v14, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v11, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v12, v13, v[1:2] +; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v6, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v12 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v13, v9, v10, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v12, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v9 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v15 -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v11 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v6 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v12 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v12, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v12 +; GISEL-NEXT: v_trunc_f32_e32 v18, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 ; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v12 -; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v9, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v22, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v17 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v21, v19, v[11:12] -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v18, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v22, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v19, v11 -; GISEL-NEXT: v_mul_hi_u32 v14, v19, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v22, v0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 +; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v6, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v20, v18, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v21, v19, v[8:9] +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v19, v8 +; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v22, v11 +; GISEL-NEXT: v_mul_lo_u32 v16, v18, v8 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v12, v19, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v8, v18, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v0 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v22, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v11, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v12, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v13, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v11, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v2, v13 -; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v18, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v20, v16, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v14, v4 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v21, v12, v[8:9] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v13 +; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v8 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_mul_hi_u32 v4, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v4, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v16, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v4, v9, v2 +; GISEL-NEXT: v_xor_b32_e32 v8, v11, v13 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v11, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v5, v2 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0 +; GISEL-NEXT: v_mul_hi_u32 v12, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v13 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v11, v[3:4] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v13, v6 +; GISEL-NEXT: v_xor_b32_e32 v4, v10, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 @@ -2733,131 +2729,131 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v0 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc -; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CGP-NEXT: v_trunc_f32_e32 v10, v4 -; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v10 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v15, v3 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] -; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v15, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v4, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v10, v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v10, vcc +; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 +; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; CGP-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; CGP-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; CGP-NEXT: v_trunc_f32_e32 v3, v2 +; CGP-NEXT: v_mac_f32_e32 v1, 0xcf800000, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v11, v1 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v14, v[2:3] +; CGP-NEXT: v_mul_hi_u32 v15, v11, v1 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v11, v[2:3] +; CGP-NEXT: v_mul_lo_u32 v3, v14, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v11, v2 +; CGP-NEXT: v_mul_lo_u32 v17, v14, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v10 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v17, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v11, v2 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v15, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v1 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v2, vcc +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v14, v[2:3] +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v9 +; CGP-NEXT: v_mul_hi_u32 v15, v11, v1 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v11, v[2:3] +; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v12 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v9, v3, v12 +; CGP-NEXT: v_mul_lo_u32 v3, v14, v1 +; CGP-NEXT: v_mul_lo_u32 v13, v11, v2 +; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 +; CGP-NEXT: v_xor_b32_e32 v8, v8, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v15, v14, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v13, v3 +; CGP-NEXT: v_mul_hi_u32 v13, v11, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v13, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v3, v8, v1 +; CGP-NEXT: v_mul_lo_u32 v11, v9, v2 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v8, v1 +; CGP-NEXT: v_mul_hi_u32 v14, v8, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; CGP-NEXT: v_mul_hi_u32 v10, v15, v10 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v13 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v4, v13 -; CGP-NEXT: v_mul_lo_u32 v4, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v12, v10 -; CGP-NEXT: v_xor_b32_e32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v15, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v10 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v15, v10 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v14, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_mul_hi_u32 v12, v14, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v14, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_mul_hi_u32 v9, v11, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v3, v8 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v12, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v10, v[8:9] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v8, vcc -; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v14, v8 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_subb_u32_e32 v8, vcc, v8, v1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v8, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v1, v3 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v13, 0 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v11, v[2:3] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, v[2:3] +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v2, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v8, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v10 +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_cndmask_b32_e64 v4, v9, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v10 +; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CGP-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v13 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v2, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v10 +; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v8 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v12, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -2908,130 +2904,128 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: .LBB8_7: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v10, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v3, v2 -; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 -; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v10, v8 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v6, v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v16, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v6, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v10, v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v10 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v6 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v10, vcc +; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; CGP-NEXT: v_trunc_f32_e32 v8, v4 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v11, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v14, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v14, v3 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[8:9] +; CGP-NEXT: v_mul_hi_u32 v9, v11, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v15, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v16, v14, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v8 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v15, v9 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v14, v[4:5] ; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v5, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[8:9] +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v4, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v11, v8 +; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v14, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v11, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v10, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v8, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_hi_u32 v11, v13, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v13, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, v[4:5] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v10 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v10 +; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; CGP-NEXT: v_cndmask_b32_e64 v5, v7, v11, s[4:5] +; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v9 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v7 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v6, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v5, v12, v2 ; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc @@ -3128,253 +3122,256 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 0, v1 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 0, v1 ; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; GISEL-NEXT: v_and_b32_e32 v14, 0xffffff, v2 +; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v3, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] +; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v4 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v3, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] +; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 +; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v5, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v12, v3 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v0, v[5:6] -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v9, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v11, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v3, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v11, v7 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v10, v[8:9] +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v12, v2, vcc +; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v12, v2 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 0, v2 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, v0 ; GISEL-NEXT: v_addc_u32_e64 v2, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v2 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v13 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v6, v6 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v2, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[6:7] -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v13, v[6:7] -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v0, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v2 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v1 +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v7, v1, vcc +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v11 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v5, v4 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0 +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v6, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v12, 0 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v15, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v10 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v12, v[6:7] +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v15, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v6 ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v0, v4, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v15, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v14, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v10, 0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v17 -; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v11, v[1:2] -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v18, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v10, v[6:7] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v13, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v18, v14, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v15, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v6, 0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v18 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v7, v[1:2] +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v19, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v6, v[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v18, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v19, v12, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v10, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12 -; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], 0, v14 +; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], v11, v6, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v6, v4 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v4, v7, v4 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v6, v0 +; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v7, v4, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v13, v9, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v4 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[0:1] ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v9, v5 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v8, v[5:6] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v11, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v2 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v9, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v4, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v2 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, v7, v10, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v6, v2 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 4c444f46ff3ddd..468dc79457f4e8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1282,21 +1282,21 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s4, s13, 31 ; GFX8-NEXT: s_ashr_i32 s6, s1, 31 -; GFX8-NEXT: s_add_u32 s16, s12, s4 -; GFX8-NEXT: s_addc_u32 s17, s13, s4 +; GFX8-NEXT: s_add_u32 s12, s12, s4 +; GFX8-NEXT: s_addc_u32 s13, s13, s4 ; GFX8-NEXT: s_add_u32 s0, s0, s6 ; GFX8-NEXT: s_mov_b32 s7, s6 ; GFX8-NEXT: s_addc_u32 s1, s1, s6 -; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 +; GFX8-NEXT: s_xor_b64 s[16:17], s[0:1], s[6:7] +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s17 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s16 ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_xor_b64 s[16:17], s[16:17], s[4:5] +; GFX8-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s18, 0, s12 -; GFX8-NEXT: s_subb_u32 s19, 0, s13 +; GFX8-NEXT: s_sub_u32 s18, 0, s16 +; GFX8-NEXT: s_subb_u32 s19, 0, s17 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 @@ -1359,65 +1359,65 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: v_mul_lo_u32 v2, s17, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, s16, v1 -; GFX8-NEXT: v_mul_hi_u32 v4, s16, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, s17, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s17, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX8-NEXT: v_mul_hi_u32 v5, s13, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s17, v1 +; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v4, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s17 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s16, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NEXT: s_ashr_i32 s16, s3, 31 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v3, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v6, s13 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s12, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v5, s17 +; GFX8-NEXT: s_ashr_i32 s12, s3, 31 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s17, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s13, v1 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v7 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s12, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v6 +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s16, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v9 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: s_add_u32 s0, s14, s6 ; GFX8-NEXT: s_addc_u32 s1, s15, s6 -; GFX8-NEXT: s_add_u32 s2, s2, s16 -; GFX8-NEXT: s_mov_b32 s17, s16 -; GFX8-NEXT: s_addc_u32 s3, s3, s16 -; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] +; GFX8-NEXT: s_add_u32 s2, s2, s12 +; GFX8-NEXT: s_mov_b32 s13, s12 +; GFX8-NEXT: s_addc_u32 s3, s3, s12 +; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s12, v8 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s16, v8 ; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 @@ -1431,7 +1431,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v12 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v0 -; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] +; GFX8-NEXT: s_xor_b64 s[14:15], s[0:1], s[6:7] ; GFX8-NEXT: s_sub_u32 s5, 0, s2 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v13, 0 @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v5, v[1:2] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v15, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], s20, v13, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[16:17], s20, v13, v[1:2] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v9, v16, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0 @@ -1504,37 +1504,37 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: v_mul_lo_u32 v7, s13, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s12, v3 +; GFX8-NEXT: v_mul_lo_u32 v7, s15, v2 +; GFX8-NEXT: v_mul_lo_u32 v8, s14, v3 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s12, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, s14, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s13, v3 -; GFX8-NEXT: v_mul_hi_u32 v2, s13, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s15, v3 +; GFX8-NEXT: v_mul_hi_u32 v2, s15, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s12, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s14, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s13, v3 +; GFX8-NEXT: v_mul_hi_u32 v9, s15, v3 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s13 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s12, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s15 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s14, v2 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s13, v6 +; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s15, v6 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 @@ -1567,7 +1567,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v2, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[16:17] +; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v2, s0, v8 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v9 ; GFX8-NEXT: v_mov_b32_e32 v8, s1 @@ -1594,21 +1594,21 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s13, 31 ; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_u32 s16, s12, s4 -; GFX9-NEXT: s_addc_u32 s17, s13, s4 +; GFX9-NEXT: s_add_u32 s12, s12, s4 +; GFX9-NEXT: s_addc_u32 s13, s13, s4 ; GFX9-NEXT: s_add_u32 s0, s0, s6 ; GFX9-NEXT: s_mov_b32 s7, s6 ; GFX9-NEXT: s_addc_u32 s1, s1, s6 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 +; GFX9-NEXT: s_xor_b64 s[16:17], s[0:1], s[6:7] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s17 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s16 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_xor_b64 s[16:17], s[16:17], s[4:5] +; GFX9-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s18, 0, s12 -; GFX9-NEXT: s_subb_u32 s19, 0, s13 +; GFX9-NEXT: s_sub_u32 s18, 0, s16 +; GFX9-NEXT: s_subb_u32 s19, 0, s17 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1642,7 +1642,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s17 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] @@ -1670,65 +1670,65 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v5, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v4, v3, v0, v6 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v4, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s17 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s16, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v5, v[2:3] -; GFX9-NEXT: s_ashr_i32 s16, s3, 31 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v4, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s12, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s17, v5, v[2:3] +; GFX9-NEXT: s_ashr_i32 s12, s3, 31 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v1, s17, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 +; GFX9-NEXT: v_sub_u32_e32 v1, s13, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s12, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s16, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[0:1] ; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v10 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v10 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s14, s6 ; GFX9-NEXT: s_addc_u32 s1, s15, s6 -; GFX9-NEXT: s_add_u32 s2, s2, s16 -; GFX9-NEXT: s_mov_b32 s17, s16 -; GFX9-NEXT: s_addc_u32 s3, s3, s16 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] +; GFX9-NEXT: s_add_u32 s2, s2, s12 +; GFX9-NEXT: s_mov_b32 s13, s12 +; GFX9-NEXT: s_addc_u32 s3, s3, s12 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s3 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 -; GFX9-NEXT: v_subrev_co_u32_e32 v16, vcc, s12, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v16, vcc, s16, v9 ; GFX9-NEXT: v_subbrev_co_u32_e32 v17, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v15 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v7 @@ -1742,16 +1742,16 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v13 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] +; GFX9-NEXT: s_xor_b64 s[14:15], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v14, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_subb_u32 s14, 0, s3 +; GFX9-NEXT: s_subb_u32 s16, 0, s3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v13, v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v11, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v14, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v14, v[2:3] ; GFX9-NEXT: v_mul_lo_u32 v3, v13, v1 ; GFX9-NEXT: v_mul_hi_u32 v11, v14, v1 ; GFX9-NEXT: v_mul_lo_u32 v4, v14, v2 @@ -1784,7 +1784,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v12, v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc ; GFX9-NEXT: v_xor_b32_e32 v7, s19, v7 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s14, v11, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s16, v11, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v10, s19 ; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s18, v9 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v10, vcc @@ -1812,18 +1812,18 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v4, v8, v7, v4 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s13, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s12, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, s12, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s13, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s13, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, s15, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, s14, v4 +; GFX9-NEXT: v_mul_hi_u32 v10, s14, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s15, v3 +; GFX9-NEXT: v_mul_hi_u32 v12, s15, v4 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, s13, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, s15, v4 ; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, s12, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, s14, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, s4 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc @@ -1837,13 +1837,13 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc ; GFX9-NEXT: v_add3_u32 v9, v8, v7, v12 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v10, s13 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s12, v3 +; GFX9-NEXT: v_mov_b32_e32 v10, s15 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s14, v3 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v11, v[7:8] ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8 -; GFX9-NEXT: v_sub_u32_e32 v7, s13, v7 +; GFX9-NEXT: v_sub_u32_e32 v7, s15, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] @@ -1875,7 +1875,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[16:17] +; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[12:13] ; GFX9-NEXT: v_xor_b32_e32 v3, s0, v10 ; GFX9-NEXT: v_xor_b32_e32 v4, s1, v9 ; GFX9-NEXT: v_mov_b32_e32 v9, s1 @@ -1893,39 +1893,39 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-LABEL: sdivrem_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx4 s[20:23], s[4:5], 0x20 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s16, s1, 31 -; GFX10-NEXT: s_ashr_i32 s4, s13, 31 -; GFX10-NEXT: s_mov_b32 s17, s16 -; GFX10-NEXT: s_add_u32 s12, s12, s4 -; GFX10-NEXT: s_addc_u32 s13, s13, s4 -; GFX10-NEXT: s_add_u32 s0, s0, s16 -; GFX10-NEXT: s_addc_u32 s1, s1, s16 +; GFX10-NEXT: s_ashr_i32 s4, s21, 31 +; GFX10-NEXT: s_ashr_i32 s2, s13, 31 ; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_xor_b64 s[6:7], s[0:1], s[16:17] -; GFX10-NEXT: s_xor_b64 s[0:1], s[12:13], s[4:5] -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX10-NEXT: s_sub_u32 s20, 0, s6 -; GFX10-NEXT: s_subb_u32 s21, 0, s7 -; GFX10-NEXT: s_ashr_i32 s12, s15, 31 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: s_xor_b64 s[18:19], s[4:5], s[16:17] -; GFX10-NEXT: s_ashr_i32 s16, s3, 31 +; GFX10-NEXT: s_add_u32 s0, s12, s2 +; GFX10-NEXT: s_addc_u32 s1, s13, s2 +; GFX10-NEXT: s_add_u32 s6, s20, s4 +; GFX10-NEXT: s_addc_u32 s7, s21, s4 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_xor_b64 s[18:19], s[6:7], s[4:5] +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s19 +; GFX10-NEXT: s_sub_u32 s20, 0, s18 +; GFX10-NEXT: s_subb_u32 s21, 0, s19 +; GFX10-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] +; GFX10-NEXT: s_ashr_i32 s4, s15, 31 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s18 +; GFX10-NEXT: s_ashr_i32 s6, s23, 31 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX10-NEXT: s_add_u32 s14, s14, s12 -; GFX10-NEXT: s_addc_u32 s15, s15, s12 -; GFX10-NEXT: s_add_u32 s2, s2, s16 -; GFX10-NEXT: s_mov_b32 s17, s16 -; GFX10-NEXT: s_addc_u32 s3, s3, s16 +; GFX10-NEXT: s_add_u32 s14, s14, s4 +; GFX10-NEXT: s_addc_u32 s15, s15, s4 +; GFX10-NEXT: s_add_u32 s12, s22, s6 +; GFX10-NEXT: s_mov_b32 s7, s6 +; GFX10-NEXT: s_addc_u32 s13, s23, s6 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] -; GFX10-NEXT: s_mov_b32 s13, s12 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX10-NEXT: s_xor_b64 s[12:13], s[12:13], s[6:7] +; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] +; GFX10-NEXT: s_xor_b64 s[14:15], s[14:15], s[4:5] ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1941,18 +1941,18 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_trunc_f32_e32 v4, v4 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v0 ; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v4 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s20, v6, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s3, s20, v6, 0 ; GFX10-NEXT: v_mul_lo_u32 v8, s21, v6 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v4 -; GFX10-NEXT: s_sub_u32 s5, 0, s2 -; GFX10-NEXT: s_subb_u32 s22, 0, s3 +; GFX10-NEXT: s_sub_u32 s3, 0, s12 +; GFX10-NEXT: s_subb_u32 s22, 0, s13 ; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX10-NEXT: v_mul_lo_u32 v9, s5, v3 +; GFX10-NEXT: v_mul_lo_u32 v9, s3, v3 ; GFX10-NEXT: v_add3_u32 v7, v1, v7, v8 ; GFX10-NEXT: v_mul_lo_u32 v10, v5, v0 ; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s23, s5, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s23, s3, v4, 0 ; GFX10-NEXT: v_mul_lo_u32 v8, s22, v4 ; GFX10-NEXT: v_mul_lo_u32 v12, v6, v7 ; GFX10-NEXT: v_mul_hi_u32 v0, v5, v0 @@ -2005,73 +2005,73 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v0, v5, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v2, vcc_lo ; GFX10-NEXT: v_mul_lo_u32 v12, v6, v7 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s20, s5, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s20, s3, v4, 0 ; GFX10-NEXT: v_mul_lo_u32 v9, s22, v4 -; GFX10-NEXT: v_mul_lo_u32 v11, s5, v3 +; GFX10-NEXT: v_mul_lo_u32 v11, s3, v3 ; GFX10-NEXT: v_mul_lo_u32 v13, v5, v7 ; GFX10-NEXT: v_mul_hi_u32 v14, v6, v7 ; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX10-NEXT: v_add_co_u32 v8, s5, v8, v12 +; GFX10-NEXT: v_add_co_u32 v8, s3, v8, v12 ; GFX10-NEXT: v_mul_lo_u32 v15, v3, v1 ; GFX10-NEXT: v_mul_hi_u32 v16, v4, v1 ; GFX10-NEXT: v_add3_u32 v2, v2, v11, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v0, s5, v13, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v8, s5, v8, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v0, s3, v13, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v8, s3, v8, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s3 ; GFX10-NEXT: v_mul_lo_u32 v12, v4, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v8, v9, v8 ; GFX10-NEXT: v_mul_hi_u32 v1, v3, v1 ; GFX10-NEXT: v_mul_lo_u32 v13, v3, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v10, v11, v10 ; GFX10-NEXT: v_mul_hi_u32 v9, v4, v2 -; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v11, s5, v15, v12 +; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v11, s3, v15, v12 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 ; GFX10-NEXT: v_add3_u32 v7, v10, v8, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v1, s5, v13, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v1, s3, v13, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s3 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo -; GFX10-NEXT: v_add_co_u32 v8, s5, v11, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v9 +; GFX10-NEXT: v_add_co_u32 v8, s3, v11, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v1, s3, v1, v9 ; GFX10-NEXT: v_mul_lo_u32 v7, s1, v0 ; GFX10-NEXT: v_mul_lo_u32 v9, s0, v5 ; GFX10-NEXT: v_mul_hi_u32 v10, s1, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX10-NEXT: v_mul_lo_u32 v11, s1, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v8, v12, v8 ; GFX10-NEXT: v_mul_hi_u32 v12, s0, v5 ; GFX10-NEXT: v_mul_hi_u32 v5, s1, v5 -; GFX10-NEXT: v_add_co_u32 v7, s5, v7, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v10, s5, v11, v10 +; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v10, s3, v11, v10 ; GFX10-NEXT: v_add_co_u32 v0, s20, v7, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v10, s5, v10, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3 +; GFX10-NEXT: v_add_co_u32 v10, s3, v10, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v9, v0 -; GFX10-NEXT: v_add_co_u32 v8, s5, v1, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v8, s3, v1, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v11 -; GFX10-NEXT: v_add_co_u32 v9, s5, v10, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v9, s3, v10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 ; GFX10-NEXT: v_mul_hi_u32 v2, v3, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v6, v13, v6 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8 ; GFX10-NEXT: v_add3_u32 v5, v7, v0, v5 ; GFX10-NEXT: v_mul_hi_u32 v8, s14, v4 ; GFX10-NEXT: v_add3_u32 v2, v6, v1, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v9, 0 -; GFX10-NEXT: v_mul_lo_u32 v6, s7, v9 -; GFX10-NEXT: v_mul_lo_u32 v7, s6, v5 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s3, s18, v9, 0 +; GFX10-NEXT: v_mul_lo_u32 v6, s19, v9 +; GFX10-NEXT: v_mul_lo_u32 v7, s18, v5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v2, vcc_lo ; GFX10-NEXT: v_mul_lo_u32 v3, s15, v4 ; GFX10-NEXT: v_mul_hi_u32 v4, s15, v4 @@ -2083,23 +2083,23 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_sub_nc_u32_e32 v12, s1, v1 ; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, s0, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v14, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s7, v12, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v13 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v12, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v13, s6 +; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v13, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v14 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s7, v0, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v14 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v12 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v15 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v19, s0, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v20, s0, 0, v7, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v14 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s19, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v15 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s19, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v17, v18, v17, s0 ; GFX10-NEXT: v_add_co_u32 v1, s0, v3, v10 ; GFX10-NEXT: v_mul_hi_u32 v10, s14, v2 @@ -2116,14 +2116,14 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v8, v10 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_sub_co_u32 v8, s0, v12, s6 +; GFX10-NEXT: v_sub_co_u32 v8, s0, v12, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v0, s0 ; GFX10-NEXT: v_add3_u32 v2, v3, v1, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v19, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v20, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v7, s2, v2 -; GFX10-NEXT: v_mul_lo_u32 v11, s3, v4 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s12, v4, 0 +; GFX10-NEXT: v_mul_lo_u32 v7, s12, v2 +; GFX10-NEXT: v_mul_lo_u32 v11, s13, v4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v17 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 @@ -2137,34 +2137,34 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s1, s15, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s15, v1 -; GFX10-NEXT: v_xor_b32_e32 v0, s18, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v9 -; GFX10-NEXT: v_xor_b32_e32 v3, s19, v5 -; GFX10-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX10-NEXT: v_xor_b32_e32 v0, s16, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s13, v9 +; GFX10-NEXT: v_xor_b32_e32 v3, s17, v5 +; GFX10-NEXT: v_xor_b32_e32 v6, s2, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, vcc_lo, s3, v1, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v8 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, vcc_lo, s13, v1, s0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v8, s2 +; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v8, s12 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v10, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s18 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s19, v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v9 -; GFX10-NEXT: v_xor_b32_e32 v3, s4, v7 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s16 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v9 +; GFX10-NEXT: v_xor_b32_e32 v3, s2, v7 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, s13, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v13 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v12 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v14, s0, v4, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s0, 0, v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0 ; GFX10-NEXT: v_add_co_u32 v11, s0, v14, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v15, s0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7 -; GFX10-NEXT: v_sub_co_u32 v7, s0, v12, s2 +; GFX10-NEXT: v_sub_co_u32 v7, s0, v12, s12 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 @@ -2175,17 +2175,17 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v14, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v7, s0 -; GFX10-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s4 +; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[6:7] +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s2 ; GFX10-NEXT: v_xor_b32_e32 v3, s0, v10 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v6, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v6, s1, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, s12, v8 -; GFX10-NEXT: v_xor_b32_e32 v7, s12, v7 +; GFX10-NEXT: v_xor_b32_e32 v8, s4, v8 +; GFX10-NEXT: v_xor_b32_e32 v7, s4, v7 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s12 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s4 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s4, v7, vcc_lo ; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[8:9] ; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[10:11] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index d0c55c69f50877..b27e01b1c3a257 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -24,135 +24,135 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v2, v1 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v6, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v6, v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 +; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 +; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc +; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; CHECK-NEXT: v_trunc_f32_e32 v2, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] +; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 +; CHECK-NEXT: v_mul_lo_u32 v12, v7, v1 +; CHECK-NEXT: v_mul_lo_u32 v13, v10, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v7, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v13, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v0 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] +; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v5 +; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v8 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v8, vcc +; CHECK-NEXT: v_xor_b32_e32 v5, v2, v8 +; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 +; CHECK-NEXT: v_mul_lo_u32 v9, v7, v1 +; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v8 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v11, v10, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_mul_hi_u32 v9, v7, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 +; CHECK-NEXT: v_mul_hi_u32 v10, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v0, v2 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v9, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v2, v[1:2] +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v9, v[1:2] +; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v4, v1, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v4, v1 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v6 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v3 +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v6 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v8 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v8 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -368,267 +368,266 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_srem_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v5, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v4, v8 -; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v4, v9 +; GISEL-NEXT: v_xor_b32_e32 v9, v5, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v9 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v9, vcc +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v11, v9 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v10, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v15, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v10 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v16, v12, v4 ; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 +; GISEL-NEXT: v_mul_hi_u32 v4, v15, v4 ; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v16, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 ; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v4 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v12, v4 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v16, 0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v4, v11 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v5, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v13, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v16, v11 ; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v1, v5, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_mul_hi_u32 v13, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v9 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v12, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v12, v[9:10] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v11, v0 -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v14, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v10, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v9, v5, v[10:11] +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v0 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v14, v10, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v14, v10 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v8, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v9 +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v0, v9, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v13, v1, v5, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v0 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v9, vcc +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v0, vcc +; GISEL-NEXT: v_xor_b32_e32 v5, v1, v0 +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v11, v8 +; GISEL-NEXT: v_subbrev_u32_e64 v15, s[4:5], 0, v10, vcc ; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v9 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, v7, v1, s[4:5] ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v16, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v16 +; GISEL-NEXT: v_trunc_f32_e32 v7, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v7, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v5 +; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v6, s[4:5] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v9, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v16, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v17, v[8:9] -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v20, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v20, v7 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v10, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v20, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v14, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v17, v[9:10] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v20, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v7 ; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v21, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v16, v8 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, v20, v7 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v7, v20, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v16, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v13, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v14, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v13, v[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v17, v0 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v20, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v9, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v18, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v9, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 ; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v8 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v8 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v8 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v9, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v5, v7, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v9, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v10, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v5 ; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v6 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -646,131 +645,131 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB2_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v1 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v1 -; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v4, v3 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v16, v5, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v3 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2 +; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v4, v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v3 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v4, vcc +; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; CGP-NEXT: v_trunc_f32_e32 v2, v1 +; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v5, 0 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v14, v[1:2] +; CGP-NEXT: v_mul_hi_u32 v15, v5, v0 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v5, v[1:2] +; CGP-NEXT: v_mul_lo_u32 v2, v14, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 +; CGP-NEXT: v_mul_lo_u32 v16, v5, v1 +; CGP-NEXT: v_mul_lo_u32 v17, v14, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v1 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v16, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v17, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] +; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v0 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v5, 0 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v14, v[1:2] ; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v0 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v5, v[1:2] +; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v12 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 +; CGP-NEXT: v_xor_b32_e32 v11, v2, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v14, v0 +; CGP-NEXT: v_mul_lo_u32 v13, v5, v1 +; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 ; CGP-NEXT: v_xor_b32_e32 v10, v10, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v15, v14, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v10, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v13, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v10, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 +; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v14, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v5, v11, v1 +; CGP-NEXT: v_mul_hi_u32 v13, v11, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v1 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v10, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v13, 0 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v2, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v13, v[1:2] +; CGP-NEXT: v_subb_u32_e64 v2, s[4:5], v10, v1, vcc +; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0 -; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v0, v3 +; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v1, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v4 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v12 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v12 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 @@ -815,131 +814,131 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] ; CGP-NEXT: .LBB2_7: -; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v7 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v3 -; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v6, v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v5 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 +; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v6, vcc +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v4, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] +; CGP-NEXT: v_mul_hi_u32 v13, v7, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v7, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 +; CGP-NEXT: v_mul_lo_u32 v14, v7, v3 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v2 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v7, v[3:4] +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v10 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v6, v10 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v11, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_xor_b32_e32 v9, v4, v10 +; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 +; CGP-NEXT: v_mul_lo_u32 v11, v7, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 ; CGP-NEXT: v_xor_b32_e32 v8, v8, v10 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v12, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v12, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v8, v2 +; CGP-NEXT: v_mul_lo_u32 v7, v9, v3 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v8, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v9, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v2, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v11, 0 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v4, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v11, v[3:4] +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v8, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v8, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v5 +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v6 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 @@ -1354,8 +1353,8 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s7, 0xf000 -; CGP-NEXT: s_movk_i32 s6, 0x1000 +; CGP-NEXT: s_movk_i32 s6, 0xf000 +; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1366,7 +1365,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -1390,11 +1389,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -1442,11 +1441,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_hi_u32 v9, v11, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v1, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v6, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v6, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v0 ; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v6, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v6 @@ -1469,10 +1468,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v14, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v15, -1, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v14, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v4 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v12, vcc @@ -1502,12 +1501,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v14, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc ; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -1558,11 +1557,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v9, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v3, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v3, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v5, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v5, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -1975,8 +1974,8 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s7, 0xffed2705 -; CGP-NEXT: s_mov_b32 s6, 0x12d8fb +; CGP-NEXT: s_mov_b32 s6, 0xffed2705 +; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1987,7 +1986,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -2011,11 +2010,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -2063,11 +2062,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_hi_u32 v9, v11, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v1, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v6, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v6, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v0 ; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v6, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v6 @@ -2090,10 +2089,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v14, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v15, -1, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v14, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v4 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v12, vcc @@ -2123,12 +2122,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v14, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc ; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -2179,11 +2178,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v9, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v3, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v3, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v5, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v5, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -2237,137 +2236,135 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB7_3: -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v6 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v1 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v7, v5 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v6, v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 +; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 +; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v5 +; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc +; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; CHECK-NEXT: v_trunc_f32_e32 v2, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] +; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 +; CHECK-NEXT: v_mul_lo_u32 v12, v7, v1 +; CHECK-NEXT: v_mul_lo_u32 v13, v10, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v13, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v0 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] +; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v4 +; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v8 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v8, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v2, v8 +; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 +; CHECK-NEXT: v_mul_lo_u32 v9, v7, v1 +; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 +; CHECK-NEXT: v_xor_b32_e32 v3, v3, v8 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v11, v10, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_mul_hi_u32 v9, v7, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v3, v0 +; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v9, v4, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9 -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 +; CHECK-NEXT: v_mul_lo_u32 v9, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v0, v2 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v9, v[1:2] +; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v3, v1, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v3, v1 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v6 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v5 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v6 +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v8 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v8 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -2408,264 +2405,264 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v5, v7, vcc ; GISEL-NEXT: v_xor_b32_e32 v5, v4, v7 -; GISEL-NEXT: v_xor_b32_e32 v7, v8, v7 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v7 -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, 0, v7, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v10, v8 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v10 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v10 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v12, v11, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v12, v14, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v8 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v11, v[9:10] -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v12, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v13, v7, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v14, v12, v[10:11] +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v4 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v12, v11, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v12, v14, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v4 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v12, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v13, v7, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v11, v[9:10] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v14, v12, v[10:11] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v14, v8 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v13, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v1, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v14, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v14, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v5, v8, v[1:2] -; GISEL-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v12, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v14, v[8:9] -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v13, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v12, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v5, v7, v[1:2] +; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v12, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v11, v0 +; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v14, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7 -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v7, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, v1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v8, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v0 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc ; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v0 ; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v8 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v15, s[4:5], 0, v9, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v9, vcc ; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, v10, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v1, s[4:5] ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v10, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v10 +; GISEL-NEXT: v_trunc_f32_e32 v16, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v16 ; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 ; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v8, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v7, s[4:5] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v20, v10 -; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v9, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v20, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v11, v5 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v17, v[9:10] -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v20, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v17, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v9, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v16, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v17, v[8:9] +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v20, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v20, v9 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v17, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v20, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v0 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v20, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v11, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v15, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v11, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v7 -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v0 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v16, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v13, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v13, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v8 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v9 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v13, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 -; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: @@ -2685,131 +2682,131 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v2, v1 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v1 -; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc -; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v4, v3 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v3 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v16, v2 +; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v4, v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v4, vcc +; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; CGP-NEXT: v_trunc_f32_e32 v2, v1 +; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[1:2] +; CGP-NEXT: v_mul_hi_u32 v14, v10, v0 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v10, v[1:2] +; CGP-NEXT: v_mul_lo_u32 v2, v13, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v13, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v1 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4] +; CGP-NEXT: v_mul_hi_u32 v1, v13, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[1:2] ; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v0 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v10, v[1:2] +; CGP-NEXT: v_add_i32_e32 v2, vcc, v8, v11 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v4, v11 -; CGP-NEXT: v_mul_lo_u32 v4, v13, v2 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 +; CGP-NEXT: v_xor_b32_e32 v9, v2, v11 +; CGP-NEXT: v_mul_lo_u32 v2, v13, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v1 +; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 ; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v13, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v8, v2 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v3 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v3 +; CGP-NEXT: v_mul_hi_u32 v1, v13, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v13, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v10, v9, v1 +; CGP-NEXT: v_mul_hi_u32 v12, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v1 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v8, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v12, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v8, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v8, v3 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v0, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v12, 0 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v2, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v12, v[1:2] +; CGP-NEXT: v_subb_u32_e64 v2, s[4:5], v8, v1, vcc +; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v8, v1 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v0 -; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v0, v3 +; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v1, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; CGP-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v11 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v11 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 @@ -2856,137 +2853,135 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] ; CGP-NEXT: .LBB8_7: -; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v10 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v3 -; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v6, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v8, v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v8 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v6 +; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v4, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] +; CGP-NEXT: v_mul_hi_u32 v13, v9, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v3 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v2 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] +; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v7 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4] +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v10 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v4, v10 +; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 +; CGP-NEXT: v_mul_lo_u32 v11, v9, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v12, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v12, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v5, v2 +; CGP-NEXT: v_mul_lo_u32 v9, v7, v3 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v5, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v2, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v4, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[3:4] +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v5, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v5, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v6 +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v8 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v8 +; CGP-NEXT: v_sub_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 65455d754be4f5..fd244d3bf2defe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -6251,15 +6251,15 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; ; GFX10-LABEL: s_ssubsat_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sub_u32 s18, s0, s8 -; GFX10-NEXT: s_subb_u32 s19, s1, s9 -; GFX10-NEXT: s_subb_u32 s16, s2, s10 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1] -; GFX10-NEXT: s_subb_u32 s17, s3, s11 -; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] +; GFX10-NEXT: s_sub_u32 s16, s0, s8 +; GFX10-NEXT: s_subb_u32 s17, s1, s9 +; GFX10-NEXT: s_subb_u32 s18, s2, s10 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] +; GFX10-NEXT: s_subb_u32 s19, s3, s11 +; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] ; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 @@ -6268,7 +6268,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_ashr_i32 s8, s17, 31 +; GFX10-NEXT: s_ashr_i32 s8, s19, 31 ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_add_u32 s9, s8, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 @@ -6304,12 +6304,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, s18 -; GFX10-NEXT: v_mov_b32_e32 v4, s19 +; GFX10-NEXT: v_mov_b32_e32 v3, s16 +; GFX10-NEXT: v_mov_b32_e32 v4, s17 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s16 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s19 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 77737b356ff6e9..7fd9f5c942c476 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -365,61 +365,61 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_udiv_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v11 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v8 +; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v9 +; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v12, v12 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 +; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v15, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v9 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v9, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v8 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v8 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v8, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v8, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -435,166 +435,166 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v8 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v9 +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v9 +; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v8, v20 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v9, v19 +; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v11, v10 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 +; GISEL-NEXT: v_mul_hi_u32 v15, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v13 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v11, v10, vcc +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v18, v8 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v20, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v17, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v6, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v8 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10 -; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14 -; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v19, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v8 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 1, v9 +; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v19 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], 1, v15 +; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v10, v13 +; GISEL-NEXT: v_add_i32_e64 v13, s[12:13], 1, v16 +; GISEL-NEXT: v_add_i32_e64 v11, s[14:15], v11, v12 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v6 ; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v4 ; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v20, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v4 ; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v6, v12 +; GISEL-NEXT: v_mul_lo_u32 v4, v6, v11 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], 0, v12, s[6:7] +; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], 0, v11, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[16:17] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v19, v4 -; GISEL-NEXT: v_addc_u32_e64 v19, s[6:7], 0, v0, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 -; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v2, s[12:13] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], v1, v16, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v16 -; GISEL-NEXT: v_subb_u32_e64 v16, s[6:7], v3, v4, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v17, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[16:17] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v20, v4 +; GISEL-NEXT: v_addc_u32_e64 v20, s[6:7], 0, v0, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v18 +; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], 0, v2, s[12:13] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v14 +; GISEL-NEXT: v_subb_u32_e64 v14, s[6:7], v1, v12, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v12 +; GISEL-NEXT: v_subb_u32_e64 v12, s[6:7], v3, v4, s[8:9] ; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[22:23] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc ; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v7 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v7 ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v6, v16, v6, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[18:19] ; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v20, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, v17, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, v5 @@ -602,19 +602,19 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v16 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[8:9] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v13, v18, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v19, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v2, v17, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v1, v15, v19, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v13, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v20, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v2, v18, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v5, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64: @@ -1250,61 +1250,61 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 ; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 ; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v5 +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc +; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v5, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v11 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v6 +; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v9 +; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v12, v12 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 +; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v15, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v9 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v9, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v6 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v6 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v6, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v6, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v6, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -1320,144 +1320,144 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v6 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v6 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v9 +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v9 +; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v6, v20 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v9, v19 +; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v11, v10 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 +; GISEL-NEXT: v_mul_hi_u32 v15, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v13 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v15 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v6 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v11, v10, vcc +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v15, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v18, v6 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v20, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v18, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v6 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10 -; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14 -; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v19, v4, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v6 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 1, v9 +; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v19 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], 1, v15 +; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v10, v13 +; GISEL-NEXT: v_add_i32_e64 v13, s[12:13], 1, v16 +; GISEL-NEXT: v_add_i32_e64 v11, s[14:15], v11, v12 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v4 ; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v7 ; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v20, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v7 ; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v2, v4, v12 -; GISEL-NEXT: v_add_i32_e64 v4, s[24:25], v16, v20 -; GISEL-NEXT: v_addc_u32_e64 v7, s[6:7], 0, v12, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], v19, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v17 +; GISEL-NEXT: v_mul_lo_u32 v2, v4, v11 +; GISEL-NEXT: v_add_i32_e64 v4, s[24:25], v17, v12 +; GISEL-NEXT: v_addc_u32_e64 v7, s[6:7], 0, v11, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], v20, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[14:15] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v18 ; GISEL-NEXT: v_subb_u32_e64 v17, s[6:7], v1, v4, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[16:17] @@ -1470,18 +1470,18 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v1, v8 ; GISEL-NEXT: v_addc_u32_e64 v1, s[12:13], 0, v7, s[12:13] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[22:23] -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, vcc, v3, v2, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 +; GISEL-NEXT: v_subb_u32_e64 v14, vcc, v3, v2, s[8:9] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v5 ; GISEL-NEXT: v_subb_u32_e64 v2, s[8:9], v2, v5, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v11, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v2, vcc, 0, v2, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v16, s[14:15] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v11, v4, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[14:15] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v14, v4, s[8:9] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] @@ -1489,17 +1489,17 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v19, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v18, s[6:7] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v2 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v13, v18, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v2, v15, v19, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v13, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v17, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v5, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom: @@ -1905,54 +1905,54 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v1 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v8 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v8, v8 ; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v18, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v19, v9, v5 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v20, v4, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v17 +; GISEL-NEXT: v_mul_hi_u32 v21, v5, v17 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 +; GISEL-NEXT: v_mul_lo_u32 v16, v4, v12 ; GISEL-NEXT: v_mul_lo_u32 v19, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v22, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v22, v4, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_lo_u32 v23, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v23, v5, v13 ; GISEL-NEXT: v_mul_lo_u32 v24, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v25, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v25, v5, v13 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc @@ -1982,36 +1982,36 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v8 ; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v17, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v17, v4, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v5, v13 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 +; GISEL-NEXT: v_mul_lo_u32 v9, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v21, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 @@ -2038,66 +2038,66 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v11, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v17, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v16, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v1, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v8, v1, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v0, v5 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v15, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v8, v1, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v17, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v5 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v7, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v10 ; GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v16, vcc @@ -2140,12 +2140,12 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v17, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v20, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v6, v19, v21, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v21, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v4, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index fba8ef2948ade9..56943531ba8ae7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -1072,185 +1072,185 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v5, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s8, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v6, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s9, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v6, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s8, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v5, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v4, v1, vcc +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1] -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14 -; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8 -; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v6 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX8-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; GFX8-NEXT: v_trunc_f32_e32 v14, v2 -; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v14 -; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v1, v2, s[0:1] +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v0, v3, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v7 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v4, vcc +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v5 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX8-NEXT: v_trunc_f32_e32 v14, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v14 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v0 +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v6, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v15, 0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v3, v16, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v14, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v16, v2, v16, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v14, v[1:2] ; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v12 ; GFX8-NEXT: v_addc_u32_e64 v18, s[0:1], 0, v13, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v15, v[2:3] -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v4, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, v14, v1 -; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v15, v[1:2] +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v4, v3, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v14, v0 +; GFX8-NEXT: v_mul_lo_u32 v4, v15, v1 ; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v3, vcc -; GFX8-NEXT: v_mul_hi_u32 v3, v15, v1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, v14, v2 -; GFX8-NEXT: v_mul_hi_u32 v1, v14, v1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 -; GFX8-NEXT: v_mul_hi_u32 v5, v15, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v2, vcc +; GFX8-NEXT: v_mul_hi_u32 v2, v15, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v14, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, v14, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, v15, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v1 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, 0 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_hi_u32 v1, v14, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v14, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v12, v17, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v14, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v18, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v15, v[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v15, v[3:4] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v5, v10, v19, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v7, v14, v3 -; GFX8-NEXT: v_mul_lo_u32 v9, v15, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX8-NEXT: v_mul_hi_u32 v8, v15, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v11, v20, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v19, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v6, v14, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, v15, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GFX8-NEXT: v_mul_hi_u32 v7, v15, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v20, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v6, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 +; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v6, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v7, v14, v3 +; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 +; GFX8-NEXT: v_mul_hi_u32 v9, v15, v3 +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v7, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v8, v14, v4 -; GFX8-NEXT: v_mul_hi_u32 v3, v14, v3 -; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v8, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v9 +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v2, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v8, v9 -; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v8, v7 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v7 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v15, v3 -; GFX8-NEXT: v_addc_u32_e64 v4, s[0:1], v14, v4, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3 -; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc -; GFX8-NEXT: v_mul_hi_u32 v0, s10, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0 -; GFX8-NEXT: v_mul_hi_u32 v8, s10, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v9 +; GFX8-NEXT: v_mul_hi_u32 v3, v14, v3 +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v2, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v7, v6 +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6 +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v15, v2 +; GFX8-NEXT: v_addc_u32_e64 v3, s[0:1], v14, v3, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v6, s11, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s10, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, s10, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v8, s11, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_mul_hi_u32 v7, s10, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v3, v0 -; GFX8-NEXT: v_mul_hi_u32 v8, s11, v4 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v8, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s15 -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8] -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s10, v3 -; GFX8-NEXT: v_subb_u32_e64 v11, s[0:1], v4, v7, vcc -; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v7 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8 -; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v9 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 +; GFX8-NEXT: v_mul_hi_u32 v9, s11, v3 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v8, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s14, v9, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v10, s11 +; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s15, v8, v[6:7] +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v2 +; GFX8-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v6, vcc +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s11, v6 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10 +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s14, v7 +; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v2, vcc +; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v8 +; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11 +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12 -; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v7 +; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 -; GFX8-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc +; GFX8-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v2, vcc ; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v16, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v14, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v18, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, v12, v19, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v13, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX8-NEXT: v_mov_b32_e32 v9, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v0, s[0:1] -; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[1:4] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v8, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v9, s5 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udivrem_v2i64: @@ -1298,6 +1298,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: s_sub_u32 s2, 0, s14 @@ -1337,183 +1338,181 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5 +; GFX9-NEXT: v_add3_u32 v8, v3, v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s8, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v6, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s13, v8, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v1 -; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s15 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s14 -; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s12, v2 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v8 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; GFX9-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GFX9-NEXT: v_trunc_f32_e32 v15, v4 -; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v15 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v11 +; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v4, v2, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v10 +; GFX9-NEXT: v_sub_u32_e32 v1, s9, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v2, v3, s[0:1] +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s12, v9 +; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GFX9-NEXT: v_trunc_f32_e32 v15, v2 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v15 +; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v1 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v16, 0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v17, v3, v17, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v14, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v16, v[4:5] -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v11 -; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v16, v[2:3] +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v15, v1 +; GFX9-NEXT: v_mul_lo_u32 v7, v16, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v12 +; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v3, vcc +; GFX9-NEXT: v_mul_hi_u32 v3, v16, v1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v16, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 +; GFX9-NEXT: v_mul_hi_u32 v1, v15, v1 +; GFX9-NEXT: v_add_u32_e32 v3, v7, v3 +; GFX9-NEXT: v_mul_hi_u32 v7, v16, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v3 -; GFX9-NEXT: v_add3_u32 v4, v6, v5, v4 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v16, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v16, v1 +; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v7, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v18, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v15, v[3:4] -; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v19, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v16, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v18, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v19, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v7, v[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v14, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v17 -; GFX9-NEXT: v_mul_lo_u32 v8, v15, v5 -; GFX9-NEXT: v_mul_lo_u32 v9, v16, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v20, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v11, v16, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v12, v21, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], v8, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], v8, v11 +; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, v7, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v20, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v12, v7, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v21, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v12, v15, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 +; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v12, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v11, v15, v6 -; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5 -; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX9-NEXT: v_mul_hi_u32 v9, v16, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, v15, v6 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v11, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v8 -; GFX9-NEXT: v_add_u32_e32 v9, v11, v9 +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v6 +; GFX9-NEXT: v_add_u32_e32 v8, v12, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX9-NEXT: v_add3_u32 v4, v8, v6, v4 +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v7, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], v15, v4, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v6, s11, v3 +; GFX9-NEXT: v_mul_lo_u32 v7, s10, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, s10, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s11, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v8, s11, v4 +; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 +; GFX9-NEXT: v_mul_hi_u32 v7, s10, v4 +; GFX9-NEXT: v_mul_hi_u32 v13, s11, v4 +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v8, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v16, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v15, v6, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v8, s11, v5 -; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v7, vcc -; GFX9-NEXT: v_mul_hi_u32 v2, s10, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, s11, v5 -; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], v8, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v8, s11, v6 -; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 -; GFX9-NEXT: v_mul_hi_u32 v9, s10, v6 -; GFX9-NEXT: v_mul_hi_u32 v13, s11, v6 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v8, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], v5, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s14, v12, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v10, vcc -; GFX9-NEXT: v_add_u32_e32 v1, v11, v9 -; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13 -; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v9, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v10, s11 -; GFX9-NEXT: v_mov_b32_e32 v6, s15 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v12, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10 -; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s14, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v3, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0 +; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc +; GFX9-NEXT: v_add3_u32 v10, v7, v12, v13 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v11, s11 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8] +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s10, v3 +; GFX9-NEXT: v_subb_co_u32_e64 v11, s[0:1], v11, v7, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11 +; GFX9-NEXT: v_sub_u32_e32 v3, s11, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s14, v8 +; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v10, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v13 -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s14, v11 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s14, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 -; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v3, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1] -; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[4:5] -; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v14, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v15, v18, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v13, v20, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v9, s[0:1] +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 097f6642cbc669..5f7b820a5b1dee 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -359,61 +359,61 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_urem_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v11 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v8 +; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v9 +; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v12, v12 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 +; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v15, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v9 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v9, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v8 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v8 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v8, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v8, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -429,102 +429,102 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v8 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v9 +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v9 +; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v8, v20 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v9, v19 +; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v11, v10 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 +; GISEL-NEXT: v_mul_hi_u32 v15, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v13 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v11, v10, vcc +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v20, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] @@ -533,40 +533,40 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v6, v11 +; GISEL-NEXT: v_mul_lo_u32 v16, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v13 +; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 ; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 ; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v6 ; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 ; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 -; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8 -; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9 +; GISEL-NEXT: v_add_i32_e64 v10, s[18:19], v17, v10 +; GISEL-NEXT: v_add_i32_e64 v11, s[18:19], v19, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v10, v8 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v11, v9 ; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc ; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 ; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] @@ -1751,63 +1751,63 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 -; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 -; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13 +; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 +; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v11 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v8 +; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v9 +; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v12, v12 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 +; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v15, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v9 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v9, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v8 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v8 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v8, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v8, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -1823,102 +1823,102 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v8 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v9 +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v9 +; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v8, v20 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v9, v19 +; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v11, v10 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 +; GISEL-NEXT: v_mul_hi_u32 v15, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v13 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v11, v10, vcc +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v20, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16 +; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] @@ -1927,80 +1927,80 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v4, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v16, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v13 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v4 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v13 +; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 +; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 +; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v4 -; GISEL-NEXT: v_sub_i32_e64 v7, s[14:15], v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v4, s[16:17], v13, v4 -; GISEL-NEXT: v_add_i32_e64 v6, s[18:19], v17, v6 -; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v6 +; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 +; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 +; GISEL-NEXT: v_add_i32_e64 v10, s[18:19], v17, v10 +; GISEL-NEXT: v_add_i32_e64 v11, s[18:19], v19, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11 -; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v6 -; GISEL-NEXT: v_subb_u32_e64 v6, s[6:7], v3, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v10, v8 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v11, v9 +; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 +; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v8, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v8, s[10:11] +; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[10:11] ; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e64 v14, vcc, 0, v3, s[12:13] -; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v5, s[12:13] +; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[12:13] ; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v8 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v5 ; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 ; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[8:9] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v12, v7, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[8:9] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v3, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom: @@ -2400,54 +2400,54 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v1 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v8 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v8, v8 ; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v18, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v19, v9, v5 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v20, v4, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v17 +; GISEL-NEXT: v_mul_hi_u32 v21, v5, v17 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 +; GISEL-NEXT: v_mul_lo_u32 v16, v4, v12 ; GISEL-NEXT: v_mul_lo_u32 v19, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v22, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v22, v4, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_lo_u32 v23, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v23, v5, v13 ; GISEL-NEXT: v_mul_lo_u32 v24, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v25, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v25, v5, v13 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc @@ -2477,36 +2477,36 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v8 ; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v17, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v17, v4, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v5, v13 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 +; GISEL-NEXT: v_mul_lo_u32 v9, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v21, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 @@ -2533,66 +2533,66 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v11, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v17, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v16, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v1, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v4, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v0, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v4, vcc ; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], 0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 8c4483fc118dbb..43d013f7e7a789 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -246,59 +246,59 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: no_agpr_no_reserve: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 -; GFX908-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] -; GFX908-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 -; GFX908-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 -; GFX908-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 -; GFX908-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX908-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX908-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 +; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 +; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] +; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48 +; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:32 +; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 +; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 +; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 +; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 ; GFX908-NEXT: s_waitcnt vmcnt(7) -; GFX908-NEXT: v_add_u32_e32 v4, v4, v4 ; GFX908-NEXT: v_add_u32_e32 v3, v3, v3 ; GFX908-NEXT: v_add_u32_e32 v2, v2, v2 ; GFX908-NEXT: v_add_u32_e32 v1, v1, v1 +; GFX908-NEXT: v_add_u32_e32 v0, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(6) -; GFX908-NEXT: v_add_u32_e32 v8, v8, v8 ; GFX908-NEXT: v_add_u32_e32 v7, v7, v7 ; GFX908-NEXT: v_add_u32_e32 v6, v6, v6 +; GFX908-NEXT: v_add_u32_e32 v5, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v32, v32, v32 ; GFX908-NEXT: v_add_u32_e32 v31, v31, v31 ; GFX908-NEXT: v_add_u32_e32 v30, v30, v30 ; GFX908-NEXT: v_add_u32_e32 v29, v29, v29 -; GFX908-NEXT: v_add_u32_e32 v5, v5, v5 -; GFX908-NEXT: v_add_u32_e32 v12, v12, v12 +; GFX908-NEXT: v_add_u32_e32 v28, v28, v28 +; GFX908-NEXT: v_add_u32_e32 v4, v4, v4 ; GFX908-NEXT: v_add_u32_e32 v11, v11, v11 ; GFX908-NEXT: v_add_u32_e32 v10, v10, v10 ; GFX908-NEXT: v_add_u32_e32 v9, v9, v9 -; GFX908-NEXT: v_add_u32_e32 v16, v16, v16 +; GFX908-NEXT: v_add_u32_e32 v8, v8, v8 ; GFX908-NEXT: v_add_u32_e32 v15, v15, v15 ; GFX908-NEXT: v_add_u32_e32 v14, v14, v14 ; GFX908-NEXT: v_add_u32_e32 v13, v13, v13 -; GFX908-NEXT: v_add_u32_e32 v20, v20, v20 +; GFX908-NEXT: v_add_u32_e32 v12, v12, v12 ; GFX908-NEXT: v_add_u32_e32 v19, v19, v19 ; GFX908-NEXT: v_add_u32_e32 v18, v18, v18 ; GFX908-NEXT: v_add_u32_e32 v17, v17, v17 -; GFX908-NEXT: v_add_u32_e32 v24, v24, v24 +; GFX908-NEXT: v_add_u32_e32 v16, v16, v16 ; GFX908-NEXT: v_add_u32_e32 v23, v23, v23 ; GFX908-NEXT: v_add_u32_e32 v22, v22, v22 ; GFX908-NEXT: v_add_u32_e32 v21, v21, v21 -; GFX908-NEXT: v_add_u32_e32 v28, v28, v28 +; GFX908-NEXT: v_add_u32_e32 v20, v20, v20 ; GFX908-NEXT: v_add_u32_e32 v27, v27, v27 ; GFX908-NEXT: v_add_u32_e32 v26, v26, v26 ; GFX908-NEXT: v_add_u32_e32 v25, v25, v25 -; GFX908-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] -; GFX908-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 +; GFX908-NEXT: v_add_u32_e32 v24, v24, v24 +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: no_agpr_no_reserve: @@ -514,49 +514,49 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX908-NEXT: s_load_dword s9, s[4:5], 0x18 -; GFX908-NEXT: s_mov_b32 s8, 0 -; GFX908-NEXT: s_mov_b32 s5, s8 +; GFX908-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; GFX908-NEXT: s_load_dword s0, s[4:5], 0x18 +; GFX908-NEXT: s_mov_b32 s6, 0 +; GFX908-NEXT: s_mov_b32 s5, s6 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX908-NEXT: s_sub_i32 s4, 0, s3 -; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s9 +; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s19 +; GFX908-NEXT: s_sub_i32 s1, 0, s19 +; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s0 ; GFX908-NEXT: v_mov_b32_e32 v19, 0 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: s_mul_i32 s4, s4, s10 -; GFX908-NEXT: s_mul_hi_u32 s4, s10, s4 -; GFX908-NEXT: s_add_i32 s10, s10, s4 -; GFX908-NEXT: s_mul_hi_u32 s4, s2, s10 -; GFX908-NEXT: s_mul_i32 s10, s4, s3 -; GFX908-NEXT: s_sub_i32 s2, s2, s10 -; GFX908-NEXT: s_add_i32 s11, s4, 1 -; GFX908-NEXT: s_sub_i32 s10, s2, s3 -; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s4, s11, s4 -; GFX908-NEXT: s_cselect_b32 s2, s10, s2 -; GFX908-NEXT: s_add_i32 s10, s4, 1 -; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s4, s10, s4 -; GFX908-NEXT: s_lshr_b32 s9, s9, 16 +; GFX908-NEXT: v_readfirstlane_b32 s4, v2 +; GFX908-NEXT: s_mul_i32 s1, s1, s4 +; GFX908-NEXT: s_mul_hi_u32 s1, s4, s1 +; GFX908-NEXT: s_add_i32 s4, s4, s1 +; GFX908-NEXT: s_mul_hi_u32 s1, s18, s4 +; GFX908-NEXT: s_mul_i32 s4, s1, s19 +; GFX908-NEXT: s_sub_i32 s4, s18, s4 +; GFX908-NEXT: s_add_i32 s7, s1, 1 +; GFX908-NEXT: s_sub_i32 s8, s4, s19 +; GFX908-NEXT: s_cmp_ge_u32 s4, s19 +; GFX908-NEXT: s_cselect_b32 s1, s7, s1 +; GFX908-NEXT: s_cselect_b32 s4, s8, s4 +; GFX908-NEXT: s_add_i32 s7, s1, 1 +; GFX908-NEXT: s_cmp_ge_u32 s4, s19 +; GFX908-NEXT: s_cselect_b32 s4, s7, s1 +; GFX908-NEXT: s_lshr_b32 s0, s0, 16 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s0 ; GFX908-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s9 -; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX908-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 +; GFX908-NEXT: s_lshl_b64 s[10:11], s[2:3], 5 +; GFX908-NEXT: s_lshl_b64 s[8:9], s[16:17], 5 ; GFX908-NEXT: s_or_b32 s10, s10, 28 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s5, v16 -; GFX908-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX908-NEXT: s_mul_i32 s1, s1, s5 -; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5 -; GFX908-NEXT: s_mul_i32 s0, s0, s5 -; GFX908-NEXT: s_add_i32 s1, s9, s1 +; GFX908-NEXT: v_readfirstlane_b32 s0, v16 +; GFX908-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX908-NEXT: s_mul_i32 s1, s17, s0 +; GFX908-NEXT: s_mul_hi_u32 s5, s16, s0 +; GFX908-NEXT: s_mul_i32 s0, s16, s0 +; GFX908-NEXT: s_add_i32 s1, s5, s1 ; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %Flow20 @@ -571,29 +571,29 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 -; GFX908-NEXT: s_mov_b32 s9, s8 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1 +; GFX908-NEXT: s_mov_b32 s7, s6 ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: v_mov_b32_e32 v4, s6 ; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6 -; GFX908-NEXT: v_mov_b32_e32 v8, s8 -; GFX908-NEXT: v_mov_b32_e32 v6, s8 -; GFX908-NEXT: v_mov_b32_e32 v5, s9 -; GFX908-NEXT: v_mov_b32_e32 v9, s9 -; GFX908-NEXT: v_mov_b32_e32 v7, s9 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v9, s7 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v5, s7 +; GFX908-NEXT: v_mov_b32_e32 v8, s6 +; GFX908-NEXT: v_mov_b32_e32 v7, s7 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[2:3], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 ; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s5, v2 -; GFX908-NEXT: v_readfirstlane_b32 s9, v3 +; GFX908-NEXT: v_readfirstlane_b32 s7, v3 ; GFX908-NEXT: s_add_u32 s5, s5, 1 -; GFX908-NEXT: s_addc_u32 s9, s9, 0 -; GFX908-NEXT: s_mul_hi_u32 s21, s2, s5 -; GFX908-NEXT: s_mul_i32 s22, s3, s5 -; GFX908-NEXT: s_mul_i32 s20, s2, s5 -; GFX908-NEXT: s_mul_i32 s5, s2, s9 +; GFX908-NEXT: s_addc_u32 s7, s7, 0 +; GFX908-NEXT: s_mul_hi_u32 s21, s8, s5 +; GFX908-NEXT: s_mul_i32 s22, s9, s5 +; GFX908-NEXT: s_mul_i32 s20, s8, s5 +; GFX908-NEXT: s_mul_i32 s5, s8, s7 ; GFX908-NEXT: s_add_i32 s5, s21, s5 ; GFX908-NEXT: s_add_i32 s5, s5, s22 ; GFX908-NEXT: s_branch .LBB3_5 @@ -670,8 +670,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s6, s6, s4 -; GFX908-NEXT: s_addc_u32 s7, s7, 0 +; GFX908-NEXT: s_add_u32 s2, s2, s4 +; GFX908-NEXT: s_addc_u32 s3, s3, 0 ; GFX908-NEXT: s_add_u32 s10, s10, s12 ; GFX908-NEXT: s_addc_u32 s11, s11, s13 ; GFX908-NEXT: s_mov_b64 s[0:1], 0 @@ -682,48 +682,48 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18 -; GFX90A-NEXT: s_mov_b32 s8, 0 -; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x18 +; GFX90A-NEXT: s_mov_b32 s6, 0 +; GFX90A-NEXT: s_mov_b32 s5, s6 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s4, 0, s3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s19 +; GFX90A-NEXT: s_sub_i32 s1, 0, s19 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s9 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 -; GFX90A-NEXT: s_mul_i32 s4, s4, s10 -; GFX90A-NEXT: s_mul_hi_u32 s4, s10, s4 -; GFX90A-NEXT: s_add_i32 s10, s10, s4 -; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s10 -; GFX90A-NEXT: s_mul_i32 s10, s4, s3 -; GFX90A-NEXT: s_sub_i32 s2, s2, s10 -; GFX90A-NEXT: s_add_i32 s11, s4, 1 -; GFX90A-NEXT: s_sub_i32 s10, s2, s3 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s4, s11, s4 -; GFX90A-NEXT: s_cselect_b32 s2, s10, s2 -; GFX90A-NEXT: s_add_i32 s10, s4, 1 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s4, s10, s4 -; GFX90A-NEXT: s_lshr_b32 s9, s9, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX90A-NEXT: v_readfirstlane_b32 s4, v1 +; GFX90A-NEXT: s_mul_i32 s1, s1, s4 +; GFX90A-NEXT: s_mul_hi_u32 s1, s4, s1 +; GFX90A-NEXT: s_add_i32 s4, s4, s1 +; GFX90A-NEXT: s_mul_hi_u32 s1, s18, s4 +; GFX90A-NEXT: s_mul_i32 s4, s1, s19 +; GFX90A-NEXT: s_sub_i32 s4, s18, s4 +; GFX90A-NEXT: s_add_i32 s7, s1, 1 +; GFX90A-NEXT: s_sub_i32 s8, s4, s19 +; GFX90A-NEXT: s_cmp_ge_u32 s4, s19 +; GFX90A-NEXT: s_cselect_b32 s1, s7, s1 +; GFX90A-NEXT: s_cselect_b32 s4, s8, s4 +; GFX90A-NEXT: s_add_i32 s7, s1, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s4, s19 +; GFX90A-NEXT: s_cselect_b32 s4, s7, s1 +; GFX90A-NEXT: s_lshr_b32 s0, s0, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s0 ; GFX90A-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s9 -; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[2:3], 5 +; GFX90A-NEXT: s_lshl_b64 s[8:9], s[16:17], 5 ; GFX90A-NEXT: s_or_b32 s10, s10, 28 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s5, v18 -; GFX90A-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX90A-NEXT: s_mul_i32 s1, s1, s5 -; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5 -; GFX90A-NEXT: s_mul_i32 s0, s0, s5 -; GFX90A-NEXT: s_add_i32 s1, s9, s1 +; GFX90A-NEXT: v_readfirstlane_b32 s0, v18 +; GFX90A-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX90A-NEXT: s_mul_i32 s1, s17, s0 +; GFX90A-NEXT: s_mul_hi_u32 s5, s16, s0 +; GFX90A-NEXT: s_mul_i32 s0, s16, s0 +; GFX90A-NEXT: s_add_i32 s1, s5, s1 ; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %Flow20 @@ -738,25 +738,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 -; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1 +; GFX90A-NEXT: s_mov_b32 s7, s6 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[2:3], 0 ; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 +; GFX90A-NEXT: v_readfirstlane_b32 s7, v5 ; GFX90A-NEXT: s_add_u32 s5, s5, 1 -; GFX90A-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-NEXT: s_mul_hi_u32 s21, s2, s5 -; GFX90A-NEXT: s_mul_i32 s22, s3, s5 -; GFX90A-NEXT: s_mul_i32 s20, s2, s5 -; GFX90A-NEXT: s_mul_i32 s5, s2, s9 +; GFX90A-NEXT: s_addc_u32 s7, s7, 0 +; GFX90A-NEXT: s_mul_hi_u32 s21, s8, s5 +; GFX90A-NEXT: s_mul_i32 s22, s9, s5 +; GFX90A-NEXT: s_mul_i32 s20, s8, s5 +; GFX90A-NEXT: s_mul_i32 s5, s8, s7 ; GFX90A-NEXT: s_add_i32 s5, s21, s5 ; GFX90A-NEXT: s_add_i32 s5, s5, s22 ; GFX90A-NEXT: s_branch .LBB3_5 @@ -826,8 +826,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s6, s6, s4 -; GFX90A-NEXT: s_addc_u32 s7, s7, 0 +; GFX90A-NEXT: s_add_u32 s2, s2, s4 +; GFX90A-NEXT: s_addc_u32 s3, s3, 0 ; GFX90A-NEXT: s_add_u32 s10, s10, s12 ; GFX90A-NEXT: s_addc_u32 s11, s11, s13 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index c793f9ee682f8c..beb78cd7d389ae 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -9221,17 +9221,17 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s10 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX6-NEXT: s_ashr_i32 s14, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s14 -; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: s_addc_u32 s3, s3, s14 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[14:15] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_sub_u32 s10, 0, s2 -; GFX6-NEXT: s_subb_u32 s11, 0, s3 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s10 +; GFX6-NEXT: s_ashr_i32 s2, s9, 31 +; GFX6-NEXT: s_add_u32 s8, s8, s2 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s9, s9, s2 +; GFX6-NEXT: s_xor_b64 s[12:13], s[8:9], s[2:3] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX6-NEXT: s_sub_u32 s10, 0, s12 +; GFX6-NEXT: s_subb_u32 s11, 0, s13 ; GFX6-NEXT: s_ashr_i32 s16, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 @@ -9306,23 +9306,23 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s13 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -9331,23 +9331,23 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] -; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[14:15] -; GFX6-NEXT: s_ashr_i32 s4, s13, 31 -; GFX6-NEXT: s_add_u32 s12, s12, s4 +; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[2:3] +; GFX6-NEXT: s_ashr_i32 s2, s15, 31 +; GFX6-NEXT: s_add_u32 s4, s14, s2 ; GFX6-NEXT: v_mov_b32_e32 v6, s5 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_addc_u32 s13, s13, s4 -; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s5, s15, s2 +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 ; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 @@ -9356,16 +9356,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: s_sub_u32 s2, 0, s12 +; GFX6-NEXT: s_sub_u32 s12, 0, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX6-NEXT: s_subb_u32 s3, 0, s13 -; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v3 +; GFX6-NEXT: s_subb_u32 s13, 0, s5 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 @@ -9384,11 +9384,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 @@ -9403,14 +9403,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: s_ashr_i32 s2, s7, 31 +; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s6, s6, s2 +; GFX6-NEXT: s_add_u32 s6, s6, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s7, s7, s2 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_addc_u32 s7, s7, s12 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[12:13] ; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 @@ -9426,25 +9426,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s4, v2 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s5, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s13 +; GFX6-NEXT: v_mov_b32_e32 v7, s5 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s4, v5 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] @@ -9455,15 +9455,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s7 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index f8f50c7cb23a5a..1b998b41a2a62e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -224,23 +224,23 @@ entry: define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) { ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s4, s6, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mul_i32 s2, s6, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -255,22 +255,22 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -285,22 +285,22 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -314,23 +314,23 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -371,24 +371,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec ; GFX11W64-NEXT: s_mov_b64 s[2:3], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1302,23 +1302,23 @@ entry: define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) { ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX6-NEXT: s_cbranch_execz .LBB6_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s4, s6, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mul_i32 s2, s6, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB6_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1333,22 +1333,22 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB6_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1363,22 +1363,22 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1392,23 +1392,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1451,24 +1451,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec ; GFX11W64-NEXT: s_mov_b64 s[2:3], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 81fd166e3779f8..c9345706fc0809 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -48,17 +48,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX89-LABEL: add_i32_constant: ; GFX89: ; %bb.0: ; %entry ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX89-NEXT: s_mov_b64 s[6:7], exec -; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX89-NEXT: s_mov_b64 s[4:5], exec +; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX89-NEXT: ; implicit-def: $vgpr1 -; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX89-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX89-NEXT: s_cbranch_execz .LBB0_2 ; GFX89-NEXT: ; %bb.1: ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[4:5] ; GFX89-NEXT: s_mul_i32 s2, s2, 5 ; GFX89-NEXT: s_mov_b32 s11, 0xf000 ; GFX89-NEXT: s_mov_b32 s10, -1 @@ -69,7 +69,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: buffer_wbinvl1_vol ; GFX89-NEXT: .LBB0_2: -; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX89-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX89-NEXT: v_readfirstlane_b32 s4, v1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s3, 0xf000 @@ -1518,17 +1518,17 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[4:5] ; GFX8-NEXT: s_mul_i32 s2, s2, 5 ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 @@ -1539,7 +1539,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB6_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1552,17 +1552,17 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[4:5] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index baa0c72dbf63e2..f619716c8ed3aa 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -223,23 +223,23 @@ entry: define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) { ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s4, s6, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mul_i32 s2, s6, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -254,22 +254,22 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -284,22 +284,22 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -313,23 +313,23 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -370,24 +370,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec ; GFX11W64-NEXT: s_mov_b64 s[2:3], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1004,23 +1004,23 @@ entry: define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) { ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s4, s6, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mul_i32 s2, s6, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1035,22 +1035,22 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1065,22 +1065,22 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1094,23 +1094,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1153,24 +1153,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec ; GFX11W64-NEXT: s_mov_b64 s[2:3], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 5990736f664fbe..ba382627561013 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -13,16 +13,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) - ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) + ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc - ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 8, implicit-def $scc + ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec @@ -33,12 +33,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc ; GFX90A-NEXT: $vgpr24 = IMPLICIT_DEF - ; GFX90A-NEXT: $agpr0 = IMPLICIT_DEF + ; GFX90A-NEXT: $vgpr12 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr26 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr22 = IMPLICIT_DEF @@ -46,11 +46,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr40, $sgpr41, $sgpr20_sgpr21_sgpr22, $vgpr24, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr12, $vgpr26, $vgpr20, $vgpr22 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr33 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF @@ -59,7 +59,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr40_sgpr41:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr4 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -67,7 +67,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr25, implicit $exec @@ -86,6 +86,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 @@ -93,7 +94,6 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -112,27 +112,27 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr23, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr21 = COPY renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr23 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr22 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr25 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr24 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr27 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr26 = COPY $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr21 = COPY renamable $sgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr20 = COPY $sgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr23 = COPY $sgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr22 = COPY $sgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr25 = COPY $sgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr24 = COPY $sgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr27 = COPY $sgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr26 = COPY $sgpr23, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.62, implicit $exec ; GFX90A-NEXT: {{ $}} @@ -151,7 +151,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) @@ -168,37 +168,37 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: ; GFX90A-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.16, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.15.bb72: ; GFX90A-NEXT: successors: %bb.16(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc @@ -208,113 +208,113 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr13 = COPY killed renamable $sgpr15 ; GFX90A-NEXT: $sgpr14 = COPY killed renamable $sgpr16 ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr18_sgpr19, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.16.Flow36: ; GFX90A-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.18, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.17.bb67: ; GFX90A-NEXT: successors: %bb.18(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.18.Flow37: ; GFX90A-NEXT: successors: %bb.19(0x40000000), %bb.20(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.20, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.19.bb62: ; GFX90A-NEXT: successors: %bb.20(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.20.Flow38: ; GFX90A-NEXT: successors: %bb.21(0x40000000), %bb.22(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.22, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.21.bb54: ; GFX90A-NEXT: successors: %bb.22(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.22.Flow39: ; GFX90A-NEXT: successors: %bb.23(0x40000000), %bb.24(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.24, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.23.bb47: ; GFX90A-NEXT: successors: %bb.24(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.24.Flow40: ; GFX90A-NEXT: successors: %bb.25(0x40000000), %bb.26(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.26, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.25.bb40: ; GFX90A-NEXT: successors: %bb.26(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.26.Flow41: ; GFX90A-NEXT: successors: %bb.27(0x40000000), %bb.28(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.28, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.27.bb33: ; GFX90A-NEXT: successors: %bb.28(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.28.Flow42: ; GFX90A-NEXT: successors: %bb.34(0x40000000), %bb.29(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec @@ -323,7 +323,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.29.Flow43: ; GFX90A-NEXT: successors: %bb.30(0x40000000), %bb.31(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr40_sgpr41, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -331,17 +331,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.30.bb19: ; GFX90A-NEXT: successors: %bb.31(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr40_sgpr41, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.31.Flow44: ; GFX90A-NEXT: successors: %bb.32(0x40000000), %bb.33(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr40_sgpr41, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.33, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock: @@ -357,32 +357,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.34.bb26: ; GFX90A-NEXT: successors: %bb.29(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr40_sgpr41, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_OR_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.29 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -399,26 +399,28 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.36.Flow21: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr23, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr40_sgpr41:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr44_sgpr45, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF @@ -436,40 +438,41 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.38.Flow22: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr23, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr56_sgpr57, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr40_sgpr41, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.36 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr48_sgpr49, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr64_sgpr65 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF @@ -486,45 +489,44 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF + ; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr23, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr40_sgpr41, killed renamable $sgpr46_sgpr47, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.38 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41:0x000000000000000F, $sgpr42_sgpr43, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr59, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec :: (load (s8) from %ir.i42, addrspace 1) ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr20, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -538,50 +540,50 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF + ; GFX90A-NEXT: $sgpr44_sgpr45 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr23, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr44_sgpr45, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr20, implicit $exec - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_OR_B64 killed renamable $sgpr40_sgpr41, killed renamable $sgpr46_sgpr47, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.40 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41:0x000000000000000F, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc + ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 renamable $sgpr62_sgpr63, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc - ; GFX90A-NEXT: $agpr0 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc + ; GFX90A-NEXT: $vgpr12 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.48, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr20, $vgpr61, $vgpr31, $vgpr63, $agpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr4, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr14 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr20, $vgpr61, $vgpr31, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr40, $vgpr62, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr4, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr12, $vgpr14, $vgpr60 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -593,42 +595,42 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.45.Flow26: ; GFX90A-NEXT: successors: %bb.47(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr23, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr40_sgpr41, killed renamable $sgpr46_sgpr47, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41:0x000000000000000F, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr56_sgpr57, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec :: (load (s8) from %ir.i49, addrspace 1) + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -641,57 +643,57 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.47.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr23, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr56_sgpr57, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr40_sgpr41, killed renamable $sgpr50_sgpr51, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.42 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41:0x000000000000000F, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.49: ; GFX90A-NEXT: successors: %bb.44(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41:0x000000000000000F, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, $vgpr4_vgpr5, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.51: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -702,13 +704,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41:0x000000000000000F, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc @@ -718,11 +720,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF @@ -731,19 +733,19 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41:0x000000000000000F, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr7, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr1, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr5, implicit $exec ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF @@ -755,45 +757,46 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.55.Flow29: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr23, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr23, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41:0x000000000000000F, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec + ; GFX90A-NEXT: renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr62_sgpr63, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr56, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr5, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr40, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr41, killed $vgpr5, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr30 = V_ALIGNBIT_B32_e64 $vgpr19, $vgpr18, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr19 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.57: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr23:0x0000000000000003, $sgpr33:0x0000000000000003, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr12_vgpr13:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr19 = COPY killed renamable $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr33, implicit $exec + ; GFX90A-NEXT: renamable $vgpr19 = COPY killed renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 @@ -801,7 +804,6 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -826,7 +828,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.58.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr40_sgpr41:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) @@ -834,26 +836,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $agpr0_agpr1 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr33, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr33 = S_MOV_B32 0 ; GFX90A-NEXT: renamable $sgpr23 = S_MOV_B32 0 - ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41:0x000000000000000F, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = FLAT_LOAD_UBYTE renamable $vgpr10_vgpr11, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) - ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr23 = S_MOV_B32 0 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr5, implicit $exec - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF @@ -862,33 +864,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr52_sgpr53 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.Flow31: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr23, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $vgpr16, implicit $exec - ; GFX90A-NEXT: renamable $agpr0_agpr1 = COPY killed renamable $vgpr12_vgpr13, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr56_sgpr57, implicit-def dead $scc + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr23, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr40_sgpr41, killed renamable $sgpr52_sgpr53, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.62.bb140: ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -896,14 +897,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.63.Flow13: ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.64.bb159: ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr4, implicit $exec ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec @@ -912,26 +913,25 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.65.Flow10: ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.67.bb161: ; GFX90A-NEXT: successors: %bb.65(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr23, killed $vgpr25, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = COPY killed renamable $agpr1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr3, killed $vgpr21, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr13, killed $vgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr54, 0, $vgpr3, 0, 0, 6, implicit $exec @@ -946,15 +946,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr28 = V_OR_B32_e32 1, $vgpr26, implicit $exec ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr24, implicit $exec ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr32 = V_CNDMASK_B32_e64 0, $vgpr36, 0, 0, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr50 = V_OR_B32_e32 $vgpr32, $vgpr20, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = COPY renamable $agpr0_agpr1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr50, killed $vgpr12, implicit $exec + ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr50, $vgpr12, implicit $exec ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr14, implicit $exec ; GFX90A-NEXT: renamable $vgpr52 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr34, killed $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1 @@ -963,14 +962,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.69.Flow: ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.70.bb186: ; GFX90A-NEXT: successors: %bb.71(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr27, implicit $exec @@ -999,19 +998,21 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.Flow9: ; GFX90A-NEXT: successors: %bb.63(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.72.bb196: ; GFX90A-NEXT: successors: %bb.69(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr5 = V_OR_B32_e32 $vgpr52, killed $vgpr18, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12 = V_OR_B32_e32 killed $vgpr5, killed $vgpr16, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr13, renamable $vgpr12_vgpr13, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $agpr0_agpr1 = COPY killed renamable $vgpr60_vgpr61, implicit $exec + ; GFX90A-NEXT: renamable $vgpr60 = V_OR_B32_e32 killed $vgpr5, killed $vgpr16, implicit $exec + ; GFX90A-NEXT: renamable $vgpr61 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr61, renamable $vgpr60_vgpr61, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = COPY killed renamable $agpr0_agpr1, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.69 bb: diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index e4c7df385d8619..5cc88343faffaa 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -189,29 +189,29 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s12, 0xff00ff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, s7, s7, 8 -; SI-NEXT: v_alignbit_b32 v1, s7, s7, 24 -; SI-NEXT: v_alignbit_b32 v2, s6, s6, 8 -; SI-NEXT: v_alignbit_b32 v4, s6, s6, 24 -; SI-NEXT: v_alignbit_b32 v5, s5, s5, 8 -; SI-NEXT: v_alignbit_b32 v6, s5, s5, 24 -; SI-NEXT: v_alignbit_b32 v7, s4, s4, 8 -; SI-NEXT: v_alignbit_b32 v8, s4, s4, 24 -; SI-NEXT: v_alignbit_b32 v9, s11, s11, 8 -; SI-NEXT: v_alignbit_b32 v10, s11, s11, 24 -; SI-NEXT: v_alignbit_b32 v11, s10, s10, 8 -; SI-NEXT: v_alignbit_b32 v12, s10, s10, 24 -; SI-NEXT: v_alignbit_b32 v13, s9, s9, 8 -; SI-NEXT: v_alignbit_b32 v14, s9, s9, 24 -; SI-NEXT: v_alignbit_b32 v15, s8, s8, 8 -; SI-NEXT: v_alignbit_b32 v16, s8, s8, 24 +; SI-NEXT: v_alignbit_b32 v0, s3, s3, 8 +; SI-NEXT: v_alignbit_b32 v1, s3, s3, 24 +; SI-NEXT: v_alignbit_b32 v2, s2, s2, 8 +; SI-NEXT: v_alignbit_b32 v4, s2, s2, 24 +; SI-NEXT: v_alignbit_b32 v5, s1, s1, 8 +; SI-NEXT: v_alignbit_b32 v6, s1, s1, 24 +; SI-NEXT: v_alignbit_b32 v7, s0, s0, 8 +; SI-NEXT: v_alignbit_b32 v8, s0, s0, 24 +; SI-NEXT: v_alignbit_b32 v9, s7, s7, 8 +; SI-NEXT: v_alignbit_b32 v10, s7, s7, 24 +; SI-NEXT: v_alignbit_b32 v11, s6, s6, 8 +; SI-NEXT: v_alignbit_b32 v12, s6, s6, 24 +; SI-NEXT: v_alignbit_b32 v13, s5, s5, 8 +; SI-NEXT: v_alignbit_b32 v14, s5, s5, 24 +; SI-NEXT: v_alignbit_b32 v15, s4, s4, 8 +; SI-NEXT: v_alignbit_b32 v16, s4, s4, 24 ; SI-NEXT: v_bfi_b32 v3, s12, v1, v0 ; SI-NEXT: v_bfi_b32 v2, s12, v4, v2 ; SI-NEXT: v_bfi_b32 v1, s12, v6, v5 @@ -220,8 +220,8 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; SI-NEXT: v_bfi_b32 v6, s12, v12, v11 ; SI-NEXT: v_bfi_b32 v5, s12, v14, v13 ; SI-NEXT: v_bfi_b32 v4, s12, v16, v15 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_bswap_v8i32: @@ -398,29 +398,29 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s12, 0xff00ff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, s6, s6, 8 -; SI-NEXT: v_alignbit_b32 v1, s6, s6, 24 -; SI-NEXT: v_alignbit_b32 v2, s7, s7, 8 -; SI-NEXT: v_alignbit_b32 v4, s7, s7, 24 -; SI-NEXT: v_alignbit_b32 v5, s4, s4, 8 -; SI-NEXT: v_alignbit_b32 v6, s4, s4, 24 -; SI-NEXT: v_alignbit_b32 v7, s5, s5, 8 -; SI-NEXT: v_alignbit_b32 v8, s5, s5, 24 -; SI-NEXT: v_alignbit_b32 v9, s10, s10, 8 -; SI-NEXT: v_alignbit_b32 v10, s10, s10, 24 -; SI-NEXT: v_alignbit_b32 v11, s11, s11, 8 -; SI-NEXT: v_alignbit_b32 v12, s11, s11, 24 -; SI-NEXT: v_alignbit_b32 v13, s8, s8, 8 -; SI-NEXT: v_alignbit_b32 v14, s8, s8, 24 -; SI-NEXT: v_alignbit_b32 v15, s9, s9, 8 -; SI-NEXT: v_alignbit_b32 v16, s9, s9, 24 +; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8 +; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24 +; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8 +; SI-NEXT: v_alignbit_b32 v4, s3, s3, 24 +; SI-NEXT: v_alignbit_b32 v5, s0, s0, 8 +; SI-NEXT: v_alignbit_b32 v6, s0, s0, 24 +; SI-NEXT: v_alignbit_b32 v7, s1, s1, 8 +; SI-NEXT: v_alignbit_b32 v8, s1, s1, 24 +; SI-NEXT: v_alignbit_b32 v9, s6, s6, 8 +; SI-NEXT: v_alignbit_b32 v10, s6, s6, 24 +; SI-NEXT: v_alignbit_b32 v11, s7, s7, 8 +; SI-NEXT: v_alignbit_b32 v12, s7, s7, 24 +; SI-NEXT: v_alignbit_b32 v13, s4, s4, 8 +; SI-NEXT: v_alignbit_b32 v14, s4, s4, 24 +; SI-NEXT: v_alignbit_b32 v15, s5, s5, 8 +; SI-NEXT: v_alignbit_b32 v16, s5, s5, 24 ; SI-NEXT: v_bfi_b32 v3, s12, v1, v0 ; SI-NEXT: v_bfi_b32 v2, s12, v4, v2 ; SI-NEXT: v_bfi_b32 v1, s12, v6, v5 @@ -429,8 +429,8 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; SI-NEXT: v_bfi_b32 v6, s12, v12, v11 ; SI-NEXT: v_bfi_b32 v5, s12, v14, v13 ; SI-NEXT: v_bfi_b32 v4, s12, v16, v15 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_bswap_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll index 66ba818b400b6f..7b7a67193d226f 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -10,7 +10,7 @@ define void @f(i32 %arg, ptr %ptr) { ; ISA-NEXT: s_mov_b64 s[4:5], 0 ; ISA-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v0 ; ISA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; ISA-NEXT: v_mov_b32_e32 v6, 0 +; ISA-NEXT: v_mov_b32_e32 v7, 0 ; ISA-NEXT: s_waitcnt lgkmcnt(0) ; ISA-NEXT: s_lshr_b32 s6, s5, 1 ; ISA-NEXT: s_lshr_b32 s7, 1, s4 @@ -27,18 +27,18 @@ define void @f(i32 %arg, ptr %ptr) { ; ISA-NEXT: s_mov_b32 s4, 0 ; ISA-NEXT: .LBB0_1: ; %bb14 ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 -; ISA-NEXT: v_mov_b32_e32 v7, v6 +; ISA-NEXT: v_mov_b32_e32 v6, v7 ; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo ; ISA-NEXT: s_or_b32 s4, s5, s4 -; ISA-NEXT: v_add_f32_e32 v6, v7, v0 -; ISA-NEXT: v_add_f32_e64 v6, v6, |v3| -; ISA-NEXT: v_add_f32_e32 v6, v6, v4 -; ISA-NEXT: v_add_f32_e32 v6, v6, v5 +; ISA-NEXT: v_add_f32_e32 v7, v6, v0 +; ISA-NEXT: v_add_f32_e64 v7, v7, |v3| +; ISA-NEXT: v_add_f32_e32 v7, v7, v4 +; ISA-NEXT: v_add_f32_e32 v7, v7, v5 ; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; ISA-NEXT: s_cbranch_execnz .LBB0_1 ; ISA-NEXT: ; %bb.2: ; %bb21 ; ISA-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; ISA-NEXT: flat_store_dword v[1:2], v7 +; ISA-NEXT: flat_store_dword v[1:2], v6 ; ISA-NEXT: s_waitcnt lgkmcnt(0) ; ISA-NEXT: s_setpc_b64 s[30:31] ; MIR-LABEL: name: f diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index cb89841b58f978..a5a46ddb0f458b 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -1449,41 +1449,41 @@ entry: define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; SI-LABEL: amd_kernel_v16i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s5, s0, 16 -; SI-NEXT: s_lshr_b32 s6, s0, 24 -; SI-NEXT: s_lshr_b32 s8, s1, 16 -; SI-NEXT: s_lshr_b32 s9, s1, 24 -; SI-NEXT: s_lshr_b32 s10, s2, 16 -; SI-NEXT: s_lshr_b32 s11, s2, 24 -; SI-NEXT: s_lshr_b32 s12, s3, 16 -; SI-NEXT: s_lshr_b32 s13, s3, 24 -; SI-NEXT: s_bfe_u32 s14, s0, 0x80008 -; SI-NEXT: s_bfe_u32 s15, s1, 0x80008 -; SI-NEXT: s_bfe_u32 s16, s2, 0x80008 -; SI-NEXT: s_bfe_u32 s17, s3, 0x80008 -; SI-NEXT: s_add_i32 s3, s3, s3 -; SI-NEXT: s_add_i32 s2, s2, s2 -; SI-NEXT: s_add_i32 s1, s1, s1 -; SI-NEXT: s_add_i32 s0, s0, s0 +; SI-NEXT: s_lshr_b32 s1, s4, 16 +; SI-NEXT: s_lshr_b32 s2, s4, 24 +; SI-NEXT: s_lshr_b32 s8, s5, 16 +; SI-NEXT: s_lshr_b32 s9, s5, 24 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: s_lshr_b32 s11, s6, 24 +; SI-NEXT: s_lshr_b32 s12, s7, 16 +; SI-NEXT: s_lshr_b32 s13, s7, 24 +; SI-NEXT: s_bfe_u32 s14, s4, 0x80008 +; SI-NEXT: s_bfe_u32 s15, s5, 0x80008 +; SI-NEXT: s_bfe_u32 s16, s6, 0x80008 +; SI-NEXT: s_bfe_u32 s17, s7, 0x80008 +; SI-NEXT: s_add_i32 s7, s7, s7 +; SI-NEXT: s_add_i32 s6, s6, s6 +; SI-NEXT: s_add_i32 s5, s5, s5 +; SI-NEXT: s_add_i32 s4, s4, s4 ; SI-NEXT: s_add_i32 s13, s13, s13 ; SI-NEXT: s_add_i32 s12, s12, s12 -; SI-NEXT: s_and_b32 s3, s3, 0xff +; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_add_i32 s17, s17, s17 ; SI-NEXT: s_add_i32 s11, s11, s11 ; SI-NEXT: s_add_i32 s10, s10, s10 -; SI-NEXT: s_and_b32 s2, s2, 0xff +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_add_i32 s16, s16, s16 ; SI-NEXT: s_add_i32 s9, s9, s9 ; SI-NEXT: s_add_i32 s8, s8, s8 -; SI-NEXT: s_and_b32 s1, s1, 0xff +; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_add_i32 s15, s15, s15 -; SI-NEXT: s_add_i32 s6, s6, s6 -; SI-NEXT: s_add_i32 s5, s5, s5 -; SI-NEXT: s_and_b32 s0, s0, 0xff +; SI-NEXT: s_add_i32 s2, s2, s2 +; SI-NEXT: s_add_i32 s1, s1, s1 +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_add_i32 s14, s14, s14 ; SI-NEXT: s_lshl_b32 s13, s13, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xff @@ -1494,36 +1494,36 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; SI-NEXT: s_lshl_b32 s9, s9, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s15, s15, 8 -; SI-NEXT: s_lshl_b32 s6, s6, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 s2, s2, 24 +; SI-NEXT: s_and_b32 s1, s1, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_or_b32 s3, s3, s17 +; SI-NEXT: s_or_b32 s7, s7, s17 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_or_b32 s2, s2, s16 +; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_or_b32 s1, s1, s15 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_or_b32 s0, s0, s14 +; SI-NEXT: s_or_b32 s5, s5, s15 +; SI-NEXT: s_lshl_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s4, s4, s14 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: s_and_b32 s3, s3, 0xffff +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_and_b32 s2, s2, 0xffff +; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s1, s1, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s0, s0, 0xffff -; SI-NEXT: s_or_b32 s3, s3, s12 -; SI-NEXT: s_or_b32 s2, s2, s10 -; SI-NEXT: s_or_b32 s1, s1, s8 -; SI-NEXT: s_or_b32 s0, s0, s5 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s5, s4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s1, s2, s1 +; SI-NEXT: s_and_b32 s2, s4, 0xffff +; SI-NEXT: s_or_b32 s4, s7, s12 +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s7, s2, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: amd_kernel_v16i8: @@ -1670,75 +1670,75 @@ entry: define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; SI-LABEL: amd_kernel_v32i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s9, 0 -; SI-NEXT: s_mov_b32 s8, 16 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_mov_b32 s0, 16 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s12, s4, 16 -; SI-NEXT: s_lshr_b32 s13, s4, 24 -; SI-NEXT: s_lshr_b32 s14, s5, 16 -; SI-NEXT: s_lshr_b32 s15, s5, 24 -; SI-NEXT: s_lshr_b32 s16, s6, 16 -; SI-NEXT: s_lshr_b32 s17, s6, 24 -; SI-NEXT: s_lshr_b32 s18, s7, 16 -; SI-NEXT: s_lshr_b32 s19, s7, 24 -; SI-NEXT: s_lshr_b32 s20, s0, 16 -; SI-NEXT: s_lshr_b32 s21, s0, 24 -; SI-NEXT: s_lshr_b32 s22, s1, 16 -; SI-NEXT: s_lshr_b32 s23, s1, 24 -; SI-NEXT: s_lshr_b32 s24, s2, 16 -; SI-NEXT: s_lshr_b32 s25, s2, 24 -; SI-NEXT: s_lshr_b32 s26, s3, 16 -; SI-NEXT: s_lshr_b32 s27, s3, 24 -; SI-NEXT: s_bfe_u32 s28, s4, 0x80008 -; SI-NEXT: s_bfe_u32 s29, s5, 0x80008 -; SI-NEXT: s_bfe_u32 s30, s6, 0x80008 -; SI-NEXT: s_bfe_u32 s31, s7, 0x80008 -; SI-NEXT: s_bfe_u32 s33, s0, 0x80008 -; SI-NEXT: s_bfe_u32 s34, s1, 0x80008 -; SI-NEXT: s_bfe_u32 s35, s2, 0x80008 -; SI-NEXT: s_bfe_u32 s36, s3, 0x80008 -; SI-NEXT: s_add_i32 s3, s3, s3 -; SI-NEXT: s_add_i32 s2, s2, s2 -; SI-NEXT: s_add_i32 s1, s1, s1 -; SI-NEXT: s_add_i32 s0, s0, s0 +; SI-NEXT: s_lshr_b32 s12, s8, 16 +; SI-NEXT: s_lshr_b32 s13, s8, 24 +; SI-NEXT: s_lshr_b32 s14, s9, 16 +; SI-NEXT: s_lshr_b32 s15, s9, 24 +; SI-NEXT: s_lshr_b32 s16, s10, 16 +; SI-NEXT: s_lshr_b32 s17, s10, 24 +; SI-NEXT: s_lshr_b32 s18, s11, 16 +; SI-NEXT: s_lshr_b32 s19, s11, 24 +; SI-NEXT: s_lshr_b32 s20, s4, 16 +; SI-NEXT: s_lshr_b32 s21, s4, 24 +; SI-NEXT: s_lshr_b32 s22, s5, 16 +; SI-NEXT: s_lshr_b32 s23, s5, 24 +; SI-NEXT: s_lshr_b32 s24, s6, 16 +; SI-NEXT: s_lshr_b32 s25, s6, 24 +; SI-NEXT: s_lshr_b32 s26, s7, 16 +; SI-NEXT: s_lshr_b32 s27, s7, 24 +; SI-NEXT: s_bfe_u32 s28, s8, 0x80008 +; SI-NEXT: s_bfe_u32 s29, s9, 0x80008 +; SI-NEXT: s_bfe_u32 s30, s10, 0x80008 +; SI-NEXT: s_bfe_u32 s31, s11, 0x80008 +; SI-NEXT: s_bfe_u32 s33, s4, 0x80008 +; SI-NEXT: s_bfe_u32 s34, s5, 0x80008 +; SI-NEXT: s_bfe_u32 s35, s6, 0x80008 +; SI-NEXT: s_bfe_u32 s36, s7, 0x80008 ; SI-NEXT: s_add_i32 s7, s7, s7 ; SI-NEXT: s_add_i32 s6, s6, s6 ; SI-NEXT: s_add_i32 s5, s5, s5 ; SI-NEXT: s_add_i32 s4, s4, s4 +; SI-NEXT: s_add_i32 s11, s11, s11 +; SI-NEXT: s_add_i32 s10, s10, s10 +; SI-NEXT: s_add_i32 s9, s9, s9 +; SI-NEXT: s_add_i32 s8, s8, s8 ; SI-NEXT: s_add_i32 s27, s27, s27 ; SI-NEXT: s_add_i32 s26, s26, s26 -; SI-NEXT: s_and_b32 s3, s3, 0xff +; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_add_i32 s36, s36, s36 ; SI-NEXT: s_add_i32 s25, s25, s25 ; SI-NEXT: s_add_i32 s24, s24, s24 -; SI-NEXT: s_and_b32 s2, s2, 0xff +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_add_i32 s35, s35, s35 ; SI-NEXT: s_add_i32 s23, s23, s23 ; SI-NEXT: s_add_i32 s22, s22, s22 -; SI-NEXT: s_and_b32 s1, s1, 0xff +; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_add_i32 s34, s34, s34 ; SI-NEXT: s_add_i32 s21, s21, s21 ; SI-NEXT: s_add_i32 s20, s20, s20 -; SI-NEXT: s_and_b32 s0, s0, 0xff +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_add_i32 s33, s33, s33 ; SI-NEXT: s_add_i32 s19, s19, s19 ; SI-NEXT: s_add_i32 s18, s18, s18 -; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_add_i32 s31, s31, s31 ; SI-NEXT: s_add_i32 s17, s17, s17 ; SI-NEXT: s_add_i32 s16, s16, s16 -; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_add_i32 s30, s30, s30 ; SI-NEXT: s_add_i32 s15, s15, s15 ; SI-NEXT: s_add_i32 s14, s14, s14 -; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_add_i32 s29, s29, s29 ; SI-NEXT: s_add_i32 s13, s13, s13 ; SI-NEXT: s_add_i32 s12, s12, s12 -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_add_i32 s28, s28, s28 ; SI-NEXT: s_lshl_b32 s27, s27, 24 ; SI-NEXT: s_and_b32 s26, s26, 0xff @@ -1765,57 +1765,57 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s28, s28, 8 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_or_b32 s3, s3, s36 +; SI-NEXT: s_or_b32 s7, s7, s36 ; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_or_b32 s2, s2, s35 +; SI-NEXT: s_or_b32 s6, s6, s35 ; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_or_b32 s1, s1, s34 +; SI-NEXT: s_or_b32 s5, s5, s34 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_or_b32 s0, s0, s33 +; SI-NEXT: s_or_b32 s4, s4, s33 ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s7, s7, s31 +; SI-NEXT: s_or_b32 s11, s11, s31 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_or_b32 s6, s6, s30 +; SI-NEXT: s_or_b32 s10, s10, s30 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_or_b32 s5, s5, s29 +; SI-NEXT: s_or_b32 s9, s9, s29 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_or_b32 s4, s4, s28 +; SI-NEXT: s_or_b32 s8, s8, s28 ; SI-NEXT: s_or_b32 s26, s27, s26 -; SI-NEXT: s_and_b32 s3, s3, 0xffff +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: s_and_b32 s2, s2, 0xffff +; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: s_and_b32 s1, s1, 0xffff +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: s_and_b32 s0, s0, 0xffff +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s3, s3, s26 -; SI-NEXT: s_or_b32 s2, s2, s24 -; SI-NEXT: s_or_b32 s1, s1, s22 -; SI-NEXT: s_or_b32 s7, s7, s18 -; SI-NEXT: s_or_b32 s6, s6, s16 -; SI-NEXT: s_or_b32 s5, s5, s14 -; SI-NEXT: s_or_b32 s4, s4, s12 -; SI-NEXT: s_or_b32 s0, s0, s20 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s26 +; SI-NEXT: s_or_b32 s6, s6, s24 +; SI-NEXT: s_or_b32 s5, s5, s22 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_or_b32 s10, s10, s16 +; SI-NEXT: s_or_b32 s9, s9, s14 +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_or_b32 s4, s4, s20 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: s_mov_b32 s8, s9 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s1 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: amd_kernel_v32i8: diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index 91fab927be3afa..3145c1c3e868bc 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -164,32 +164,32 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s19, s3 -; SI-NEXT: s_mov_b32 s22, s2 -; SI-NEXT: s_mov_b32 s23, s3 -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s16, s8 -; SI-NEXT: s_mov_b32 s17, s9 -; SI-NEXT: s_mov_b32 s20, s10 -; SI-NEXT: s_mov_b32 s21, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 78c657049fcb2a..2441252178033e 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -520,18 +520,18 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v8i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -557,7 +557,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 855b5fff11fe55..1adf93c8e17a52 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -876,16 +876,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 @@ -916,7 +916,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1 ; SI-NEXT: v_min3_u32 v0, v0, v1, 64 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 4202edfbd0eb45..c1e0563a1fda0f 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -353,20 +353,20 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: s_branch .LBB2_4 ; SI-NEXT: .LBB2_2: ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB2_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -375,11 +375,11 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -389,17 +389,17 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: .LBB2_4: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 ; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 ; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -947,20 +947,20 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB5_3 ; SI-NEXT: s_branch .LBB5_4 ; SI-NEXT: .LBB5_2: ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB5_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -969,11 +969,11 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -999,17 +999,17 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: .LBB5_4: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 ; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 ; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -1216,11 +1216,11 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_mov_b32 s37, s38 ; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1248,8 +1248,8 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v11, v2 ; SI-NEXT: v_or_b32_e32 v8, v8, v12 ; SI-NEXT: v_or_b32_e32 v2, v10, v13 @@ -1259,9 +1259,9 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_2: ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -1273,11 +1273,11 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_mov_b32 s37, s38 ; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1305,21 +1305,21 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v0 ; SI-NEXT: v_or_b32_e32 v8, v8, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v10 ; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: .LBB7_4: ; %exit ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_movk_i32 s34, 0x3800 ; SI-NEXT: v_mov_b32_e32 v8, 0x3d00 ; SI-NEXT: v_mov_b32_e32 v9, 0x3900 @@ -1329,8 +1329,8 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5 -; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4 +; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6 ; SI-NEXT: v_cndmask_b32_e32 v12, v10, v11, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v3 @@ -1339,13 +1339,13 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4 -; SI-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v5, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 ; SI-NEXT: v_or_b32_e32 v6, v3, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v12, 16 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -1495,9 +1495,9 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1530,16 +1530,16 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_or_b32_e32 v9, v10, v12 ; SI-NEXT: v_or_b32_e32 v8, v8, v13 ; SI-NEXT: v_or_b32_e32 v10, v7, v14 -; SI-NEXT: v_or_b32_e32 v11, v5, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_or_b32_e32 v11, v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 @@ -1548,12 +1548,12 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_branch .LBB8_4 ; SI-NEXT: .LBB8_2: ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB8_3: ; %T @@ -1562,76 +1562,76 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_mov_b32 s37, s38 ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:18 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:18 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:20 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:20 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:22 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:22 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:24 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:24 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:26 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:26 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:28 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:28 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v8, v7, v10 -; SI-NEXT: v_or_b32_e32 v9, v5, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v8, v6, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 ; SI-NEXT: .LBB8_4: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_mov_b32_e32 v8, 0x3fa00000 ; SI-NEXT: v_mov_b32_e32 v9, 0x3f200000 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v5 ; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v4 ; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc @@ -1639,9 +1639,9 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v10 ; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v11 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6 ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v12 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v11 ; SI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 27c42a9ea0db60..d0df27ed5d39ac 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -3506,25 +3506,25 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_cvt_f16_f32_e32 v12, v24 ; CI-NEXT: v_or_b32_e32 v11, v13, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v30 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 ; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v30 ; CI-NEXT: v_or_b32_e32 v12, v13, v12 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; CI-NEXT: v_or_b32_e32 v13, v15, v13 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; CI-NEXT: v_or_b32_e32 v13, v15, v13 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:40 ; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; CI-NEXT: v_or_b32_e32 v14, v15, v14 ; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v24 ; CI-NEXT: v_or_b32_e32 v15, v25, v15 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:40 ; CI-NEXT: s_waitcnt vmcnt(11) ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; CI-NEXT: s_waitcnt vmcnt(10) @@ -3549,50 +3549,51 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_or_b32_e32 v18, v19, v18 ; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; CI-NEXT: v_or_b32_e32 v19, v21, v19 -; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v27 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; CI-NEXT: v_or_b32_e32 v20, v21, v20 -; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; CI-NEXT: v_or_b32_e32 v21, v27, v21 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: s_waitcnt vmcnt(4) +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; CI-NEXT: v_or_b32_e32 v24, v25, v24 +; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; CI-NEXT: v_or_b32_e32 v25, v27, v25 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; CI-NEXT: v_or_b32_e32 v20, v21, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v22 +; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; CI-NEXT: v_or_b32_e32 v21, v23, v21 +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; CI-NEXT: v_or_b32_e32 v24, v25, v24 ; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; CI-NEXT: v_or_b32_e32 v26, v27, v26 ; CI-NEXT: v_add_i32_e32 v27, vcc, 0x7c, v0 ; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 -; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_or_b32_e32 v22, v22, v23 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; CI-NEXT: v_or_b32_e32 v26, v27, v26 @@ -3600,8 +3601,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 -; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3613,30 +3612,30 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v26 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v27 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 +; CI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; CI-NEXT: v_or_b32_e32 v22, v26, v22 ; CI-NEXT: v_add_i32_e32 v26, vcc, 0x70, v0 -; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 +; CI-NEXT: buffer_store_dword v22, v26, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; CI-NEXT: s_waitcnt vmcnt(3) ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; CI-NEXT: v_or_b32_e32 v23, v23, v27 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 +; CI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; CI-NEXT: v_or_b32_e32 v22, v26, v22 ; CI-NEXT: v_add_i32_e32 v26, vcc, 0x6c, v0 -; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 +; CI-NEXT: buffer_store_dword v22, v26, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 ; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen ; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 @@ -3645,11 +3644,11 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; CI-NEXT: s_waitcnt vmcnt(3) ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 +; CI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; CI-NEXT: v_or_b32_e32 v22, v26, v22 ; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v27 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3663,15 +3662,15 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_add_i32_e32 v26, vcc, 0x60, v0 ; CI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v23, vcc, 0x5c, v0 -; CI-NEXT: buffer_store_dword v25, v23, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v23, vcc, 0x58, v0 ; CI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v22, vcc, 0x54, v0 -; CI-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v22, vcc, 0x50, v0 +; CI-NEXT: v_add_i32_e32 v22, vcc, 0x58, v0 ; CI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v21, vcc, 0x4c, v0 +; CI-NEXT: v_add_i32_e32 v21, vcc, 0x54, v0 ; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v20, vcc, 0x50, v0 +; CI-NEXT: buffer_store_dword v25, v20, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v20, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v24, v20, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 ; CI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index 76c40f5962c588..e32d5d773058aa 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -294,16 +294,16 @@ entry: define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 ; SI-NEXT: s_mov_b32 s9, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_trunc_f32_e32 v0, s7 -; SI-NEXT: v_trunc_f32_e32 v1, s6 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_trunc_f32_e32 v0, s3 +; SI-NEXT: v_trunc_f32_e32 v1, s2 ; SI-NEXT: v_mul_f32_e64 v2, |v0|, s8 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; SI-NEXT: v_mul_f32_e64 v4, |v1|, s8 @@ -324,7 +324,7 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; SI-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc ; SI-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v6, v5, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 7d4393b653a756..fef9e33f5470e9 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -1380,20 +1380,20 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1403,29 +1403,29 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] ; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] -; SI-NEXT: v_readfirstlane_b32 s6, v5 -; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014 -; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01 -; SI-NEXT: s_mov_b32 s5, 0xfffff -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 -; SI-NEXT: v_not_b32_e32 v6, s4 +; SI-NEXT: v_readfirstlane_b32 s2, v5 +; SI-NEXT: s_bfe_u32 s0, s2, 0xb0014 +; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 +; SI-NEXT: s_mov_b32 s1, 0xfffff +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 +; SI-NEXT: v_not_b32_e32 v6, s0 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 -; SI-NEXT: v_not_b32_e32 v7, s5 +; SI-NEXT: v_not_b32_e32 v7, s1 ; SI-NEXT: v_and_b32_e32 v5, v5, v7 -; SI-NEXT: s_and_b32 s4, s6, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_and_b32 s0, s2, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc -; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: v_mov_b32_e32 v7, s0 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v7, s2 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f64: @@ -1605,20 +1605,20 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; SI-LABEL: unsafe_frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1628,29 +1628,29 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] ; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] -; SI-NEXT: v_readfirstlane_b32 s6, v5 -; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014 -; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01 -; SI-NEXT: s_mov_b32 s5, 0xfffff -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 -; SI-NEXT: v_not_b32_e32 v6, s4 +; SI-NEXT: v_readfirstlane_b32 s2, v5 +; SI-NEXT: s_bfe_u32 s0, s2, 0xb0014 +; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 +; SI-NEXT: s_mov_b32 s1, 0xfffff +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 +; SI-NEXT: v_not_b32_e32 v6, s0 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 -; SI-NEXT: v_not_b32_e32 v7, s5 +; SI-NEXT: v_not_b32_e32 v7, s1 ; SI-NEXT: v_and_b32_e32 v5, v5, v7 -; SI-NEXT: s_and_b32 s4, s6, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_and_b32 s0, s2, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc -; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: v_mov_b32_e32 v7, s0 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v7, s2 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f64: @@ -2626,23 +2626,23 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-LABEL: frem_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 -; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; SI-NEXT: v_div_scale_f32 v5, s[0:1], v3, v3, v1 ; SI-NEXT: v_rcp_f32_e32 v6, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 @@ -2657,7 +2657,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_trunc_f32_e32 v4, v4 ; SI-NEXT: v_fma_f32 v1, -v4, v3, v1 ; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; SI-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v0 ; SI-NEXT: v_rcp_f32_e32 v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 @@ -2671,28 +2671,28 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v3, v3 ; SI-NEXT: v_fma_f32 v0, -v3, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v2f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s8, s4 +; CI-NEXT: s_mov_b32 s9, s5 ; CI-NEXT: s_mov_b32 s4, s6 ; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: s_mov_b32 s11, s3 +; CI-NEXT: s_mov_b32 s6, s10 +; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: s_mov_b32 s3, s11 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; CI-NEXT: v_div_scale_f32 v5, s[0:1], v3, v3, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 ; CI-NEXT: v_rcp_f32_e32 v6, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -2707,7 +2707,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; CI-NEXT: v_trunc_f32_e32 v4, v4 ; CI-NEXT: v_fma_f32 v1, -v4, v3, v1 -; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; CI-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -2722,7 +2722,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v3, v3 ; CI-NEXT: v_fma_f32 v0, -v3, v2, v0 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f32: @@ -3682,22 +3682,22 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-LABEL: frem_v2f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s8, s4 +; CI-NEXT: s_mov_b32 s9, s5 ; CI-NEXT: s_mov_b32 s4, s6 ; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: s_mov_b32 s11, s3 +; CI-NEXT: s_mov_b32 s6, s10 +; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: s_mov_b32 s3, s11 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3] +; CI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] ; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] @@ -3711,7 +3711,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] ; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1] +; CI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] ; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] @@ -3725,7 +3725,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f64: diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll index 13e588dffaf5c1..5703f0771e96d4 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -4316,20 +4316,20 @@ entry: define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 -; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 +; SDAG-IEEE-NEXT: s_mov_b32 s11, 0xf000 +; SDAG-IEEE-NEXT: s_mov_b32 s10, -1 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s11, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s7, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s7 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v3, v2 -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s10, v1 -; SDAG-IEEE-NEXT: s_mov_b32 s4, s8 -; SDAG-IEEE-NEXT: s_mov_b32 s5, s9 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 +; SDAG-IEEE-NEXT: s_mov_b32 s8, s4 +; SDAG-IEEE-NEXT: s_mov_b32 s9, s5 ; SDAG-IEEE-NEXT: v_add_i32_e32 v4, vcc, -1, v3 ; SDAG-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v2 ; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 @@ -4340,8 +4340,8 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s10 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s6 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v5, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v5, v0 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v4, 0x260 @@ -4359,7 +4359,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SDAG-IEEE-NEXT: s_endpgm ; ; GISEL-IEEE-LABEL: elim_redun_check_v2: @@ -4522,20 +4522,20 @@ entry: define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2_ult: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 -; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 +; SDAG-IEEE-NEXT: s_mov_b32 s11, 0xf000 +; SDAG-IEEE-NEXT: s_mov_b32 s10, -1 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s11, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s7, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s7 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v3, v2 -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s10, v1 -; SDAG-IEEE-NEXT: s_mov_b32 s4, s8 -; SDAG-IEEE-NEXT: s_mov_b32 s5, s9 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 +; SDAG-IEEE-NEXT: s_mov_b32 s8, s4 +; SDAG-IEEE-NEXT: s_mov_b32 s9, s5 ; SDAG-IEEE-NEXT: v_add_i32_e32 v4, vcc, -1, v3 ; SDAG-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v2 ; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 @@ -4546,8 +4546,8 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s10 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s6 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v5, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v5, v0 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v4, 0x260 @@ -4565,7 +4565,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SDAG-IEEE-NEXT: s_endpgm ; ; GISEL-IEEE-LABEL: elim_redun_check_v2_ult: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 8dd73c5ab32fbc..32b9b7c5c72612 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2565,26 +2565,26 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX10-NEXT: s_clause 0x14 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:136 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:156 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:140 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:152 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:156 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:160 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:120 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:116 @@ -2636,33 +2636,33 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(32) -; GFX10-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:284 -; GFX10-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:280 -; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:276 -; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:272 -; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:268 -; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:264 -; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:260 -; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:256 +; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:284 +; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:280 +; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:276 +; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:272 +; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:268 +; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:264 +; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:260 +; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:256 ; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:252 ; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen offset:248 -; GFX10-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen offset:244 -; GFX10-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen offset:240 +; GFX10-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:248 +; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:244 +; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:240 ; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:236 -; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:232 -; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:228 -; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:224 -; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:220 +; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:232 +; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:228 +; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:224 +; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:220 ; GFX10-NEXT: s_waitcnt vmcnt(16) ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:216 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:212 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:208 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:204 -; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:200 -; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:196 -; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:192 -; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:188 +; GFX10-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen offset:200 +; GFX10-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen offset:196 +; GFX10-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:188 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:184 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:180 @@ -2697,7 +2697,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-LABEL: return_72xi32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x10 +; GFX11-NEXT: s_clause 0xe ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 @@ -2705,59 +2705,47 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 ; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 ; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s32 offset:224 -; GFX11-NEXT: scratch_store_b128 off, v[25:28], s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:164 ; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:60 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:72 -; GFX11-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v26, v23 -; GFX11-NEXT: v_dual_mov_b32 v25, v22 :: v_dual_mov_b32 v24, v21 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:104 ; GFX11-NEXT: s_add_i32 s1, s0, 0x110 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 -; GFX11-NEXT: v_dual_mov_b32 v23, v20 :: v_dual_mov_b32 v22, v19 -; GFX11-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v17 -; GFX11-NEXT: v_dual_mov_b32 v19, v16 :: v_dual_mov_b32 v18, v15 -; GFX11-NEXT: v_dual_mov_b32 v17, v14 :: v_dual_mov_b32 v16, v13 -; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v14, v11 -; GFX11-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v12, v9 -; GFX11-NEXT: v_dual_mov_b32 v11, v8 :: v_dual_mov_b32 v10, v7 -; GFX11-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v7, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v6, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:104 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v1, off, s32 offset:88 ; GFX11-NEXT: s_add_i32 s2, s0, 0xe0 ; GFX11-NEXT: s_add_i32 s3, s0, 0xd0 ; GFX11-NEXT: s_add_i32 s34, s0, 0xc0 @@ -2772,59 +2760,61 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: s_add_i32 s43, s0, 48 ; GFX11-NEXT: s_add_i32 s44, s0, 32 ; GFX11-NEXT: s_waitcnt vmcnt(23) -; GFX11-NEXT: scratch_store_b128 off, v[45:48], s1 +; GFX11-NEXT: scratch_store_b128 off, v[32:35], s1 ; GFX11-NEXT: s_add_i32 s1, s0, 0x100 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:108 ; GFX11-NEXT: s_waitcnt vmcnt(21) -; GFX11-NEXT: scratch_store_b128 off, v[56:59], s1 -; GFX11-NEXT: s_clause 0xc -; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v1, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v4, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_store_b128 off, v[36:39], s1 +; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; GFX11-NEXT: s_add_i32 s1, s0, 0xf0 ; GFX11-NEXT: s_add_i32 s0, s0, 16 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: scratch_store_b128 off, v[33:36], s1 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b128 off, v[60:63], s2 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s3 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: scratch_store_b128 off, v[56:59], s34 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: scratch_store_b128 off, v[43:46], s35 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: scratch_store_b128 off, v[39:42], s36 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: scratch_store_b128 off, v[52:55], s37 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b128 off, v[48:51], s38 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s32 offset:224 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 -; GFX11-NEXT: scratch_store_b128 off, v[59:62], s2 -; GFX11-NEXT: scratch_store_b128 off, v[4:7], s3 -; GFX11-NEXT: scratch_store_b128 off, v[41:44], s34 -; GFX11-NEXT: scratch_store_b128 off, v[37:40], s35 -; GFX11-NEXT: scratch_store_b128 off, v[52:55], s36 -; GFX11-NEXT: scratch_store_b128 off, v[48:51], s37 -; GFX11-NEXT: scratch_store_b128 off, v[33:36], s38 -; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 offset:224 ; 16-byte Folded Reload -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s39 -; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 offset:240 ; 16-byte Folded Reload -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s40 -; GFX11-NEXT: scratch_store_b128 off, v[24:27], s41 -; GFX11-NEXT: scratch_store_b128 off, v[20:23], s42 -; GFX11-NEXT: scratch_store_b128 off, v[16:19], s43 -; GFX11-NEXT: scratch_store_b128 off, v[12:15], s44 -; GFX11-NEXT: scratch_store_b128 off, v[8:11], s0 +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s39 +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s40 +; GFX11-NEXT: scratch_store_b128 off, v[21:24], s41 +; GFX11-NEXT: scratch_store_b128 off, v[17:20], s42 +; GFX11-NEXT: scratch_store_b128 off, v[13:16], s43 +; GFX11-NEXT: scratch_store_b128 off, v[9:12], s44 +; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 ; GFX11-NEXT: s_clause 0xe -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:192 ; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 ; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 ; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 @@ -2981,35 +2971,35 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:764 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:768 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:772 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:776 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:780 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:784 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:788 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:792 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:516 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:776 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:780 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:528 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:784 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:532 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:788 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:536 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:792 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:540 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:796 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:544 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:516 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:520 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:524 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:528 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:532 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:536 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:540 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:544 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:556 @@ -3068,23 +3058,30 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:124 ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -3221,8 +3218,10 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: s_clause 0x28 -; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:636 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill +; GFX10-NEXT: s_clause 0x20 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:648 @@ -3255,38 +3254,37 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:760 ; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:764 ; GFX10-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:768 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:772 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:776 -; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:780 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:784 -; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:788 -; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:792 -; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:516 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:772 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:520 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:776 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:524 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:780 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:528 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:784 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:532 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:788 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:536 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:540 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:792 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:544 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:796 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill -; GFX10-NEXT: s_clause 0x15 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v0, 24 +; GFX10-NEXT: s_clause 0x1d +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:516 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:520 +; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:524 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:528 +; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:532 +; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:536 +; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:540 +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:544 ; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 ; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 ; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:556 @@ -3309,9 +3307,10 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:624 ; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:628 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:632 -; GFX10-NEXT: v_mov_b32_e32 v0, 24 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:12 @@ -3344,24 +3343,29 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:124 ; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; GFX10-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:132 -; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 -; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 -; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 -; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 -; GFX10-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 -; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 -; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 -; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1540 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1544 -; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1548 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1552 -; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1556 -; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1560 -; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1564 -; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:1568 -; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload ; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_clause 0xe @@ -3398,8 +3402,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_add_i32 s33, s32, 0x1ff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v32, s33 offset:1536 ; 4-byte Folded Spill +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3409,8 +3413,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:60 +; GFX11-NEXT: s_clause 0xe ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:56 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:52 ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:48 @@ -3450,7 +3453,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_load_b64 s[46:47], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s3, s32, 16 ; GFX11-NEXT: s_add_i32 s0, s33, 0x200 -; GFX11-NEXT: v_writelane_b32 v32, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s3 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0 @@ -3469,109 +3472,113 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0 ; GFX11-NEXT: v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0 ; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0 -; GFX11-NEXT: v_writelane_b32 v32, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[46:47] -; GFX11-NEXT: s_clause 0xb -; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:624 -; GFX11-NEXT: scratch_load_b128 v[26:29], off, s33 offset:640 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b128 v[45:48], off, s33 offset:624 +; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:640 +; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v32, v48 +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b128 v[48:51], off, s33 offset:656 ; GFX11-NEXT: scratch_load_b128 v[52:55], off, s33 offset:672 -; GFX11-NEXT: scratch_load_b128 v[40:43], off, s33 offset:688 -; GFX11-NEXT: scratch_load_b128 v[44:47], off, s33 offset:704 -; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:720 -; GFX11-NEXT: scratch_load_b128 v[60:63], off, s33 offset:736 +; GFX11-NEXT: scratch_load_b128 v[41:44], off, s33 offset:688 +; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:704 +; GFX11-NEXT: scratch_load_b128 v[60:63], off, s33 offset:720 +; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:736 ; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:752 ; GFX11-NEXT: scratch_load_b128 v[4:7], off, s33 offset:768 ; GFX11-NEXT: scratch_load_b128 v[8:11], off, s33 offset:784 -; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:512 -; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 -; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: v_dual_mov_b32 v31, v50 :: v_dual_mov_b32 v30, v49 +; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:512 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_dual_mov_b32 v38, v53 :: v_dual_mov_b32 v37, v52 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_dual_mov_b32 v49, v40 :: v_dual_mov_b32 v50, v41 -; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_dual_mov_b32 v41, v56 :: v_dual_mov_b32 v40, v47 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_dual_mov_b32 v47, v2 :: v_dual_mov_b32 v2, v5 -; GFX11-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v27 +; GFX11-NEXT: v_dual_mov_b32 v39, v54 :: v_dual_mov_b32 v52, v44 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_dual_mov_b32 v53, v56 :: v_dual_mov_b32 v54, v57 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v12 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[12:15], s33 offset:1588 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1588 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:528 -; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:544 -; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:560 -; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:576 -; GFX11-NEXT: v_dual_mov_b32 v39, v28 :: v_dual_mov_b32 v28, v29 -; GFX11-NEXT: v_dual_mov_b32 v29, v48 :: v_dual_mov_b32 v48, v55 -; GFX11-NEXT: v_dual_mov_b32 v55, v46 :: v_dual_mov_b32 v46, v1 -; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, v7 -; GFX11-NEXT: v_mov_b32_e32 v5, v8 -; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v56, v59 +; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:528 +; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 +; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:560 +; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:576 +; GFX11-NEXT: v_mov_b32_e32 v56, v63 +; GFX11-NEXT: v_mov_b32_e32 v12, v15 +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v15, v2 +; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_mov_b32_e32 v8, v15 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_dual_mov_b32 v10, v17 :: v_dual_mov_b32 v15, v22 +; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v8, v19 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_mov_b32_e32 v10, v21 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1572 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:592 +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1556 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:608 +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1540 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_store_b128 off, v[36:39], s32 -; GFX11-NEXT: v_dual_mov_b32 v37, v52 :: v_dual_mov_b32 v38, v53 -; GFX11-NEXT: v_mov_b32_e32 v39, v54 -; GFX11-NEXT: v_dual_mov_b32 v53, v44 :: v_dual_mov_b32 v54, v45 -; GFX11-NEXT: v_dual_mov_b32 v44, v63 :: v_dual_mov_b32 v45, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v3, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v9 +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 +; GFX11-NEXT: v_mov_b32_e32 v32, v36 +; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 +; GFX11-NEXT: v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v36, v51 +; GFX11-NEXT: v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v41 +; GFX11-NEXT: v_mov_b32_e32 v50, v42 +; GFX11-NEXT: v_dual_mov_b32 v55, v58 :: v_dual_mov_b32 v58, v13 +; GFX11-NEXT: v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v0, v3 +; GFX11-NEXT: v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9 ; GFX11-NEXT: scratch_store_b32 off, v11, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x90 -; GFX11-NEXT: v_dual_mov_b32 v36, v51 :: v_dual_mov_b32 v51, v42 -; GFX11-NEXT: v_mov_b32_e32 v52, v43 +; GFX11-NEXT: v_mov_b32_e32 v51, v43 +; GFX11-NEXT: v_mov_b32_e32 v41, v59 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 +; GFX11-NEXT: v_mov_b32_e32 v7, v18 ; GFX11-NEXT: s_add_i32 s0, s32, 0x80 -; GFX11-NEXT: v_mov_b32_e32 v42, v57 +; GFX11-NEXT: v_dual_mov_b32 v42, v60 :: v_dual_mov_b32 v43, v61 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 -; GFX11-NEXT: v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v5, v12 +; GFX11-NEXT: v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v9, v20 ; GFX11-NEXT: s_add_i32 s0, s32, 0x70 -; GFX11-NEXT: v_mov_b32_e32 v43, v58 -; GFX11-NEXT: v_dual_mov_b32 v57, v60 :: v_dual_mov_b32 v58, v61 -; GFX11-NEXT: scratch_store_b128 off, v[44:47], s0 +; GFX11-NEXT: v_mov_b32_e32 v5, v16 +; GFX11-NEXT: scratch_store_b128 off, v[12:15], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x6c -; GFX11-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v7, v14 +; GFX11-NEXT: v_dual_mov_b32 v6, v17 :: v_dual_mov_b32 v11, v22 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x60 -; GFX11-NEXT: v_mov_b32_e32 v9, v16 +; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45 ; GFX11-NEXT: scratch_store_b96 off, v[56:58], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x50 -; GFX11-NEXT: v_mov_b32_e32 v11, v18 -; GFX11-NEXT: scratch_store_b128 off, v[40:43], s0 +; GFX11-NEXT: v_mov_b32_e32 v13, v24 +; GFX11-NEXT: scratch_store_b128 off, v[41:44], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: v_dual_mov_b32 v12, v19 :: v_dual_mov_b32 v13, v20 +; GFX11-NEXT: v_dual_mov_b32 v14, v25 :: v_dual_mov_b32 v31, v47 ; GFX11-NEXT: scratch_store_b128 off, v[52:55], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 48 -; GFX11-NEXT: v_mov_b32_e32 v14, v21 +; GFX11-NEXT: v_mov_b32_e32 v15, v26 ; GFX11-NEXT: scratch_store_b128 off, v[48:51], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 32 -; GFX11-NEXT: v_mov_b32_e32 v16, v23 +; GFX11-NEXT: v_mov_b32_e32 v16, v27 ; GFX11-NEXT: scratch_store_b128 off, v[36:39], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s0 -; GFX11-NEXT: v_mov_b32_e32 v29, v33 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 +; GFX11-NEXT: v_mov_b32_e32 v30, v46 +; GFX11-NEXT: scratch_store_b128 off, v[32:35], s0 +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1572 ; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1556 ; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1540 ; GFX11-NEXT: s_add_i32 s0, s33, 0x400 -; GFX11-NEXT: v_dual_mov_b32 v30, v34 :: v_dual_mov_b32 v31, v35 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 42 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[46:47] -; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: s_clause 0xe ; GFX11-NEXT: scratch_load_b32 v63, off, s33 ; GFX11-NEXT: scratch_load_b32 v62, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v61, off, s33 offset:8 @@ -3587,11 +3594,10 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:48 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:52 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:56 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:60 -; GFX11-NEXT: v_readlane_b32 s31, v32, 1 -; GFX11-NEXT: v_readlane_b32 s30, v32, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v32, off, s33 offset:1536 ; 4-byte Folded Reload +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xf600 ; GFX11-NEXT: s_mov_b32 s33, s45 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index a6d56df00c8627..f9fb08af4b45f2 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4669,21 +4669,21 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB73_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_max_i32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_max_i32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB73_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4692,7 +4692,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: @@ -4885,21 +4885,21 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB75_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_max_i32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_max_i32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB75_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4908,7 +4908,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i32_ret_addr64: @@ -5808,21 +5808,21 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_max_u32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_max_u32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5831,7 +5831,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -5926,21 +5926,21 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB86_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_max_u32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_max_u32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5949,7 +5949,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i32_ret_addr64: @@ -7556,21 +7556,21 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_min_i32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB104_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7579,7 +7579,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: @@ -7759,21 +7759,21 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB106_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_min_i32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB106_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7782,7 +7782,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32_ret_addr64: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 005cfe73671bd2..984057048eb87c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -5011,29 +5011,29 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB73_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB73_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5042,7 +5042,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i64_ret_addr64_offset: @@ -5255,29 +5255,29 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB75_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB75_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5286,7 +5286,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i64_ret_addr64: @@ -6333,29 +6333,29 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6364,7 +6364,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: @@ -6462,29 +6462,29 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB86_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6493,7 +6493,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i64_ret_addr64: @@ -8374,29 +8374,29 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB104_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8405,7 +8405,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i64_ret_addr64_offset: @@ -8607,29 +8607,29 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB106_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB106_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8638,7 +8638,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i64_ret_addr64: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 429bdd805ec5e1..b51c50a3e1eac1 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -1169,25 +1169,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1378,25 +1378,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2429,25 +2429,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2638,25 +2638,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4393,25 +4393,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4602,25 +4602,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index f05a420a1b0a26..cd889889de10cd 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -1265,25 +1265,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1474,25 +1474,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2525,25 +2525,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2734,25 +2734,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4577,25 +4577,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4786,25 +4786,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index e2d55990473c09..4371eb6c3ee92f 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1856,92 +1856,92 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v14, s3 -; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v16, s3 -; CI-NEXT: v_mov_b32_e32 v15, s2 +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_mov_b32_e32 v16, s2 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v18, s3 -; CI-NEXT: v_mov_b32_e32 v17, s2 +; CI-NEXT: v_mov_b32_e32 v19, s3 +; CI-NEXT: v_mov_b32_e32 v18, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x70 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v12, s1 -; CI-NEXT: v_mov_b32_e32 v11, s0 +; CI-NEXT: v_mov_b32_e32 v13, s1 +; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; CI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; CI-NEXT: s_nop 0 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v19 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; CI-NEXT: v_mov_b32_e32 v14, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; CI-NEXT: v_mov_b32_e32 v13, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x60 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; CI-NEXT: v_mov_b32_e32 v16, s3 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_mov_b32_e32 v15, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x50 -; CI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] -; CI-NEXT: v_cvt_f32_f16_e32 v17, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: flat_store_dwordx4 v[11:12], v[0:3] -; CI-NEXT: v_cvt_f32_f16_e32 v12, v18 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 +; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x60 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 +; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x50 ; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: s_add_u32 s0, s0, 64 -; CI-NEXT: flat_store_dwordx4 v[13:14], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v17 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v19 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v21 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; CI-NEXT: v_mov_b32_e32 v20, s3 +; CI-NEXT: v_mov_b32_e32 v21, s3 ; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v19, s2 +; CI-NEXT: v_mov_b32_e32 v20, s2 ; CI-NEXT: v_mov_b32_e32 v12, s0 -; CI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[20:21], v[0:3] ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; CI-NEXT: s_endpgm ; @@ -1951,12 +1951,12 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 @@ -1984,41 +1984,41 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: s_add_u32 s0, s0, 0x60 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v22, v4 -; VI-NEXT: v_cvt_f32_f16_sdwa v23, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; VI-NEXT: v_cvt_f32_f16_sdwa v25, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; VI-NEXT: v_cvt_f32_f16_sdwa v21, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; VI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v20, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v21, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; VI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; VI-NEXT: v_cvt_f32_f16_sdwa v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v26, v6 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; VI-NEXT: v_cvt_f32_f16_sdwa v27, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v20 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v21 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v22 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v23 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; VI-NEXT: v_cvt_f32_f16_sdwa v28, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v29, v4 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v31 +; VI-NEXT: v_cvt_f32_f16_sdwa v30, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v24 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v25 -; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v29 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v22 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v23 +; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v31 ; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v32 +; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v29 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v30 +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 ; VI-NEXT: v_mov_b32_e32 v21, s3 ; VI-NEXT: v_mov_b32_e32 v23, s1 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v26 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll index d6f7a92af9dcb6..0f13942e236766 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -47,8 +47,8 @@ entry: } ; CHECK: .name: num_spilled_sgprs -; GFX700: .sgpr_spill_count: 38 -; GFX803: .sgpr_spill_count: 22 +; GFX700: .sgpr_spill_count: 36 +; GFX803: .sgpr_spill_count: 20 ; GFX900: .sgpr_spill_count: 48 ; GFX1010: .sgpr_spill_count: 48 ; CHECK: .symbol: num_spilled_sgprs.kd diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index 8c53d2671de3f6..fcbb3512777071 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2712,12 +2712,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 12 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 @@ -2732,12 +2732,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 12, v9 +; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 @@ -2751,8 +2751,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v10 +; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 @@ -2760,38 +2760,38 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 -; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18 ; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 -; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GFX9-NEXT: v_or_b32_sdwa v7, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_or_b32_e32 v4, v4, v1 -; GFX9-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, v7, v3 -; GFX9-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v6 +; GFX9-NEXT: v_add_u16_e32 v2, v7, v4 ; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v16, v18, v1 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 -; GFX9-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 +; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v5 +; GFX9-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 +; GFX9-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: @@ -2804,12 +2804,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 12 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 @@ -2824,12 +2824,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v4, 12, v9 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 @@ -2843,8 +2843,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v10 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 @@ -2852,38 +2852,38 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX9-DL-NEXT: v_or_b32_sdwa v2, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-DL-NEXT: v_or_b32_e32 v4, v4, v1 -; GFX9-DL-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v3, v7, v3 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v6 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v7, v4 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v16, v18, v1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 +; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index f5d41b246b1b8c..397283761ecbe7 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -52,17 +52,15 @@ define <2 x i64> @f1() #0 { define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) { ; GFX11-LABEL: f2: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX11-NEXT: s_load_b32 s24, s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-NEXT: s_load_b32 s24, s[16:17], 0x24 ; GFX11-NEXT: s_mov_b32 s12, s13 ; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX11-NEXT: s_mov_b32 s16, 0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_mov_b32 s0, -1 -; GFX11-NEXT: s_mov_b32 s18, exec_lo +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, exec_lo ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0 @@ -70,63 +68,67 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB2_13 ; GFX11-NEXT: ; %bb.1: ; %bb14 -; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c -; GFX11-NEXT: s_mov_b32 s19, 0 +; GFX11-NEXT: s_load_b128 s[16:19], s[4:5], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitcmp1_b32 s21, 0 +; GFX11-NEXT: s_bitcmp1_b32 s17, 0 ; GFX11-NEXT: s_cselect_b32 s25, -1, 0 -; GFX11-NEXT: s_bitcmp0_b32 s21, 0 +; GFX11-NEXT: s_bitcmp0_b32 s17, 0 +; GFX11-NEXT: s_mov_b32 s17, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX11-NEXT: ; %bb.2: ; %bb15 -; GFX11-NEXT: s_add_u32 s8, s16, 0x58 -; GFX11-NEXT: s_addc_u32 s9, s17, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 +; GFX11-NEXT: s_add_u32 s8, s4, 0x58 +; GFX11-NEXT: s_addc_u32 s9, s5, 0 +; GFX11-NEXT: s_getpc_b64 s[20:21] +; GFX11-NEXT: s_add_u32 s20, s20, f0@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s21, s21, f0@gotpcrel32@hi+12 ; GFX11-NEXT: s_mov_b32 s13, s14 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s21, s14 +; GFX11-NEXT: s_load_b64 s[28:29], s[20:21], 0x0 +; GFX11-NEXT: s_mov_b64 s[20:21], s[4:5] +; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b32 s27, s14 ; GFX11-NEXT: s_mov_b32 s14, s15 +; GFX11-NEXT: s_mov_b64 s[22:23], s[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_mov_b32 s14, s21 -; GFX11-NEXT: s_mov_b32 s1, -1 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 -; GFX11-NEXT: s_cbranch_vccz .LBB2_4 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[28:29] +; GFX11-NEXT: s_mov_b32 s14, s27 +; GFX11-NEXT: s_mov_b64 s[0:1], s[22:23] +; GFX11-NEXT: s_mov_b64 s[4:5], s[20:21] +; GFX11-NEXT: s_mov_b32 s13, -1 +; GFX11-NEXT: s_cbranch_execz .LBB2_4 ; GFX11-NEXT: s_branch .LBB2_12 ; GFX11-NEXT: .LBB2_3: -; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s13, 0 ; GFX11-NEXT: .LBB2_4: ; %bb16 -; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x54 -; GFX11-NEXT: s_bitcmp1_b32 s23, 0 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_and_b32 s3, s23, 1 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x54 +; GFX11-NEXT: s_bitcmp1_b32 s19, 0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_and_b32 s9, s19, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitcmp1_b32 s2, 0 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_cselect_b32 s8, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11-NEXT: s_bitcmp1_b32 s8, 0 +; GFX11-NEXT: s_mov_b32 s8, -1 +; GFX11-NEXT: s_cselect_b32 s19, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s9, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader -; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 +; GFX11-NEXT: s_load_b128 s[20:23], s[4:5], 0x44 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_hi_u32 s2, s29, s28 -; GFX11-NEXT: s_mul_i32 s3, s29, s28 +; GFX11-NEXT: s_mul_hi_u32 s8, s21, s20 +; GFX11-NEXT: s_mul_i32 s9, s21, s20 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, 1 -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-NEXT: v_alignbit_b32 v0, s8, s9, 1 +; GFX11-NEXT: s_mov_b32 s9, 0 +; GFX11-NEXT: v_readfirstlane_b32 s8, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s25 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s2, 1 -; GFX11-NEXT: s_lshr_b32 s2, s2, s30 +; GFX11-NEXT: s_or_b32 s8, s8, 1 +; GFX11-NEXT: s_lshr_b32 s8, s8, s22 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s2, s2, s22 -; GFX11-NEXT: s_mul_i32 s2, s2, s20 +; GFX11-NEXT: s_mul_i32 s8, s8, s18 +; GFX11-NEXT: s_mul_i32 s8, s8, s16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s24, s2 -; GFX11-NEXT: s_lshl_b64 s[20:21], s[2:3], 1 +; GFX11-NEXT: s_or_b32 s8, s24, s8 +; GFX11-NEXT: s_lshl_b64 s[20:21], s[8:9], 1 ; GFX11-NEXT: global_load_u16 v1, v2, s[20:21] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 @@ -134,61 +136,62 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_cmp_ne_u16_e64 s2, s3, 0 +; GFX11-NEXT: v_cmp_ne_u16_e64 s8, s9, 0 ; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: s_and_b32 vcc_lo, s8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, v3, s0 +; GFX11-NEXT: s_and_b32 vcc_lo, s19, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, v3, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-NEXT: v_readfirstlane_b32 s8, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_bitcmp1_b32 s2, 0 -; GFX11-NEXT: s_cselect_b32 s2, 0x100, 0 +; GFX11-NEXT: s_bitcmp1_b32 s8, 0 +; GFX11-NEXT: s_cselect_b32 s8, 0x100, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s3, s2, s3 +; GFX11-NEXT: s_or_b32 s9, s8, s9 ; GFX11-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-NEXT: ; %bb.7: ; %Flow -; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 ; GFX11-NEXT: .LBB2_8: ; %Flow12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s8 ; GFX11-NEXT: s_cbranch_vccz .LBB2_12 ; GFX11-NEXT: ; %bb.9: -; GFX11-NEXT: s_xor_b32 s0, s8, -1 +; GFX11-NEXT: s_xor_b32 s2, s19, -1 ; GFX11-NEXT: .LBB2_10: ; %bb17 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB2_10 ; GFX11-NEXT: ; %bb.11: ; %Flow6 -; GFX11-NEXT: s_mov_b32 s19, -1 +; GFX11-NEXT: s_mov_b32 s17, -1 ; GFX11-NEXT: .LBB2_12: ; %Flow11 -; GFX11-NEXT: s_and_b32 s3, s1, exec_lo -; GFX11-NEXT: s_or_not1_b32 s0, s19, exec_lo +; GFX11-NEXT: s_and_b32 s16, s13, exec_lo +; GFX11-NEXT: s_or_not1_b32 s2, s17, exec_lo ; GFX11-NEXT: .LBB2_13: ; %Flow9 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s18 -; GFX11-NEXT: s_and_saveexec_b32 s18, s0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-NEXT: s_and_saveexec_b32 s3, s2 ; GFX11-NEXT: s_cbranch_execz .LBB2_15 ; GFX11-NEXT: ; %bb.14: ; %bb43 -; GFX11-NEXT: s_add_u32 s8, s16, 0x58 -; GFX11-NEXT: s_addc_u32 s9, s17, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 +; GFX11-NEXT: s_add_u32 s8, s4, 0x58 +; GFX11-NEXT: s_addc_u32 s9, s5, 0 +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12 ; GFX11-NEXT: s_mov_b32 s13, s14 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[18:19], s[4:5], 0x0 +; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_or_b32 s3, s3, exec_lo +; GFX11-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX11-NEXT: s_or_b32 s16, s16, exec_lo ; GFX11-NEXT: .LBB2_15: ; %Flow14 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s18 -; GFX11-NEXT: s_and_saveexec_b32 s0, s3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-NEXT: s_and_saveexec_b32 s0, s16 ; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock ; GFX11-NEXT: ; divergent unreachable ; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index ae470efc92feee..b221efa1762b35 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -872,8 +872,8 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xa4 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x114 -; GCN-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x104 -; GCN-NEXT: s_load_dwordx8 s[24:31], s[0:1], 0xe4 +; GCN-NEXT: s_load_dwordx4 s[28:31], s[0:1], 0x104 +; GCN-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0xe4 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 @@ -897,18 +897,18 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v13, s17 ; GCN-NEXT: v_mov_b32_e32 v14, s18 ; GCN-NEXT: v_mov_b32_e32 v15, s19 -; GCN-NEXT: v_mov_b32_e32 v16, s24 -; GCN-NEXT: v_mov_b32_e32 v17, s25 -; GCN-NEXT: v_mov_b32_e32 v18, s26 -; GCN-NEXT: v_mov_b32_e32 v19, s27 -; GCN-NEXT: v_mov_b32_e32 v20, s28 -; GCN-NEXT: v_mov_b32_e32 v21, s29 -; GCN-NEXT: v_mov_b32_e32 v22, s30 -; GCN-NEXT: v_mov_b32_e32 v23, s31 -; GCN-NEXT: v_mov_b32_e32 v24, s20 -; GCN-NEXT: v_mov_b32_e32 v25, s21 -; GCN-NEXT: v_mov_b32_e32 v26, s22 -; GCN-NEXT: v_mov_b32_e32 v27, s23 +; GCN-NEXT: v_mov_b32_e32 v16, s20 +; GCN-NEXT: v_mov_b32_e32 v17, s21 +; GCN-NEXT: v_mov_b32_e32 v18, s22 +; GCN-NEXT: v_mov_b32_e32 v19, s23 +; GCN-NEXT: v_mov_b32_e32 v20, s24 +; GCN-NEXT: v_mov_b32_e32 v21, s25 +; GCN-NEXT: v_mov_b32_e32 v22, s26 +; GCN-NEXT: v_mov_b32_e32 v23, s27 +; GCN-NEXT: v_mov_b32_e32 v24, s28 +; GCN-NEXT: v_mov_b32_e32 v25, s29 +; GCN-NEXT: v_mov_b32_e32 v26, s30 +; GCN-NEXT: v_mov_b32_e32 v27, s31 ; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index f98b41ba199bd7..4fa5b6cf843c1f 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2798,80 +2798,80 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] -; GFX9-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_cmp_eq_u32 s7, 6 ; GFX9-NEXT: v_mov_b32_e32 v9, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 2 -; GFX9-NEXT: v_perm_b32 v4, v4, v10, s2 +; GFX9-NEXT: v_perm_b32 v3, v3, v10, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 3 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 0 -; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 +; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 14 -; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 +; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 15 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 12 -; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 +; GFX9-NEXT: v_perm_b32 v0, v10, v0, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 13 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 10 -; GFX9-NEXT: v_perm_b32 v8, v10, v8, s2 +; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 11 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 8 -; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 +; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 9 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 -; GFX9-NEXT: v_perm_b32 v5, v9, v5, s2 -; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 -; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX9-NEXT: v_perm_b32 v5, v10, v5, s2 +; GFX9-NEXT: v_perm_b32 v4, v9, v4, s2 +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v16f16_dynamic: @@ -2975,20 +2975,20 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 -; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 +; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3] +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v9, s1 +; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; CI-NEXT: s_cmp_eq_u32 s5, 15 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 14 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -2996,109 +2996,109 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 12 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] +; CI-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 11 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 10 ; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 9 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 8 ; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 7 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 6 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v15, v15, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v15, v15, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 4 -; CI-NEXT: v_or_b32_e32 v10, v10, v11 -; CI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc +; CI-NEXT: v_or_b32_e32 v3, v3, v11 +; CI-NEXT: v_cndmask_b32_e32 v11, v16, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; CI-NEXT: v_cndmask_b32_e64 v12, v12, v10, s[2:3] +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_or_b32_e32 v2, v2, v11 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_or_b32_e32 v6, v6, v11 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; CI-NEXT: s_cmp_eq_u32 s5, 3 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_or_b32_e32 v9, v9, v12 +; CI-NEXT: v_or_b32_e32 v2, v2, v12 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_or_b32_e32 v7, v7, v12 -; CI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_or_b32_e32 v0, v0, v12 +; CI-NEXT: v_cndmask_b32_e32 v12, v17, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 1 -; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; CI-NEXT: v_or_b32_e32 v8, v8, v13 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; CI-NEXT: v_or_b32_e32 v1, v1, v13 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; CI-NEXT: v_or_b32_e32 v1, v1, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; CI-NEXT: v_or_b32_e32 v3, v3, v13 -; CI-NEXT: v_or_b32_e32 v0, v0, v6 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: v_or_b32_e32 v5, v5, v10 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; CI-NEXT: v_or_b32_e32 v7, v7, v13 +; CI-NEXT: v_or_b32_e32 v4, v4, v10 +; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; CI-NEXT: s_nop 0 -; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; CI-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v8 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; ; GFX11-LABEL: v_insertelement_v16f16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 25de4a61f50f41..84fbea5a722eaf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -30,86 +30,86 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376 ; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152 +; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(8) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(4) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v1 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[64:67], v1 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[128:131] +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[64:67] ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:24592 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 ; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 ; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 @@ -168,48 +168,48 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:49152 ; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] +; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:57456 +; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:57440 +; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:57424 +; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:57408 +; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:57344 +; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:57360 +; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:57376 +; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 @@ -219,38 +219,38 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 ; GCN-NEXT: s_endpgm entry: call void @llvm.amdgcn.iglp.opt(i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 288616086eb8e5..2cc729f411c827 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -618,31 +618,31 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[0:3], v1 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192 +; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[128:131], v1 +; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 ; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:24576 ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:49264 ; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:49248 ; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:49232 @@ -652,52 +652,52 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:49168 ; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:49152 ; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: ds_read_b128 a[156:159], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[152:155], v2 offset:57440 -; GCN-NEXT: ds_read_b128 a[148:151], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[144:147], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[128:131], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[132:135], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 +; GCN-NEXT: ds_read_b128 a[92:95], v2 offset:57456 +; GCN-NEXT: ds_read_b128 a[88:91], v2 offset:57440 +; GCN-NEXT: ds_read_b128 a[84:87], v2 offset:57424 +; GCN-NEXT: ds_read_b128 a[80:83], v2 offset:57408 +; GCN-NEXT: ds_read_b128 a[64:67], v2 offset:57344 +; GCN-NEXT: ds_read_b128 a[68:71], v2 offset:57360 +; GCN-NEXT: ds_read_b128 a[72:75], v2 offset:57376 +; GCN-NEXT: ds_read_b128 a[76:79], v2 offset:57392 ; GCN-NEXT: v_mov_b32_e32 v2, 2.0 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; GCN-NEXT: s_waitcnt lgkmcnt(14) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] ; GCN-NEXT: s_waitcnt lgkmcnt(8) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 4 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[0:3] +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[128:131] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:16400 ; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672 ; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688 ; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640 @@ -706,14 +706,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624 ; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576 ; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:32784 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0) ; GCN-NEXT: s_endpgm @@ -724,31 +724,31 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:8192 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:24576 ; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:49264 ; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:49248 ; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:49232 @@ -758,52 +758,52 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:49168 ; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:49152 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v2 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v2 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v2 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v2 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v2 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v2 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v2 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v2 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v2 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v2 offset:57392 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 4 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:8208 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:16480 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:16496 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:16448 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:16464 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:16416 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:16432 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:16384 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:16400 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:24672 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:24688 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:24640 @@ -812,14 +812,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:24624 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:24576 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:24592 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:32784 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm @@ -863,68 +863,68 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v0, s0, v1 -; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:112 -; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:96 -; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:80 -; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:64 -; GCN-NEXT: ds_read_b128 a[0:3], v0 -; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:16 -; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:32 -; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v1 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v1, s1, v1 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:112 -; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:96 -; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:80 -; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:64 -; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:48 -; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:32 -; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:16 -; GCN-NEXT: ds_write_b128 v1, a[0:3] -; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:8304 -; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[0:3] +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:8288 -; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:8304 -; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:8256 -; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:8272 -; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:8224 -; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:8240 -; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:8192 -; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:8208 -; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:24688 -; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:24672 -; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:24656 -; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:24640 -; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:24624 -; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:24608 -; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:24592 -; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:24576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -933,47 +933,47 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:16480 -; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:16496 -; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:16448 -; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:16464 -; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:16416 -; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:16432 -; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:16384 -; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:16400 -; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:49264 -; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:49248 -; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:49232 -; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:49216 -; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:49200 -; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:49184 -; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:49168 -; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:49152 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:24672 -; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:24688 -; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:24640 -; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:24656 -; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:24608 -; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:24624 -; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:24576 -; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:24592 -; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:57456 -; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:57440 -; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:57424 -; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:57408 -; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:57344 -; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:57360 -; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:57376 -; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:57392 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -982,82 +982,82 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:32864 -; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:32880 -; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:32832 -; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:32848 -; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:32800 -; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:32816 -; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:32768 -; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s0, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s1, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:112 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:24576 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -1066,47 +1066,47 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:16480 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:16496 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:16448 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:16464 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:16416 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:16432 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:16384 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:16400 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:49152 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:49152 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, 0x6000, v1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:57392 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:57392 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -1115,14 +1115,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:32784 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 0c49338bfcab93..fa0c723c64e36e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -707,42 +707,42 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s8 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s9 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 -; SI-GISEL-NEXT: v_add_f32_e32 v0, s8, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[0:1] ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_add_f32_e32 v1, s9, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v1, s5, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc ; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] -; SI-GISEL-NEXT: v_add_f32_e32 v5, s10, v5 -; SI-GISEL-NEXT: v_add_f32_e32 v2, s11, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v5, s6, v5 +; SI-GISEL-NEXT: v_add_f32_e32 v2, s7, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v5, v5 ; SI-GISEL-NEXT: v_exp_f32_e32 v3, v2 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] ; SI-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 -; SI-GISEL-NEXT: s_mov_b32 s6, -1 -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-GISEL-NEXT: s_mov_b32 s10, -1 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 528232a203acfe..8ffa83ceb8a240 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -758,55 +758,55 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: s_mov_b32 s11, 0x3377d1cf -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 +; SI-SDAG-NEXT: s_mov_b32 s7, 0x3377d1cf +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s9, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s5, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s9, 0x3f317217 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x3f317217 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v1 -; SI-SDAG-NEXT: v_fma_f32 v4, v1, s9, -v3 -; SI-SDAG-NEXT: v_fma_f32 v4, v1, s11, v4 +; SI-SDAG-NEXT: v_fma_f32 v4, v1, s5, -v3 +; SI-SDAG-NEXT: v_fma_f32 v4, v1, s7, v4 ; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s12 ; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1] -; SI-SDAG-NEXT: v_mul_f32_e32 v3, s8, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, s4, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 ; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s10, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v3, s9, -v5 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; SI-SDAG-NEXT: v_fma_f32 v6, v3, s5, -v5 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v3, s11, v6 +; SI-SDAG-NEXT: v_fma_f32 v6, v3, s7, v6 ; SI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s12 ; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[2:3] ; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v2 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s9, -v3 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, v5 +; SI-SDAG-NEXT: v_fma_f32 v5, v2, s5, -v3 +; SI-SDAG-NEXT: v_fma_f32 v5, v2, s7, v5 ; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s12 ; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: s_mov_b32 s10, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log_v3f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 2e5bf2e5609512..bd468961f19dd9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -758,55 +758,55 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: s_mov_b32 s11, 0x3284fbcf -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 +; SI-SDAG-NEXT: s_mov_b32 s7, 0x3284fbcf +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s9, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s5, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s9, 0x3e9a209a +; SI-SDAG-NEXT: s_mov_b32 s5, 0x3e9a209a ; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v1 -; SI-SDAG-NEXT: v_fma_f32 v4, v1, s9, -v3 -; SI-SDAG-NEXT: v_fma_f32 v4, v1, s11, v4 +; SI-SDAG-NEXT: v_fma_f32 v4, v1, s5, -v3 +; SI-SDAG-NEXT: v_fma_f32 v4, v1, s7, v4 ; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s12 ; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1] -; SI-SDAG-NEXT: v_mul_f32_e32 v3, s8, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, s4, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 ; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s10, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v3, s9, -v5 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; SI-SDAG-NEXT: v_fma_f32 v6, v3, s5, -v5 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v3, s11, v6 +; SI-SDAG-NEXT: v_fma_f32 v6, v3, s7, v6 ; SI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s12 ; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[2:3] ; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v2 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s9, -v3 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, v5 +; SI-SDAG-NEXT: v_fma_f32 v5, v2, s5, -v3 +; SI-SDAG-NEXT: v_fma_f32 v5, v2, s7, v5 ; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s12 ; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: s_mov_b32 s10, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log10_v3f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index d499e017e92f41..af12c10fec5d6f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -865,42 +865,42 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s8 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s9 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc ; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 ; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v2 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; SI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; SI-GISEL-NEXT: s_mov_b32 s6, -1 -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-GISEL-NEXT: s_mov_b32 s10, -1 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 1a51c8708b941f..5849ae26b56ba9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -390,7 +390,7 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s21, 0xfffff ; SI-NEXT: s_mov_b32 s20, s2 -; SI-NEXT: v_mov_b32_e32 v8, 0 +; SI-NEXT: s_brev_b32 s26, -2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; SI-NEXT: s_addk_i32 s3, 0xfc01 @@ -406,146 +406,146 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v0, s22 ; SI-NEXT: v_mov_b32_e32 v1, s23 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_and_b64 s[24:25], s[24:25], exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014 -; SI-NEXT: s_add_i32 s24, s6, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s24 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s3 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_and_b32 s25, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s24, 0 +; SI-NEXT: s_and_b32 s24, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s25, s7 -; SI-NEXT: s_cmp_gt_i32 s24, 51 +; SI-NEXT: s_cselect_b32 s7, s24, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s6, s4, s6 -; SI-NEXT: v_bfi_b32 v9, s3, v0, v1 +; SI-NEXT: v_bfi_b32 v5, s26, v0, v1 ; SI-NEXT: s_cselect_b32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[22:23], v[8:9] +; SI-NEXT: v_add_f64 v[2:3], s[22:23], v[4:5] ; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: s_and_b64 s[22:23], s[24:25], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 -; SI-NEXT: s_add_i32 s22, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s22 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_bfe_u32 s3, s11, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] -; SI-NEXT: s_and_b32 s23, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s22, 0 +; SI-NEXT: s_and_b32 s22, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s23, s5 -; SI-NEXT: s_cmp_gt_i32 s22, 51 +; SI-NEXT: s_cselect_b32 s5, s22, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s4, s10, s4 ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: v_bfi_b32 v9, s3, v4, v5 +; SI-NEXT: v_bfi_b32 v5, s26, v5, v6 ; SI-NEXT: v_cmp_ge_f64_e64 s[22:23], |v[0:1]|, 0.5 -; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[8:9] +; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[4:5] ; SI-NEXT: s_and_b64 s[6:7], s[22:23], exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v6, s6 -; SI-NEXT: s_bfe_u32 s6, s9, 0xb0014 -; SI-NEXT: s_add_i32 s10, s6, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s10 -; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v8, s3 +; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s3 ; SI-NEXT: s_andn2_b64 s[6:7], s[8:9], s[6:7] -; SI-NEXT: s_and_b32 s11, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_and_b32 s10, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s11, s7 -; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s7, s10, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s6, s8, s6 ; SI-NEXT: s_cselect_b32 s7, s9, s7 -; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: v_mov_b32_e32 v5, s7 -; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[4:5] -; SI-NEXT: v_bfi_b32 v9, s3, v6, v7 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 -; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[8:9] +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_add_f64 v[6:7], s[8:9], -v[5:6] +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[6:7]|, 0.5 +; SI-NEXT: v_bfi_b32 v5, s26, v8, v9 +; SI-NEXT: v_add_f64 v[8:9], s[4:5], v[4:5] ; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_bfe_u32 s4, s15, 0xb0014 -; SI-NEXT: s_add_i32 s8, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s8 -; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v10, s3 +; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[14:15], s[4:5] -; SI-NEXT: s_and_b32 s9, s15, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_and_b32 s8, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s9, s5 -; SI-NEXT: s_cmp_gt_i32 s8, 51 -; SI-NEXT: s_cselect_b32 s4, s14, s4 +; SI-NEXT: s_cselect_b32 s5, s8, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s5, s15, s5 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: v_add_f64 v[4:5], s[14:15], -v[4:5] -; SI-NEXT: v_bfi_b32 v9, s3, v9, v10 -; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[4:5]|, 0.5 -; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[8:9] +; SI-NEXT: s_cselect_b32 s4, s14, s4 +; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_add_f64 v[6:7], s[14:15], -v[5:6] +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[6:7]|, 0.5 +; SI-NEXT: v_bfi_b32 v5, s26, v10, v11 +; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5] ; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v12, s6 -; SI-NEXT: s_bfe_u32 s6, s13, 0xb0014 -; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s8 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s3 ; SI-NEXT: s_andn2_b64 s[6:7], s[12:13], s[6:7] -; SI-NEXT: s_and_b32 s9, s13, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_and_b32 s8, s13, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s9, s7 -; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s7, s8, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s7, s13, s7 ; SI-NEXT: s_cselect_b32 s6, s12, s6 -; SI-NEXT: v_mov_b32_e32 v10, s7 -; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[9:10] -; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[10:11] +; SI-NEXT: v_mov_b32_e32 v12, s15 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s3, v12, v13 -; SI-NEXT: v_add_f64 v[12:13], s[4:5], v[8:9] +; SI-NEXT: v_bfi_b32 v5, s26, v5, v12 +; SI-NEXT: v_add_f64 v[12:13], s[4:5], v[4:5] ; SI-NEXT: s_and_b64 s[4:5], s[8:9], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_bfe_u32 s4, s19, 0xb0014 -; SI-NEXT: s_add_i32 s8, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s8 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_bfe_u32 s3, s19, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[18:19], s[4:5] -; SI-NEXT: s_and_b32 s9, s19, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_and_b32 s8, s19, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s9, s5 -; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s5, s8, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s5, s19, s5 ; SI-NEXT: s_cselect_b32 s4, s18, s4 -; SI-NEXT: v_mov_b32_e32 v10, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: v_add_f64 v[10:11], s[18:19], -v[9:10] -; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_mov_b32_e32 v11, s5 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_add_f64 v[10:11], s[18:19], -v[10:11] +; SI-NEXT: v_mov_b32_e32 v14, s13 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s3, v14, v15 -; SI-NEXT: v_add_f64 v[10:11], s[6:7], v[8:9] +; SI-NEXT: v_bfi_b32 v5, s26, v5, v14 +; SI-NEXT: v_add_f64 v[10:11], s[6:7], v[4:5] ; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: s_bfe_u32 s6, s17, 0xb0014 -; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s8 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_bfe_u32 s3, s17, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s3 ; SI-NEXT: s_andn2_b64 s[6:7], s[16:17], s[6:7] -; SI-NEXT: s_and_b32 s9, s17, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_and_b32 s8, s17, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s9, s7 -; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s7, s8, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s7, s17, s7 ; SI-NEXT: s_cselect_b32 s6, s16, s6 ; SI-NEXT: v_mov_b32_e32 v15, s7 @@ -553,20 +553,20 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_add_f64 v[14:15], s[16:17], -v[14:15] ; SI-NEXT: v_mov_b32_e32 v16, s19 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[14:15]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s3, v9, v16 -; SI-NEXT: v_add_f64 v[16:17], s[4:5], v[8:9] +; SI-NEXT: v_bfi_b32 v5, s26, v5, v16 +; SI-NEXT: v_add_f64 v[16:17], s[4:5], v[4:5] ; SI-NEXT: s_and_b64 s[4:5], s[8:9], exec ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: v_mov_b32_e32 v14, s17 -; SI-NEXT: v_bfi_b32 v9, s3, v9, v14 -; SI-NEXT: v_add_f64 v[14:15], s[6:7], v[8:9] +; SI-NEXT: v_bfi_b32 v5, s26, v5, v14 +; SI-NEXT: v_add_f64 v[14:15], s[6:7], v[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -574,86 +574,86 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 ; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_mov_b32_e32 v12, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[4:5] ; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: v_mov_b32_e32 v6, s7 ; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 -; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] +; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[4:5] ; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 +; CI-NEXT: v_mov_b32_e32 v7, s4 +; CI-NEXT: v_bfi_b32 v13, s2, v7, v6 ; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[10:11] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_mov_b32_e32 v10, s5 -; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] -; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[12:13] +; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: v_mov_b32_e32 v9, s5 +; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[6:7] +; CI-NEXT: v_bfi_b32 v13, s2, v8, v9 ; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 -; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] +; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[12:13] +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[10:11], s[8:9], -v[6:7] +; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[4:5] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[14:15] -; CI-NEXT: v_mov_b32_e32 v12, s11 +; CI-NEXT: v_mov_b32_e32 v10, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[8:9]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] +; CI-NEXT: v_mov_b32_e32 v11, s11 ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 +; CI-NEXT: v_bfi_b32 v13, s2, v10, v11 ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[12:13], s[14:15], -v[10:11] -; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_add_f64 v[10:11], s[14:15], -v[8:9] +; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[12:13] +; CI-NEXT: v_mov_b32_e32 v13, s4 ; CI-NEXT: v_mov_b32_e32 v14, s9 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v14 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_bfi_b32 v13, s2, v13, v14 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[12:13] ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[12:13], s[12:13], -v[14:15] +; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[14:15] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_add_f64 v[4:5], v[4:5], v[12:13] +; CI-NEXT: v_mov_b32_e32 v13, s4 ; CI-NEXT: v_mov_b32_e32 v16, s15 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v16 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: v_bfi_b32 v13, s2, v13, v16 ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_add_f64 v[10:11], v[8:9], v[12:13] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_add_f64 v[8:9], s[18:19], -v[16:17] +; CI-NEXT: v_mov_b32_e32 v13, s4 ; CI-NEXT: v_mov_b32_e32 v18, s13 -; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[16:17] -; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 -; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] +; CI-NEXT: v_bfi_b32 v13, s2, v13, v18 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[8:9]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[18:19], s[16:17] +; CI-NEXT: v_add_f64 v[8:9], v[14:15], v[12:13] ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[18:19], s[16:17], -v[14:15] +; CI-NEXT: v_add_f64 v[14:15], s[16:17], -v[18:19] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[18:19]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v13, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[14:15]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v20, s19 ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_bfi_b32 v5, s2, v5, v20 +; CI-NEXT: v_bfi_b32 v13, s2, v13, v20 ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_mov_b32_e32 v18, s17 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 -; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] +; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[12:13] +; CI-NEXT: v_mov_b32_e32 v13, s4 +; CI-NEXT: v_mov_b32_e32 v16, s17 +; CI-NEXT: v_bfi_b32 v13, s2, v13, v16 +; CI-NEXT: v_add_f64 v[12:13], v[18:19], v[12:13] ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 438b1bfe319a04..fa8f1ae2ff4725 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -1282,17 +1282,17 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 ; GFX6-NEXT: buffer_load_ushort v12, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_u32 v3, v12, 3, 1 ; GFX6-NEXT: v_bfe_u32 v1, v12, 1, 1 @@ -1310,10 +1310,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX6-NEXT: v_bfe_u32 v8, v12, 8, 1 ; GFX6-NEXT: v_bfe_u32 v14, v12, 14, 1 ; GFX6-NEXT: v_bfe_u32 v12, v12, 12, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i32: @@ -1442,17 +1442,17 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 ; GFX6-NEXT: buffer_load_ushort v12, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_i32 v3, v12, 3, 1 ; GFX6-NEXT: v_bfe_i32 v2, v12, 2, 1 @@ -1470,10 +1470,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX6-NEXT: v_bfe_i32 v14, v12, 14, 1 ; GFX6-NEXT: v_bfe_i32 v13, v12, 13, 1 ; GFX6-NEXT: v_bfe_i32 v12, v12, 12, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i32: @@ -2425,15 +2425,16 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v33, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s6, s3, 24 -; GFX8-NEXT: s_lshr_b32 s8, s2, 24 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10018 -; GFX8-NEXT: s_bfe_u32 s5, s3, 0x10018 -; GFX8-NEXT: s_and_b32 s7, s3, 1 -; GFX8-NEXT: s_and_b32 s9, s2, 1 +; GFX8-NEXT: s_lshr_b32 s7, s2, 24 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x10018 +; GFX8-NEXT: s_bfe_u32 s9, s3, 0x10018 +; GFX8-NEXT: s_and_b32 s10, s3, 1 +; GFX8-NEXT: s_and_b32 s11, s2, 1 ; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10013 ; GFX8-NEXT: s_bfe_u32 s13, s2, 0x10012 ; GFX8-NEXT: s_bfe_u32 s14, s2, 0x10011 @@ -2446,211 +2447,210 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10012 ; GFX8-NEXT: s_bfe_u32 s22, s3, 0x10011 ; GFX8-NEXT: s_bfe_u32 s23, s3, 0x10010 -; GFX8-NEXT: s_bfe_u32 s10, s3, 0x10017 -; GFX8-NEXT: s_bfe_u32 s11, s3, 0x10016 +; GFX8-NEXT: s_bfe_u32 s4, s3, 0x10017 +; GFX8-NEXT: s_bfe_u32 s5, s3, 0x10016 ; GFX8-NEXT: s_bfe_u32 s24, s3, 0x10015 ; GFX8-NEXT: s_bfe_u32 s25, s3, 0x10014 -; GFX8-NEXT: v_mov_b32_e32 v25, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v24, s11 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0xc0 -; GFX8-NEXT: v_mov_b32_e32 v22, s25 -; GFX8-NEXT: v_mov_b32_e32 v23, s24 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0x50 -; GFX8-NEXT: v_mov_b32_e32 v22, s23 -; GFX8-NEXT: v_mov_b32_e32 v23, s22 -; GFX8-NEXT: v_mov_b32_e32 v24, s21 -; GFX8-NEXT: v_mov_b32_e32 v25, s20 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 64 -; GFX8-NEXT: v_mov_b32_e32 v22, s19 -; GFX8-NEXT: v_mov_b32_e32 v23, s18 -; GFX8-NEXT: v_mov_b32_e32 v24, s17 -; GFX8-NEXT: v_mov_b32_e32 v25, s16 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v22, s15 -; GFX8-NEXT: v_mov_b32_e32 v23, s14 -; GFX8-NEXT: v_mov_b32_e32 v24, s13 -; GFX8-NEXT: v_mov_b32_e32 v25, s12 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: s_add_u32 s4, s0, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_mov_b32_e32 v9, s4 +; GFX8-NEXT: s_add_u32 s4, s0, 0xc0 +; GFX8-NEXT: v_mov_b32_e32 v5, s25 +; GFX8-NEXT: v_mov_b32_e32 v6, s24 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] +; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_mov_b32_e32 v9, s4 +; GFX8-NEXT: s_add_u32 s4, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v5, s23 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: v_mov_b32_e32 v7, s21 +; GFX8-NEXT: v_mov_b32_e32 v8, s20 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] +; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_mov_b32_e32 v9, s4 +; GFX8-NEXT: s_add_u32 s4, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v5, s19 +; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v7, s17 +; GFX8-NEXT: v_mov_b32_e32 v8, s16 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] +; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_mov_b32_e32 v9, s4 +; GFX8-NEXT: s_add_u32 s4, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3 -; GFX8-NEXT: v_mov_b32_e32 v25, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2 -; GFX8-NEXT: v_and_b32_e32 v21, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v27, 1, v22 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s3 -; GFX8-NEXT: v_mov_b32_e32 v24, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 32 +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] +; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 7, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 12, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 14, s2 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v29, 1, v5 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 1, s3 +; GFX8-NEXT: v_mov_b32_e32 v9, s4 +; GFX8-NEXT: s_add_u32 s4, s0, 32 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2 -; GFX8-NEXT: v_and_b32_e32 v28, 1, v22 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v20 -; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s2 -; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v19 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v30, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v13 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s2 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v12 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 10, s2 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v17, 1, v2 +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] +; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s3 +; GFX8-NEXT: v_and_b32_e32 v31, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v16 +; GFX8-NEXT: v_and_b32_sdwa v5, v11, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v9, s4 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 3, s3 -; GFX8-NEXT: v_mov_b32_e32 v25, 1 -; GFX8-NEXT: v_mov_b32_e32 v21, s11 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v0 +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] +; GFX8-NEXT: s_add_u32 s4, s0, 16 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s6 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v19 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v16 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX8-NEXT: v_and_b32_sdwa v16, v14, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v20, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s6 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v0 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GFX8-NEXT: v_and_b32_e32 v20, 1, v14 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s6 -; GFX8-NEXT: s_add_u32 s10, s0, 16 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 3, s6 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s2 +; GFX8-NEXT: v_and_b32_e32 v34, 1, v5 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 1, s6 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_lshrrev_b16_e64 v18, 4, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v20, 6, s2 +; GFX8-NEXT: v_and_b32_e32 v21, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v15 -; GFX8-NEXT: v_mov_b32_e32 v16, s11 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v11, 1, v5 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s6 +; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_and_b32_e32 v22, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX8-NEXT: v_mov_b32_e32 v15, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v0 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v20 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v18 +; GFX8-NEXT: v_mov_b32_e32 v9, s4 +; GFX8-NEXT: v_lshrrev_b16_e64 v23, 2, s2 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v0 +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] +; GFX8-NEXT: v_mov_b32_e32 v10, s1 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s7 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v24 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v23 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v22 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s0 +; GFX8-NEXT: s_add_u32 s4, s0, 0xb0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s8 -; GFX8-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v10 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: v_mov_b32_e32 v12, s0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NEXT: s_add_u32 s10, s0, 0xb0 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 14, s3 +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 1, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v25, 12, s3 +; GFX8-NEXT: v_and_b32_e32 v26, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e64 v27, 14, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s3 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v8 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 3, s8 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s10 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v5 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s7 +; GFX8-NEXT: v_mov_b32_e32 v10, s5 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v8 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX8-NEXT: v_lshrrev_b16_e64 v17, 6, s7 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v27 ; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX8-NEXT: v_mov_b32_e32 v10, s11 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v26 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v25 +; GFX8-NEXT: v_mov_b32_e32 v9, s4 ; GFX8-NEXT: v_lshrrev_b16_e64 v3, 10, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 2, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 4, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s8 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] -; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v13 -; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v17 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v17 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v26, 6, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 2, s3 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v16 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v18 +; GFX8-NEXT: v_lshrrev_b16_e64 v28, 6, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v16 ; GFX8-NEXT: v_and_b32_e32 v18, 1, v3 -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v15 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v19 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v4 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x90 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v20 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v1 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v20 ; GFX8-NEXT: v_and_b32_e32 v20, 1, v0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 6, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x80 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v23 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v22 -; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v27 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v26 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s7 +; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v29 +; GFX8-NEXT: v_and_b32_e32 v22, 1, v28 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v4 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 6, s6 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v28 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] ; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v31 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v12 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v30 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: v_mov_b32_e32 v20, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v32, 4, s6 ; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[1:4] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v19, 7, s6 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v24 +; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v32 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 2, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 4, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v12 -; GFX8-NEXT: v_mov_b32_e32 v12, s5 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v15 +; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v13 +; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX8-NEXT: v_mov_b32_e32 v12, s9 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v11 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 7, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 7, s7 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 0x60 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm @@ -2669,173 +2669,173 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T41.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T39.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T37.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T21.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T19.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T34.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T32.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T30.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T28.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T26.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T24.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T24.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_64 T21.XY, T19.X, 0, #1 +; EG-NEXT: VTX_READ_64 T19.XY, T19.X, 0, #1 ; EG-NEXT: ALU clause starting at 24: ; EG-NEXT: MOV * T19.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 25: -; EG-NEXT: BFE_UINT * T19.W, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T20.W, T19.X, literal.x, 1, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T19.Z, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T20.Z, T19.X, literal.x, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T19.Y, T21.X, 1, 1, -; EG-NEXT: BFE_UINT * T20.W, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T20.Y, T19.X, 1, 1, +; EG-NEXT: BFE_UINT * T21.W, T19.X, literal.x, 1, ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT T19.X, T21.X, 1, -; EG-NEXT: BFE_UINT T20.Z, T21.X, literal.x, 1, +; EG-NEXT: AND_INT T20.X, T19.X, 1, +; EG-NEXT: BFE_UINT T21.Z, T19.X, literal.x, 1, ; EG-NEXT: LSHR * T22.X, KC0[2].Y, literal.y, ; EG-NEXT: 6(8.407791e-45), 2(2.802597e-45) -; EG-NEXT: BFE_UINT T20.Y, T21.X, literal.x, 1, -; EG-NEXT: BFE_UINT * T23.W, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT T21.Y, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T23.W, T19.X, literal.y, 1, ; EG-NEXT: 5(7.006492e-45), 11(1.541428e-44) -; EG-NEXT: BFE_UINT T20.X, T21.X, literal.x, 1, -; EG-NEXT: BFE_UINT T23.Z, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT T21.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T23.Z, T19.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 4(5.605194e-45), 10(1.401298e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T24.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T23.Y, T21.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T25.W, T21.X, literal.z, 1, +; EG-NEXT: BFE_UINT T23.Y, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T25.W, T19.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) ; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T23.X, T21.X, literal.x, 1, -; EG-NEXT: BFE_UINT T25.Z, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT T23.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T25.Z, T19.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 14(1.961818e-44) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T26.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T25.Y, T21.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T27.W, T21.X, literal.z, 1, +; EG-NEXT: BFE_UINT T25.Y, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T27.W, T19.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 13(1.821688e-44) ; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T25.X, T21.X, literal.x, 1, -; EG-NEXT: BFE_UINT T27.Z, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT T25.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T27.Z, T19.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 12(1.681558e-44), 18(2.522337e-44) ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T28.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T27.Y, T21.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T29.W, T21.X, literal.z, 1, +; EG-NEXT: BFE_UINT T27.Y, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T29.W, T19.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 17(2.382207e-44) ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T27.X, T21.X, literal.x, 1, -; EG-NEXT: BFE_UINT T29.Z, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT T27.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T29.Z, T19.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 22(3.082857e-44) ; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T30.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T29.Y, T21.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T31.W, T21.X, literal.z, 1, +; EG-NEXT: BFE_UINT T29.Y, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T31.W, T19.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 21(2.942727e-44) ; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T29.X, T21.X, literal.x, 1, -; EG-NEXT: BFE_UINT T31.Z, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT T29.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T31.Z, T19.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 20(2.802597e-44), 26(3.643376e-44) ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T32.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T31.Y, T21.X, literal.y, 1, -; EG-NEXT: LSHR * T33.W, T21.X, literal.z, +; EG-NEXT: BFE_UINT T31.Y, T19.X, literal.y, 1, +; EG-NEXT: LSHR * T33.W, T19.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 25(3.503246e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T31.X, T21.X, literal.x, 1, -; EG-NEXT: BFE_UINT T33.Z, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT T31.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T33.Z, T19.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 24(3.363116e-44), 30(4.203895e-44) ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T34.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T33.Y, T21.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T35.W, T21.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T33.Y, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T35.W, T19.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 29(4.063766e-44) ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T33.X, T21.X, literal.x, 1, -; EG-NEXT: BFE_UINT T35.Z, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T33.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T35.Z, T19.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 28(3.923636e-44), 2(2.802597e-45) ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T21.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T35.Y, T21.Y, 1, 1, -; EG-NEXT: BFE_UINT T36.W, T21.Y, literal.y, 1, -; EG-NEXT: AND_INT * T35.X, T21.Y, 1, +; EG-NEXT: LSHR T19.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T35.Y, T19.Y, 1, 1, +; EG-NEXT: BFE_UINT T36.W, T19.Y, literal.y, 1, +; EG-NEXT: AND_INT * T35.X, T19.Y, 1, ; EG-NEXT: 2(2.802597e-45), 7(9.809089e-45) -; EG-NEXT: BFE_UINT T36.Z, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T36.Z, T19.Y, literal.x, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 6(8.407791e-45), 128(1.793662e-43) ; EG-NEXT: LSHR T37.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T36.Y, T21.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T38.W, T21.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T36.Y, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T38.W, T19.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 5(7.006492e-45) ; EG-NEXT: 11(1.541428e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T36.X, T21.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T38.Z, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T36.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T38.Z, T19.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 4(5.605194e-45), 10(1.401298e-44) ; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 122: ; EG-NEXT: LSHR T39.X, T0.W, literal.x, -; EG-NEXT: BFE_UINT T38.Y, T21.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T40.W, T21.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T38.Y, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T40.W, T19.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) ; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T38.X, T21.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T40.Z, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T38.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T40.Z, T19.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 14(1.961818e-44) ; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T41.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T40.Y, T21.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T42.W, T21.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T40.Y, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T42.W, T19.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 13(1.821688e-44) ; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T40.X, T21.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T42.Z, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T40.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T42.Z, T19.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 12(1.681558e-44), 18(2.522337e-44) ; EG-NEXT: 176(2.466285e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T43.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T42.Y, T21.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T44.W, T21.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T42.Y, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T44.W, T19.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 17(2.382207e-44) ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T42.X, T21.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T44.Z, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T42.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T44.Z, T19.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 22(3.082857e-44) ; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T45.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T44.Y, T21.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T46.W, T21.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T44.Y, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T46.W, T19.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 21(2.942727e-44) ; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T44.X, T21.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T46.Z, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T44.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T46.Z, T19.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 20(2.802597e-44), 26(3.643376e-44) ; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T47.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T46.Y, T21.Y, literal.y, 1, -; EG-NEXT: LSHR * T48.W, T21.Y, literal.z, +; EG-NEXT: BFE_UINT T46.Y, T19.Y, literal.y, 1, +; EG-NEXT: LSHR * T48.W, T19.Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 25(3.503246e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T46.X, T21.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T48.Z, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T46.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T48.Z, T19.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 24(3.363116e-44), 30(4.203895e-44) ; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T49.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT * T48.Y, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T48.Y, T19.Y, literal.y, 1, ; EG-NEXT: 2(2.802597e-45), 29(4.063766e-44) -; EG-NEXT: BFE_UINT T48.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T48.X, T19.Y, literal.x, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 28(3.923636e-44), 240(3.363116e-43) ; EG-NEXT: LSHR * T50.X, PV.W, literal.x, @@ -3019,27 +3019,27 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 8, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 11, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 4, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s2 -; GFX8-NEXT: s_lshr_b32 s7, s3, 24 -; GFX8-NEXT: s_lshr_b32 s8, s2, 24 -; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10018 -; GFX8-NEXT: s_bfe_i32 s5, s3, 0x10018 -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x10000 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10000 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 12, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 13, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 14, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 15, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 8, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 9, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 10, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 11, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 4, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 5, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v17, 7, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v18, 1, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v20, 3, s2 +; GFX8-NEXT: s_lshr_b32 s6, s3, 24 +; GFX8-NEXT: s_lshr_b32 s7, s2, 24 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x10018 +; GFX8-NEXT: s_bfe_i32 s9, s3, 0x10018 +; GFX8-NEXT: s_bfe_i32 s10, s3, 0x10000 +; GFX8-NEXT: s_bfe_i32 s11, s2, 0x10000 ; GFX8-NEXT: s_bfe_i32 s12, s2, 0x10013 ; GFX8-NEXT: s_bfe_i32 s13, s2, 0x10012 ; GFX8-NEXT: s_bfe_i32 s14, s2, 0x10011 @@ -3052,183 +3052,183 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i32 s20, s3, 0x10012 ; GFX8-NEXT: s_bfe_i32 s21, s3, 0x10011 ; GFX8-NEXT: s_bfe_i32 s22, s3, 0x10010 -; GFX8-NEXT: s_bfe_i32 s10, s3, 0x10017 -; GFX8-NEXT: s_bfe_i32 s11, s3, 0x10016 +; GFX8-NEXT: s_bfe_i32 s4, s3, 0x10017 +; GFX8-NEXT: s_bfe_i32 s5, s3, 0x10016 ; GFX8-NEXT: s_bfe_i32 s23, s3, 0x10015 ; GFX8-NEXT: s_bfe_i32 s24, s3, 0x10014 -; GFX8-NEXT: v_mov_b32_e32 v25, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v24, s11 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0xc0 -; GFX8-NEXT: v_mov_b32_e32 v22, s24 -; GFX8-NEXT: v_mov_b32_e32 v23, s23 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0x50 -; GFX8-NEXT: v_mov_b32_e32 v22, s22 -; GFX8-NEXT: v_mov_b32_e32 v23, s21 -; GFX8-NEXT: v_mov_b32_e32 v24, s20 -; GFX8-NEXT: v_mov_b32_e32 v25, s19 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 64 -; GFX8-NEXT: v_mov_b32_e32 v22, s2 -; GFX8-NEXT: v_mov_b32_e32 v23, s18 -; GFX8-NEXT: v_mov_b32_e32 v24, s17 -; GFX8-NEXT: v_mov_b32_e32 v25, s16 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v22, s15 -; GFX8-NEXT: v_mov_b32_e32 v23, s14 -; GFX8-NEXT: v_mov_b32_e32 v24, s13 -; GFX8-NEXT: v_mov_b32_e32 v25, s12 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v23, s11 -; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 -; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1 -; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v22, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: s_add_u32 s4, s0, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_add_u32 s4, s0, 0xc0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NEXT: v_mov_b32_e32 v1, s21 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_add_u32 s4, s0, 0x50 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_add_u32 s4, s0, 64 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_add_u32 s4, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 13, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 10, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 11, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v26, 4, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v27, 5, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v28, 6, s3 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v23, 1, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v21, 12, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v22, 13, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v23, 14, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v24, 15, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v25, 8, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v26, 9, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v27, 10, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v28, 11, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v29, 4, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v30, 5, s3 +; GFX8-NEXT: v_bfe_i32 v3, v9, 0, 1 +; GFX8-NEXT: v_bfe_i32 v2, v8, 0, 1 +; GFX8-NEXT: v_lshrrev_b16_e64 v31, 6, s3 +; GFX8-NEXT: v_bfe_i32 v1, v7, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v6, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_lshrrev_b16_e64 v32, 7, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v33, 1, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v34, 2, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v35, 3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 -; GFX8-NEXT: v_mov_b32_e32 v18, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1 -; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1 -; GFX8-NEXT: v_bfe_i32 v15, v15, 0, 1 -; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1 +; GFX8-NEXT: v_bfe_i32 v3, v13, 0, 1 +; GFX8-NEXT: v_bfe_i32 v2, v12, 0, 1 +; GFX8-NEXT: v_bfe_i32 v1, v11, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v10, 0, 1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[14:17] -; GFX8-NEXT: v_bfe_i32 v13, v13, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1 -; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1 -; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_bfe_i32 v3, v17, 0, 1 +; GFX8-NEXT: v_bfe_i32 v2, v16, 0, 1 +; GFX8-NEXT: v_bfe_i32 v1, v15, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v14, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 -; GFX8-NEXT: v_mov_b32_e32 v12, s1 -; GFX8-NEXT: v_bfe_i32 v10, v9, 0, 1 -; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 -; GFX8-NEXT: v_bfe_i32 v8, v7, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: v_mov_b32_e32 v11, s0 +; GFX8-NEXT: v_bfe_i32 v3, v20, 0, 1 +; GFX8-NEXT: v_bfe_i32 v2, v19, 0, 1 +; GFX8-NEXT: v_bfe_i32 v1, v18, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10] -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s8 -; GFX8-NEXT: v_mov_b32_e32 v8, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 5, s8 -; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 1 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s8 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[3:6] -; GFX8-NEXT: v_bfe_i32 v8, v11, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 2, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s8 -; GFX8-NEXT: v_bfe_i32 v7, v10, 0, 1 -; GFX8-NEXT: v_bfe_i32 v11, v1, 0, 1 -; GFX8-NEXT: v_bfe_i32 v10, v0, 0, 1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 -; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 1 -; GFX8-NEXT: v_bfe_i32 v3, v12, 0, 1 -; GFX8-NEXT: v_bfe_i32 v6, v13, 0, 1 -; GFX8-NEXT: v_bfe_i32 v13, v24, 0, 1 -; GFX8-NEXT: v_bfe_i32 v12, v2, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v13, s3 +; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x90 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 5, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13] +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 4, s7 +; GFX8-NEXT: v_bfe_i32 v3, v24, 0, 1 +; GFX8-NEXT: v_bfe_i32 v2, v23, 0, 1 +; GFX8-NEXT: v_bfe_i32 v1, v22, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v21, 0, 1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v12, v15, 0, 1 -; GFX8-NEXT: v_bfe_i32 v15, v19, 0, 1 -; GFX8-NEXT: v_bfe_i32 v19, v23, 0, 1 -; GFX8-NEXT: v_bfe_i32 v25, v22, 0, 1 -; GFX8-NEXT: v_bfe_i32 v24, v28, 0, 1 -; GFX8-NEXT: v_bfe_i32 v23, v27, 0, 1 -; GFX8-NEXT: v_bfe_i32 v22, v26, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_bfe_i32 v4, v8, 0, 1 +; GFX8-NEXT: v_bfe_i32 v8, v25, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v25, s3 +; GFX8-NEXT: v_mov_b32_e32 v24, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x80 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 4, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[22:25] +; GFX8-NEXT: v_bfe_i32 v23, v32, 0, 1 +; GFX8-NEXT: v_bfe_i32 v22, v31, 0, 1 +; GFX8-NEXT: v_bfe_i32 v21, v30, 0, 1 +; GFX8-NEXT: v_bfe_i32 v20, v29, 0, 1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v11, v14, 0, 1 -; GFX8-NEXT: v_bfe_i32 v14, v18, 0, 1 -; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1 -; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v18, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 1, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s7 +; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 7, s6 +; GFX8-NEXT: v_mov_b32_e32 v21, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s6 +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 1 +; GFX8-NEXT: v_bfe_i32 v1, v10, 0, 1 +; GFX8-NEXT: v_bfe_i32 v5, v9, 0, 1 +; GFX8-NEXT: v_bfe_i32 v11, v28, 0, 1 +; GFX8-NEXT: v_bfe_i32 v10, v27, 0, 1 +; GFX8-NEXT: v_bfe_i32 v9, v26, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v20, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 7, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[18:21] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GFX8-NEXT: v_bfe_i32 v19, v35, 0, 1 +; GFX8-NEXT: v_bfe_i32 v10, v15, 0, 1 +; GFX8-NEXT: v_bfe_i32 v15, v16, 0, 1 +; GFX8-NEXT: v_bfe_i32 v18, v34, 0, 1 +; GFX8-NEXT: v_bfe_i32 v17, v33, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v16, s10 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1 -; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GFX8-NEXT: v_lshrrev_b16_e64 v36, 4, s6 +; GFX8-NEXT: v_mov_b32_e32 v17, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v37, 5, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v38, 6, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s6 +; GFX8-NEXT: v_mov_b32_e32 v16, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[14:17] +; GFX8-NEXT: v_bfe_i32 v9, v14, 0, 1 +; GFX8-NEXT: v_bfe_i32 v14, v38, 0, 1 +; GFX8-NEXT: v_bfe_i32 v13, v37, 0, 1 +; GFX8-NEXT: v_bfe_i32 v12, v36, 0, 1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v13, v2, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v10, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 2, s7 +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v13, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s6 +; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 7, s8 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13] +; GFX8-NEXT: v_bfe_i32 v11, v0, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v8, s9 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s7 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GFX8-NEXT: s_add_u32 s0, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1 +; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GFX8-NEXT: s_endpgm -; +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; ; EG-LABEL: constant_sextload_v64i1_to_v64i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[] @@ -4202,14 +4202,14 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 ; GFX6-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 @@ -4219,8 +4219,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX6-NEXT: v_mov_b32_e32 v11, v1 ; GFX6-NEXT: v_mov_b32_e32 v13, v1 ; GFX6-NEXT: v_mov_b32_e32 v15, v1 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_u32 v14, v0, 1, 1 ; GFX6-NEXT: v_bfe_u32 v10, v0, 3, 1 @@ -4230,10 +4230,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX6-NEXT: v_bfe_u32 v8, v0, 2, 1 ; GFX6-NEXT: v_bfe_u32 v4, v0, 4, 1 ; GFX6-NEXT: v_bfe_u32 v0, v0, 6, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_zextload_v8i1_to_v8i64: @@ -4344,17 +4344,17 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 ; GFX6-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 7, v0 @@ -4379,10 +4379,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v8i1_to_v8i64: @@ -4558,100 +4558,100 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v17, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, v17 -; GFX8-NEXT: v_mov_b32_e32 v13, v17 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v22, 0 +; GFX8-NEXT: v_mov_b32_e32 v18, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: flat_load_ushort v9, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: s_add_u32 s4, s0, 0x50 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, s4 ; GFX8-NEXT: v_mov_b32_e32 v24, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v23, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, v17 -; GFX8-NEXT: v_mov_b32_e32 v5, v17 -; GFX8-NEXT: v_mov_b32_e32 v22, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_mov_b32_e32 v20, v1 +; GFX8-NEXT: v_mov_b32_e32 v16, v1 +; GFX8-NEXT: v_mov_b32_e32 v12, v1 +; GFX8-NEXT: v_mov_b32_e32 v10, 0 +; GFX8-NEXT: v_mov_b32_e32 v14, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 10, v2 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 11, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 14, v2 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[18:21] -; GFX8-NEXT: v_mov_b32_e32 v0, 1 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v18, 15, v2 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[16:19] +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 10, v9 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 11, v9 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 14, v9 +; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[2:5] +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 15, v9 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, 1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 9, v9 ; GFX8-NEXT: v_mov_b32_e32 v24, s3 -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v23, s2 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_and_b32_sdwa v0, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: v_mov_b32_e32 v19, 0 -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v0 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v24, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 12, v9 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v26, s3 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 12, v2 -; GFX8-NEXT: v_mov_b32_e32 v25, s2 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 13, v9 +; GFX8-NEXT: v_mov_b32_e32 v24, s3 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX8-NEXT: v_mov_b32_e32 v23, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 13, v2 -; GFX8-NEXT: v_mov_b32_e32 v20, v17 -; GFX8-NEXT: v_mov_b32_e32 v1, v17 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 2, v9 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 -; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v0 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 4, v9 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[19:22] ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 7, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 6, v2 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[19:22] +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 7, v9 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 6, v9 +; GFX8-NEXT: v_and_b32_e32 v11, 1, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 5, v9 +; GFX8-NEXT: v_lshrrev_b16_e32 v13, 3, v9 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 4, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v10, 5, v2 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v14, 3, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v6 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 -; GFX8-NEXT: v_mov_b32_e32 v21, s1 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v9 +; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v9 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v22 -; GFX8-NEXT: v_mov_b32_e32 v18, s2 -; GFX8-NEXT: v_mov_b32_e32 v20, s0 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX8-NEXT: v_and_b32_e32 v17, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX8-NEXT: v_mov_b32_e32 v20, s3 +; GFX8-NEXT: v_mov_b32_e32 v22, s1 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v9 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v13 +; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v17 +; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v2 +; GFX8-NEXT: v_mov_b32_e32 v19, s2 +; GFX8-NEXT: v_mov_b32_e32 v21, s0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[15:18] +; GFX8-NEXT: flat_store_dwordx4 v[19:20], v[11:14] +; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[7:10] +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v16i1_to_v16i64: @@ -4821,7 +4821,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 @@ -4853,44 +4853,44 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v20, s2 ; GFX8-NEXT: v_mov_b32_e32 v22, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 15, v0 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1 -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v13, 12, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v14, 13, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[1:4] -; GFX8-NEXT: v_lshrrev_b16_e32 v15, 10, v0 -; GFX8-NEXT: v_bfe_i32 v3, v14, 0, 1 -; GFX8-NEXT: v_bfe_i32 v1, v13, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 11, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[1:4] -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 8, v0 -; GFX8-NEXT: v_bfe_i32 v3, v5, 0, 1 -; GFX8-NEXT: v_bfe_i32 v1, v15, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v13, 9, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[1:4] -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 6, v0 -; GFX8-NEXT: v_bfe_i32 v3, v13, 0, 1 -; GFX8-NEXT: v_bfe_i32 v1, v6, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v14, 7, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 5, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v9, 2, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 3, v0 -; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[1:4] +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 14, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v4 +; GFX8-NEXT: v_bfe_i32 v2, v1, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX8-NEXT: v_lshrrev_b16_e32 v13, 12, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v14, 13, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NEXT: v_lshrrev_b16_e32 v15, 10, v4 +; GFX8-NEXT: v_bfe_i32 v2, v14, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v13, 0, 1 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 11, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 8, v4 +; GFX8-NEXT: v_bfe_i32 v2, v5, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v15, 0, 1 +; GFX8-NEXT: v_lshrrev_b16_e32 v13, 9, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3] +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 6, v4 +; GFX8-NEXT: v_bfe_i32 v2, v13, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v6, 0, 1 +; GFX8-NEXT: v_lshrrev_b16_e32 v14, 7, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 4, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 5, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v9, 2, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 3, v4 +; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[0:3] ; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v4 ; GFX8-NEXT: v_bfe_i32 v12, v7, 0, 1 -; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v4, 0, 1 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1 ; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1 ; GFX8-NEXT: v_bfe_i32 v4, v9, 0, 1 @@ -5129,17 +5129,18 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 11, s2 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s2 -; GFX8-NEXT: s_lshr_b32 s14, s2, 24 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s2 ; GFX8-NEXT: v_and_b32_e32 v11, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2 +; GFX8-NEXT: v_and_b32_e32 v13, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s2 +; GFX8-NEXT: v_and_b32_e32 v17, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2 +; GFX8-NEXT: s_lshr_b32 s12, s2, 24 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v1 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10018 -; GFX8-NEXT: s_and_b32 s11, s2, 1 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x10018 +; GFX8-NEXT: s_and_b32 s14, s2, 1 ; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10011 ; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10010 ; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10012 @@ -5148,18 +5149,17 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10015 ; GFX8-NEXT: s_bfe_u32 s21, s2, 0x10016 ; GFX8-NEXT: s_bfe_u32 s22, s2, 0x10017 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 14, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 12, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 12, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 11, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v9, 10, s2 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 4, s2 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 2, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v21, 4, s2 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 15, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: s_add_u32 s4, s0, 0xa0 @@ -5167,139 +5167,140 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s6, s0, 0x90 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NEXT: s_add_u32 s8, s0, 0x80 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s12 ; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: s_add_u32 s12, s0, 0x70 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s14 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s14 -; GFX8-NEXT: v_mov_b32_e32 v25, s13 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s12 +; GFX8-NEXT: s_add_u32 s10, s0, 0x70 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v24, s12 -; GFX8-NEXT: s_add_u32 s12, s0, 0xf0 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v14 -; GFX8-NEXT: v_mov_b32_e32 v21, v1 -; GFX8-NEXT: v_mov_b32_e32 v23, v1 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GFX8-NEXT: v_mov_b32_e32 v25, s13 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 6, s14 -; GFX8-NEXT: v_mov_b32_e32 v24, s12 -; GFX8-NEXT: s_add_u32 s12, s0, 0x60 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v17 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s14 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v11 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v15 -; GFX8-NEXT: v_and_b32_e32 v22, 0xffff, v16 -; GFX8-NEXT: v_mov_b32_e32 v16, s13 -; GFX8-NEXT: v_mov_b32_e32 v15, s12 -; GFX8-NEXT: s_add_u32 s12, s0, 0x50 -; GFX8-NEXT: v_mov_b32_e32 v23, 0 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[20:23] -; GFX8-NEXT: v_mov_b32_e32 v16, s13 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v9 -; GFX8-NEXT: v_mov_b32_e32 v24, v1 -; GFX8-NEXT: v_mov_b32_e32 v15, s12 -; GFX8-NEXT: s_add_u32 s12, s0, 64 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[23:26] -; GFX8-NEXT: v_mov_b32_e32 v15, 1 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_and_b32_sdwa v23, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v16, s13 -; GFX8-NEXT: v_mov_b32_e32 v15, s12 -; GFX8-NEXT: s_add_u32 s12, s0, 48 -; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v8 -; GFX8-NEXT: v_mov_b32_e32 v26, 0 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[23:26] -; GFX8-NEXT: v_mov_b32_e32 v16, s13 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v5 -; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v3 -; GFX8-NEXT: v_mov_b32_e32 v26, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s12 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[23:26] -; GFX8-NEXT: v_mov_b32_e32 v16, s3 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: v_and_b32_e32 v12, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, s11 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[2:5] +; GFX8-NEXT: s_add_u32 s10, s0, 0xf0 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 1, s12 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 6, s12 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v10 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 7, s12 +; GFX8-NEXT: v_mov_b32_e32 v7, s11 +; GFX8-NEXT: s_add_u32 s10, s0, 0x60 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[2:5] +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v7, s11 +; GFX8-NEXT: s_add_u32 s10, s0, 0x50 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[2:5] +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[8:11] +; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_add_u32 s10, s0, 64 +; GFX8-NEXT: v_and_b32_sdwa v12, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GFX8-NEXT: v_mov_b32_e32 v15, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: s_add_u32 s10, s0, 48 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_and_b32_e32 v12, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v17 +; GFX8-NEXT: v_mov_b32_e32 v19, 0 +; GFX8-NEXT: v_mov_b32_e32 v17, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] ; GFX8-NEXT: v_mov_b32_e32 v0, s21 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v21 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX8-NEXT: v_mov_b32_e32 v21, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v15, s2 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v16, s5 +; GFX8-NEXT: v_mov_b32_e32 v20, s2 +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v21, s5 ; GFX8-NEXT: v_mov_b32_e32 v0, s19 ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: v_mov_b32_e32 v15, s4 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v16, s7 +; GFX8-NEXT: v_mov_b32_e32 v20, s4 +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v21, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s17 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: v_mov_b32_e32 v15, s6 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v16, s9 +; GFX8-NEXT: v_mov_b32_e32 v20, s6 +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v21, s9 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v2, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s8 +; GFX8-NEXT: v_mov_b32_e32 v20, s8 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v26, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[23:26] +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s14 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v7 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v15, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v13 -; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX8-NEXT: v_mov_b32_e32 v13, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s11 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, v10 -; GFX8-NEXT: v_mov_b32_e32 v3, v13 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GFX8-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: v_mov_b32_e32 v12, s0 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v22, 4, s12 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v22 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s14 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX8-NEXT: v_mov_b32_e32 v12, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s14 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; GFX8-NEXT: v_lshrrev_b16_e64 v24, 2, s12 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[8:11] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX8-NEXT: v_mov_b32_e32 v17, 0 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v19 -; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v18 -; GFX8-NEXT: v_mov_b32_e32 v22, 0 -; GFX8-NEXT: v_mov_b32_e32 v20, v1 +; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v24 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s13 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[19:22] -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-NEXT: v_mov_b32_e32 v3, v17 +; GFX8-NEXT: v_mov_b32_e32 v2, v25 +; GFX8-NEXT: v_mov_b32_e32 v3, v26 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -5473,164 +5474,164 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s52, s8, 30 -; GFX6-NEXT: s_lshr_b32 s46, s8, 31 -; GFX6-NEXT: s_lshr_b32 s48, s8, 28 -; GFX6-NEXT: s_lshr_b32 s36, s8, 29 -; GFX6-NEXT: s_lshr_b32 s38, s8, 26 -; GFX6-NEXT: s_lshr_b32 s26, s8, 27 -; GFX6-NEXT: s_lshr_b32 s28, s8, 24 -; GFX6-NEXT: s_lshr_b32 s4, s8, 25 -; GFX6-NEXT: s_lshr_b32 s6, s8, 22 -; GFX6-NEXT: s_lshr_b32 s10, s8, 23 -; GFX6-NEXT: s_lshr_b32 s12, s8, 20 -; GFX6-NEXT: s_lshr_b32 s14, s8, 21 -; GFX6-NEXT: s_lshr_b32 s16, s8, 18 -; GFX6-NEXT: s_lshr_b32 s18, s8, 19 -; GFX6-NEXT: s_lshr_b32 s20, s8, 16 -; GFX6-NEXT: s_lshr_b32 s22, s8, 17 -; GFX6-NEXT: s_lshr_b32 s24, s8, 14 -; GFX6-NEXT: s_lshr_b32 s30, s8, 15 -; GFX6-NEXT: s_lshr_b32 s34, s8, 12 -; GFX6-NEXT: s_lshr_b32 s40, s8, 13 -; GFX6-NEXT: s_lshr_b32 s42, s8, 10 -; GFX6-NEXT: s_lshr_b32 s44, s8, 11 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: s_lshr_b32 s50, s8, 8 -; GFX6-NEXT: v_mov_b32_e32 v2, s52 -; GFX6-NEXT: v_mov_b32_e32 v3, s53 -; GFX6-NEXT: s_lshr_b32 s52, s8, 9 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v4, s46 -; GFX6-NEXT: v_mov_b32_e32 v5, s47 -; GFX6-NEXT: s_lshr_b32 s46, s8, 6 -; GFX6-NEXT: v_mov_b32_e32 v6, s48 -; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s48, s8, 7 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s8, 4 -; GFX6-NEXT: v_mov_b32_e32 v10, s38 -; GFX6-NEXT: v_mov_b32_e32 v11, s39 -; GFX6-NEXT: s_lshr_b32 s38, s8, 5 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v12, s26 -; GFX6-NEXT: v_mov_b32_e32 v13, s27 -; GFX6-NEXT: s_lshr_b32 s26, s8, 2 -; GFX6-NEXT: v_mov_b32_e32 v14, s28 -; GFX6-NEXT: v_mov_b32_e32 v15, s29 -; GFX6-NEXT: s_lshr_b32 s28, s8, 3 -; GFX6-NEXT: s_lshr_b32 s8, s8, 1 +; GFX6-NEXT: s_lshr_b32 s6, s4, 30 +; GFX6-NEXT: s_lshr_b32 s8, s4, 31 +; GFX6-NEXT: s_lshr_b32 s10, s4, 28 +; GFX6-NEXT: s_lshr_b32 s12, s4, 29 +; GFX6-NEXT: s_lshr_b32 s14, s4, 26 +; GFX6-NEXT: s_lshr_b32 s16, s4, 27 +; GFX6-NEXT: s_lshr_b32 s18, s4, 24 +; GFX6-NEXT: s_lshr_b32 s20, s4, 25 +; GFX6-NEXT: s_lshr_b32 s22, s4, 22 +; GFX6-NEXT: s_lshr_b32 s24, s4, 23 +; GFX6-NEXT: s_lshr_b32 s26, s4, 20 +; GFX6-NEXT: s_lshr_b32 s28, s4, 21 +; GFX6-NEXT: s_lshr_b32 s30, s4, 18 +; GFX6-NEXT: s_lshr_b32 s34, s4, 19 +; GFX6-NEXT: s_lshr_b32 s36, s4, 16 +; GFX6-NEXT: s_lshr_b32 s38, s4, 17 +; GFX6-NEXT: s_lshr_b32 s40, s4, 14 +; GFX6-NEXT: s_lshr_b32 s42, s4, 15 +; GFX6-NEXT: s_lshr_b32 s44, s4, 12 +; GFX6-NEXT: s_lshr_b32 s46, s4, 13 +; GFX6-NEXT: s_lshr_b32 s48, s4, 10 +; GFX6-NEXT: s_lshr_b32 s50, s4, 11 +; GFX6-NEXT: s_bfe_i64 s[52:53], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s52 +; GFX6-NEXT: v_mov_b32_e32 v1, s53 +; GFX6-NEXT: s_lshr_b32 s52, s4, 8 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: s_lshr_b32 s6, s4, 9 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 6 +; GFX6-NEXT: v_mov_b32_e32 v6, s10 +; GFX6-NEXT: v_mov_b32_e32 v7, s11 +; GFX6-NEXT: s_lshr_b32 s10, s4, 7 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s12 +; GFX6-NEXT: v_mov_b32_e32 v9, s13 +; GFX6-NEXT: s_lshr_b32 s12, s4, 4 +; GFX6-NEXT: v_mov_b32_e32 v10, s14 +; GFX6-NEXT: v_mov_b32_e32 v11, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 5 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s16 +; GFX6-NEXT: v_mov_b32_e32 v13, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 2 +; GFX6-NEXT: v_mov_b32_e32 v14, s18 +; GFX6-NEXT: v_mov_b32_e32 v15, s19 +; GFX6-NEXT: s_lshr_b32 s18, s4, 3 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v16, s4 -; GFX6-NEXT: v_mov_b32_e32 v17, s5 +; GFX6-NEXT: v_mov_b32_e32 v16, s20 +; GFX6-NEXT: v_mov_b32_e32 v17, s21 ; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NEXT: v_mov_b32_e32 v5, s11 +; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NEXT: v_mov_b32_e32 v4, s24 +; GFX6-NEXT: v_mov_b32_e32 v5, s25 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: v_mov_b32_e32 v4, s14 -; GFX6-NEXT: v_mov_b32_e32 v5, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: v_mov_b32_e32 v4, s28 +; GFX6-NEXT: v_mov_b32_e32 v5, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NEXT: v_mov_b32_e32 v4, s34 +; GFX6-NEXT: v_mov_b32_e32 v5, s35 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: v_mov_b32_e32 v4, s22 -; GFX6-NEXT: v_mov_b32_e32 v5, s23 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v4, s38 +; GFX6-NEXT: v_mov_b32_e32 v5, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s24 -; GFX6-NEXT: v_mov_b32_e32 v3, s25 -; GFX6-NEXT: v_mov_b32_e32 v4, s30 -; GFX6-NEXT: v_mov_b32_e32 v5, s31 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: v_mov_b32_e32 v4, s42 +; GFX6-NEXT: v_mov_b32_e32 v5, s43 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NEXT: v_mov_b32_e32 v5, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: v_mov_b32_e32 v4, s46 +; GFX6-NEXT: v_mov_b32_e32 v5, s47 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: v_mov_b32_e32 v4, s44 -; GFX6-NEXT: v_mov_b32_e32 v5, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s48 +; GFX6-NEXT: v_mov_b32_e32 v3, s49 +; GFX6-NEXT: v_mov_b32_e32 v4, s50 +; GFX6-NEXT: v_mov_b32_e32 v5, s51 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s50 -; GFX6-NEXT: v_mov_b32_e32 v3, s51 -; GFX6-NEXT: v_mov_b32_e32 v4, s52 -; GFX6-NEXT: v_mov_b32_e32 v5, s53 +; GFX6-NEXT: v_mov_b32_e32 v2, s52 +; GFX6-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s46 -; GFX6-NEXT: v_mov_b32_e32 v3, s47 -; GFX6-NEXT: v_mov_b32_e32 v4, s48 -; GFX6-NEXT: v_mov_b32_e32 v5, s49 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: v_mov_b32_e32 v4, s38 -; GFX6-NEXT: v_mov_b32_e32 v5, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NEXT: v_mov_b32_e32 v4, s14 +; GFX6-NEXT: v_mov_b32_e32 v5, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: v_mov_b32_e32 v4, s28 -; GFX6-NEXT: v_mov_b32_e32 v5, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NEXT: v_mov_b32_e32 v5, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5659,113 +5660,114 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v11, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_add_u32 s10, s0, 0xb0 -; GFX8-NEXT: v_mov_b32_e32 v12, s11 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s11 -; GFX8-NEXT: v_mov_b32_e32 v15, s10 +; GFX8-NEXT: v_mov_b32_e32 v15, s11 +; GFX8-NEXT: v_mov_b32_e32 v14, s10 ; GFX8-NEXT: s_add_u32 s10, s0, 0xa0 -; GFX8-NEXT: v_mov_b32_e32 v13, s12 -; GFX8-NEXT: v_mov_b32_e32 v14, s13 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s11 -; GFX8-NEXT: v_mov_b32_e32 v15, s10 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v15, s11 +; GFX8-NEXT: v_mov_b32_e32 v14, s10 ; GFX8-NEXT: s_add_u32 s10, s0, 0x90 -; GFX8-NEXT: v_mov_b32_e32 v11, s14 -; GFX8-NEXT: v_mov_b32_e32 v12, s15 -; GFX8-NEXT: v_mov_b32_e32 v13, s16 -; GFX8-NEXT: v_mov_b32_e32 v14, s17 +; GFX8-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NEXT: v_mov_b32_e32 v3, s17 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s11 -; GFX8-NEXT: v_mov_b32_e32 v15, s10 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v15, s11 +; GFX8-NEXT: v_mov_b32_e32 v14, s10 ; GFX8-NEXT: s_add_u32 s10, s0, 0x80 -; GFX8-NEXT: v_mov_b32_e32 v11, s18 -; GFX8-NEXT: v_mov_b32_e32 v12, s19 -; GFX8-NEXT: v_mov_b32_e32 v13, s20 -; GFX8-NEXT: v_mov_b32_e32 v14, s21 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, s21 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s11 -; GFX8-NEXT: v_mov_b32_e32 v15, s10 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v15, s11 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 14, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 15, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NEXT: v_mov_b32_e32 v2, s24 +; GFX8-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NEXT: v_mov_b32_e32 v14, s10 ; GFX8-NEXT: s_add_u32 s10, s0, 0x70 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s8 -; GFX8-NEXT: v_mov_b32_e32 v11, s22 -; GFX8-NEXT: v_mov_b32_e32 v12, s23 -; GFX8-NEXT: v_mov_b32_e32 v13, s24 -; GFX8-NEXT: v_mov_b32_e32 v14, s25 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 9, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 6, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s8 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 5, s8 +; GFX8-NEXT: v_bfe_i32 v2, v5, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v4, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 12, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 13, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 10, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 11, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 8, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 9, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v17, 5, s8 ; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s8 -; GFX8-NEXT: v_bfe_i32 v11, v10, 0, 1 -; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v14, s11 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s8 ; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s8 ; GFX8-NEXT: s_add_u32 s8, s0, 0x60 -; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX8-NEXT: v_mov_b32_e32 v13, s10 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v12, s9 -; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v11, s8 +; GFX8-NEXT: v_bfe_i32 v2, v7, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v6, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: s_add_u32 s8, s0, 0x50 -; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10] -; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v10, s9 -; GFX8-NEXT: v_bfe_i32 v7, v6, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v9, s8 +; GFX8-NEXT: v_bfe_i32 v2, v9, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v8, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: s_add_u32 s8, s0, 64 -; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] -; GFX8-NEXT: v_mov_b32_e32 v11, s9 -; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v10, s8 +; GFX8-NEXT: v_bfe_i32 v2, v11, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v10, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: s_add_u32 s8, s0, 48 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[3:6] -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v11, s9 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: v_mov_b32_e32 v10, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s6 +; GFX8-NEXT: v_bfe_i32 v2, v13, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v12, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 7, s6 ; GFX8-NEXT: v_lshrrev_b16_e64 v8, 4, s6 ; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 2, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 3, s6 -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[1:4] -; GFX8-NEXT: v_bfe_i32 v18, v16, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 2, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s6 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GFX8-NEXT: v_bfe_i32 v12, v18, 0, 1 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s6 ; GFX8-NEXT: s_add_u32 s6, s0, 32 -; GFX8-NEXT: v_bfe_i32 v2, v1, 0, 1 -; GFX8-NEXT: v_bfe_i32 v16, v0, 0, 1 +; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 1 +; GFX8-NEXT: v_bfe_i32 v18, v17, 0, 1 +; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 @@ -5788,8 +5790,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v17, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_add_u32 s4, s0, 0xf0 -; GFX8-NEXT: v_bfe_i32 v14, v13, 0, 1 -; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1 +; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -5807,8 +5808,8 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX8-NEXT: v_bfe_i32 v4, v5, 0, 1 +; GFX8-NEXT: v_bfe_i32 v6, v5, 0, 1 +; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 1 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 @@ -6029,50 +6030,50 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s4, s2, 0x10003 ; GFX6-NEXT: s_bfe_u32 s5, s2, 0x10005 -; GFX6-NEXT: s_bfe_u32 s8, s2, 0x10007 -; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10009 -; GFX6-NEXT: s_bfe_u32 s13, s2, 0x1000b -; GFX6-NEXT: s_bfe_u32 s15, s2, 0x1000d -; GFX6-NEXT: s_bfe_u32 s17, s2, 0x1000f -; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10011 -; GFX6-NEXT: s_bfe_u32 s21, s2, 0x10013 -; GFX6-NEXT: s_bfe_u32 s23, s2, 0x10015 -; GFX6-NEXT: s_bfe_u32 s25, s2, 0x10017 -; GFX6-NEXT: s_bfe_u32 s27, s2, 0x10019 -; GFX6-NEXT: s_bfe_u32 s29, s2, 0x1001b -; GFX6-NEXT: s_bfe_u32 s31, s2, 0x1001d -; GFX6-NEXT: s_lshr_b32 s34, s2, 31 -; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10003 -; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10005 -; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10007 -; GFX6-NEXT: s_bfe_u32 s39, s3, 0x10009 -; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000b -; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000d -; GFX6-NEXT: s_bfe_u32 s42, s3, 0x1000f -; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10011 -; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10013 -; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10015 -; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10017 -; GFX6-NEXT: s_bfe_u32 s47, s3, 0x10019 -; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001b -; GFX6-NEXT: s_bfe_u32 s49, s3, 0x1001d -; GFX6-NEXT: s_lshr_b32 s50, s3, 31 -; GFX6-NEXT: s_bfe_u32 s9, s3, 0x10001 -; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10001 -; GFX6-NEXT: s_and_b32 s7, s2, 1 -; GFX6-NEXT: s_and_b32 s10, s3, 1 -; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10002 -; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10004 -; GFX6-NEXT: s_bfe_u32 s16, s2, 0x10006 -; GFX6-NEXT: s_bfe_u32 s18, s2, 0x10008 -; GFX6-NEXT: s_bfe_u32 s20, s2, 0x1000a -; GFX6-NEXT: s_bfe_u32 s22, s2, 0x1000c -; GFX6-NEXT: s_bfe_u32 s24, s2, 0x1000e -; GFX6-NEXT: s_bfe_u32 s26, s2, 0x10010 -; GFX6-NEXT: s_bfe_u32 s28, s2, 0x10012 -; GFX6-NEXT: s_bfe_u32 s30, s2, 0x10014 -; GFX6-NEXT: s_bfe_u32 s33, s2, 0x10016 -; GFX6-NEXT: s_bfe_u32 s35, s2, 0x10018 +; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10007 +; GFX6-NEXT: s_bfe_u32 s7, s2, 0x10009 +; GFX6-NEXT: s_bfe_u32 s8, s2, 0x1000b +; GFX6-NEXT: s_bfe_u32 s9, s2, 0x1000d +; GFX6-NEXT: s_bfe_u32 s10, s2, 0x1000f +; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10011 +; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10013 +; GFX6-NEXT: s_bfe_u32 s13, s2, 0x10015 +; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10017 +; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10019 +; GFX6-NEXT: s_bfe_u32 s16, s2, 0x1001b +; GFX6-NEXT: s_bfe_u32 s17, s2, 0x1001d +; GFX6-NEXT: s_lshr_b32 s18, s2, 31 +; GFX6-NEXT: s_bfe_u32 s19, s3, 0x10003 +; GFX6-NEXT: s_bfe_u32 s20, s3, 0x10005 +; GFX6-NEXT: s_bfe_u32 s21, s3, 0x10007 +; GFX6-NEXT: s_bfe_u32 s22, s3, 0x10009 +; GFX6-NEXT: s_bfe_u32 s23, s3, 0x1000b +; GFX6-NEXT: s_bfe_u32 s24, s3, 0x1000d +; GFX6-NEXT: s_bfe_u32 s25, s3, 0x1000f +; GFX6-NEXT: s_bfe_u32 s26, s3, 0x10011 +; GFX6-NEXT: s_bfe_u32 s27, s3, 0x10013 +; GFX6-NEXT: s_bfe_u32 s28, s3, 0x10015 +; GFX6-NEXT: s_bfe_u32 s29, s3, 0x10017 +; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10019 +; GFX6-NEXT: s_bfe_u32 s31, s3, 0x1001b +; GFX6-NEXT: s_bfe_u32 s33, s3, 0x1001d +; GFX6-NEXT: s_lshr_b32 s34, s3, 31 +; GFX6-NEXT: s_bfe_u32 s35, s3, 0x10001 +; GFX6-NEXT: s_bfe_u32 s36, s2, 0x10001 +; GFX6-NEXT: s_and_b32 s37, s2, 1 +; GFX6-NEXT: s_and_b32 s38, s3, 1 +; GFX6-NEXT: s_bfe_u32 s39, s2, 0x10002 +; GFX6-NEXT: s_bfe_u32 s40, s2, 0x10004 +; GFX6-NEXT: s_bfe_u32 s41, s2, 0x10006 +; GFX6-NEXT: s_bfe_u32 s42, s2, 0x10008 +; GFX6-NEXT: s_bfe_u32 s43, s2, 0x1000a +; GFX6-NEXT: s_bfe_u32 s44, s2, 0x1000c +; GFX6-NEXT: s_bfe_u32 s45, s2, 0x1000e +; GFX6-NEXT: s_bfe_u32 s46, s2, 0x10010 +; GFX6-NEXT: s_bfe_u32 s47, s2, 0x10012 +; GFX6-NEXT: s_bfe_u32 s48, s2, 0x10014 +; GFX6-NEXT: s_bfe_u32 s49, s2, 0x10016 +; GFX6-NEXT: s_bfe_u32 s50, s2, 0x10018 ; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001a ; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c ; GFX6-NEXT: s_bfe_u32 s53, s2, 0x1001e @@ -6096,486 +6097,482 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v0, s67 -; GFX6-NEXT: v_mov_b32_e32 v2, s50 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s68 -; GFX6-NEXT: v_mov_b32_e32 v2, s49 +; GFX6-NEXT: v_mov_b32_e32 v2, s33 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NEXT: v_mov_b32_e32 v2, s48 +; GFX6-NEXT: v_mov_b32_e32 v2, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s65 -; GFX6-NEXT: v_mov_b32_e32 v2, s47 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s64 -; GFX6-NEXT: v_mov_b32_e32 v2, s46 +; GFX6-NEXT: v_mov_b32_e32 v2, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s63 -; GFX6-NEXT: v_mov_b32_e32 v2, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v2, s27 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s61 -; GFX6-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v2, s25 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s59 -; GFX6-NEXT: v_mov_b32_e32 v2, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s24 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s58 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v2, s23 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s22 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s56 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v2, s21 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s55 -; GFX6-NEXT: v_mov_b32_e32 v2, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s54 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v2, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s53 -; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s52 -; GFX6-NEXT: v_mov_b32_e32 v2, s31 +; GFX6-NEXT: v_mov_b32_e32 v2, s17 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s35 -; GFX6-NEXT: v_mov_b32_e32 v2, s27 +; GFX6-NEXT: v_mov_b32_e32 v0, s50 +; GFX6-NEXT: v_mov_b32_e32 v2, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s33 -; GFX6-NEXT: v_mov_b32_e32 v2, s25 +; GFX6-NEXT: v_mov_b32_e32 v0, s49 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s30 -; GFX6-NEXT: v_mov_b32_e32 v2, s23 +; GFX6-NEXT: v_mov_b32_e32 v0, s48 +; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NEXT: v_mov_b32_e32 v2, s21 +; GFX6-NEXT: v_mov_b32_e32 v0, s47 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s26 -; GFX6-NEXT: v_mov_b32_e32 v2, s19 +; GFX6-NEXT: v_mov_b32_e32 v0, s46 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s24 -; GFX6-NEXT: v_mov_b32_e32 v2, s17 +; GFX6-NEXT: v_mov_b32_e32 v0, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s22 -; GFX6-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: v_mov_b32_e32 v2, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v0, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s39 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s38 +; GFX6-NEXT: v_mov_b32_e32 v2, s35 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v28, 0 +; GFX8-NEXT: v_mov_b32_e32 v32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[18:19], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s18 ; GFX8-NEXT: v_and_b32_e32 v18, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 11, s2 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s2 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2 -; GFX8-NEXT: s_lshr_b32 s31, s3, 24 -; GFX8-NEXT: s_lshr_b32 s24, s2, 24 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 10, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2 -; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10018 -; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10018 -; GFX8-NEXT: s_and_b32 s22, s3, 1 -; GFX8-NEXT: s_and_b32 s23, s2, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s2 -; GFX8-NEXT: s_bfe_u32 s25, s2, 0x10011 -; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10010 -; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10012 -; GFX8-NEXT: s_bfe_u32 s28, s2, 0x10013 -; GFX8-NEXT: s_bfe_u32 s29, s2, 0x10014 -; GFX8-NEXT: s_bfe_u32 s30, s2, 0x10015 -; GFX8-NEXT: s_bfe_u32 s33, s2, 0x10016 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10017 -; GFX8-NEXT: s_bfe_u32 s34, s3, 0x10011 -; GFX8-NEXT: s_bfe_u32 s35, s3, 0x10010 -; GFX8-NEXT: s_bfe_u32 s36, s3, 0x10012 -; GFX8-NEXT: s_bfe_u32 s37, s3, 0x10013 -; GFX8-NEXT: s_bfe_u32 s38, s3, 0x10016 -; GFX8-NEXT: s_bfe_u32 s39, s3, 0x10017 -; GFX8-NEXT: s_bfe_u32 s40, s3, 0x10015 -; GFX8-NEXT: s_bfe_u32 s41, s3, 0x10014 -; GFX8-NEXT: s_add_u32 s4, s0, 0x1a0 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: s_add_u32 s6, s0, 0x1b0 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: s_add_u32 s8, s0, 0x190 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 11, s18 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s18 +; GFX8-NEXT: v_and_b32_e32 v22, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s18 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s18 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s18 +; GFX8-NEXT: v_mov_b32_e32 v14, s18 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 14, s18 +; GFX8-NEXT: s_lshr_b32 s41, s19, 24 +; GFX8-NEXT: s_lshr_b32 s40, s18, 24 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s18 +; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s18 +; GFX8-NEXT: v_lshrrev_b16_e64 v24, 6, s18 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 4, s18 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 2, s18 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s18 +; GFX8-NEXT: s_bfe_u32 s20, s18, 0x10018 +; GFX8-NEXT: s_bfe_u32 s21, s19, 0x10018 +; GFX8-NEXT: s_and_b32 s22, s19, 1 +; GFX8-NEXT: s_and_b32 s23, s18, 1 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s18 +; GFX8-NEXT: s_bfe_u32 s24, s18, 0x10011 +; GFX8-NEXT: s_bfe_u32 s25, s18, 0x10010 +; GFX8-NEXT: s_bfe_u32 s26, s18, 0x10012 +; GFX8-NEXT: s_bfe_u32 s27, s18, 0x10013 +; GFX8-NEXT: s_bfe_u32 s28, s18, 0x10014 +; GFX8-NEXT: s_bfe_u32 s29, s18, 0x10015 +; GFX8-NEXT: s_bfe_u32 s30, s18, 0x10016 +; GFX8-NEXT: s_bfe_u32 s18, s18, 0x10017 +; GFX8-NEXT: s_bfe_u32 s31, s19, 0x10011 +; GFX8-NEXT: s_bfe_u32 s33, s19, 0x10010 +; GFX8-NEXT: s_bfe_u32 s34, s19, 0x10012 +; GFX8-NEXT: s_bfe_u32 s35, s19, 0x10013 +; GFX8-NEXT: s_bfe_u32 s36, s19, 0x10016 +; GFX8-NEXT: s_bfe_u32 s37, s19, 0x10017 +; GFX8-NEXT: s_bfe_u32 s38, s19, 0x10015 +; GFX8-NEXT: s_bfe_u32 s39, s19, 0x10014 +; GFX8-NEXT: s_add_u32 s16, s0, 0x1a0 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: s_add_u32 s14, s0, 0x1b0 +; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: s_add_u32 s12, s0, 0x190 +; GFX8-NEXT: s_addc_u32 s13, s1, 0 ; GFX8-NEXT: s_add_u32 s10, s0, 0x180 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: s_add_u32 s12, s0, 0xb0 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: s_add_u32 s14, s0, 0xa0 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: s_add_u32 s16, s0, 0x90 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: s_add_u32 s18, s0, 0x80 -; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s3 +; GFX8-NEXT: s_add_u32 s8, s0, 0xb0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_add_u32 s6, s0, 0xa0 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: s_add_u32 s4, s0, 0x90 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x80 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s19 ; GFX8-NEXT: s_add_u32 s42, s0, 0x70 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v23, s42 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v24, s43 +; GFX8-NEXT: v_mov_b32_e32 v12, s42 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v11, v1 +; GFX8-NEXT: v_mov_b32_e32 v13, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x170 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s3 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[2:5] +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 14, s19 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v24, s42 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v22 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s3 -; GFX8-NEXT: v_mov_b32_e32 v25, s43 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v12, s42 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v15 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s19 +; GFX8-NEXT: v_mov_b32_e32 v13, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x1f0 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 6, s31 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s41 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_and_b32_e32 v24, 1, v21 -; GFX8-NEXT: v_lshrrev_b16_e64 v26, 7, s31 -; GFX8-NEXT: v_mov_b32_e32 v25, v1 -; GFX8-NEXT: v_mov_b32_e32 v27, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 11, s19 +; GFX8-NEXT: v_mov_b32_e32 v12, s42 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v16 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 7, s41 +; GFX8-NEXT: v_mov_b32_e32 v13, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 6, s24 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[24:27] +; GFX8-NEXT: v_lshrrev_b16_e64 v17, 6, s40 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_and_b32_e32 v24, 1, v20 -; GFX8-NEXT: v_lshrrev_b16_e64 v26, 7, s24 -; GFX8-NEXT: v_mov_b32_e32 v25, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NEXT: v_mov_b32_e32 v12, s42 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v17 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 7, s40 +; GFX8-NEXT: v_mov_b32_e32 v13, s43 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GFX8-NEXT: s_add_u32 s42, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[24:27] +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 9, s19 +; GFX8-NEXT: v_and_b32_e32 v17, 1, v8 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 7, s19 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v24, 1, v19 -; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v18 -; GFX8-NEXT: v_mov_b32_e32 v18, s42 -; GFX8-NEXT: v_mov_b32_e32 v27, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s43 +; GFX8-NEXT: v_mov_b32_e32 v12, s42 +; GFX8-NEXT: v_and_b32_e32 v26, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v18 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x50 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[24:27] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v24, 1, v17 -; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v16 -; GFX8-NEXT: v_mov_b32_e32 v16, s42 -; GFX8-NEXT: v_mov_b32_e32 v27, 0 -; GFX8-NEXT: v_mov_b32_e32 v17, s43 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 5, s19 +; GFX8-NEXT: v_mov_b32_e32 v12, s42 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v21 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 64 -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[24:27] -; GFX8-NEXT: v_mov_b32_e32 v19, 1 -; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v15 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GFX8-NEXT: v_mov_b32_e32 v21, 1 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s42 -; GFX8-NEXT: v_and_b32_sdwa v24, v12, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v27, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s43 +; GFX8-NEXT: v_mov_b32_e32 v12, s42 +; GFX8-NEXT: v_and_b32_sdwa v8, v14, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v22 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 48 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[24:27] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v13 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 3, s19 ; GFX8-NEXT: v_mov_b32_e32 v12, s42 -; GFX8-NEXT: v_and_b32_e32 v24, 1, v14 -; GFX8-NEXT: v_mov_b32_e32 v27, 0 +; GFX8-NEXT: v_and_b32_e32 v22, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v24 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v23 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: v_mov_b32_e32 v13, s43 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GFX8-NEXT: s_add_u32 s42, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[24:27] +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s19 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v8 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 5, s41 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v24, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v10 -; GFX8-NEXT: v_mov_b32_e32 v10, s42 -; GFX8-NEXT: v_mov_b32_e32 v27, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s43 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 5, s31 -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[24:27] -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 3, s31 +; GFX8-NEXT: v_mov_b32_e32 v11, s42 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v6 +; GFX8-NEXT: v_mov_b32_e32 v10, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v12, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 16 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v12 -; GFX8-NEXT: v_and_b32_e32 v25, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v8 -; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10] +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 3, s41 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v7, s42 -; GFX8-NEXT: v_mov_b32_e32 v13, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, s43 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[10:13] -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s31 +; GFX8-NEXT: v_mov_b32_e32 v9, s42 +; GFX8-NEXT: v_and_b32_e32 v28, 1, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v10, s43 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 1, s41 ; GFX8-NEXT: s_add_u32 s42, s0, 0x160 -; GFX8-NEXT: v_lshrrev_b16_e64 v23, 12, s3 -; GFX8-NEXT: v_and_b32_e32 v27, 1, v7 -; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s42 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v23 -; GFX8-NEXT: v_mov_b32_e32 v13, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, s43 -; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[10:13] -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 5, s24 -; GFX8-NEXT: s_add_u32 s42, s0, 0x150 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s3 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v6 -; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s42 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v21 -; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v4 -; GFX8-NEXT: v_mov_b32_e32 v13, 0 -; GFX8-NEXT: v_mov_b32_e32 v7, s43 -; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[10:13] -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 1, s24 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s3 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX8-NEXT: s_add_u32 s42, s0, 0x140 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s42 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s3 -; GFX8-NEXT: v_and_b32_sdwa v19, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v20 -; GFX8-NEXT: v_mov_b32_e32 v22, 0 -; GFX8-NEXT: v_mov_b32_e32 v20, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x130 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 5, s3 -; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[19:22] -; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s42 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v18 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v2 -; GFX8-NEXT: v_mov_b32_e32 v21, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x120 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 4, s3 -; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[18:21] +; GFX8-NEXT: v_lshrrev_b16_e64 v25, 12, s19 +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] +; GFX8-NEXT: v_and_b32_e32 v30, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v2 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s3 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v17 -; GFX8-NEXT: v_mov_b32_e32 v20, 0 -; GFX8-NEXT: v_mov_b32_e32 v18, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v25 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x110 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v5 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 2, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s24 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[5:8] +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 5, s40 +; GFX8-NEXT: s_add_u32 s42, s0, 0x150 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s19 +; GFX8-NEXT: v_and_b32_e32 v25, 1, v2 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s31 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v15 -; GFX8-NEXT: v_mov_b32_e32 v19, 0 -; GFX8-NEXT: v_mov_b32_e32 v17, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v16 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v15 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v23 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v0, s41 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v24 -; GFX8-NEXT: v_mov_b32_e32 v24, s5 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[5:8] +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s40 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v2 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 1, s40 +; GFX8-NEXT: v_lshrrev_b16_e64 v31, 4, s40 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 2, s40 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: s_add_u32 s40, s0, 0x140 +; GFX8-NEXT: v_mov_b32_e32 v0, s19 +; GFX8-NEXT: v_lshrrev_b16_e64 v27, 4, s41 +; GFX8-NEXT: v_lshrrev_b16_e64 v29, 2, s41 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v2 +; GFX8-NEXT: s_addc_u32 s41, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s40 +; GFX8-NEXT: v_and_b32_sdwa v6, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NEXT: s_add_u32 s40, s0, 0x130 +; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s19 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[6:9] +; GFX8-NEXT: s_addc_u32 s41, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s40 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v18 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NEXT: s_add_u32 s40, s0, 0x120 +; GFX8-NEXT: v_lshrrev_b16_e64 v20, 4, s19 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[6:9] +; GFX8-NEXT: s_addc_u32 s41, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s40 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v11 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v20 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NEXT: s_add_u32 s40, s0, 0x110 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s19 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[10:13] +; GFX8-NEXT: s_addc_u32 s41, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s40 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GFX8-NEXT: v_mov_b32_e32 v17, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[14:17] +; GFX8-NEXT: v_mov_b32_e32 v0, s39 +; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; GFX8-NEXT: v_mov_b32_e32 v23, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v23, s4 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v24, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s38 -; GFX8-NEXT: v_mov_b32_e32 v2, s39 -; GFX8-NEXT: v_mov_b32_e32 v23, s6 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v24, s9 +; GFX8-NEXT: v_mov_b32_e32 v22, s16 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v23, s15 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v2, s37 -; GFX8-NEXT: v_mov_b32_e32 v23, s8 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v24, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s35 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v23, s10 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v24, s13 +; GFX8-NEXT: v_mov_b32_e32 v22, s14 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v23, s13 +; GFX8-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_mov_b32_e32 v22, s12 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v23, s11 ; GFX8-NEXT: v_mov_b32_e32 v0, s33 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v23, s12 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v24, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s29 -; GFX8-NEXT: v_mov_b32_e32 v2, s30 -; GFX8-NEXT: v_mov_b32_e32 v23, s14 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v24, s17 -; GFX8-NEXT: v_mov_b32_e32 v0, s27 -; GFX8-NEXT: v_mov_b32_e32 v2, s28 -; GFX8-NEXT: v_mov_b32_e32 v23, s16 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v24, s19 +; GFX8-NEXT: v_mov_b32_e32 v2, s31 +; GFX8-NEXT: v_mov_b32_e32 v22, s10 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v23, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v22, s8 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v23, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s28 +; GFX8-NEXT: v_mov_b32_e32 v2, s29 +; GFX8-NEXT: v_mov_b32_e32 v22, s6 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v23, s5 ; GFX8-NEXT: v_mov_b32_e32 v0, s26 -; GFX8-NEXT: v_mov_b32_e32 v2, s25 -; GFX8-NEXT: v_mov_b32_e32 v23, s18 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 1, s3 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v24, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s27 +; GFX8-NEXT: v_mov_b32_e32 v22, s4 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v23, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s25 +; GFX8-NEXT: v_mov_b32_e32 v2, s24 +; GFX8-NEXT: v_mov_b32_e32 v22, s2 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v23, s1 ; GFX8-NEXT: s_add_u32 s2, s0, 0x100 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 -; GFX8-NEXT: v_mov_b32_e32 v23, s0 +; GFX8-NEXT: v_mov_b32_e32 v22, s0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX8-NEXT: v_mov_b32_e32 v14, 0 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v24, s3 +; GFX8-NEXT: v_mov_b32_e32 v19, 0 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v23, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 -; GFX8-NEXT: v_mov_b32_e32 v23, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, v15 +; GFX8-NEXT: v_mov_b32_e32 v3, v19 +; GFX8-NEXT: v_mov_b32_e32 v22, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v22, 0 -; GFX8-NEXT: v_mov_b32_e32 v20, v1 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v27 +; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v24 +; GFX8-NEXT: v_mov_b32_e32 v21, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[19:22] +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[18:21] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s31 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v26 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v25 -; GFX8-NEXT: v_mov_b32_e32 v18, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, v1 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v29 +; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v28 +; GFX8-NEXT: v_mov_b32_e32 v17, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, v1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[15:18] +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v30 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[14:17] ; GFX8-NEXT: v_mov_b32_e32 v0, s21 ; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, v27 -; GFX8-NEXT: v_mov_b32_e32 v3, v28 +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, v11 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 4, s24 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v31 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v25 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, v1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 2, s24 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[10:13] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v13, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[6:9] +; GFX8-NEXT: v_mov_b32_e32 v2, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, v12 -; GFX8-NEXT: v_mov_b32_e32 v3, v13 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, v32 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -6587,12 +6584,12 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: ALU 95, @41, KC0[], KC1[] ; EG-NEXT: ALU 99, @137, KC0[CB0:0-32], KC1[] ; EG-NEXT: ALU 60, @237, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T82.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T81.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T80.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T79.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T78.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T77.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T82.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T81.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T80.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T79.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T78.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T77.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T76.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T75.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T74.X, 0 @@ -6618,149 +6615,149 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T54.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T53.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T52.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T51.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T51.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 38: -; EG-NEXT: VTX_READ_64 T25.XY, T19.X, 0, #1 +; EG-NEXT: VTX_READ_64 T19.XY, T19.X, 0, #1 ; EG-NEXT: ALU clause starting at 40: ; EG-NEXT: MOV * T19.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 41: -; EG-NEXT: LSHR * T19.Z, T25.Y, literal.x, +; EG-NEXT: LSHR * T20.Z, T19.Y, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T19.X, T25.Y, literal.x, 1, -; EG-NEXT: MOV T19.Y, 0.0, -; EG-NEXT: BFE_UINT * T20.Z, T25.Y, literal.y, 1, -; EG-NEXT: 30(4.203895e-44), 29(4.063766e-44) -; EG-NEXT: BFE_UINT T20.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T20.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T20.Y, 0.0, -; EG-NEXT: BFE_UINT * T21.Z, T25.Y, literal.y, 1, -; EG-NEXT: 28(3.923636e-44), 27(3.783506e-44) -; EG-NEXT: BFE_UINT T21.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT * T21.Z, T19.Y, literal.y, 1, +; EG-NEXT: 30(4.203895e-44), 29(4.063766e-44) +; EG-NEXT: BFE_UINT T21.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T21.Y, 0.0, -; EG-NEXT: BFE_UINT * T22.Z, T25.Y, literal.y, 1, -; EG-NEXT: 26(3.643376e-44), 25(3.503246e-44) -; EG-NEXT: BFE_UINT T22.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT * T22.Z, T19.Y, literal.y, 1, +; EG-NEXT: 28(3.923636e-44), 27(3.783506e-44) +; EG-NEXT: BFE_UINT T22.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T22.Y, 0.0, -; EG-NEXT: BFE_UINT * T23.Z, T25.Y, literal.y, 1, -; EG-NEXT: 24(3.363116e-44), 23(3.222986e-44) -; EG-NEXT: BFE_UINT T23.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT * T23.Z, T19.Y, literal.y, 1, +; EG-NEXT: 26(3.643376e-44), 25(3.503246e-44) +; EG-NEXT: BFE_UINT T23.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T23.Y, 0.0, -; EG-NEXT: BFE_UINT * T24.Z, T25.Y, literal.y, 1, -; EG-NEXT: 22(3.082857e-44), 21(2.942727e-44) -; EG-NEXT: BFE_UINT T24.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT * T24.Z, T19.Y, literal.y, 1, +; EG-NEXT: 24(3.363116e-44), 23(3.222986e-44) +; EG-NEXT: BFE_UINT T24.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T24.Y, 0.0, -; EG-NEXT: BFE_UINT * T26.Z, T25.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T25.Z, T19.Y, literal.y, 1, +; EG-NEXT: 22(3.082857e-44), 21(2.942727e-44) +; EG-NEXT: BFE_UINT T25.X, T19.Y, literal.x, 1, +; EG-NEXT: MOV T25.Y, 0.0, +; EG-NEXT: BFE_UINT * T26.Z, T19.Y, literal.y, 1, ; EG-NEXT: 20(2.802597e-44), 19(2.662467e-44) -; EG-NEXT: BFE_UINT T26.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T26.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T26.Y, 0.0, -; EG-NEXT: BFE_UINT * T27.Z, T25.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T27.Z, T19.Y, literal.y, 1, ; EG-NEXT: 18(2.522337e-44), 17(2.382207e-44) -; EG-NEXT: BFE_UINT T27.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T27.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T27.Y, 0.0, -; EG-NEXT: BFE_UINT * T28.Z, T25.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T28.Z, T19.Y, literal.y, 1, ; EG-NEXT: 16(2.242078e-44), 15(2.101948e-44) -; EG-NEXT: BFE_UINT T28.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T28.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T28.Y, 0.0, -; EG-NEXT: BFE_UINT * T29.Z, T25.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T29.Z, T19.Y, literal.y, 1, ; EG-NEXT: 14(1.961818e-44), 13(1.821688e-44) -; EG-NEXT: BFE_UINT T29.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T29.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T29.Y, 0.0, -; EG-NEXT: BFE_UINT * T30.Z, T25.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T30.Z, T19.Y, literal.y, 1, ; EG-NEXT: 12(1.681558e-44), 11(1.541428e-44) -; EG-NEXT: BFE_UINT T30.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T30.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T30.Y, 0.0, -; EG-NEXT: BFE_UINT * T31.Z, T25.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T31.Z, T19.Y, literal.y, 1, ; EG-NEXT: 10(1.401298e-44), 9(1.261169e-44) -; EG-NEXT: BFE_UINT T31.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T31.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T31.Y, 0.0, -; EG-NEXT: BFE_UINT * T32.Z, T25.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T32.Z, T19.Y, literal.y, 1, ; EG-NEXT: 8(1.121039e-44), 7(9.809089e-45) -; EG-NEXT: BFE_UINT T32.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T32.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T32.Y, 0.0, -; EG-NEXT: BFE_UINT * T33.Z, T25.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T33.Z, T19.Y, literal.y, 1, ; EG-NEXT: 6(8.407791e-45), 5(7.006492e-45) -; EG-NEXT: BFE_UINT T33.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T33.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T33.Y, 0.0, -; EG-NEXT: BFE_UINT * T34.Z, T25.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T34.Z, T19.Y, literal.y, 1, ; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) -; EG-NEXT: BFE_UINT T34.X, T25.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T34.X, T19.Y, literal.x, 1, ; EG-NEXT: MOV T34.Y, 0.0, -; EG-NEXT: BFE_UINT T35.Z, T25.Y, 1, 1, -; EG-NEXT: AND_INT * T35.X, T25.Y, 1, +; EG-NEXT: BFE_UINT T35.Z, T19.Y, 1, 1, +; EG-NEXT: AND_INT * T35.X, T19.Y, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: MOV T35.Y, 0.0, -; EG-NEXT: LSHR * T36.Z, T25.X, literal.x, +; EG-NEXT: LSHR * T36.Z, T19.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T36.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T36.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T36.Y, 0.0, -; EG-NEXT: BFE_UINT * T37.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T37.Z, T19.X, literal.y, 1, ; EG-NEXT: 30(4.203895e-44), 29(4.063766e-44) -; EG-NEXT: BFE_UINT T37.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T37.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T37.Y, 0.0, -; EG-NEXT: BFE_UINT * T38.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T38.Z, T19.X, literal.y, 1, ; EG-NEXT: 28(3.923636e-44), 27(3.783506e-44) -; EG-NEXT: BFE_UINT T38.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T38.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T38.Y, 0.0, -; EG-NEXT: BFE_UINT * T39.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T39.Z, T19.X, literal.y, 1, ; EG-NEXT: 26(3.643376e-44), 25(3.503246e-44) -; EG-NEXT: BFE_UINT T39.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T39.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T39.Y, 0.0, -; EG-NEXT: BFE_UINT * T40.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T40.Z, T19.X, literal.y, 1, ; EG-NEXT: 24(3.363116e-44), 23(3.222986e-44) -; EG-NEXT: BFE_UINT T40.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T40.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T40.Y, 0.0, -; EG-NEXT: BFE_UINT * T41.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T41.Z, T19.X, literal.y, 1, ; EG-NEXT: 22(3.082857e-44), 21(2.942727e-44) -; EG-NEXT: BFE_UINT T41.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T41.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T41.Y, 0.0, -; EG-NEXT: BFE_UINT * T42.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T42.Z, T19.X, literal.y, 1, ; EG-NEXT: 20(2.802597e-44), 19(2.662467e-44) -; EG-NEXT: BFE_UINT T42.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T42.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T42.Y, 0.0, -; EG-NEXT: BFE_UINT * T43.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T43.Z, T19.X, literal.y, 1, ; EG-NEXT: 18(2.522337e-44), 17(2.382207e-44) -; EG-NEXT: BFE_UINT * T43.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T43.X, T19.X, literal.x, 1, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 137: ; EG-NEXT: MOV T43.Y, 0.0, -; EG-NEXT: BFE_UINT * T44.Z, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T44.Z, T19.X, literal.x, 1, ; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T44.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T44.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T44.Y, 0.0, -; EG-NEXT: BFE_UINT * T45.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T45.Z, T19.X, literal.y, 1, ; EG-NEXT: 14(1.961818e-44), 13(1.821688e-44) -; EG-NEXT: BFE_UINT T45.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T45.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T45.Y, 0.0, -; EG-NEXT: BFE_UINT * T46.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T46.Z, T19.X, literal.y, 1, ; EG-NEXT: 12(1.681558e-44), 11(1.541428e-44) -; EG-NEXT: BFE_UINT T46.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T46.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T46.Y, 0.0, -; EG-NEXT: BFE_UINT * T47.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T47.Z, T19.X, literal.y, 1, ; EG-NEXT: 10(1.401298e-44), 9(1.261169e-44) -; EG-NEXT: BFE_UINT T47.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T47.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T47.Y, 0.0, -; EG-NEXT: BFE_UINT * T48.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T48.Z, T19.X, literal.y, 1, ; EG-NEXT: 8(1.121039e-44), 7(9.809089e-45) -; EG-NEXT: BFE_UINT T48.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T48.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T48.Y, 0.0, -; EG-NEXT: BFE_UINT * T49.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T49.Z, T19.X, literal.y, 1, ; EG-NEXT: 6(8.407791e-45), 5(7.006492e-45) -; EG-NEXT: BFE_UINT T49.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T49.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T49.Y, 0.0, -; EG-NEXT: BFE_UINT * T50.Z, T25.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T50.Z, T19.X, literal.y, 1, ; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) -; EG-NEXT: BFE_UINT T50.X, T25.X, literal.x, 1, +; EG-NEXT: BFE_UINT T50.X, T19.X, literal.x, 1, ; EG-NEXT: MOV T50.Y, 0.0, -; EG-NEXT: BFE_UINT T25.Z, T25.X, 1, 1, -; EG-NEXT: AND_INT * T25.X, T25.X, 1, +; EG-NEXT: BFE_UINT T19.Z, T19.X, 1, 1, +; EG-NEXT: AND_INT * T19.X, T19.X, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T25.Y, 0.0, -; EG-NEXT: MOV T19.W, 0.0, -; EG-NEXT: MOV * T20.W, 0.0, -; EG-NEXT: MOV T21.W, 0.0, -; EG-NEXT: MOV * T22.W, 0.0, -; EG-NEXT: MOV T23.W, 0.0, -; EG-NEXT: MOV * T24.W, 0.0, +; EG-NEXT: MOV T19.Y, 0.0, +; EG-NEXT: MOV T20.W, 0.0, +; EG-NEXT: MOV * T21.W, 0.0, +; EG-NEXT: MOV T22.W, 0.0, +; EG-NEXT: MOV * T23.W, 0.0, +; EG-NEXT: MOV T24.W, 0.0, +; EG-NEXT: MOV * T25.W, 0.0, ; EG-NEXT: MOV T26.W, 0.0, ; EG-NEXT: MOV * T27.W, 0.0, ; EG-NEXT: MOV T28.W, 0.0, @@ -6786,7 +6783,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: MOV T48.W, 0.0, ; EG-NEXT: MOV * T49.W, 0.0, ; EG-NEXT: MOV T50.W, 0.0, -; EG-NEXT: MOV * T25.W, 0.0, +; EG-NEXT: MOV * T19.W, 0.0, ; EG-NEXT: LSHR T51.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -6899,352 +6896,353 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s48, s5, 30 -; GFX6-NEXT: s_lshr_b32 s46, s5, 28 -; GFX6-NEXT: s_lshr_b32 s44, s5, 29 -; GFX6-NEXT: s_lshr_b32 s40, s5, 26 -; GFX6-NEXT: s_lshr_b32 s42, s5, 27 -; GFX6-NEXT: s_lshr_b32 s36, s5, 24 -; GFX6-NEXT: s_lshr_b32 s38, s5, 25 -; GFX6-NEXT: s_lshr_b32 s30, s5, 22 -; GFX6-NEXT: s_lshr_b32 s34, s5, 23 -; GFX6-NEXT: s_lshr_b32 s26, s5, 20 -; GFX6-NEXT: s_lshr_b32 s28, s5, 21 -; GFX6-NEXT: s_lshr_b32 s22, s5, 18 -; GFX6-NEXT: s_lshr_b32 s24, s5, 19 -; GFX6-NEXT: s_lshr_b32 s18, s5, 16 -; GFX6-NEXT: s_lshr_b32 s20, s5, 17 -; GFX6-NEXT: s_lshr_b32 s14, s5, 14 -; GFX6-NEXT: s_lshr_b32 s16, s5, 15 -; GFX6-NEXT: s_lshr_b32 s10, s5, 12 -; GFX6-NEXT: s_lshr_b32 s12, s5, 13 -; GFX6-NEXT: s_lshr_b32 s6, s5, 10 -; GFX6-NEXT: s_lshr_b32 s8, s5, 11 -; GFX6-NEXT: s_mov_b32 s50, s5 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[4:5], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: s_lshr_b32 s50, s5, 8 -; GFX6-NEXT: v_mov_b32_e32 v4, s52 -; GFX6-NEXT: v_mov_b32_e32 v5, s53 -; GFX6-NEXT: s_lshr_b32 s52, s5, 9 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[46:47], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v6, s48 -; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s46, s5, 6 -; GFX6-NEXT: v_mov_b32_e32 v10, s54 -; GFX6-NEXT: v_mov_b32_e32 v11, s55 -; GFX6-NEXT: s_lshr_b32 s48, s5, 7 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: v_mov_b32_e32 v12, s44 -; GFX6-NEXT: v_mov_b32_e32 v13, s45 -; GFX6-NEXT: s_lshr_b32 s44, s5, 4 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v14, s40 -; GFX6-NEXT: v_mov_b32_e32 v15, s41 -; GFX6-NEXT: s_lshr_b32 s42, s5, 5 -; GFX6-NEXT: v_mov_b32_e32 v16, s54 -; GFX6-NEXT: v_mov_b32_e32 v17, s55 -; GFX6-NEXT: s_lshr_b32 s40, s5, 2 -; GFX6-NEXT: v_mov_b32_e32 v8, s7 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v9, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:496 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s36 -; GFX6-NEXT: v_mov_b32_e32 v7, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 3 -; GFX6-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 1 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:480 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s30 -; GFX6-NEXT: v_mov_b32_e32 v11, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 30 -; GFX6-NEXT: v_mov_b32_e32 v12, s34 -; GFX6-NEXT: v_mov_b32_e32 v13, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 31 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:464 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s26 -; GFX6-NEXT: v_mov_b32_e32 v15, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 28 -; GFX6-NEXT: v_mov_b32_e32 v16, s28 -; GFX6-NEXT: v_mov_b32_e32 v17, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 29 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:448 +; GFX6-NEXT: s_lshr_b32 s6, s5, 30 +; GFX6-NEXT: s_lshr_b32 s8, s5, 28 +; GFX6-NEXT: s_lshr_b32 s10, s5, 29 +; GFX6-NEXT: s_lshr_b32 s12, s5, 26 +; GFX6-NEXT: s_lshr_b32 s16, s5, 27 +; GFX6-NEXT: s_lshr_b32 s18, s5, 24 +; GFX6-NEXT: s_lshr_b32 s24, s5, 25 +; GFX6-NEXT: s_lshr_b32 s26, s5, 22 +; GFX6-NEXT: s_lshr_b32 s28, s5, 23 +; GFX6-NEXT: s_lshr_b32 s34, s5, 20 +; GFX6-NEXT: s_lshr_b32 s36, s5, 21 +; GFX6-NEXT: s_lshr_b32 s42, s5, 18 +; GFX6-NEXT: s_lshr_b32 s44, s5, 19 +; GFX6-NEXT: s_lshr_b32 s46, s5, 16 +; GFX6-NEXT: s_lshr_b32 s48, s5, 17 +; GFX6-NEXT: s_lshr_b32 s50, s5, 14 +; GFX6-NEXT: s_lshr_b32 s52, s5, 15 +; GFX6-NEXT: s_lshr_b32 s54, s5, 12 +; GFX6-NEXT: s_lshr_b32 s56, s5, 13 +; GFX6-NEXT: s_lshr_b32 s30, s5, 10 +; GFX6-NEXT: s_lshr_b32 s58, s5, 11 +; GFX6-NEXT: s_mov_b32 s14, s5 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: s_lshr_b32 s22, s5, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: s_lshr_b32 s60, s5, 9 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NEXT: s_lshr_b32 s20, s5, 6 +; GFX6-NEXT: v_mov_b32_e32 v8, s8 +; GFX6-NEXT: v_mov_b32_e32 v9, s9 +; GFX6-NEXT: s_lshr_b32 s38, s5, 7 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x10000 +; GFX6-NEXT: s_ashr_i32 s10, s5, 31 +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_mov_b32_e32 v11, s7 +; GFX6-NEXT: s_lshr_b32 s14, s5, 4 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[16:17], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s6 +; GFX6-NEXT: v_mov_b32_e32 v13, s7 +; GFX6-NEXT: s_lshr_b32 s62, s5, 5 +; GFX6-NEXT: v_mov_b32_e32 v14, s8 +; GFX6-NEXT: v_mov_b32_e32 v15, s9 +; GFX6-NEXT: s_lshr_b32 s16, s5, 2 +; GFX6-NEXT: v_mov_b32_e32 v6, s10 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[24:25], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v7, s10 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s22 -; GFX6-NEXT: v_mov_b32_e32 v7, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 26 -; GFX6-NEXT: v_mov_b32_e32 v8, s24 -; GFX6-NEXT: v_mov_b32_e32 v9, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 27 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:432 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NEXT: s_lshr_b32 s24, s5, 3 +; GFX6-NEXT: v_mov_b32_e32 v6, s8 +; GFX6-NEXT: v_mov_b32_e32 v7, s9 +; GFX6-NEXT: s_lshr_b32 s40, s5, 1 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[28:29], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s18 -; GFX6-NEXT: v_mov_b32_e32 v11, s19 -; GFX6-NEXT: s_lshr_b32 s20, s4, 24 -; GFX6-NEXT: v_mov_b32_e32 v12, s54 -; GFX6-NEXT: v_mov_b32_e32 v13, s55 -; GFX6-NEXT: s_lshr_b32 s18, s4, 25 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:416 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_mov_b32_e32 v9, s7 +; GFX6-NEXT: s_lshr_b32 s26, s4, 30 +; GFX6-NEXT: v_mov_b32_e32 v10, s8 +; GFX6-NEXT: v_mov_b32_e32 v11, s9 +; GFX6-NEXT: s_lshr_b32 s64, s4, 31 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[34:35], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s14 -; GFX6-NEXT: v_mov_b32_e32 v15, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 22 -; GFX6-NEXT: v_mov_b32_e32 v16, s16 -; GFX6-NEXT: v_mov_b32_e32 v17, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 23 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:400 +; GFX6-NEXT: v_mov_b32_e32 v12, s8 +; GFX6-NEXT: v_mov_b32_e32 v13, s9 +; GFX6-NEXT: s_lshr_b32 s34, s4, 28 +; GFX6-NEXT: v_mov_b32_e32 v14, s6 +; GFX6-NEXT: v_mov_b32_e32 v15, s7 +; GFX6-NEXT: s_lshr_b32 s66, s4, 29 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[42:43], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: v_mov_b32_e32 v7, s11 -; GFX6-NEXT: s_lshr_b32 s10, s4, 20 -; GFX6-NEXT: v_mov_b32_e32 v8, s12 -; GFX6-NEXT: v_mov_b32_e32 v9, s13 -; GFX6-NEXT: s_lshr_b32 s12, s4, 21 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:384 +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, s9 +; GFX6-NEXT: s_lshr_b32 s28, s4, 26 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v7, s7 +; GFX6-NEXT: s_lshr_b32 s42, s4, 27 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[46:47], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, s8 +; GFX6-NEXT: v_mov_b32_e32 v9, s9 +; GFX6-NEXT: s_lshr_b32 s36, s4, 24 ; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: v_mov_b32_e32 v11, s7 -; GFX6-NEXT: s_lshr_b32 s6, s4, 18 +; GFX6-NEXT: s_lshr_b32 s6, s4, 25 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[52:53], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[50:51], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:416 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s8 ; GFX6-NEXT: v_mov_b32_e32 v13, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 19 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:368 +; GFX6-NEXT: s_lshr_b32 s8, s4, 22 +; GFX6-NEXT: v_mov_b32_e32 v14, s10 +; GFX6-NEXT: v_mov_b32_e32 v15, s11 +; GFX6-NEXT: s_lshr_b32 s10, s4, 23 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[56:57], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[54:55], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s50 -; GFX6-NEXT: v_mov_b32_e32 v15, s51 -; GFX6-NEXT: s_lshr_b32 s50, s4, 16 -; GFX6-NEXT: v_mov_b32_e32 v16, s52 -; GFX6-NEXT: v_mov_b32_e32 v17, s53 -; GFX6-NEXT: s_lshr_b32 s52, s4, 17 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:352 +; GFX6-NEXT: v_mov_b32_e32 v4, s12 +; GFX6-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NEXT: s_lshr_b32 s12, s4, 20 +; GFX6-NEXT: v_mov_b32_e32 v6, s18 +; GFX6-NEXT: v_mov_b32_e32 v7, s19 +; GFX6-NEXT: s_lshr_b32 s18, s4, 21 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[58:59], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:384 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, s30 +; GFX6-NEXT: v_mov_b32_e32 v9, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 18 +; GFX6-NEXT: v_mov_b32_e32 v10, s44 +; GFX6-NEXT: v_mov_b32_e32 v11, s45 +; GFX6-NEXT: s_lshr_b32 s44, s4, 19 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[60:61], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:368 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v12, s22 +; GFX6-NEXT: v_mov_b32_e32 v13, s23 +; GFX6-NEXT: s_lshr_b32 s22, s4, 16 +; GFX6-NEXT: v_mov_b32_e32 v14, s46 +; GFX6-NEXT: v_mov_b32_e32 v15, s47 +; GFX6-NEXT: s_lshr_b32 s46, s4, 17 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:352 +; GFX6-NEXT: v_mov_b32_e32 v16, s20 +; GFX6-NEXT: v_mov_b32_e32 v17, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 14 +; GFX6-NEXT: v_mov_b32_e32 v18, s38 +; GFX6-NEXT: v_mov_b32_e32 v19, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 15 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[62:63], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:336 +; GFX6-NEXT: s_waitcnt expcnt(1) +; GFX6-NEXT: v_mov_b32_e32 v6, s14 +; GFX6-NEXT: v_mov_b32_e32 v7, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 12 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s46 -; GFX6-NEXT: v_mov_b32_e32 v7, s47 -; GFX6-NEXT: s_lshr_b32 s46, s4, 14 ; GFX6-NEXT: v_mov_b32_e32 v8, s48 ; GFX6-NEXT: v_mov_b32_e32 v9, s49 -; GFX6-NEXT: s_lshr_b32 s48, s4, 15 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[44:45], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:336 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s42 -; GFX6-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 12 -; GFX6-NEXT: v_mov_b32_e32 v12, s54 -; GFX6-NEXT: v_mov_b32_e32 v13, s55 -; GFX6-NEXT: s_lshr_b32 s44, s4, 13 +; GFX6-NEXT: s_lshr_b32 s48, s4, 13 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:320 +; GFX6-NEXT: v_mov_b32_e32 v10, s16 +; GFX6-NEXT: v_mov_b32_e32 v11, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 10 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s40 -; GFX6-NEXT: v_mov_b32_e32 v15, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 10 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v16, s36 -; GFX6-NEXT: v_mov_b32_e32 v17, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 8 +; GFX6-NEXT: v_mov_b32_e32 v12, s24 +; GFX6-NEXT: v_mov_b32_e32 v13, s25 +; GFX6-NEXT: s_lshr_b32 s24, s4, 11 +; GFX6-NEXT: v_mov_b32_e32 v4, s40 +; GFX6-NEXT: v_mov_b32_e32 v5, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 8 +; GFX6-NEXT: s_bfe_i64 s[50:51], s[64:65], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:304 +; GFX6-NEXT: v_mov_b32_e32 v14, s26 +; GFX6-NEXT: v_mov_b32_e32 v15, s27 +; GFX6-NEXT: s_lshr_b32 s26, s4, 9 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v16, s50 +; GFX6-NEXT: v_mov_b32_e32 v17, s51 +; GFX6-NEXT: s_lshr_b32 s50, s4, 6 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:304 +; GFX6-NEXT: s_bfe_i64 s[52:53], s[66:67], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s30 -; GFX6-NEXT: v_mov_b32_e32 v7, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v8, s34 -; GFX6-NEXT: v_mov_b32_e32 v9, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s34 +; GFX6-NEXT: v_mov_b32_e32 v7, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v8, s52 +; GFX6-NEXT: v_mov_b32_e32 v9, s53 +; GFX6-NEXT: s_lshr_b32 s52, s4, 4 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:288 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s26 -; GFX6-NEXT: v_mov_b32_e32 v11, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v12, s28 -; GFX6-NEXT: v_mov_b32_e32 v13, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:272 +; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s22 -; GFX6-NEXT: v_mov_b32_e32 v15, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v16, s24 -; GFX6-NEXT: v_mov_b32_e32 v17, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 +; GFX6-NEXT: v_mov_b32_e32 v10, s28 +; GFX6-NEXT: v_mov_b32_e32 v11, s29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v12, s42 +; GFX6-NEXT: v_mov_b32_e32 v13, s43 +; GFX6-NEXT: s_lshr_b32 s42, s4, 2 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 3 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 3 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:224 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 +; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s52 -; GFX6-NEXT: v_mov_b32_e32 v3, s53 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s46 -; GFX6-NEXT: v_mov_b32_e32 v1, s47 -; GFX6-NEXT: v_mov_b32_e32 v2, s48 -; GFX6-NEXT: v_mov_b32_e32 v3, s49 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s42 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NEXT: v_mov_b32_e32 v3, s45 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: v_mov_b32_e32 v5, s11 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s38 -; GFX6-NEXT: v_mov_b32_e32 v1, s39 ; GFX6-NEXT: v_mov_b32_e32 v2, s30 ; GFX6-NEXT: v_mov_b32_e32 v3, s31 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s34 -; GFX6-NEXT: v_mov_b32_e32 v1, s35 -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NEXT: v_mov_b32_e32 v4, s44 +; GFX6-NEXT: v_mov_b32_e32 v5, s45 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NEXT: v_mov_b32_e32 v1, s29 ; GFX6-NEXT: v_mov_b32_e32 v2, s22 ; GFX6-NEXT: v_mov_b32_e32 v3, s23 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NEXT: v_mov_b32_e32 v4, s46 +; GFX6-NEXT: v_mov_b32_e32 v5, s47 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s24 -; GFX6-NEXT: v_mov_b32_e32 v1, s25 ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-NEXT: v_mov_b32_e32 v7, s5 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v4, s38 +; GFX6-NEXT: v_mov_b32_e32 v5, s39 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: v_mov_b32_e32 v4, s48 +; GFX6-NEXT: v_mov_b32_e32 v5, s49 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v4, s24 +; GFX6-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: v_mov_b32_e32 v4, s26 +; GFX6-NEXT: v_mov_b32_e32 v5, s27 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s50 +; GFX6-NEXT: v_mov_b32_e32 v3, s51 +; GFX6-NEXT: v_mov_b32_e32 v4, s34 +; GFX6-NEXT: v_mov_b32_e32 v5, s35 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s52 +; GFX6-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NEXT: v_mov_b32_e32 v4, s28 +; GFX6-NEXT: v_mov_b32_e32 v5, s29 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: v_mov_b32_e32 v4, s36 +; GFX6-NEXT: v_mov_b32_e32 v5, s37 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b32 s13, 0 -; GFX8-NEXT: s_mov_b32 s11, s13 +; GFX8-NEXT: s_mov_b32 s9, 0 +; GFX8-NEXT: s_mov_b32 s11, s9 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s16, s9, 22 -; GFX8-NEXT: s_lshr_b32 s18, s9, 23 -; GFX8-NEXT: s_lshr_b32 s20, s9, 20 -; GFX8-NEXT: s_lshr_b32 s22, s9, 21 -; GFX8-NEXT: s_lshr_b32 s24, s9, 18 -; GFX8-NEXT: s_lshr_b32 s26, s9, 19 -; GFX8-NEXT: s_lshr_b32 s28, s9, 16 -; GFX8-NEXT: s_lshr_b32 s30, s9, 17 -; GFX8-NEXT: s_lshr_b32 s34, s8, 22 -; GFX8-NEXT: s_lshr_b32 s36, s8, 23 -; GFX8-NEXT: s_lshr_b32 s38, s8, 20 -; GFX8-NEXT: s_lshr_b32 s40, s8, 21 -; GFX8-NEXT: s_lshr_b32 s42, s8, 18 -; GFX8-NEXT: s_lshr_b32 s44, s8, 19 -; GFX8-NEXT: s_lshr_b32 s46, s8, 16 -; GFX8-NEXT: s_lshr_b32 s48, s8, 17 -; GFX8-NEXT: s_mov_b32 s12, s9 -; GFX8-NEXT: s_lshr_b32 s10, s9, 24 -; GFX8-NEXT: s_lshr_b32 s6, s8, 24 +; GFX8-NEXT: s_lshr_b32 s16, s13, 22 +; GFX8-NEXT: s_lshr_b32 s18, s13, 23 +; GFX8-NEXT: s_lshr_b32 s20, s13, 20 +; GFX8-NEXT: s_lshr_b32 s22, s13, 21 +; GFX8-NEXT: s_lshr_b32 s24, s13, 18 +; GFX8-NEXT: s_lshr_b32 s26, s13, 19 +; GFX8-NEXT: s_lshr_b32 s28, s13, 16 +; GFX8-NEXT: s_lshr_b32 s30, s13, 17 +; GFX8-NEXT: s_lshr_b32 s34, s12, 22 +; GFX8-NEXT: s_lshr_b32 s36, s12, 23 +; GFX8-NEXT: s_lshr_b32 s38, s12, 20 +; GFX8-NEXT: s_lshr_b32 s40, s12, 21 +; GFX8-NEXT: s_lshr_b32 s42, s12, 18 +; GFX8-NEXT: s_lshr_b32 s44, s12, 19 +; GFX8-NEXT: s_lshr_b32 s46, s12, 16 +; GFX8-NEXT: s_lshr_b32 s48, s12, 17 +; GFX8-NEXT: s_mov_b32 s8, s13 +; GFX8-NEXT: s_lshr_b32 s10, s13, 24 +; GFX8-NEXT: s_lshr_b32 s6, s12, 24 ; GFX8-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[14:15], s[8:9], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[14:15], s[12:13], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 @@ -7261,265 +7259,268 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v11, s16 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: s_add_u32 s16, s0, 0x1b0 -; GFX8-NEXT: v_mov_b32_e32 v12, s17 +; GFX8-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_mov_b32_e32 v13, s18 -; GFX8-NEXT: v_mov_b32_e32 v14, s19 +; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v7, s19 ; GFX8-NEXT: v_mov_b32_e32 v16, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 0x1a0 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_mov_b32_e32 v11, s20 -; GFX8-NEXT: v_mov_b32_e32 v12, s21 -; GFX8-NEXT: v_mov_b32_e32 v13, s22 -; GFX8-NEXT: v_mov_b32_e32 v14, s23 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: v_mov_b32_e32 v7, s23 ; GFX8-NEXT: v_mov_b32_e32 v16, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 0x190 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_mov_b32_e32 v11, s24 -; GFX8-NEXT: v_mov_b32_e32 v12, s25 -; GFX8-NEXT: v_mov_b32_e32 v13, s26 -; GFX8-NEXT: v_mov_b32_e32 v14, s27 +; GFX8-NEXT: v_mov_b32_e32 v4, s24 +; GFX8-NEXT: v_mov_b32_e32 v5, s25 +; GFX8-NEXT: v_mov_b32_e32 v6, s26 +; GFX8-NEXT: v_mov_b32_e32 v7, s27 ; GFX8-NEXT: v_mov_b32_e32 v16, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 0x180 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_mov_b32_e32 v11, s28 -; GFX8-NEXT: v_mov_b32_e32 v12, s29 -; GFX8-NEXT: v_mov_b32_e32 v13, s30 -; GFX8-NEXT: v_mov_b32_e32 v14, s31 +; GFX8-NEXT: v_mov_b32_e32 v4, s28 +; GFX8-NEXT: v_mov_b32_e32 v5, s29 +; GFX8-NEXT: v_mov_b32_e32 v6, s30 +; GFX8-NEXT: v_mov_b32_e32 v7, s31 ; GFX8-NEXT: v_mov_b32_e32 v16, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 0xb0 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_mov_b32_e32 v11, s34 -; GFX8-NEXT: v_mov_b32_e32 v12, s35 -; GFX8-NEXT: v_mov_b32_e32 v13, s36 -; GFX8-NEXT: v_mov_b32_e32 v14, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_mov_b32_e32 v6, s36 +; GFX8-NEXT: v_mov_b32_e32 v7, s37 ; GFX8-NEXT: v_mov_b32_e32 v16, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 0xa0 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_mov_b32_e32 v11, s38 -; GFX8-NEXT: v_mov_b32_e32 v12, s39 -; GFX8-NEXT: v_mov_b32_e32 v13, s40 -; GFX8-NEXT: v_mov_b32_e32 v14, s41 +; GFX8-NEXT: v_mov_b32_e32 v4, s38 +; GFX8-NEXT: v_mov_b32_e32 v5, s39 +; GFX8-NEXT: v_mov_b32_e32 v6, s40 +; GFX8-NEXT: v_mov_b32_e32 v7, s41 ; GFX8-NEXT: v_mov_b32_e32 v16, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 0x90 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_mov_b32_e32 v11, s42 -; GFX8-NEXT: v_mov_b32_e32 v12, s43 -; GFX8-NEXT: v_mov_b32_e32 v13, s44 -; GFX8-NEXT: v_mov_b32_e32 v14, s45 +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NEXT: v_mov_b32_e32 v6, s44 +; GFX8-NEXT: v_mov_b32_e32 v7, s45 ; GFX8-NEXT: v_mov_b32_e32 v16, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 0x80 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s8 -; GFX8-NEXT: v_mov_b32_e32 v11, s46 -; GFX8-NEXT: v_mov_b32_e32 v12, s47 -; GFX8-NEXT: v_mov_b32_e32 v13, s48 -; GFX8-NEXT: v_mov_b32_e32 v14, s49 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 14, s12 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 15, s12 +; GFX8-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NEXT: v_mov_b32_e32 v6, s48 +; GFX8-NEXT: v_mov_b32_e32 v7, s49 ; GFX8-NEXT: v_mov_b32_e32 v16, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 0x70 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1 -; GFX8-NEXT: v_bfe_i32 v11, v10, 0, 1 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s8 -; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: v_bfe_i32 v6, v9, 0, 1 +; GFX8-NEXT: v_bfe_i32 v4, v8, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v8, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 12, s12 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 13, s12 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX8-NEXT: v_mov_b32_e32 v9, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[9:12] -; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s8 -; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: v_bfe_i32 v6, v11, 0, 1 +; GFX8-NEXT: v_bfe_i32 v4, v10, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v10, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 10, s12 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 11, s12 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX8-NEXT: v_mov_b32_e32 v11, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 0x50 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[7:10] -; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX8-NEXT: v_bfe_i32 v7, v6, 0, 1 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 9, s8 -; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: v_bfe_i32 v6, v13, 0, 1 +; GFX8-NEXT: v_bfe_i32 v4, v12, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v12, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s12 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 9, s12 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX8-NEXT: v_mov_b32_e32 v13, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 64 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[5:8] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 +; GFX8-NEXT: v_bfe_i32 v5, v14, 0, 1 ; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s16 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 6, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s8 +; GFX8-NEXT: v_mov_b32_e32 v13, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 6, s12 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s12 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: v_mov_b32_e32 v14, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 48 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[3:6] +; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[3:6] ; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1 ; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_mov_b32_e32 v13, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s12 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 5, s12 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: v_mov_b32_e32 v14, s17 ; GFX8-NEXT: s_add_u32 s16, s0, 32 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s8 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[1:4] -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s16 -; GFX8-NEXT: v_bfe_i32 v2, v13, 0, 1 +; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[1:4] ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v17, s17 -; GFX8-NEXT: s_add_u32 s16, s0, 16 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s8 +; GFX8-NEXT: v_bfe_i32 v2, v15, 0, 1 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v14, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 2, s12 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 3, s12 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_mov_b32_e32 v15, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 16 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v18, s17 -; GFX8-NEXT: v_bfe_i32 v2, v11, 0, 1 -; GFX8-NEXT: v_bfe_i32 v0, v14, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s8 +; GFX8-NEXT: v_bfe_i32 v2, v8, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v16, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v16, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 1, s12 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v17, s16 -; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v18, s1 -; GFX8-NEXT: v_bfe_i32 v2, v12, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s9 +; GFX8-NEXT: v_mov_b32_e32 v17, s17 +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_bfe_i32 v2, v9, 0, 1 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 14, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 15, s13 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NEXT: v_mov_b32_e32 v17, s0 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 ; GFX8-NEXT: s_add_u32 s14, s0, 0x170 -; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_bfe_i32 v2, v10, 0, 1 -; GFX8-NEXT: v_bfe_i32 v0, v9, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v9, s14 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s9 +; GFX8-NEXT: v_bfe_i32 v2, v11, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v10, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v10, s14 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v10, s15 -; GFX8-NEXT: s_add_u32 s8, s0, 0x160 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 7, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 5, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s9 -; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 1, s9 -; GFX8-NEXT: v_bfe_i32 v2, v8, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v11, s15 +; GFX8-NEXT: s_add_u32 s12, s0, 0x160 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 13, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 9, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 4, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 2, s13 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 3, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 1, s13 +; GFX8-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NEXT: v_bfe_i32 v2, v12, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v7, 0, 1 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NEXT: v_mov_b32_e32 v18, s13 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_add_u32 s8, s0, 0x150 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v17, s12 +; GFX8-NEXT: s_add_u32 s12, s0, 0x150 +; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[0:3] +; GFX8-NEXT: s_addc_u32 s13, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v6, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v5, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s12 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, s9 -; GFX8-NEXT: s_add_u32 s8, s0, 0x140 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 +; GFX8-NEXT: s_add_u32 s12, s0, 0x140 ; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[0:3] -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_bfe_i32 v2, v15, 0, 1 +; GFX8-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NEXT: v_bfe_i32 v2, v13, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v4, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: s_add_u32 s8, s0, 0x130 +; GFX8-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NEXT: s_add_u32 s12, s0, 0x130 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_bfe_i32 v2, v16, 0, 1 -; GFX8-NEXT: v_bfe_i32 v0, v13, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NEXT: v_bfe_i32 v2, v15, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v14, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 6, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v17, 4, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v18, 5, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s10 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: s_add_u32 s8, s0, 0x120 +; GFX8-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0x120 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_bfe_i32 v2, v12, 0, 1 -; GFX8-NEXT: v_bfe_i32 v0, v11, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_bfe_i32 v2, v9, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v8, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: s_add_u32 s8, s0, 0x110 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: s_add_u32 s10, s0, 0x110 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_bfe_i32 v2, v9, 0, 1 -; GFX8-NEXT: v_bfe_i32 v0, v14, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_bfe_i32 v2, v10, 0, 1 +; GFX8-NEXT: v_bfe_i32 v0, v16, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: s_add_u32 s8, s0, 0x100 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: s_add_u32 s8, s0, 0x100 +; GFX8-NEXT: v_bfe_i32 v2, v11, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_bfe_i32 v2, v10, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 6, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 7, s10 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: s_add_u32 s8, s0, 0x1f0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_bfe_i32 v2, v8, 0, 1 +; GFX8-NEXT: v_bfe_i32 v2, v12, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v7, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v6, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 4, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 5, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 4, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 7, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 4, s6 ; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s6 ; GFX8-NEXT: v_lshrrev_b16_e64 v4, 2, s6 ; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s6 @@ -7531,9 +7532,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v16, v17, 0, 1 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s10 ; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -7556,12 +7554,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xf0 -; GFX8-NEXT: v_bfe_i32 v14, v12, 0, 1 +; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1 ; GFX8-NEXT: v_bfe_i32 v12, v13, 0, 1 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_bfe_i32 v8, v15, 0, 1 ; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -7570,6 +7567,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_bfe_i32 v10, v9, 0, 1 +; GFX8-NEXT: v_bfe_i32 v8, v8, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xd0 ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index bee3d455187ca7..85782a98cfda0c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1685,109 +1685,109 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s6, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s13, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s17, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s10, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s11, s0, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s13, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s15, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s16, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s17, s6, 16 +; GCN-HSA-NEXT: s_and_b32 s18, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s19, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: s_and_b32 s0, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s1, s6, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s8, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s8, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s8, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32: @@ -1909,109 +1909,109 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s4, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s0, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s1, s1 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s14, s3, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s2, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s3, s3 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s2, s2 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s16, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s17, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s14, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s6, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s19, s6, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s16, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s17, s8, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s19, s10, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s12, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s13, s4, 16 -; GCN-HSA-NEXT: s_ashr_i32 s14, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s15, s6, 16 -; GCN-HSA-NEXT: s_ashr_i32 s16, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s17, s8, 16 -; GCN-HSA-NEXT: s_ashr_i32 s2, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s3, s10, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 -; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 -; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_ashr_i32 s10, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s11, s0, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s12, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s13, s0 +; GCN-HSA-NEXT: s_ashr_i32 s14, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s15, s2, 16 +; GCN-HSA-NEXT: s_ashr_i32 s16, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s17, s4, 16 +; GCN-HSA-NEXT: s_ashr_i32 s0, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s1, s6, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s8, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s8, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s8, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GCN-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i32: @@ -2969,57 +2969,25 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI: ; %bb.0: ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s41, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s45, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s47, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s49, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s51, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s53, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s39, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s40, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s43, s3, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s44, s2, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s55, s17, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s16, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s57, s19, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s18, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s59, s21, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s20, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s61, s23, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s22, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s63, s25, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s24, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s65, s27, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s26, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s67, s29, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s68, s28, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s69, s31, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s70, s30, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s17, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s19, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s18, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s39, s21, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s20, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s41, s23, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s22, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s43, s25, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s24, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s45, s27, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s26, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s47, s29, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s28, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s49, s31, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s30, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s17, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff @@ -3027,109 +2995,141 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s20, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s23, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s25, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s24, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s27, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s29, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s28, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s31, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s30, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s51, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s53, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s55, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s57, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s59, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s61, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s63, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s65, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s67, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s68, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s69, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s70, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s66 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s57 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s70 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s69 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s68 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s67 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -3145,28 +3145,28 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_lshr_b32 s22, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s23, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s24, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s39, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s41, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s43, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s45, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s47, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff -; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff -; GCN-HSA-NEXT: s_and_b32 s31, s2, 0xffff -; GCN-HSA-NEXT: s_and_b32 s34, s5, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff -; GCN-HSA-NEXT: s_and_b32 s38, s7, 0xffff -; GCN-HSA-NEXT: s_and_b32 s40, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s42, s9, 0xffff -; GCN-HSA-NEXT: s_and_b32 s44, s8, 0xffff -; GCN-HSA-NEXT: s_and_b32 s46, s11, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s25, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s35, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-HSA-NEXT: s_and_b32 s37, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s38, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s39, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s40, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s41, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s42, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s43, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s44, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s45, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s46, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s47, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s48, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s49, s13, 0xffff ; GCN-HSA-NEXT: s_and_b32 s50, s12, 0xffff @@ -3284,67 +3284,67 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3354,57 +3354,41 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s0, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s12, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s14, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s17, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s16, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s19, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s18, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s21, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s20, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s23, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s22, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s25, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s24, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s27, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s26, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s29, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s28, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s31, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s30, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s11, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s10, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s18, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s21, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s20, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s23, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s22, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s25, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s24, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s27, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s26, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s29, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s28, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s31, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s30, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s0, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s2, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s15, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s14, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff @@ -3419,151 +3403,163 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s31, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s30, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xf0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s14, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xf0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xe0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xd0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xd0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xc0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xc0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xb0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xb0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xa0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xa0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x90 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x90 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x80 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x70 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x60 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x50 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s36, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s36, 32 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s36, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s37 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3585,95 +3581,95 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T58.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T52.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T53.XYZW, T55.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T39.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T37.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T48.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T40.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T38.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T46.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T41.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T38.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T39.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T43.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T36.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T38.XYZW, T37.X, 0, #1 -; EG-NEXT: VTX_READ_128 T39.XYZW, T37.X, 48, #1 -; EG-NEXT: VTX_READ_128 T40.XYZW, T37.X, 32, #1 -; EG-NEXT: VTX_READ_128 T41.XYZW, T37.X, 16, #1 +; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1 +; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 48, #1 +; EG-NEXT: VTX_READ_128 T38.XYZW, T35.X, 32, #1 +; EG-NEXT: VTX_READ_128 T39.XYZW, T35.X, 16, #1 ; EG-NEXT: Fetch clause starting at 30: -; EG-NEXT: VTX_READ_128 T49.XYZW, T37.X, 112, #1 -; EG-NEXT: VTX_READ_128 T50.XYZW, T37.X, 96, #1 -; EG-NEXT: VTX_READ_128 T51.XYZW, T37.X, 80, #1 -; EG-NEXT: VTX_READ_128 T52.XYZW, T37.X, 64, #1 +; EG-NEXT: VTX_READ_128 T49.XYZW, T35.X, 112, #1 +; EG-NEXT: VTX_READ_128 T50.XYZW, T35.X, 96, #1 +; EG-NEXT: VTX_READ_128 T51.XYZW, T35.X, 80, #1 +; EG-NEXT: VTX_READ_128 T52.XYZW, T35.X, 64, #1 ; EG-NEXT: ALU clause starting at 38: -; EG-NEXT: MOV * T37.X, KC0[2].Z, +; EG-NEXT: MOV * T35.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 39: -; EG-NEXT: LSHR * T35.W, T38.Y, literal.x, +; EG-NEXT: LSHR * T40.W, T36.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T35.Z, T38.Y, literal.x, +; EG-NEXT: AND_INT * T40.Z, T36.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR T35.Y, T38.X, literal.x, -; EG-NEXT: LSHR * T36.W, T38.W, literal.x, +; EG-NEXT: LSHR T40.Y, T36.X, literal.x, +; EG-NEXT: LSHR * T41.W, T36.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T35.X, T38.X, literal.x, -; EG-NEXT: AND_INT T36.Z, T38.W, literal.x, -; EG-NEXT: LSHR * T38.X, KC0[2].Y, literal.y, +; EG-NEXT: AND_INT T40.X, T36.X, literal.x, +; EG-NEXT: AND_INT T41.Z, T36.W, literal.x, +; EG-NEXT: LSHR * T36.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) -; EG-NEXT: LSHR T36.Y, T38.Z, literal.x, -; EG-NEXT: LSHR * T42.W, T41.Y, literal.x, +; EG-NEXT: LSHR T41.Y, T36.Z, literal.x, +; EG-NEXT: LSHR * T42.W, T39.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T36.X, T38.Z, literal.x, -; EG-NEXT: AND_INT T42.Z, T41.Y, literal.x, +; EG-NEXT: AND_INT T41.X, T36.Z, literal.x, +; EG-NEXT: AND_INT T42.Z, T39.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LSHR T43.X, PV.W, literal.x, -; EG-NEXT: LSHR T42.Y, T41.X, literal.y, -; EG-NEXT: LSHR T44.W, T41.W, literal.y, -; EG-NEXT: AND_INT * T42.X, T41.X, literal.z, +; EG-NEXT: LSHR T42.Y, T39.X, literal.y, +; EG-NEXT: LSHR T44.W, T39.W, literal.y, +; EG-NEXT: AND_INT * T42.X, T39.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T44.Z, T41.W, literal.x, +; EG-NEXT: AND_INT T44.Z, T39.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) -; EG-NEXT: LSHR T41.X, PV.W, literal.x, -; EG-NEXT: LSHR T44.Y, T41.Z, literal.y, -; EG-NEXT: LSHR T45.W, T40.Y, literal.y, -; EG-NEXT: AND_INT * T44.X, T41.Z, literal.z, +; EG-NEXT: LSHR T39.X, PV.W, literal.x, +; EG-NEXT: LSHR T44.Y, T39.Z, literal.y, +; EG-NEXT: LSHR T45.W, T38.Y, literal.y, +; EG-NEXT: AND_INT * T44.X, T39.Z, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T45.Z, T40.Y, literal.x, +; EG-NEXT: AND_INT T45.Z, T38.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) ; EG-NEXT: LSHR T46.X, PV.W, literal.x, -; EG-NEXT: LSHR T45.Y, T40.X, literal.y, -; EG-NEXT: LSHR T47.W, T40.W, literal.y, -; EG-NEXT: AND_INT * T45.X, T40.X, literal.z, +; EG-NEXT: LSHR T45.Y, T38.X, literal.y, +; EG-NEXT: LSHR T47.W, T38.W, literal.y, +; EG-NEXT: AND_INT * T45.X, T38.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T47.Z, T40.W, literal.x, +; EG-NEXT: AND_INT T47.Z, T38.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44) -; EG-NEXT: LSHR T40.X, PV.W, literal.x, -; EG-NEXT: LSHR T47.Y, T40.Z, literal.y, -; EG-NEXT: AND_INT * T47.X, T40.Z, literal.z, +; EG-NEXT: LSHR T38.X, PV.W, literal.x, +; EG-NEXT: LSHR T47.Y, T38.Z, literal.y, +; EG-NEXT: AND_INT * T47.X, T38.Z, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: LSHR * T37.W, T39.Y, literal.y, +; EG-NEXT: LSHR * T35.W, T37.Y, literal.y, ; EG-NEXT: 80(1.121039e-43), 16(2.242078e-44) ; EG-NEXT: LSHR T48.X, PV.W, literal.x, -; EG-NEXT: AND_INT * T37.Z, T39.Y, literal.y, +; EG-NEXT: AND_INT * T35.Z, T37.Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) ; EG-NEXT: ALU clause starting at 95: -; EG-NEXT: LSHR T37.Y, T39.X, literal.x, -; EG-NEXT: LSHR * T53.W, T39.W, literal.x, +; EG-NEXT: LSHR T35.Y, T37.X, literal.x, +; EG-NEXT: LSHR * T53.W, T37.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T37.X, T39.X, literal.x, -; EG-NEXT: AND_INT T53.Z, T39.W, literal.x, +; EG-NEXT: AND_INT T35.X, T37.X, literal.x, +; EG-NEXT: AND_INT T53.Z, T37.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) -; EG-NEXT: LSHR T39.X, PV.W, literal.x, -; EG-NEXT: LSHR T53.Y, T39.Z, literal.y, +; EG-NEXT: LSHR T37.X, PV.W, literal.x, +; EG-NEXT: LSHR T53.Y, T37.Z, literal.y, ; EG-NEXT: LSHR T54.W, T52.Y, literal.y, -; EG-NEXT: AND_INT * T53.X, T39.Z, literal.z, +; EG-NEXT: AND_INT * T53.X, T37.Z, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: AND_INT T54.Z, T52.Y, literal.x, @@ -5640,61 +5636,61 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s4, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: @@ -6029,212 +6025,208 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s16, s15 -; GCN-HSA-NEXT: s_mov_b32 s18, s13 -; GCN-HSA-NEXT: s_mov_b32 s20, s11 -; GCN-HSA-NEXT: s_mov_b32 s22, s9 -; GCN-HSA-NEXT: s_lshr_b32 s24, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s2, s11 +; GCN-HSA-NEXT: s_mov_b32 s12, s9 +; GCN-HSA-NEXT: s_mov_b32 s14, s7 +; GCN-HSA-NEXT: s_mov_b32 s16, s5 +; GCN-HSA-NEXT: s_lshr_b32 s18, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s4, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[4:5], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[14:15], s[14:15], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s1, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[18:19], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s5 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s11 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s9 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s9, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s0, 0x50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s0, 64 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s0, 0x70 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s0, 0x60 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s8, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s9, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s8, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s9, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 32 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s8, 16 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s9, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -6446,146 +6438,146 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s19, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s21, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s23, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s25, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s27, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s29, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s31, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s34, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s0, 16 -; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-HSA-NEXT: s_and_b32 s35, s2, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s19, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s0, 16 +; GCN-HSA-NEXT: s_and_b32 s35, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-HSA-NEXT: s_and_b32 s2, s15, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 +; GCN-HSA-NEXT: s_and_b32 s0, s15, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -6598,147 +6590,146 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s14, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s12, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s0, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s15, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s14, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s2, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s0, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s0, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s4, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s1, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s15, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xf0 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xd0 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s15, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0xf0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0xd0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xb0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x90 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x90 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xe0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0xe0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xc0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0xa0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xa0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x80 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 32 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s17 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -6920,142 +6911,142 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s6, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s4, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s0, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[60:61], s[0:1], 48 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[0:1], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[2:3], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[8:9], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[12:13], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[14:15], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[12:13], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[14:15], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s69 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s67 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[52:53], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[50:51], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[48:49], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[46:47], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[36:37], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s74 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s75 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s72 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s73 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s66 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s67 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s65 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -7065,153 +7056,151 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s42, s15 -; GCN-HSA-NEXT: s_mov_b32 s48, s13 -; GCN-HSA-NEXT: s_mov_b32 s50, s11 -; GCN-HSA-NEXT: s_mov_b32 s52, s9 -; GCN-HSA-NEXT: s_mov_b32 s54, s7 -; GCN-HSA-NEXT: s_mov_b32 s56, s5 +; GCN-HSA-NEXT: s_mov_b32 s34, s15 +; GCN-HSA-NEXT: s_mov_b32 s36, s13 +; GCN-HSA-NEXT: s_mov_b32 s38, s11 +; GCN-HSA-NEXT: s_mov_b32 s40, s9 +; GCN-HSA-NEXT: s_mov_b32 s42, s7 +; GCN-HSA-NEXT: s_mov_b32 s44, s5 ; GCN-HSA-NEXT: s_mov_b32 s46, s3 -; GCN-HSA-NEXT: s_mov_b32 s58, s1 -; GCN-HSA-NEXT: s_lshr_b32 s60, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s62, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s64, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s68, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s70, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s72, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s74, s0, 16 +; GCN-HSA-NEXT: s_mov_b32 s48, s1 +; GCN-HSA-NEXT: s_lshr_b32 s50, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s52, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s54, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s58, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s60, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[38:39], s[2:3], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[14:15], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[42:43], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[34:35], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[44:45], s[6:7], 48 +; GCN-HSA-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[74:75], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[72:73], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[70:71], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[68:69], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[66:67], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[64:65], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[60:61], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[56:57], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[54:55], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[52:53], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s58, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s59, s17, 0 +; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s52, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s53, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s11 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s12 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s74 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s75 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 -; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xd0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s49 -; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s44 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s59 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s45 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GCN-HSA-NEXT: s_add_u32 s38, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 -; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s44 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38 -; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s72 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s73 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s70 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s71 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s69 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xc0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -7269,22 +7258,22 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s15 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s13 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s11 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s52, s9 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s56, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s3 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s14, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s12, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s72, s0, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s15 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s13 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s11 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s9 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s3 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s14, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s2, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s0, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 @@ -7293,123 +7282,119 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[38:39], s[2:3], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[42:43], s[4:5], 48 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[82:83], s[14:15], 48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[72:73], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[14:15], s[14:15], 48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[64:65], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[62:63], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[60:61], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[58:59], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[56:57], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 -; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0xf0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s82 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s83 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 -; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0xd0 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xf0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s81 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 -; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0xb0 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xd0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s78 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s79 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 -; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0x90 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0x90 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s76 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s77 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 -; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0x70 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s74 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s75 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0x50 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 -; GCN-NOHSA-VI-NEXT: s_add_u32 s42, s16, 0x50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s43, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s43 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s73 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 48 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 16 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-VI-NEXT: s_add_u32 s36, s16, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s37, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s37 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xe0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xe0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xc0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 87913841012184..1e6665a75ad4e7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -746,29 +746,29 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v11i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x8 -; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 +; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16 +; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s9 +; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 @@ -779,29 +779,29 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v11i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20 -; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 16 +; GFX8-NOHSA-NEXT: s_addc_u32 s11, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s11 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s9 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 @@ -1975,37 +1975,39 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 +; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX7-HSA-NEXT: s_add_u32 s10, s8, 48 +; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s8, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s8, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: s_addc_u32 s3, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -2315,41 +2317,41 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-HSA-NEXT: s_ashr_i32 s16, s11, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s17, s10, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s14, s9, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s15, s8, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-HSA-NEXT: s_ashr_i32 s16, s7, 31 +; GFX9-HSA-NEXT: s_ashr_i32 s17, s6, 31 +; GFX9-HSA-NEXT: s_ashr_i32 s14, s5, 31 +; GFX9-HSA-NEXT: s_ashr_i32 s15, s4, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-HSA-NEXT: s_ashr_i32 s12, s7, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s13, s6, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX9-HSA-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-HSA-NEXT: s_ashr_i32 s12, s3, 31 +; GFX9-HSA-NEXT: s_ashr_i32 s13, s2, 31 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:48 +; GFX9-HSA-NEXT: s_ashr_i32 s10, s1, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-HSA-NEXT: s_ashr_i32 s3, s4, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX9-HSA-NEXT: s_ashr_i32 s11, s0, 31 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:32 ; GFX9-HSA-NEXT: s_nop 0 -; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s12 -; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 ; GFX9-HSA-NEXT: s_nop 0 -; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX9-HSA-NEXT: s_endpgm %ld = load <8 x i32>, ptr addrspace(4) %in %ext = sext <8 x i32> %ld to <8 x i64> @@ -3783,7 +3785,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX9-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX9-HSA: ; %bb.0: ; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 -; GFX9-HSA-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -3792,116 +3794,116 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: s_ashr_i32 s66, s30, 31 ; GFX9-HSA-NEXT: s_ashr_i32 s63, s29, 31 ; GFX9-HSA-NEXT: s_ashr_i32 s64, s28, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s66 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s65 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s66 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s65 ; GFX9-HSA-NEXT: s_ashr_i32 s61, s27, 31 ; GFX9-HSA-NEXT: s_ashr_i32 s62, s26, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:240 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:240 ; GFX9-HSA-NEXT: s_ashr_i32 s59, s25, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s28 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s64 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s63 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s64 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s63 ; GFX9-HSA-NEXT: s_ashr_i32 s60, s24, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:224 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:224 ; GFX9-HSA-NEXT: s_ashr_i32 s57, s23, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s26 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s62 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s61 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s62 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s61 ; GFX9-HSA-NEXT: s_ashr_i32 s58, s22, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:208 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:208 ; GFX9-HSA-NEXT: s_ashr_i32 s55, s21, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s60 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s59 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s60 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s25 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s59 ; GFX9-HSA-NEXT: s_ashr_i32 s56, s20, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:192 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:192 ; GFX9-HSA-NEXT: s_ashr_i32 s53, s19, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s58 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s57 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s58 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s57 ; GFX9-HSA-NEXT: s_ashr_i32 s54, s18, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:176 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:176 ; GFX9-HSA-NEXT: s_ashr_i32 s51, s17, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s20 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s56 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s55 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-HSA-NEXT: s_ashr_i32 s52, s16, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:160 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:160 ; GFX9-HSA-NEXT: s_ashr_i32 s49, s15, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s54 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s53 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s54 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-HSA-NEXT: s_ashr_i32 s50, s14, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:144 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:144 ; GFX9-HSA-NEXT: s_ashr_i32 s47, s13, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s52 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s51 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s52 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX9-HSA-NEXT: s_ashr_i32 s48, s12, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:128 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:128 ; GFX9-HSA-NEXT: s_ashr_i32 s45, s11, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s50 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s49 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s50 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s49 ; GFX9-HSA-NEXT: s_ashr_i32 s46, s10, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:112 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112 ; GFX9-HSA-NEXT: s_ashr_i32 s43, s9, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s48 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s47 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s48 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-HSA-NEXT: s_ashr_i32 s44, s8, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:96 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96 ; GFX9-HSA-NEXT: s_ashr_i32 s41, s7, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s45 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s46 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-HSA-NEXT: s_ashr_i32 s42, s6, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:80 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80 ; GFX9-HSA-NEXT: s_ashr_i32 s39, s5, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s43 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s44 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX9-HSA-NEXT: s_ashr_i32 s40, s4, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:64 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64 ; GFX9-HSA-NEXT: s_ashr_i32 s35, s3, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s41 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s42 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX9-HSA-NEXT: s_ashr_i32 s38, s2, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:48 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48 ; GFX9-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s39 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s40 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s39 ; GFX9-HSA-NEXT: s_ashr_i32 s34, s0, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:32 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32 ; GFX9-HSA-NEXT: s_nop 0 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s35 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:16 +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s38 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16 ; GFX9-HSA-NEXT: s_nop 0 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s33 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] +; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] ; GFX9-HSA-NEXT: s_endpgm %ld = load <32 x i32>, ptr addrspace(4) %in %ext = sext <32 x i32> %ld to <32 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 66fc322e5e04b5..019bfa70290ff4 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -2561,111 +2561,111 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0 -; GFX8-NOHSA-NEXT: s_ashr_i32 s10, s0, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s11, s0, 0x80010 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s0 -; GFX8-NOHSA-NEXT: s_ashr_i32 s13, s1, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s14, s1, 0x80010 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s1 -; GFX8-NOHSA-NEXT: s_ashr_i32 s16, s2, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s17, s2, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s3, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s19, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s20, s4, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s21, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s22, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s23, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s24, s6, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s25, s6, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s7, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s7, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s1 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s7 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 0x60 +; GFX8-NOHSA-NEXT: s_ashr_i32 s12, s4, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s13, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s14, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s15, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s16, s6, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s17, s6, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s19, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s20, s8, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s21, s8, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s22, s9, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s23, s9, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s24, s10, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s25, s10, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s11, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s11, 0x80010 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s11 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NOHSA-NEXT: v_bfe_i32 v11, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s11 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s10 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s10, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NOHSA-NEXT: v_bfe_i32 v11, v2, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s24 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s5 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s9 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NOHSA-NEXT: v_bfe_i32 v11, v4, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s22 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s3 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_bfe_i32 v9, v6, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s21 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s20 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v7, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s18 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s16 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s5 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s4 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] ; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -2808,33 +2808,33 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s2, 24 ; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s2, 0x80008 ; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s26, s3, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s4, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s28, s4, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s29, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s30, s5, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s31, s6, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s33, s6, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s35, s7, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s8, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s37, s8, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s9, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s39, s9, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s10, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s41, s10, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s11, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s43, s11, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s12, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s45, s12, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s13, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s47, s13, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s14, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s49, s14, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s15, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s51, s15, 0x80008 -; GFX6-NOHSA-NEXT: s_and_b32 s52, s0, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s0, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s3, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s4, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s29, s5, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s31, s6, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s33, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s34, s7, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s35, s8, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s36, s8, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s37, s9, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s38, s9, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s39, s10, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s40, s10, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s41, s11, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s42, s11, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s43, s12, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s44, s12, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s45, s13, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s46, s13, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s47, s14, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s48, s14, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s49, s15, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s50, s15, 0x80008 +; GFX6-NOHSA-NEXT: s_and_b32 s51, s0, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s52, s0, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s53, s1, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s54, s1, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s55, s2, 0xff @@ -2870,76 +2870,76 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s51 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s50 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s50 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s49 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s48 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s47 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s46 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s45 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s44 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s43 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s40 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s38 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s34 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s30 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s26 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 @@ -2956,9 +2956,9 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s51 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s52 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm @@ -2975,48 +2975,48 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s21, s1, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s23, s2, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s25, s3, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s26, s3, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s29, s4, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s31, s5, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s33, s5, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s35, s6, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s36, s6, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s38, s7, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s39, s7, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s41, s8, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s42, s8, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s43, s9, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s44, s9, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s45, s10, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s46, s10, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s47, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s48, s11, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s49, s12, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s50, s12, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s51, s13, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s52, s13, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s53, s14, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s54, s14, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s55, s15, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s56, s15, 0x80008 -; GFX7-HSA-NEXT: s_and_b32 s24, s0, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s27, s1, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s30, s2, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s24, s3, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s25, s3, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s27, s4, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s5, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s29, s5, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s30, s6, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s31, s6, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s33, s7, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s34, s7, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s35, s8, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s36, s8, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s37, s9, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s38, s9, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s39, s10, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s40, s10, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s41, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s42, s11, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s43, s12, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s44, s12, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s45, s13, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s46, s13, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s47, s14, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s48, s14, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s49, s15, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s50, s15, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s51, s0, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s52, s0, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s53, s1, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s54, s1, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s55, s2, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s34, s3, 0xff +; GFX7-HSA-NEXT: s_and_b32 s56, s3, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s37, s4, 0xff +; GFX7-HSA-NEXT: s_and_b32 s57, s4, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s40, s5, 0xff +; GFX7-HSA-NEXT: s_and_b32 s58, s5, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s57, s6, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s58, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s59, s7, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s60, s7, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s59, s6, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s60, s7, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s61, s8, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s62, s9, 0xff @@ -3033,147 +3033,145 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s68, s15, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xa0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x90 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s53 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s47 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s49 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s46 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s45 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s43 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s42 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s41 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s35 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s40 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s38 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s16, 32 -; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s55 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 -; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s53 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s51 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3187,196 +3185,191 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s0, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s19, s1, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s21, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s23, s3, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s25, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s27, s5, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s29, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s33, s7, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s8, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s35, s9, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s10, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s37, s11, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s12, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s39, s13, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s14, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s15, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s20, s0, 0xff +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s21, s3, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s23, s5, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s25, s7, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s8, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s27, s9, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s10, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s29, s11, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s12, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s31, s13, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s33, s14, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s15, 24 +; GFX8-NOHSA-NEXT: s_and_b32 s35, s0, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0 -; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s22, s1, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s36, s0, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s37, s1, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1 -; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s24, s2, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s38, s1, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s39, s2, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s2 ; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s26, s3, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s40, s3, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s3 ; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s28, s4, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s41, s4, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s4 ; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s41, s5, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s42, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s43, s6, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s44, s6, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s45, s7, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s46, s7, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s47, s8, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s48, s8, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s49, s9, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s50, s9, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s51, s10, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s52, s10, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s53, s11, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s54, s11, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s55, s12, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s56, s12, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s57, s13, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s58, s13, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s59, s14, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s60, s14, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s31, s15, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s42, s5, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s43, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s44, s6, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s45, s6, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s46, s7, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s47, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s48, s8, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s49, s8, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s50, s9, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s51, s9, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s52, s10, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s53, s10, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s54, s11, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s55, s11, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s56, s12, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s57, s12, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s58, s13, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s59, s13, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s60, s14, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s61, s14, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s15, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s15, 0x80010 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xf0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s1 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xe0 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s15 -; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s30 -; GFX8-NOHSA-NEXT: s_add_u32 s30, s16, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s31 -; GFX8-NOHSA-NEXT: s_addc_u32 s31, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s30 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s34 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xd0 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s14 -; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s59 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s40 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s15 -; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xd0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s61 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s33 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xc0 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s57 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s58 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s39 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s58 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s59 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s31 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xb0 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xb0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s55 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s56 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s38 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s56 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s30 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xa0 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s53 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s54 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s37 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s54 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s55 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s29 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0x90 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s10 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s16, 0xa0 -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s10 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s16, 0x90 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s51 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s52 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s36 -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s52 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s53 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s28 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0x80 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s51 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s27 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0x70 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 0x80 -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s47 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s34 -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s26 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0x60 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s45 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s33 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s47 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s25 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0x50 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s43 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s29 -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s24 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 64 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s41 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s42 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s43 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s23 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s41 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s25 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s22 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s40 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s21 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s39 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 32 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s21 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s20 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s37 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s35 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s36 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3398,96 +3391,96 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T33.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T37.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T35.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T35.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T31.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T22.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T20.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T27.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T21.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T22.XYZW, T21.X, 16, #1 -; EG-NEXT: VTX_READ_128 T23.XYZW, T21.X, 0, #1 +; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1 +; EG-NEXT: VTX_READ_128 T21.XYZW, T19.X, 0, #1 ; EG-NEXT: Fetch clause starting at 26: -; EG-NEXT: VTX_READ_128 T32.XYZW, T21.X, 48, #1 -; EG-NEXT: VTX_READ_128 T33.XYZW, T21.X, 32, #1 +; EG-NEXT: VTX_READ_128 T32.XYZW, T19.X, 48, #1 +; EG-NEXT: VTX_READ_128 T33.XYZW, T19.X, 32, #1 ; EG-NEXT: ALU clause starting at 30: -; EG-NEXT: MOV * T21.X, KC0[2].Z, +; EG-NEXT: MOV * T19.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 31: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T19.Z, T23.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT * T22.Z, T21.X, literal.x, PV.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T19.Y, T23.X, literal.x, T0.W, -; EG-NEXT: BFE_UINT T20.Z, T23.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T19.W, T23.X, literal.z, +; EG-NEXT: BFE_UINT T22.Y, T21.X, literal.x, T0.W, +; EG-NEXT: BFE_UINT T23.Z, T21.Y, literal.y, T0.W, +; EG-NEXT: LSHR * T22.W, T21.X, literal.z, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T19.X, T23.X, literal.x, -; EG-NEXT: BFE_UINT T20.Y, T23.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T23.X, KC0[2].Y, literal.z, +; EG-NEXT: AND_INT T22.X, T21.X, literal.x, +; EG-NEXT: BFE_UINT T23.Y, T21.Y, literal.y, T0.W, +; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.z, ; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T24.Z, T23.Z, literal.x, T0.W, -; EG-NEXT: LSHR * T20.W, T23.Y, literal.y, +; EG-NEXT: BFE_UINT T24.Z, T21.Z, literal.x, T0.W, +; EG-NEXT: LSHR * T23.W, T21.Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: AND_INT T20.X, T23.Y, literal.x, -; EG-NEXT: BFE_UINT T24.Y, T23.Z, literal.y, T0.W, +; EG-NEXT: AND_INT T23.X, T21.Y, literal.x, +; EG-NEXT: BFE_UINT T24.Y, T21.Z, literal.y, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T25.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T26.Z, T23.W, literal.y, T0.W, -; EG-NEXT: LSHR T24.W, T23.Z, literal.z, -; EG-NEXT: AND_INT * T24.X, T23.Z, literal.w, +; EG-NEXT: BFE_UINT T26.Z, T21.W, literal.y, T0.W, +; EG-NEXT: LSHR T24.W, T21.Z, literal.z, +; EG-NEXT: AND_INT * T24.X, T21.Z, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T26.Y, T23.W, literal.x, T0.W, +; EG-NEXT: BFE_UINT T26.Y, T21.W, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) ; EG-NEXT: LSHR T27.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T28.Z, T22.X, literal.y, T0.W, BS:VEC_021/SCL_122 -; EG-NEXT: LSHR T26.W, T23.W, literal.z, -; EG-NEXT: AND_INT * T26.X, T23.W, literal.w, +; EG-NEXT: BFE_UINT T28.Z, T20.X, literal.y, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHR T26.W, T21.W, literal.z, +; EG-NEXT: AND_INT * T26.X, T21.W, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T28.Y, T22.X, literal.x, T0.W, +; EG-NEXT: BFE_UINT T28.Y, T20.X, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44) ; EG-NEXT: LSHR T29.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T30.Z, T22.Y, literal.y, T0.W, -; EG-NEXT: LSHR T28.W, T22.X, literal.z, -; EG-NEXT: AND_INT * T28.X, T22.X, literal.w, +; EG-NEXT: BFE_UINT T30.Z, T20.Y, literal.y, T0.W, +; EG-NEXT: LSHR T28.W, T20.X, literal.z, +; EG-NEXT: AND_INT * T28.X, T20.X, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T30.Y, T22.Y, literal.x, T0.W, +; EG-NEXT: BFE_UINT T30.Y, T20.Y, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 64(8.968310e-44) -; EG-NEXT: LSHR T22.X, PV.W, literal.x, -; EG-NEXT: LSHR T30.W, T22.Y, literal.y, -; EG-NEXT: AND_INT * T30.X, T22.Y, literal.z, +; EG-NEXT: LSHR T20.X, PV.W, literal.x, +; EG-NEXT: LSHR T30.W, T20.Y, literal.y, +; EG-NEXT: AND_INT * T30.X, T20.Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T21.Z, T22.Z, literal.x, T0.W, +; EG-NEXT: BFE_UINT T19.Z, T20.Z, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43) ; EG-NEXT: LSHR T31.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT * T21.Y, T22.Z, literal.y, T0.W, +; EG-NEXT: BFE_UINT * T19.Y, T20.Z, literal.y, T0.W, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: ALU clause starting at 91: -; EG-NEXT: BFE_UINT T34.Z, T22.W, literal.x, T0.W, -; EG-NEXT: LSHR * T21.W, T22.Z, literal.y, +; EG-NEXT: BFE_UINT T34.Z, T20.W, literal.x, T0.W, +; EG-NEXT: LSHR * T19.W, T20.Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: AND_INT T21.X, T22.Z, literal.x, -; EG-NEXT: BFE_UINT T34.Y, T22.W, literal.y, T0.W, +; EG-NEXT: AND_INT T19.X, T20.Z, literal.x, +; EG-NEXT: BFE_UINT T34.Y, T20.W, literal.y, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T35.X, PV.W, literal.x, ; EG-NEXT: BFE_UINT T36.Z, T33.X, literal.y, T0.W, BS:VEC_021/SCL_122 -; EG-NEXT: LSHR T34.W, T22.W, literal.z, -; EG-NEXT: AND_INT * T34.X, T22.W, literal.w, +; EG-NEXT: LSHR T34.W, T20.W, literal.z, +; EG-NEXT: AND_INT * T34.X, T20.W, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) ; EG-NEXT: BFE_UINT T36.Y, T33.X, literal.x, T0.W, @@ -3749,33 +3742,33 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s18, s0, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s19, s0, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s20, s0, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s21, s1, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s22, s1, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s23, s1, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s24, s2, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s25, s2, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s26, s2, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s27, s3, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s28, s3, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s29, s3, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s30, s4, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s31, s4, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s33, s4, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s34, s5, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s35, s5, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s36, s5, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s37, s6, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s38, s6, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s39, s6, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s40, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s41, s7, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s42, s7, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s43, s8, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s44, s8, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s45, s8, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s47, s9, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s48, s9, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s49, s9, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s22, s1, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s23, s1, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s24, s1, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s26, s2, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s27, s2, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s28, s2, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s29, s3, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s30, s3, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s31, s3, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s33, s4, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s34, s4, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s35, s4, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s36, s5, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s37, s5, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s38, s5, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s39, s6, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s40, s6, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s41, s6, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s42, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s43, s7, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s44, s7, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s45, s8, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s46, s8, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s47, s8, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s48, s9, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s49, s9, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s50, s9, 0x80008 ; GFX7-HSA-NEXT: s_ashr_i32 s51, s10, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s52, s10, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s53, s10, 0x80008 @@ -3794,49 +3787,53 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s66, s15, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s67, s15, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s68, s15, 0x80008 -; GFX7-HSA-NEXT: s_sext_i32_i8 s46, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s50, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s9 +; GFX7-HSA-NEXT: s_sext_i32_i8 s21, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xf0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s25, s1 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s1 ; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s8, s8 ; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 ; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 ; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 ; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 ; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68 @@ -3854,99 +3851,95 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s55 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s43 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 -; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s45 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s53 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s49 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s42 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s36 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_sext_i32_i8 s3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_sext_i32_i8 s1, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s16, 32 -; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_sext_i32_i8 s0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 @@ -3960,208 +3953,208 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v18, 8, s14 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0 ; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s0, 24 ; GFX8-NOHSA-NEXT: s_bfe_i32 s19, s0, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s20, s1, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s21, s1, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s22, s2, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s23, s2, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s24, s3, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s25, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s26, s4, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s27, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s28, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s29, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s30, s6, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s31, s6, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s7, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s34, s7, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s35, s8, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s36, s8, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s37, s9, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s38, s9, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s39, s10, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s40, s10, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s41, s11, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s42, s11, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s43, s12, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s44, s12, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s45, s13, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s46, s13, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s47, s14, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s48, s14, 0x80010 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s49, s14 -; GFX8-NOHSA-NEXT: s_ashr_i32 s14, s15, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s50, s15, 0x80010 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s15 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s20, s0 +; GFX8-NOHSA-NEXT: s_ashr_i32 s21, s1, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s22, s1, 0x80010 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s23, s1 +; GFX8-NOHSA-NEXT: s_ashr_i32 s24, s2, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s25, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s26, s3, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s27, s3, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s28, s4, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s29, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s30, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s31, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s6, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s34, s6, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s35, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s36, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s37, s8, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s38, s8, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s39, s9, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s40, s9, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s41, s10, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s42, s10, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s43, s11, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s44, s11, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s45, s12, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s46, s12, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s47, s13, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s48, s13, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s49, s14, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s50, s14, 0x80010 +; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s15, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s15, 0x80010 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xf0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s1 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s15 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s15 -; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 -; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v5, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s50 -; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v20, 8, s12 -; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v18, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s47 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s14, s12 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s13 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v19, 8, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xd0 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v19, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s45 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xb0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v20, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s43 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xe0 +; GFX8-NOHSA-NEXT: v_bfe_i32 v5, v4, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v10, 8, s14 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s14, s14 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 +; GFX8-NOHSA-NEXT: v_bfe_i32 v5, v10, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s13 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v11, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xc0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s47 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v12, 8, s12 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v12, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s45 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v13, 8, s11 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s13 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s8 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s4 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v11, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s42 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s41 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s12 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s4 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s8 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s10 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 0xa0 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s9 -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s9 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v9, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s40 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s39 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 0x90 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v8, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s38 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s37 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s9 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 0x80 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v7, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s36 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s35 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v13, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s43 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s10 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v14, 8, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0xa0 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0x90 +; GFX8-NOHSA-NEXT: v_bfe_i32 v9, v14, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s41 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v15, 8, s9 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0x80 +; GFX8-NOHSA-NEXT: v_bfe_i32 v9, v15, 0, 8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s7 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[11:14] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 0x70 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 0x60 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v4, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s34 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s33 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s40 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s39 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v16, 8, s8 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NOHSA-NEXT: v_bfe_i32 v9, v16, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s37 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v17, 8, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0x70 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0x60 +; GFX8-NOHSA-NEXT: v_bfe_i32 v11, v17, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s36 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s35 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 0x50 +; GFX8-NOHSA-NEXT: v_bfe_i32 v11, v2, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s33 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s5 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 64 +; GFX8-NOHSA-NEXT: v_bfe_i32 v11, v0, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s30 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v18, 8, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 48 +; GFX8-NOHSA-NEXT: v_bfe_i32 v9, v18, 0, 8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 0x50 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v2, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s31 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s30 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s29 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s28 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v21, 8, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2 -; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v6, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s28 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v19, 8, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 32 +; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v19, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s27 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s26 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v10, 8, s3 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[11:12], v[6:9] +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v20, 8, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s16, 16 +; GFX8-NOHSA-NEXT: v_bfe_i32 v5, v20, 0, 8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 32 -; GFX8-NOHSA-NEXT: v_bfe_i32 v10, v10, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s25 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[9:12] -; GFX8-NOHSA-NEXT: v_bfe_i32 v5, v21, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s22 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s1, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s24 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] ; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17 @@ -4178,11 +4171,11 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: ALU 72, @151, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T49.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T19.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T35.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T34.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T33.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T32.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T32.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T30.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T29.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T28.X, 0 @@ -4195,13 +4188,13 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 24: -; EG-NEXT: VTX_READ_128 T20.XYZW, T21.X, 32, #1 -; EG-NEXT: VTX_READ_128 T19.XYZW, T21.X, 48, #1 +; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1 +; EG-NEXT: VTX_READ_128 T21.XYZW, T19.X, 48, #1 ; EG-NEXT: Fetch clause starting at 28: -; EG-NEXT: VTX_READ_128 T31.XYZW, T21.X, 0, #1 -; EG-NEXT: VTX_READ_128 T21.XYZW, T21.X, 16, #1 +; EG-NEXT: VTX_READ_128 T31.XYZW, T19.X, 0, #1 +; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 16, #1 ; EG-NEXT: ALU clause starting at 32: -; EG-NEXT: MOV * T21.X, KC0[2].Z, +; EG-NEXT: MOV * T19.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 33: ; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, @@ -4222,26 +4215,26 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43) ; EG-NEXT: LSHR T28.X, PV.W, literal.x, -; EG-NEXT: LSHR T0.Y, T19.W, literal.y, -; EG-NEXT: LSHR T0.Z, T19.Z, literal.z, -; EG-NEXT: LSHR * T0.W, T19.W, literal.w, +; EG-NEXT: LSHR T0.Y, T21.W, literal.y, +; EG-NEXT: LSHR T0.Z, T21.Z, literal.z, +; EG-NEXT: LSHR * T0.W, T21.W, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T29.X, PV.W, literal.x, -; EG-NEXT: LSHR T1.Y, T19.Z, literal.y, -; EG-NEXT: LSHR T1.Z, T19.Y, literal.z, -; EG-NEXT: LSHR * T1.W, T19.Z, literal.w, +; EG-NEXT: LSHR T1.Y, T21.Z, literal.y, +; EG-NEXT: LSHR T1.Z, T21.Y, literal.z, +; EG-NEXT: LSHR * T1.W, T21.Z, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, ; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T30.X, PV.W, literal.x, -; EG-NEXT: LSHR T2.Y, T19.Y, literal.y, -; EG-NEXT: LSHR T2.Z, T19.Y, literal.z, -; EG-NEXT: LSHR T2.W, T19.X, literal.y, -; EG-NEXT: LSHR * T3.W, T19.X, literal.z, +; EG-NEXT: LSHR T2.Y, T21.Y, literal.y, +; EG-NEXT: LSHR T2.Z, T21.Y, literal.z, +; EG-NEXT: LSHR T2.W, T21.X, literal.y, +; EG-NEXT: LSHR * T3.W, T21.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 74: @@ -4261,22 +4254,22 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: LSHR T33.X, PS, literal.x, ; EG-NEXT: LSHR T5.Y, T20.X, literal.y, ; EG-NEXT: LSHR T5.Z, T20.X, literal.z, -; EG-NEXT: LSHR T6.W, T21.W, literal.y, +; EG-NEXT: LSHR T6.W, T19.W, literal.y, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 176(2.466285e-43) ; EG-NEXT: LSHR T34.X, PS, literal.x, -; EG-NEXT: LSHR T6.Y, T21.W, literal.y, -; EG-NEXT: LSHR T6.Z, T21.Z, literal.z, -; EG-NEXT: LSHR T7.W, T21.Z, literal.y, +; EG-NEXT: LSHR T6.Y, T19.W, literal.y, +; EG-NEXT: LSHR T6.Z, T19.Z, literal.z, +; EG-NEXT: LSHR T7.W, T19.Z, literal.y, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) ; EG-NEXT: 16(2.242078e-44), 192(2.690493e-43) ; EG-NEXT: LSHR T35.X, PS, literal.x, -; EG-NEXT: LSHR T7.Y, T21.Y, literal.y, -; EG-NEXT: LSHR T7.Z, T21.Y, literal.z, -; EG-NEXT: LSHR T8.W, T21.X, literal.y, -; EG-NEXT: LSHR * T9.W, T21.X, literal.z, +; EG-NEXT: LSHR T7.Y, T19.Y, literal.y, +; EG-NEXT: LSHR T7.Z, T19.Y, literal.z, +; EG-NEXT: LSHR T8.W, T19.X, literal.y, +; EG-NEXT: LSHR * T9.W, T19.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T36.X, T31.X, 0.0, literal.x, @@ -4305,81 +4298,81 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: BFE_INT T38.W, T9.Y, 0.0, literal.x, ; EG-NEXT: LSHR * T11.W, T31.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T40.X, T21.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T40.X, T19.X, 0.0, literal.x, ; EG-NEXT: BFE_INT T37.Y, PS, 0.0, literal.x, ; EG-NEXT: BFE_INT T38.Z, T10.W, 0.0, literal.x, ; EG-NEXT: BFE_INT T39.W, T8.Z, 0.0, literal.x, ; EG-NEXT: LSHR * T10.W, T31.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T31.X, T21.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T31.X, T19.Y, 0.0, literal.x, ; EG-NEXT: BFE_INT T38.Y, PS, 0.0, literal.x, ; EG-NEXT: BFE_INT T39.Z, T8.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: BFE_INT T40.W, T9.W, 0.0, literal.x, ; EG-NEXT: LSHR * T9.W, T31.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T41.X, T21.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT T41.X, T19.Z, 0.0, literal.x, ; EG-NEXT: BFE_INT T39.Y, PS, 0.0, literal.x, ; EG-NEXT: BFE_INT T40.Z, T8.W, 0.0, literal.x, ; EG-NEXT: BFE_INT * T31.W, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 151: -; EG-NEXT: LSHR * T8.W, T21.X, literal.x, +; EG-NEXT: LSHR * T8.W, T19.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T42.X, T21.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T42.X, T19.W, 0.0, literal.x, ; EG-NEXT: BFE_INT T40.Y, PV.W, 0.0, literal.x, ; EG-NEXT: BFE_INT T31.Z, T7.Y, 0.0, literal.x, ; EG-NEXT: BFE_INT T41.W, T7.W, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR * T7.W, T21.Y, literal.x, +; EG-NEXT: LSHR * T7.W, T19.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T43.X, T20.X, 0.0, literal.x, ; EG-NEXT: BFE_INT T31.Y, PS, 0.0, literal.x, ; EG-NEXT: BFE_INT T41.Z, T6.Z, 0.0, literal.x, ; EG-NEXT: BFE_INT T42.W, T6.Y, 0.0, literal.x, -; EG-NEXT: LSHR * T7.W, T21.Z, literal.x, +; EG-NEXT: LSHR * T7.W, T19.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T21.X, T20.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T19.X, T20.Y, 0.0, literal.x, ; EG-NEXT: BFE_INT T41.Y, PS, 0.0, literal.x, ; EG-NEXT: BFE_INT T42.Z, T6.W, 0.0, literal.x, ; EG-NEXT: BFE_INT T43.W, T5.Z, 0.0, literal.x, -; EG-NEXT: LSHR * T6.W, T21.W, literal.x, +; EG-NEXT: LSHR * T6.W, T19.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T44.X, T20.Z, 0.0, literal.x, ; EG-NEXT: BFE_INT T42.Y, PS, 0.0, literal.x, ; EG-NEXT: BFE_INT T43.Z, T5.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T21.W, T5.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T19.W, T5.W, 0.0, literal.x, ; EG-NEXT: LSHR * T5.W, T20.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T45.X, T20.W, 0.0, literal.x, ; EG-NEXT: BFE_INT T43.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T21.Z, T4.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT T19.Z, T4.Z, 0.0, literal.x, ; EG-NEXT: BFE_INT T44.W, T4.Y, 0.0, literal.x, ; EG-NEXT: LSHR * T5.W, T20.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T46.X, T19.X, 0.0, literal.x, -; EG-NEXT: BFE_INT T21.Y, PS, 0.0, literal.x, +; EG-NEXT: BFE_INT T46.X, T21.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T19.Y, PS, 0.0, literal.x, ; EG-NEXT: BFE_INT T44.Z, T4.W, 0.0, literal.x, ; EG-NEXT: BFE_INT T45.W, T3.Z, 0.0, literal.x, ; EG-NEXT: LSHR * T4.W, T20.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T20.X, T19.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T20.X, T21.Y, 0.0, literal.x, ; EG-NEXT: BFE_INT T44.Y, PS, 0.0, literal.x, ; EG-NEXT: BFE_INT T45.Z, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: BFE_INT T46.W, T3.W, 0.0, literal.x, ; EG-NEXT: LSHR * T3.W, T20.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T47.X, T19.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT T47.X, T21.Z, 0.0, literal.x, ; EG-NEXT: BFE_INT T45.Y, PS, 0.0, literal.x, ; EG-NEXT: BFE_INT T46.Z, T2.W, 0.0, literal.x, ; EG-NEXT: BFE_INT T20.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR * T2.W, T19.X, literal.x, +; EG-NEXT: LSHR * T2.W, T21.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T48.X, T19.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T48.X, T21.W, 0.0, literal.x, ; EG-NEXT: BFE_INT T46.Y, PS, 0.0, literal.x, ; EG-NEXT: BFE_INT T20.Z, T2.Y, 0.0, literal.x, ; EG-NEXT: BFE_INT T47.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 208(2.914701e-43) -; EG-NEXT: LSHR T19.X, PS, literal.x, +; EG-NEXT: LSHR T21.X, PS, literal.x, ; EG-NEXT: BFE_INT T20.Y, T1.Z, 0.0, literal.y, ; EG-NEXT: BFE_INT T47.Z, T1.Y, 0.0, literal.y, ; EG-NEXT: BFE_INT T48.W, T0.W, 0.0, literal.y, @@ -4389,7 +4382,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: LSHR T49.X, PS, literal.x, ; EG-NEXT: BFE_INT T47.Y, T0.Z, 0.0, literal.y, ; EG-NEXT: BFE_INT T48.Z, T0.Y, 0.0, literal.y, -; EG-NEXT: LSHR T0.W, T19.W, literal.y, +; EG-NEXT: LSHR T0.W, T21.W, literal.y, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00) @@ -5848,13 +5841,13 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 24 ; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 8 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[4:5], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[40:41], s[6:7], 56 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 @@ -5863,14 +5856,14 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s5 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s24 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s25 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34 @@ -5899,11 +5892,11 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s31 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -5913,25 +5906,23 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s14, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s26, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s8, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[34:35], s[4:5], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[38:39], s[6:7], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s2, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s10, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s22, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 8 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 @@ -5941,75 +5932,76 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 32 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -6018,100 +6010,100 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s11, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s10, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s9, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s8, 24 -; GFX8-NOHSA-NEXT: s_mov_b32 s24, s11 -; GFX8-NOHSA-NEXT: s_mov_b32 s4, s9 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s11 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s10 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s9 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s8 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 56 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s5, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s4, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s4, 24 +; GFX8-NOHSA-NEXT: s_mov_b32 s18, s7 +; GFX8-NOHSA-NEXT: s_mov_b32 s20, s5 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s7 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s6 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s5 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s4 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s10 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NOHSA-NEXT: v_bfe_i32 v10, v1, 0, 8 ; GFX8-NOHSA-NEXT: v_bfe_i32 v14, v0, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s11 -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s9 -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v4, 0, 8 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v4, 0, 8 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -6223,109 +6215,109 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s0, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s1, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s3, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s5, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s20, s7, 0x80008 -; GFX6-NOHSA-NEXT: s_bfe_u32 s21, s6, 0x80008 -; GFX6-NOHSA-NEXT: s_bfe_u32 s22, s5, 0x80008 -; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s4, 0x80008 -; GFX6-NOHSA-NEXT: s_bfe_u32 s24, s3, 0x80008 -; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s2, 0x80008 -; GFX6-NOHSA-NEXT: s_bfe_u32 s26, s1, 0x80008 -; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s0, 0x80008 -; GFX6-NOHSA-NEXT: s_and_b32 s28, s0, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s29, s1, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s30, s2, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s31, s3, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s33, s4, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s34, s5, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s35, s6, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s36, s7, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s5, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s7, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s8, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s9, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s10, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s11, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s20, s11, 0x80008 +; GFX6-NOHSA-NEXT: s_bfe_u32 s21, s10, 0x80008 +; GFX6-NOHSA-NEXT: s_bfe_u32 s22, s9, 0x80008 +; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s8, 0x80008 +; GFX6-NOHSA-NEXT: s_bfe_u32 s24, s7, 0x80008 +; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s6, 0x80008 +; GFX6-NOHSA-NEXT: s_bfe_u32 s26, s5, 0x80008 +; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s4, 0x80008 +; GFX6-NOHSA-NEXT: s_and_b32 s28, s4, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s29, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s30, s6, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s31, s7, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s33, s8, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s34, s9, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s35, s10, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s36, s11, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s8, s8, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:240 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:208 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:176 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:144 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:224 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s35 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:192 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:160 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:128 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s28 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64: @@ -6343,139 +6335,139 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s18, s10, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s19, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s20, s11, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s21, s10, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s22, s9, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s23, s8, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s24, s7, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s25, s6, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s26, s5, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s2, s4, 0x80008 -; GFX7-HSA-NEXT: s_and_b32 s3, s4, 0xff -; GFX7-HSA-NEXT: s_and_b32 s27, s5, 0xff -; GFX7-HSA-NEXT: s_and_b32 s28, s6, 0xff -; GFX7-HSA-NEXT: s_and_b32 s29, s7, 0xff -; GFX7-HSA-NEXT: s_and_b32 s30, s8, 0xff -; GFX7-HSA-NEXT: s_and_b32 s31, s9, 0xff -; GFX7-HSA-NEXT: s_and_b32 s33, s10, 0xff -; GFX7-HSA-NEXT: s_and_b32 s34, s11, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s35, s4, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s36, s5, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s2, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s19, s11, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s20, s10, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s21, s9, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s22, s8, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s23, s7, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s24, s6, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s25, s5, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s26, s4, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s27, s4, 0xff +; GFX7-HSA-NEXT: s_and_b32 s28, s5, 0xff +; GFX7-HSA-NEXT: s_and_b32 s29, s6, 0xff +; GFX7-HSA-NEXT: s_and_b32 s30, s7, 0xff +; GFX7-HSA-NEXT: s_and_b32 s31, s8, 0xff +; GFX7-HSA-NEXT: s_and_b32 s33, s9, 0xff +; GFX7-HSA-NEXT: s_and_b32 s34, s10, 0xff +; GFX7-HSA-NEXT: s_and_b32 s35, s11, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s4, s11, 0x80010 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GFX7-HSA-NEXT: s_bfe_u32 s3, s11, 0x80010 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x90 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 48 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 16 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 64 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -6491,149 +6483,144 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s5, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s7, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s9, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s15, s11, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s8, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s19, s4, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s4, 0xff +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s11, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s15, s10, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s8, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s4, 24 +; GFX8-NOHSA-NEXT: s_and_b32 s19, s4, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s4 -; GFX8-NOHSA-NEXT: s_and_b32 s3, s5, 0xff -; GFX8-NOHSA-NEXT: s_and_b32 s20, s6, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s6 -; GFX8-NOHSA-NEXT: s_and_b32 s21, s7, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s7 -; GFX8-NOHSA-NEXT: s_and_b32 s22, s8, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s8 -; GFX8-NOHSA-NEXT: s_and_b32 s23, s9, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s9 -; GFX8-NOHSA-NEXT: s_and_b32 s24, s10, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s20, s5, 0xff +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s5 +; GFX8-NOHSA-NEXT: s_and_b32 s21, s6, 0xff +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s6 +; GFX8-NOHSA-NEXT: s_and_b32 s22, s7, 0xff +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s7 +; GFX8-NOHSA-NEXT: s_and_b32 s23, s8, 0xff +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v10, 8, s8 +; GFX8-NOHSA-NEXT: s_and_b32 s24, s9, 0xff +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s9 +; GFX8-NOHSA-NEXT: s_and_b32 s25, s10, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v12, 8, s10 -; GFX8-NOHSA-NEXT: s_and_b32 s25, s11, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s26, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s26, s11, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010 ; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s8, 0x80010 ; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX8-NOHSA-NEXT: s_bfe_u32 s27, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010 ; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010 ; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010 -; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s11, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xb0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x70 +; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s11, 0x80010 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xd0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s27 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x90 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v13, 8, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xc0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xa0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v12 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x80 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x80 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v11 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s23 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v10 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -6820,412 +6807,409 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s50, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s44, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s3, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s14, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s26, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s3, 8 ; GFX6-NOHSA-NEXT: s_mov_b32 s40, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s1, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s1, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s62, s1 -; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s64, s0, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s66, s0, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[50:51], s[0:1], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[2:3], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[68:69], s[4:5], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s1, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_mov_b32 s52, s1 +; GFX6-NOHSA-NEXT: s_lshr_b32 s54, s0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s58, s0, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[62:63], s[0:1], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[64:65], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[66:67], s[4:5], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[68:69], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[54:55], s[2:3], 56 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[70:71], s[2:3], 56 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s71 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s69 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s66 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s67 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s5 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[46:47], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[62:63], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[40:41], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[56:57], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[42:43], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[52:53], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[58:59], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[56:57], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[54:55], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[46:47], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s55 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s49 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s70 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s71 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s51 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s65 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s28 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s62 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s63 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:128 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s39 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s50, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s62, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s36, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s30, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s34, s3 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s1, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 8 -; GFX7-HSA-NEXT: s_mov_b32 s16, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s66, s0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s68, s0, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s70, s0, 8 -; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[20:21], s[2:3], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 56 -; GFX7-HSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[6:7], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s4, s15, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s15, 8 +; GFX7-HSA-NEXT: s_mov_b32 s24, s15 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s14, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s14, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s30, s14, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s34, s13, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s36, s13, 8 +; GFX7-HSA-NEXT: s_mov_b32 s38, s13 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s12, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s42, s12, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s44, s12, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s46, s11, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s48, s11, 8 +; GFX7-HSA-NEXT: s_mov_b32 s50, s11 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s10, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s54, s10, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s10, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s9, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s9, 8 +; GFX7-HSA-NEXT: s_mov_b32 s58, s9 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s8, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s6, s8, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s62, s8, 8 +; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[64:65], s[10:11], 56 +; GFX7-HSA-NEXT: s_ashr_i64 s[10:11], s[14:15], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[66:67], s[12:13], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[68:69], s[12:13], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[70:71], s[14:15], 0x80000 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[64:65], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s64, s8, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s65, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s48 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xe0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s49 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s49 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46 -; GFX7-HSA-NEXT: s_add_u32 s46, s8, 0xc0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 -; GFX7-HSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s42 -; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s43 -; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s43 -; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s65 -; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s46 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s42 +; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX7-HSA-NEXT: s_add_u32 s56, s0, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s57, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s22 +; GFX7-HSA-NEXT: s_add_u32 s22, s0, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s23 +; GFX7-HSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s22 +; GFX7-HSA-NEXT: s_add_u32 s22, s0, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s22 +; GFX7-HSA-NEXT: s_add_u32 s22, s0, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s22 +; GFX7-HSA-NEXT: s_add_u32 s22, s0, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s22 +; GFX7-HSA-NEXT: s_add_u32 s22, s0, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s22 +; GFX7-HSA-NEXT: s_add_u32 s22, s0, 0x90 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GFX7-HSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s23 +; GFX7-HSA-NEXT: s_add_u32 s22, s0, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s40 -; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s41 +; GFX7-HSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s69 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s66 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s67 +; GFX7-HSA-NEXT: s_add_u32 s22, s0, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s39 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s45 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s65 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 +; GFX7-HSA-NEXT: s_add_u32 s22, s0, 0x60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x50 +; GFX7-HSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 +; GFX7-HSA-NEXT: s_add_u32 s22, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 +; GFX7-HSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: s_add_u32 s12, s8, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-HSA-NEXT: s_add_u32 s16, s0, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-HSA-NEXT: s_addc_u32 s17, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 32 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s6, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s5, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s4, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s42, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s1, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s0, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s0, 24 -; GFX8-NOHSA-NEXT: s_mov_b32 s48, s7 -; GFX8-NOHSA-NEXT: s_mov_b32 s50, s5 -; GFX8-NOHSA-NEXT: s_mov_b32 s52, s3 -; GFX8-NOHSA-NEXT: s_mov_b32 s54, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s7 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s6 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s5 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s4 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s3 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s2 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s0 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[18:19], s[0:1], 56 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[24:25], s[2:3], 56 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[56:57], s[4:5], 56 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[58:59], s[6:7], 56 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[54:55], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[52:53], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s23, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s22, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s22, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s21, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s20, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s20, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s19, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s18, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s42, s18, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s17, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s16, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s16, 24 +; GFX8-NOHSA-NEXT: s_mov_b32 s50, s23 +; GFX8-NOHSA-NEXT: s_mov_b32 s14, s21 +; GFX8-NOHSA-NEXT: s_mov_b32 s10, s19 +; GFX8-NOHSA-NEXT: s_mov_b32 s6, s17 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s23 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s22 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s21 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s20 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s19 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s18 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s17 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s16 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[16:17], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[18:19], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[20:21], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[22:23], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[16:17], s[16:17], 56 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[18:19], s[18:19], 56 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[20:21], s[20:21], 56 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[22:23], s[22:23], 56 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 @@ -7235,147 +7219,149 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s28 -; GFX8-NOHSA-NEXT: s_add_u32 s28, s8, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s29 -; GFX8-NOHSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s58 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s59 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s29 -; GFX8-NOHSA-NEXT: s_add_u32 s28, s8, 0xd0 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s22 +; GFX8-NOHSA-NEXT: s_add_u32 s22, s0, 0xf0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s23 +; GFX8-NOHSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s23 +; GFX8-NOHSA-NEXT: s_add_u32 s22, s0, 0xd0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-NOHSA-NEXT: s_addc_u32 s23, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s23 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v20, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s20 +; GFX8-NOHSA-NEXT: s_add_u32 s20, s0, 0xb0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s21 +; GFX8-NOHSA-NEXT: s_addc_u32 s21, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s31 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s34 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s29 -; GFX8-NOHSA-NEXT: s_add_u32 s28, s8, 0xb0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s36 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s37 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s56 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s57 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s29 -; GFX8-NOHSA-NEXT: s_add_u32 s28, s8, 0x90 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s38 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s39 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s40 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s41 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s29 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: v_bfe_i32 v10, v9, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s24 -; GFX8-NOHSA-NEXT: s_add_u32 s24, s8, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s25 -; GFX8-NOHSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s42 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s43 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s25 -; GFX8-NOHSA-NEXT: s_add_u32 s24, s8, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s21 +; GFX8-NOHSA-NEXT: s_add_u32 s20, s0, 0x90 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s45 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s47 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s25 +; GFX8-NOHSA-NEXT: s_addc_u32 s21, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s36 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s37 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s21 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: v_bfe_i32 v18, v5, 0, 8 +; GFX8-NOHSA-NEXT: v_bfe_i32 v18, v0, 0, 8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s18 -; GFX8-NOHSA-NEXT: s_add_u32 s18, s8, 48 +; GFX8-NOHSA-NEXT: s_add_u32 s18, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s19 -; GFX8-NOHSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX8-NOHSA-NEXT: s_addc_u32 s19, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s39 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s19 -; GFX8-NOHSA-NEXT: s_add_u32 s18, s8, 16 +; GFX8-NOHSA-NEXT: s_add_u32 s18, s0, 0x50 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX8-NOHSA-NEXT: s_addc_u32 s19, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s40 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s41 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s43 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s19 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xe0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v14, v8, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s7 -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xc0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] ; GFX8-NOHSA-NEXT: v_bfe_i32 v22, v1, 0, 8 -; GFX8-NOHSA-NEXT: v_bfe_i32 v18, v4, 0, 8 -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s16 +; GFX8-NOHSA-NEXT: s_add_u32 s16, s0, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s17 +; GFX8-NOHSA-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s17 +; GFX8-NOHSA-NEXT: s_add_u32 s16, s0, 16 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-NOHSA-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s17 +; GFX8-NOHSA-NEXT: s_add_u32 s16, s0, 0xe0 +; GFX8-NOHSA-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 0xc0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s47 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v21, s13 +; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NOHSA-NEXT: v_bfe_i32 v10, v9, 0, 8 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v20, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v21, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23] +; GFX8-NOHSA-NEXT: v_bfe_i32 v14, v8, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 0xa0 ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v20, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v22, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v21, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s51 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s17 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] +; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: v_bfe_i32 v22, v4, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0x80 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v20, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v21, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s8 +; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x80 +; GFX8-NOHSA-NEXT: v_bfe_i32 v18, v5, 0, 8 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s9 +; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 64 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32 ; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v6, 0, 8 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -9343,44 +9329,44 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 +; GFX6-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s12, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s6, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s7, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s5, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s20, s2, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s21, s2, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s22, s3, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s3, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s24, s0, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s25, s0, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s26, s1, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s1, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s2 -; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s5 +; GFX6-NOHSA-NEXT: s_and_b32 s12, s10, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s10, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s11, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s11, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s8, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s8, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s18, s9, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s9, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s20, s6, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s21, s6, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s22, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s7, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s24, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s25, s4, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s26, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s5, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NOHSA-NEXT: s_and_b32 s9, s9, 0xff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s8 +; GFX6-NOHSA-NEXT: s_and_b32 s8, s8, 0xff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s11 +; GFX6-NOHSA-NEXT: s_and_b32 s11, s11, 0xff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s10 +; GFX6-NOHSA-NEXT: s_and_b32 s10, s10, 0xff ; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s27, v0, 16 ; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 8 ; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s25, v1, 16 @@ -9398,33 +9384,33 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_alignbit_b32 v10, s13, v7, 16 ; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 ; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s26 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s26 ; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s24 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s24 ; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 -; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s22 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s22 ; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v8 -; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s20 +; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s20 ; GFX6-NOHSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v4 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s18 +; GFX6-NOHSA-NEXT: s_or_b32 s9, s9, s18 ; GFX6-NOHSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v9 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s16 +; GFX6-NOHSA-NEXT: s_or_b32 s8, s8, s16 ; GFX6-NOHSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v6 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s14 -; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s12 +; GFX6-NOHSA-NEXT: s_or_b32 s11, s11, s14 +; GFX6-NOHSA-NEXT: s_or_b32 s10, s10, s12 ; GFX6-NOHSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s11 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16: @@ -9630,25 +9616,25 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: ALU 103, @16, KC0[], KC1[] ; EG-NEXT: ALU 104, @120, KC0[], KC1[] ; EG-NEXT: ALU 41, @225, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T42.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T41.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 10: -; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1 ; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: ; EG-NEXT: MOV * T0.Y, T16.X, ; EG-NEXT: MOV * T35.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: AND_INT T0.W, T37.X, literal.x, +; EG-NEXT: AND_INT T0.W, T36.X, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, ; EG-NEXT: 255(3.573311e-43), -65536(nan) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, ; EG-NEXT: MOV * T16.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T0.W, T37.X, literal.x, +; EG-NEXT: LSHL * T0.W, T36.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, @@ -9658,27 +9644,27 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: MOV T0.Y, T17.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, T37.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T36.X, literal.x, PV.W, ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, ; EG-NEXT: 16(2.242078e-44), -65536(nan) ; EG-NEXT: OR_INT * T1.W, PS, PV.W, ; EG-NEXT: MOV * T17.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.X, literal.x, +; EG-NEXT: LSHR * T1.W, T36.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T36.Y, PV.W, PS, +; EG-NEXT: OR_INT * T37.Y, PV.W, PS, ; EG-NEXT: MOV T17.X, PV.Y, ; EG-NEXT: MOV * T0.Y, T12.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T37.Y, literal.y, +; EG-NEXT: AND_INT * T2.W, T36.Y, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV * T12.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T37.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T36.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, @@ -9686,28 +9672,28 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV T12.X, PV.W, ; EG-NEXT: MOV T0.Y, T13.X, -; EG-NEXT: BFE_UINT * T1.W, T37.Y, literal.x, T0.W, +; EG-NEXT: BFE_UINT * T1.W, T36.Y, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, ; EG-NEXT: MOV * T13.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.Y, literal.x, +; EG-NEXT: LSHR * T1.W, T36.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T36.W, PV.W, PS, +; EG-NEXT: OR_INT * T37.W, PV.W, PS, ; EG-NEXT: MOV T13.X, PV.W, ; EG-NEXT: MOV * T0.Y, T8.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T37.Z, literal.y, +; EG-NEXT: AND_INT * T2.W, T36.Z, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV * T8.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T37.Z, literal.x, +; EG-NEXT: LSHL * T1.W, T36.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, @@ -9715,28 +9701,28 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV T8.X, PV.W, ; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: BFE_UINT * T1.W, T37.Z, literal.x, T0.W, +; EG-NEXT: BFE_UINT * T1.W, T36.Z, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, ; EG-NEXT: MOV * T9.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.Z, literal.x, +; EG-NEXT: LSHR * T1.W, T36.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T37.Y, PV.W, PS, +; EG-NEXT: OR_INT * T36.Y, PV.W, PS, ; EG-NEXT: MOV T9.X, PV.Y, ; EG-NEXT: MOV * T0.Y, T4.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T37.W, literal.y, +; EG-NEXT: AND_INT * T2.W, T36.W, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV * T4.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T37.W, literal.x, +; EG-NEXT: LSHL * T1.W, T36.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, @@ -9744,7 +9730,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV T4.X, PV.W, ; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W, +; EG-NEXT: BFE_UINT * T1.W, T36.W, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 120: ; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x, @@ -9752,12 +9738,12 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, ; EG-NEXT: MOV * T5.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.W, literal.x, +; EG-NEXT: LSHR * T1.W, T36.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T37.W, PV.W, PS, +; EG-NEXT: OR_INT * T36.W, PV.W, PS, ; EG-NEXT: MOV T5.X, PV.W, ; EG-NEXT: MOV * T0.Y, T32.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, @@ -9887,10 +9873,10 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: MOV T21.X, PV.W, -; EG-NEXT: MOV * T36.X, T16.X, -; EG-NEXT: MOV * T36.Z, T12.X, -; EG-NEXT: MOV T37.X, T8.X, -; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T37.X, T16.X, +; EG-NEXT: MOV * T37.Z, T12.X, +; EG-NEXT: MOV T36.X, T8.X, +; EG-NEXT: MOV T36.Z, T4.X, BS:VEC_120/SCL_212 ; EG-NEXT: MOV * T38.X, T32.X, ; EG-NEXT: MOV * T38.Z, T28.X, ; EG-NEXT: MOV T35.X, T24.X, @@ -9904,115 +9890,115 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 +; GFX6-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s1, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s13, s1, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s14, s1, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s1, s1 -; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s0, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s16, s0, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s17, s0, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s0, s0 -; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s3, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s19, s3, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s20, s3, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s3, s3 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s2, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s22, s2, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s23, s2, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s2, s2 -; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s25, s5, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s26, s5, 0x80008 +; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s13, s5, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s14, s5, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s5, s5 -; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s4, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s28, s4, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s29, s4, 0x80008 +; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s4, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s16, s4, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s17, s4, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s4, s4 -; GFX6-NOHSA-NEXT: s_ashr_i32 s30, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s31, s7, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s33, s7, 0x80008 +; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s19, s7, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s20, s7, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s7, s7 -; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s6, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s35, s6, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s36, s6, 0x80008 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s6, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s22, s6, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s23, s6, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s9, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s25, s9, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s26, s9, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s8, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s28, s8, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s29, s8, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX6-NOHSA-NEXT: s_ashr_i32 s30, s11, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s31, s11, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s33, s11, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s11, s11 +; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s10, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s35, s10, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s36, s10, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 16 ; GFX6-NOHSA-NEXT: s_and_b32 s13, s13, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 16 -; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s15, s15, 16 ; GFX6-NOHSA-NEXT: s_and_b32 s16, s16, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s17, 16 -; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s18, s18, 16 ; GFX6-NOHSA-NEXT: s_and_b32 s19, s19, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s20, s20, 16 -; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s21, s21, 16 ; GFX6-NOHSA-NEXT: s_and_b32 s22, s22, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s23, s23, 16 -; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s24, s24, 16 ; GFX6-NOHSA-NEXT: s_and_b32 s25, s25, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 16 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NOHSA-NEXT: s_and_b32 s9, s9, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s27, s27, 16 ; GFX6-NOHSA-NEXT: s_and_b32 s28, s28, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s29, s29, 16 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NOHSA-NEXT: s_and_b32 s8, s8, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s30, s30, 16 ; GFX6-NOHSA-NEXT: s_and_b32 s31, s31, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s33, s33, 16 -; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NOHSA-NEXT: s_and_b32 s11, s11, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s34, s34, 16 ; GFX6-NOHSA-NEXT: s_and_b32 s35, s35, 0xffff ; GFX6-NOHSA-NEXT: s_lshl_b32 s36, s36, 16 -; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NOHSA-NEXT: s_and_b32 s10, s10, 0xffff ; GFX6-NOHSA-NEXT: s_or_b32 s12, s13, s12 -; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s14 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s14 ; GFX6-NOHSA-NEXT: s_or_b32 s13, s16, s15 -; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s17 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s17 ; GFX6-NOHSA-NEXT: s_or_b32 s14, s19, s18 -; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s20 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s20 ; GFX6-NOHSA-NEXT: s_or_b32 s15, s22, s21 -; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s23 +; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s23 ; GFX6-NOHSA-NEXT: s_or_b32 s16, s25, s24 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s26 +; GFX6-NOHSA-NEXT: s_or_b32 s9, s9, s26 ; GFX6-NOHSA-NEXT: s_or_b32 s17, s28, s27 ; GFX6-NOHSA-NEXT: s_or_b32 s18, s31, s30 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s33 +; GFX6-NOHSA-NEXT: s_or_b32 s11, s11, s33 ; GFX6-NOHSA-NEXT: s_or_b32 s19, s35, s34 -; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s36 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NOHSA-NEXT: s_or_b32 s10, s10, s36 +; GFX6-NOHSA-NEXT: s_or_b32 s8, s8, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s16 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s14 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s12 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index e89c44d5b94a89..8d36eb392689be 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -622,32 +622,32 @@ entry: define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; GCN-NOHSA-SI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, s10 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, s11 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v4, off, s[8:11], 0 offset:4 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:6 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v5, off, s[8:11], 0 offset:8 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 offset:10 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v6, off, s[8:11], 0 offset:12 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v7, off, s[8:11], 0 offset:14 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v8, off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v9, off, s[8:11], 0 offset:18 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v10, off, s[8:11], 0 offset:20 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v11, off, s[8:11], 0 offset:22 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v12, off, s[8:11], 0 offset:24 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v13, off, s[8:11], 0 offset:26 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v14, off, s[8:11], 0 offset:28 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v15, off, s[8:11], 0 offset:30 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:2 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:4 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:6 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:8 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 offset:10 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v6, off, s[4:7], 0 offset:12 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v7, off, s[4:7], 0 offset:14 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v8, off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v9, off, s[4:7], 0 offset:18 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v10, off, s[4:7], 0 offset:20 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v11, off, s[4:7], 0 offset:22 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v12, off, s[4:7], 0 offset:24 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v13, off, s[4:7], 0 offset:26 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v14, off, s[4:7], 0 offset:28 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v15, off, s[4:7], 0 offset:30 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 @@ -666,8 +666,8 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v13, v12 ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v5, v11, v10 ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_load_v16i16_align2: @@ -2108,17 +2108,17 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 @@ -2138,10 +2138,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v4 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v7 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: @@ -2196,18 +2196,18 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) @@ -2226,10 +2226,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v16i16_to_v16i32: @@ -2344,17 +2344,17 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v1 @@ -2374,10 +2374,10 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 16, v6 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v7, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32: @@ -2432,18 +2432,18 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) @@ -2462,10 +2462,10 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i16_to_v16i32: @@ -2660,27 +2660,27 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 @@ -2689,64 +2689,64 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[16:19] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32: @@ -3068,22 +3068,22 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 @@ -3101,63 +3101,63 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v0, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[15:18] -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v18, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: s_waitcnt vmcnt(4) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v18, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[15:18] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v6, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v8, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v2, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v10, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[3:6] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v6, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v12, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v2, v15, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v14, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32: @@ -3460,115 +3460,103 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xffff, v0 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v12 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v25, 0xffff, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v30 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v29 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v57, 0xffff, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v55, 0xffff, v27 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v62, 16, v32 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v31 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v29, 0xffff, v34 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v61, 0xffff, v32 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v59, 0xffff, v31 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v37 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v35 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, 0xffff, v38 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v37 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v36 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v42 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, 0xffff, v41 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v40, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v44, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v48, 0xffff, v12 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v18 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v19 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v18 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v52, 0xffff, v16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v23 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v56, 0xffff, v20 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v26 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v24 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v26 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v62, 0xffff, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v24 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v31 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v30 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload @@ -3583,190 +3571,191 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 -; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s6 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[26:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v37, 16, v21 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v36, 0xffff, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[34:37] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v22 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v2 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[32:35] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[19:22] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[15:18] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[16:19] -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[11:14] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[15:18] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v30 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v29 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v27 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v28 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v27 -; GCN-HSA-NEXT: v_and_b32_e32 v29, 0xffff, v25 -; GCN-HSA-NEXT: v_and_b32_e32 v27, 0xffff, v24 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v20 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[27:30] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v23 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v22 +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v33 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v32 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v26 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v35 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v34 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v34 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v28 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v28 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v31 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v31 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v30 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: @@ -3784,112 +3773,102 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v32, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v12 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v19, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v20, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v21, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v40 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v53, 0xffff, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v57, 0xffff, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v55, 0xffff, v27 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v62, 16, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v60, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v61, 0xffff, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v59, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v38 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v31, 0xffff, v37 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v42 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v41 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v33, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v34, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v35, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, 0xffff, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v18 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v17 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, 0xffff, v16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, 0xffff, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xffff, v20 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v26 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v24 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v26 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, 0xffff, v25 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, 0xffff, v24 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v30 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload @@ -3913,97 +3892,97 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T59.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T58.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T56.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T55.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T53.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T48.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T55.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T53.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T48.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T46.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T46.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T44.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T41.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T41.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T36.XYZW, T37.X, 0, #1 -; EG-NEXT: VTX_READ_128 T38.XYZW, T37.X, 48, #1 -; EG-NEXT: VTX_READ_128 T39.XYZW, T37.X, 32, #1 -; EG-NEXT: VTX_READ_128 T40.XYZW, T37.X, 16, #1 +; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1 +; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 48, #1 +; EG-NEXT: VTX_READ_128 T38.XYZW, T35.X, 32, #1 +; EG-NEXT: VTX_READ_128 T39.XYZW, T35.X, 16, #1 ; EG-NEXT: Fetch clause starting at 30: -; EG-NEXT: VTX_READ_128 T49.XYZW, T37.X, 112, #1 -; EG-NEXT: VTX_READ_128 T50.XYZW, T37.X, 96, #1 -; EG-NEXT: VTX_READ_128 T51.XYZW, T37.X, 80, #1 -; EG-NEXT: VTX_READ_128 T52.XYZW, T37.X, 64, #1 +; EG-NEXT: VTX_READ_128 T49.XYZW, T35.X, 112, #1 +; EG-NEXT: VTX_READ_128 T50.XYZW, T35.X, 96, #1 +; EG-NEXT: VTX_READ_128 T51.XYZW, T35.X, 80, #1 +; EG-NEXT: VTX_READ_128 T52.XYZW, T35.X, 64, #1 ; EG-NEXT: ALU clause starting at 38: -; EG-NEXT: MOV * T37.X, KC0[2].Z, +; EG-NEXT: MOV * T35.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 39: -; EG-NEXT: LSHR * T35.W, T36.W, literal.x, +; EG-NEXT: LSHR * T40.W, T36.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T35.Z, T36.W, literal.x, +; EG-NEXT: AND_INT * T40.Z, T36.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR T35.Y, T36.Z, literal.x, +; EG-NEXT: LSHR T40.Y, T36.Z, literal.x, ; EG-NEXT: LSHR * T36.W, T36.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T35.X, T36.Z, literal.x, +; EG-NEXT: AND_INT T40.X, T36.Z, literal.x, ; EG-NEXT: AND_INT T36.Z, T36.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LSHR T41.X, PV.W, literal.x, ; EG-NEXT: LSHR T36.Y, T36.X, literal.y, -; EG-NEXT: LSHR T42.W, T40.W, literal.y, +; EG-NEXT: LSHR T42.W, T39.W, literal.y, ; EG-NEXT: AND_INT * T36.X, T36.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT * T42.Z, T40.W, literal.x, +; EG-NEXT: AND_INT * T42.Z, T39.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LSHR T43.X, KC0[2].Y, literal.x, -; EG-NEXT: LSHR T42.Y, T40.Z, literal.y, -; EG-NEXT: LSHR T40.W, T40.Y, literal.y, -; EG-NEXT: AND_INT * T42.X, T40.Z, literal.z, +; EG-NEXT: LSHR T42.Y, T39.Z, literal.y, +; EG-NEXT: LSHR T39.W, T39.Y, literal.y, +; EG-NEXT: AND_INT * T42.X, T39.Z, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T40.Z, T40.Y, literal.x, +; EG-NEXT: AND_INT T39.Z, T39.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) ; EG-NEXT: LSHR T44.X, PV.W, literal.x, -; EG-NEXT: LSHR T40.Y, T40.X, literal.y, -; EG-NEXT: LSHR T45.W, T39.W, literal.y, -; EG-NEXT: AND_INT * T40.X, T40.X, literal.z, +; EG-NEXT: LSHR T39.Y, T39.X, literal.y, +; EG-NEXT: LSHR T45.W, T38.W, literal.y, +; EG-NEXT: AND_INT * T39.X, T39.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T45.Z, T39.W, literal.x, +; EG-NEXT: AND_INT T45.Z, T38.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) ; EG-NEXT: LSHR T46.X, PV.W, literal.x, -; EG-NEXT: LSHR T45.Y, T39.Z, literal.y, -; EG-NEXT: LSHR T39.W, T39.Y, literal.y, -; EG-NEXT: AND_INT * T45.X, T39.Z, literal.z, +; EG-NEXT: LSHR T45.Y, T38.Z, literal.y, +; EG-NEXT: LSHR T38.W, T38.Y, literal.y, +; EG-NEXT: AND_INT * T45.X, T38.Z, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T39.Z, T39.Y, literal.x, +; EG-NEXT: AND_INT T38.Z, T38.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43) ; EG-NEXT: LSHR T47.X, PV.W, literal.x, -; EG-NEXT: LSHR T39.Y, T39.X, literal.y, -; EG-NEXT: AND_INT * T39.X, T39.X, literal.z, +; EG-NEXT: LSHR T38.Y, T38.X, literal.y, +; EG-NEXT: AND_INT * T38.X, T38.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: LSHR * T37.W, T38.W, literal.y, +; EG-NEXT: LSHR * T35.W, T37.W, literal.y, ; EG-NEXT: 64(8.968310e-44), 16(2.242078e-44) ; EG-NEXT: LSHR T48.X, PV.W, literal.x, -; EG-NEXT: AND_INT * T37.Z, T38.W, literal.y, +; EG-NEXT: AND_INT * T35.Z, T37.W, literal.y, ; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) ; EG-NEXT: ALU clause starting at 96: -; EG-NEXT: LSHR T37.Y, T38.Z, literal.x, -; EG-NEXT: LSHR * T38.W, T38.Y, literal.x, +; EG-NEXT: LSHR T35.Y, T37.Z, literal.x, +; EG-NEXT: LSHR * T37.W, T37.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T37.X, T38.Z, literal.x, -; EG-NEXT: AND_INT T38.Z, T38.Y, literal.x, +; EG-NEXT: AND_INT T35.X, T37.Z, literal.x, +; EG-NEXT: AND_INT T37.Z, T37.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) ; EG-NEXT: LSHR T53.X, PV.W, literal.x, -; EG-NEXT: LSHR T38.Y, T38.X, literal.y, +; EG-NEXT: LSHR T37.Y, T37.X, literal.y, ; EG-NEXT: LSHR T54.W, T52.W, literal.y, -; EG-NEXT: AND_INT * T38.X, T38.X, literal.z, +; EG-NEXT: AND_INT * T37.X, T37.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: AND_INT T54.Z, T52.W, literal.x, @@ -4275,100 +4254,100 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s7 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[40:43], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v19 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v18 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v32, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v35 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v34 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v35, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v34, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v33 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v32 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v33, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v32, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v39 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v38 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v39, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v38, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v37 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v36 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v37, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v36, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v43 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v42 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v43, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v42, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v41 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v40 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v41, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v40, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v31 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v30 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v31, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v30, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v29 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v28 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v29, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v28, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v30, 16, v27 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 16, v26 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v27, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v26, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v25 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v24 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v25, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v24, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v23 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v22 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v23, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v21 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v20 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v19, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v33, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v34, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v35, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v17 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 16, v23 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 16, v22 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v21 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v20 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v27 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 16, v26 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v26, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v25 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v24 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v25, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v24, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 16, v31 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v30 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v31, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v30, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v29 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v28 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v29, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v28, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v15, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v13 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 16, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v11, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v9 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 16, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v7, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 16, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 16, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v3, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v1 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload @@ -4383,42 +4362,42 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 -; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s2, 64 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GCN-HSA-NEXT: s_add_u32 s8, s2, 32 +; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[24:25] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s8 ; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 +; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 @@ -4426,146 +4405,149 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v21 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v20 -; GCN-HSA-NEXT: v_bfe_i32 v34, v21, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v32, v20, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v30, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v0, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v23 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v22 -; GCN-HSA-NEXT: v_bfe_i32 v34, v23, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v32, v22, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[32:35] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v30, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v17 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v16 -; GCN-HSA-NEXT: v_bfe_i32 v22, v17, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v16, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v19 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v18 -; GCN-HSA-NEXT: v_bfe_i32 v21, v19, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v18, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[19:22] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v8, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v22, v15, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v14, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[20:23] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v6, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v10, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[15:18] -; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[11:14] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v0, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[4:7] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v6, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v12, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v2, v15, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v14, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v21 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v20 +; GCN-HSA-NEXT: v_bfe_i32 v10, v21, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v20, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v28 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v25 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v24 +; GCN-HSA-NEXT: v_bfe_i32 v14, v25, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v24, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 +; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v18, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v17 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v16 +; GCN-HSA-NEXT: v_bfe_i32 v6, v17, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v16, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v23 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v22 +; GCN-HSA-NEXT: v_bfe_i32 v18, v23, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v22, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_bfe_i32 v19, v28, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v27 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v26 -; GCN-HSA-NEXT: v_bfe_i32 v2, v27, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v26, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v28, 16, v25 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v24 -; GCN-HSA-NEXT: v_bfe_i32 v27, v25, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v25, v24, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v33 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v32 +; GCN-HSA-NEXT: v_bfe_i32 v22, v33, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v20, v32, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[25:28] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v35 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v34 +; GCN-HSA-NEXT: v_bfe_i32 v18, v35, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v34, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v29 -; GCN-HSA-NEXT: v_bfe_i32 v21, v29, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v27 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v31 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v30 -; GCN-HSA-NEXT: v_bfe_i32 v17, v31, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v30, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v26 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: v_bfe_i32 v10, v27, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v26, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32: @@ -4583,112 +4565,102 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[23:26], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v3 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v32, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 16, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v14, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v42, 16, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v40, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v41, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v39, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v50, 16, v1 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v48, 16, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v49, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v47, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v36 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v35 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v36, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v35, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v46, 16, v5 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v44, 16, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v45, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v43, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v54, 16, v26 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v52, 16, v25 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v53, v26, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v51, v25, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v58, 16, v24 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v56, 16, v23 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v57, v24, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v55, v23, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v30 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v29 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v30, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v29, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v62, 16, v28 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v60, 16, v27 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v61, v28, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v59, v27, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 16, v34 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 16, v33 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v34, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v33, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v32 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v31 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v32, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v31, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v34, 16, v38 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 16, v37 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v38, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v37, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v33, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v34, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v35, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 16, v1 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v29 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v28 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v29, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v28, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v7 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v43, 16, v5 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v40, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v47, 16, v9 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v14 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v51, 16, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v19 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v19, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v18, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v17, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v16, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v23 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v22 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v23, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v22, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v21 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v20 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v21, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v20, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 16, v27 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 16, v26 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v27, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v26, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v63, 16, v25 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v61, 16, v24 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v62, v25, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v60, v24, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v31 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v30 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v31, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v30, 0, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload @@ -4910,9 +4882,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: ALU 82, @57, KC0[CB0:0-32], KC1[] ; CM-NEXT: ALU 72, @140, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T65, T66.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T35.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T37.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T64, T56.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T55.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T55.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T63, T54.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T45, T53.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T62, T52.X @@ -4928,17 +4900,17 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 24: -; CM-NEXT: VTX_READ_128 T36.XYZW, T37.X, 16, #1 -; CM-NEXT: VTX_READ_128 T35.XYZW, T37.X, 0, #1 +; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1 +; CM-NEXT: VTX_READ_128 T37.XYZW, T35.X, 0, #1 ; CM-NEXT: Fetch clause starting at 28: -; CM-NEXT: VTX_READ_128 T41.XYZW, T37.X, 112, #1 -; CM-NEXT: VTX_READ_128 T42.XYZW, T37.X, 96, #1 -; CM-NEXT: VTX_READ_128 T43.XYZW, T37.X, 80, #1 -; CM-NEXT: VTX_READ_128 T44.XYZW, T37.X, 64, #1 -; CM-NEXT: VTX_READ_128 T45.XYZW, T37.X, 48, #1 -; CM-NEXT: VTX_READ_128 T37.XYZW, T37.X, 32, #1 +; CM-NEXT: VTX_READ_128 T41.XYZW, T35.X, 112, #1 +; CM-NEXT: VTX_READ_128 T42.XYZW, T35.X, 96, #1 +; CM-NEXT: VTX_READ_128 T43.XYZW, T35.X, 80, #1 +; CM-NEXT: VTX_READ_128 T44.XYZW, T35.X, 64, #1 +; CM-NEXT: VTX_READ_128 T45.XYZW, T35.X, 48, #1 +; CM-NEXT: VTX_READ_128 T35.XYZW, T35.X, 32, #1 ; CM-NEXT: ALU clause starting at 40: -; CM-NEXT: MOV * T37.X, KC0[2].Z, +; CM-NEXT: MOV * T35.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 41: ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00) @@ -4946,13 +4918,13 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43) ; CM-NEXT: LSHR T39.X, PV.W, literal.x, -; CM-NEXT: LSHR T0.Y, T35.Z, literal.y, -; CM-NEXT: LSHR T0.Z, T35.W, literal.y, +; CM-NEXT: LSHR T0.Y, T37.Z, literal.y, +; CM-NEXT: LSHR T0.Z, T37.W, literal.y, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: 192(2.690493e-43), 0(0.000000e+00) ; CM-NEXT: LSHR T40.X, PV.W, literal.x, -; CM-NEXT: LSHR T1.Y, T35.Y, literal.y, +; CM-NEXT: LSHR T1.Y, T37.Y, literal.y, ; CM-NEXT: LSHR T1.Z, T36.Z, literal.y, ; CM-NEXT: LSHR * T0.W, T36.W, literal.y, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -4962,18 +4934,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: 16(2.242078e-44), 208(2.914701e-43) ; CM-NEXT: LSHR T46.X, PV.W, literal.x, ; CM-NEXT: LSHR T2.Y, T36.Y, literal.y, -; CM-NEXT: LSHR T3.Z, T37.Z, literal.y, +; CM-NEXT: LSHR T3.Z, T35.Z, literal.y, ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: 160(2.242078e-43), 0(0.000000e+00) ; CM-NEXT: LSHR T47.X, PV.W, literal.x, -; CM-NEXT: LSHR T3.Y, T37.W, literal.y, -; CM-NEXT: LSHR T4.Z, T37.X, literal.y, +; CM-NEXT: LSHR T3.Y, T35.W, literal.y, +; CM-NEXT: LSHR T4.Z, T35.X, literal.y, ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: 176(2.466285e-43), 0(0.000000e+00) ; CM-NEXT: LSHR T48.X, PV.W, literal.x, -; CM-NEXT: LSHR T4.Y, T37.Y, literal.y, +; CM-NEXT: LSHR T4.Y, T35.Y, literal.y, ; CM-NEXT: LSHR T5.Z, T45.Z, literal.y, ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -5073,40 +5045,40 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; CM-NEXT: BFE_INT T62.X, T45.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T61.Y, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: BFE_INT T45.Z, T37.Y, 0.0, literal.x, +; CM-NEXT: BFE_INT T45.Z, T35.Y, 0.0, literal.x, ; CM-NEXT: BFE_INT * T44.W, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T45.X, T37.X, 0.0, literal.x, +; CM-NEXT: BFE_INT T45.X, T35.X, 0.0, literal.x, ; CM-NEXT: BFE_INT T44.Y, T6.Z, 0.0, literal.x, -; CM-NEXT: BFE_INT T63.Z, T37.W, 0.0, literal.x, +; CM-NEXT: BFE_INT T63.Z, T35.W, 0.0, literal.x, ; CM-NEXT: BFE_INT * T62.W, T5.Y, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T63.X, T37.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT T63.X, T35.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T62.Y, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: BFE_INT T37.Z, T36.Y, 0.0, literal.x, +; CM-NEXT: BFE_INT T35.Z, T36.Y, 0.0, literal.x, ; CM-NEXT: BFE_INT * T45.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T37.X, T36.X, 0.0, literal.x, +; CM-NEXT: BFE_INT T35.X, T36.X, 0.0, literal.x, ; CM-NEXT: BFE_INT T45.Y, T4.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T64.Z, T36.W, 0.0, literal.x, ; CM-NEXT: BFE_INT * T63.W, T3.Y, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; CM-NEXT: BFE_INT T64.X, T36.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T63.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: BFE_INT T36.Z, T35.Y, 0.0, literal.x, -; CM-NEXT: BFE_INT * T37.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: BFE_INT T36.Z, T37.Y, 0.0, literal.x, +; CM-NEXT: BFE_INT * T35.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T36.X, T35.X, 0.0, literal.x, -; CM-NEXT: BFE_INT T37.Y, T2.Z, 0.0, literal.x, -; CM-NEXT: BFE_INT T65.Z, T35.W, 0.0, literal.x, +; CM-NEXT: BFE_INT T36.X, T37.X, 0.0, literal.x, +; CM-NEXT: BFE_INT T35.Y, T2.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT T65.Z, T37.W, 0.0, literal.x, ; CM-NEXT: BFE_INT * T64.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T65.X, T35.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT T65.X, T37.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T64.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: LSHR T1.Z, T35.X, literal.x, +; CM-NEXT: LSHR T1.Z, T37.X, literal.x, ; CM-NEXT: BFE_INT * T36.W, T1.Y, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T35.X, KC0[2].Y, literal.x, +; CM-NEXT: LSHR T37.X, KC0[2].Y, literal.x, ; CM-NEXT: BFE_INT T36.Y, PV.Z, 0.0, literal.y, ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y, ; CM-NEXT: BFE_INT * T65.W, T0.Z, 0.0, literal.y, @@ -6019,14 +5991,14 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0 @@ -6036,8 +6008,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 @@ -6047,10 +6019,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: @@ -6099,19 +6071,19 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 @@ -6127,10 +6099,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v8i16_to_v8i64: @@ -6237,17 +6209,17 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 @@ -6266,10 +6238,10 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64: @@ -6319,17 +6291,17 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -6352,10 +6324,10 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v8i16_to_v8i64: @@ -6904,12 +6876,12 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 @@ -6941,48 +6913,48 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v10, v16, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: v_mov_b32_e32 v16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v6, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v9, v16, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GCN-HSA-NEXT: v_ashr_i64 v[14:15], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[14:15], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v12, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 -; GCN-HSA-NEXT: v_bfe_i32 v6, v16, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[6:7], 48 +; GCN-HSA-NEXT: v_bfe_i32 v2, v17, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_bfe_i32 v0, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v16, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; @@ -6996,62 +6968,62 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v13, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i16_to_v16i64: @@ -7245,139 +7217,168 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[19:22], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[25:28], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[29:32], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[33:36], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[2:5], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[6:9], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v62, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v20 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v25 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v25 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v27 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v26 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xffff, v28 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 16, v29 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v29 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v31 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v31 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v51, 16, v30 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v30 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v32 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v36 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v33 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v57, 0xffff, v33 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v35 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v35 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v50, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v52, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v42, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v44, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v38, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v40, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v58, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v46, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, v24 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v10 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(4) +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v12 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v57, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v51, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, v1 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v63, 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v60, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v63, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(4) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v60, 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[57:60], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) +; GCN-NOHSA-SI-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 @@ -7392,124 +7393,119 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, v16 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s10 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s9 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v13 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[17:20] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[9:12] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[17:20] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[7:10] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[0:3] +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v11 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[23:26] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v27, 0xffff, v12 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[27:30] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[19:22] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v6 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[23:26] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[19:22] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[1:4] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[5:8] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[5:8] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: @@ -7522,95 +7518,95 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[15:18], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[29:32], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[33:36], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v56, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v41, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v47, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v58, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v38, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v56 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v28 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v30 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v55 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v18 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v55 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v55 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v12 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v55 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v55 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v29 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v56 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v56 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v33 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v55, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v34, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v56 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v56 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v56 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i16_to_v32i64: @@ -7783,117 +7779,117 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; CM-NEXT: ALU 33, @31, KC0[], KC1[] ; CM-NEXT: TEX 0 @28 ; CM-NEXT: ALU 94, @65, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T50.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T50.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T49.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T48.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T26, T47.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T46.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T46.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T27, T45.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T44.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T29, T43.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T42.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T42.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T41.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T31, T40.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T32, T39.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T38.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T38.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T33, T37.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T34, T36.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T23.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T22.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 22: -; CM-NEXT: VTX_READ_128 T21.XYZW, T20.X, 0, #1 -; CM-NEXT: VTX_READ_128 T22.XYZW, T20.X, 32, #1 -; CM-NEXT: VTX_READ_128 T23.XYZW, T20.X, 16, #1 +; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1 +; CM-NEXT: VTX_READ_128 T21.XYZW, T19.X, 32, #1 +; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 16, #1 ; CM-NEXT: Fetch clause starting at 28: -; CM-NEXT: VTX_READ_128 T23.XYZW, T20.X, 48, #1 +; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 48, #1 ; CM-NEXT: ALU clause starting at 30: -; CM-NEXT: MOV * T20.X, KC0[2].Z, +; CM-NEXT: MOV * T19.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 31: -; CM-NEXT: LSHR * T19.Z, T21.Y, literal.x, +; CM-NEXT: LSHR * T23.Z, T20.Y, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT T19.X, T21.Y, literal.x, -; CM-NEXT: MOV T19.Y, 0.0, -; CM-NEXT: LSHR * T24.Z, T21.X, literal.y, +; CM-NEXT: AND_INT T23.X, T20.Y, literal.x, +; CM-NEXT: MOV T23.Y, 0.0, +; CM-NEXT: LSHR * T24.Z, T20.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T24.X, T21.X, literal.x, +; CM-NEXT: AND_INT T24.X, T20.X, literal.x, ; CM-NEXT: MOV T24.Y, 0.0, -; CM-NEXT: LSHR * T25.Z, T21.W, literal.y, +; CM-NEXT: LSHR * T25.Z, T20.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T25.X, T21.W, literal.x, +; CM-NEXT: AND_INT T25.X, T20.W, literal.x, ; CM-NEXT: MOV T25.Y, 0.0, -; CM-NEXT: LSHR * T26.Z, T21.Z, literal.y, +; CM-NEXT: LSHR * T26.Z, T20.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T26.X, T21.Z, literal.x, +; CM-NEXT: AND_INT T26.X, T20.Z, literal.x, ; CM-NEXT: MOV T26.Y, 0.0, -; CM-NEXT: LSHR * T21.Z, T23.Y, literal.y, +; CM-NEXT: LSHR * T20.Z, T22.Y, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T21.X, T23.Y, literal.x, -; CM-NEXT: MOV T21.Y, 0.0, -; CM-NEXT: LSHR * T27.Z, T23.X, literal.y, +; CM-NEXT: AND_INT T20.X, T22.Y, literal.x, +; CM-NEXT: MOV T20.Y, 0.0, +; CM-NEXT: LSHR * T27.Z, T22.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T27.X, T23.X, literal.x, +; CM-NEXT: AND_INT T27.X, T22.X, literal.x, ; CM-NEXT: MOV T27.Y, 0.0, -; CM-NEXT: LSHR * T28.Z, T23.W, literal.y, +; CM-NEXT: LSHR * T28.Z, T22.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T28.X, T23.W, literal.x, +; CM-NEXT: AND_INT T28.X, T22.W, literal.x, ; CM-NEXT: MOV T28.Y, 0.0, -; CM-NEXT: LSHR * T29.Z, T23.Z, literal.y, +; CM-NEXT: LSHR * T29.Z, T22.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T29.X, T23.Z, literal.x, +; CM-NEXT: AND_INT T29.X, T22.Z, literal.x, ; CM-NEXT: MOV T29.Y, 0.0, -; CM-NEXT: LSHR * T20.Z, T22.Y, literal.y, +; CM-NEXT: LSHR * T19.Z, T21.Y, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; CM-NEXT: ALU clause starting at 65: -; CM-NEXT: AND_INT T20.X, T22.Y, literal.x, -; CM-NEXT: MOV T20.Y, 0.0, -; CM-NEXT: LSHR * T30.Z, T22.X, literal.y, +; CM-NEXT: AND_INT T19.X, T21.Y, literal.x, +; CM-NEXT: MOV T19.Y, 0.0, +; CM-NEXT: LSHR * T30.Z, T21.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T30.X, T22.X, literal.x, +; CM-NEXT: AND_INT T30.X, T21.X, literal.x, ; CM-NEXT: MOV T30.Y, 0.0, -; CM-NEXT: LSHR * T31.Z, T22.W, literal.y, +; CM-NEXT: LSHR * T31.Z, T21.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T31.X, T22.W, literal.x, +; CM-NEXT: AND_INT T31.X, T21.W, literal.x, ; CM-NEXT: MOV T31.Y, 0.0, -; CM-NEXT: LSHR * T32.Z, T22.Z, literal.y, +; CM-NEXT: LSHR * T32.Z, T21.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T32.X, T22.Z, literal.x, +; CM-NEXT: AND_INT T32.X, T21.Z, literal.x, ; CM-NEXT: MOV T32.Y, 0.0, -; CM-NEXT: LSHR * T22.Z, T23.Y, literal.y, +; CM-NEXT: LSHR * T21.Z, T22.Y, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T22.X, T23.Y, literal.x, -; CM-NEXT: MOV T22.Y, 0.0, -; CM-NEXT: LSHR * T33.Z, T23.X, literal.y, +; CM-NEXT: AND_INT T21.X, T22.Y, literal.x, +; CM-NEXT: MOV T21.Y, 0.0, +; CM-NEXT: LSHR * T33.Z, T22.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T33.X, T23.X, literal.x, +; CM-NEXT: AND_INT T33.X, T22.X, literal.x, ; CM-NEXT: MOV T33.Y, 0.0, -; CM-NEXT: LSHR * T34.Z, T23.W, literal.y, +; CM-NEXT: LSHR * T34.Z, T22.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T34.X, T23.W, literal.x, +; CM-NEXT: AND_INT T34.X, T22.W, literal.x, ; CM-NEXT: MOV T34.Y, 0.0, -; CM-NEXT: LSHR * T35.Z, T23.Z, literal.y, +; CM-NEXT: LSHR * T35.Z, T22.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T35.X, T23.Z, literal.x, +; CM-NEXT: AND_INT T35.X, T22.Z, literal.x, ; CM-NEXT: MOV T35.Y, 0.0, -; CM-NEXT: MOV * T19.W, 0.0, +; CM-NEXT: MOV * T23.W, 0.0, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; CM-NEXT: MOV * T24.W, 0.0, ; CM-NEXT: MOV * T25.W, 0.0, ; CM-NEXT: MOV * T26.W, 0.0, -; CM-NEXT: MOV * T21.W, 0.0, +; CM-NEXT: MOV * T20.W, 0.0, ; CM-NEXT: MOV * T27.W, 0.0, ; CM-NEXT: MOV * T28.W, 0.0, ; CM-NEXT: MOV * T29.W, 0.0, -; CM-NEXT: MOV * T20.W, 0.0, +; CM-NEXT: MOV * T19.W, 0.0, ; CM-NEXT: MOV * T30.W, 0.0, ; CM-NEXT: MOV * T31.W, 0.0, ; CM-NEXT: MOV * T32.W, 0.0, -; CM-NEXT: MOV * T22.W, 0.0, +; CM-NEXT: MOV * T21.W, 0.0, ; CM-NEXT: MOV * T33.W, 0.0, ; CM-NEXT: MOV * T34.W, 0.0, ; CM-NEXT: MOV * T35.W, 0.0, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00) -; CM-NEXT: LSHR T23.X, PV.W, literal.x, +; CM-NEXT: LSHR T22.X, PV.W, literal.x, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43) ; CM-NEXT: LSHR T36.X, PV.W, literal.x, @@ -7956,85 +7952,85 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[2:3], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[14:15], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[12:13], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[6:7], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[10:11], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[4:5], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[8:9], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v23, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[10:11], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[6:7], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[8:9], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[4:5], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[14:15], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[2:3], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[12:13], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[0:1], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v4, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v24, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v0, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 @@ -8064,17 +8060,17 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 @@ -8097,139 +8093,139 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[8:9], 48 -; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v11 +; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[2:3], 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[10:11], 48 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v10 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x50 -; GCN-HSA-NEXT: v_bfe_i32 v18, v9, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v16, v10, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] ; GCN-HSA-NEXT: s_add_u32 s10, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 +; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, v7 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[6:7], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[8:9], 48 +; GCN-HSA-NEXT: v_bfe_i32 v0, v9, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, v11 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[10:11], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_bfe_i32 v7, v13, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[9:10], v[12:13], 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 -; GCN-HSA-NEXT: v_bfe_i32 v7, v3, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[9:10], v[14:15], 48 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[12:13], 48 +; GCN-HSA-NEXT: v_bfe_i32 v0, v13, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] -; GCN-HSA-NEXT: v_bfe_i32 v19, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v23, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v21, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v25, v25, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[14:15], 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_bfe_i32 v19, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v21, v25, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v25, v5, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GCN-HSA-NEXT: v_bfe_i32 v23, v6, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[23:26] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_bfe_i32 v15, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v17, v18, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[19:22] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v17, v18, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v3, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_bfe_i32 v13, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v8, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_bfe_i32 v11, v14, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_bfe_i32 v7, v12, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v11, v14, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[7:10] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v0, v12, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: @@ -8249,44 +8245,46 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[0:1] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v3 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[0:1] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[4:5] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v19, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[4:5] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v7 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[8:9] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v7 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[8:9] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v11 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[12:13] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v18, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[21:22], 48, v[2:3] -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[20:21], 48, v[6:7] -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v17, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[19:20], 48, v[10:11] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[14:15] +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[12:13] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v15 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v20, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[2:3] +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v21, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[6:7] +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v22, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[10:11] +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v23, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[14:15] ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 @@ -8294,8 +8292,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 @@ -8312,7 +8308,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v8, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v10, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v21, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v22, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v20, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v4, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v6, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll index 3641bd4ef865dc..2961d7e9065e53 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -364,22 +364,22 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s3, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: ds_read_b32 v2, v1 +; GFX7-NEXT: ds_read_b32 v1, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 -; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB2_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX7-NEXT: v_add_f32_e32 v3, v1, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB2_6 ; GFX7-NEXT: .LBB2_7: ; %Flow14 @@ -460,22 +460,22 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX8-NEXT: ; %bb.5: ; GFX8-NEXT: s_lshl_b32 s3, s3, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: ds_read_b32 v2, v1 +; GFX8-NEXT: ds_read_b32 v1, v1 ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX8-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB2_6: ; %atomicrmw.start2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v4, v1, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB2_6 ; GFX8-NEXT: .LBB2_7: ; %Flow16 @@ -730,21 +730,21 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s3, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: ds_read_b32 v2, v1 +; GFX7-NEXT: ds_read_b32 v1, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 -; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB3_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX7-NEXT: v_add_f32_e32 v3, v1, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, s3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB3_6 ; GFX7-NEXT: .LBB3_7: ; %Flow14 @@ -823,21 +823,21 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: ; %bb.5: ; GFX8-NEXT: s_lshl_b32 s3, s3, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: ds_read_b32 v2, v1 +; GFX8-NEXT: ds_read_b32 v1, v1 ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX8-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB3_6: ; %atomicrmw.start2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v4, v1, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB3_6 ; GFX8-NEXT: .LBB3_7: ; %Flow16 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index e2683bba37f4bc..823e6bd5556968 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -770,15 +770,15 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; CHECK-NEXT: s_load_dwordx2 s[46:47], s[6:7], 0x10 ; CHECK-NEXT: s_add_u32 s0, s0, s15 -; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] +; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: s_add_u32 s42, s36, 40 +; CHECK-NEXT: s_add_u32 s42, s34, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] -; CHECK-NEXT: s_addc_u32 s43, s37, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_addc_u32 s43, s35, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] ; CHECK-NEXT: s_mov_b32 s33, s14 ; CHECK-NEXT: s_mov_b32 s40, s13 @@ -794,7 +794,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b32 s12, s41 ; CHECK-NEXT: s_mov_b32 s13, s40 ; CHECK-NEXT: s_mov_b32 s14, s33 @@ -807,7 +807,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b32 s12, s41 ; CHECK-NEXT: s_mov_b32 s13, s40 ; CHECK-NEXT: s_mov_b32 s14, s33 @@ -824,7 +824,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b32 s12, s41 ; CHECK-NEXT: s_mov_b32 s13, s40 ; CHECK-NEXT: global_load_dword v0, v0, s[46:47] @@ -916,10 +916,10 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 -; CHECK-NEXT: s_add_u32 s8, s36, 40 -; CHECK-NEXT: s_addc_u32 s9, s37, 0 +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b32 s12, s41 ; CHECK-NEXT: s_mov_b32 s13, s40 ; CHECK-NEXT: s_mov_b32 s14, s33 @@ -949,10 +949,10 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_add_u32 s8, s36, 40 -; CHECK-NEXT: s_addc_u32 s9, s37, 0 +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b32 s12, s41 ; CHECK-NEXT: s_mov_b32 s13, s40 ; CHECK-NEXT: s_mov_b32 s14, s33 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 5b094e647fde82..c195d0c5714433 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -29,8 +29,8 @@ bb: } ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: -; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[18:33], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] -; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[2:17], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] +; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31] +; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31] ; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31] ; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31] diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll index fcf100ee2de145..6ee523da900245 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -28,30 +28,29 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400 +; GCN-NEXT: buffer_load_dword v5, v[1:2], s[8:11], 0 addr64 offset:400 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xf ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_max_i32_e32 v4, s2, v5 +; GCN-NEXT: v_mov_b32_e32 v3, v4 +; GCN-NEXT: v_mov_b32_e32 v4, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_max_i32_e32 v3, s2, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v4 -; GCN-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_atomic_cmpswap v[5:6], v[1:2], s[8:11], 0 addr64 offset:400 glc +; GCN-NEXT: buffer_atomic_cmpswap v[3:4], v[1:2], s[8:11], 0 addr64 offset:400 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GCN-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, v5 +; GCN-NEXT: v_mov_b32_e32 v5, v3 ; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN-NEXT: s_cbranch_execnz .LBB0_2 ; GCN-NEXT: ; %bb.3: ; %atomicrmw.end ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: buffer_store_dword v5, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v3, off, s[4:7], 0 ; GCN-NEXT: .LBB0_4: ; %exit ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll index c877740c1baa9f..73ceecf2c04b17 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -730,7 +730,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX9_W64-NEXT: ;;#ASMSTART ; GFX9_W64-NEXT: s_mov_b32 s4, 17 ; GFX9_W64-NEXT: ;;#ASMEND -; GFX9_W64-NEXT: v_mov_b32_e32 v8, s4 +; GFX9_W64-NEXT: v_mov_b32_e32 v9, s4 ; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0 @@ -742,9 +742,9 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX9_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] ; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX9_W64-NEXT: s_nop 0 -; GFX9_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX9_W64-NEXT: ; implicit-def: $vgpr8 +; GFX9_W64-NEXT: ; implicit-def: $vgpr9 ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_1 ; GFX9_W64-NEXT: ; %bb.2: @@ -766,7 +766,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 -; GFX9_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX9_W64-NEXT: buffer_load_format_x v8, v0, s[8:11], 0 idxen ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0 ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -776,7 +776,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX9_W64-NEXT: .LBB2_6: ; %bb2 ; GFX9_W64-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) -; GFX9_W64-NEXT: global_store_dword v[11:12], v9, off +; GFX9_W64-NEXT: global_store_dword v[11:12], v8, off ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) ; GFX9_W64-NEXT: s_setpc_b64 s[30:31] ; @@ -786,7 +786,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W32-NEXT: ;;#ASMSTART ; GFX1010_W32-NEXT: s_mov_b32 s4, 17 ; GFX1010_W32-NEXT: ;;#ASMEND -; GFX1010_W32-NEXT: v_mov_b32_e32 v8, s4 +; GFX1010_W32-NEXT: v_mov_b32_e32 v9, s4 ; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo ; GFX1010_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 @@ -797,9 +797,9 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[2:3] ; GFX1010_W32-NEXT: s_and_b32 s5, vcc_lo, s5 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, s5 -; GFX1010_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 +; GFX1010_W32-NEXT: ; implicit-def: $vgpr9 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s5 ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_1 @@ -821,7 +821,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[6:7] ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 -; GFX1010_W32-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W32-NEXT: buffer_load_format_x v8, v0, s[8:11], 0 idxen ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 @@ -832,7 +832,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) -; GFX1010_W32-NEXT: global_store_dword v[11:12], v9, off +; GFX1010_W32-NEXT: global_store_dword v[11:12], v8, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] ; @@ -842,7 +842,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W64-NEXT: ;;#ASMSTART ; GFX1010_W64-NEXT: s_mov_b32 s4, 17 ; GFX1010_W64-NEXT: ;;#ASMEND -; GFX1010_W64-NEXT: v_mov_b32_e32 v8, s4 +; GFX1010_W64-NEXT: v_mov_b32_e32 v9, s4 ; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec ; GFX1010_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 @@ -853,9 +853,9 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3] ; GFX1010_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] -; GFX1010_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 +; GFX1010_W64-NEXT: ; implicit-def: $vgpr9 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_1 @@ -877,7 +877,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7] ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX1010_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W64-NEXT: buffer_load_format_x v8, v0, s[8:11], 0 idxen ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 @@ -888,7 +888,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W64-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) -; GFX1010_W64-NEXT: global_store_dword v[11:12], v9, off +; GFX1010_W64-NEXT: global_store_dword v[11:12], v8, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] ; @@ -898,7 +898,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W32-NEXT: ;;#ASMSTART ; GFX1100_W32-NEXT: s_mov_b32 s4, 17 ; GFX1100_W32-NEXT: ;;#ASMEND -; GFX1100_W32-NEXT: v_mov_b32_e32 v8, s4 +; GFX1100_W32-NEXT: v_mov_b32_e32 v9, s4 ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s8, v0 @@ -911,9 +911,9 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W32-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 +; GFX1100_W32-NEXT: ; implicit-def: $vgpr9 ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_1 ; GFX1100_W32-NEXT: ; %bb.2: @@ -937,7 +937,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W32-NEXT: buffer_load_format_x v8, v0, s[4:7], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0 ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -948,7 +948,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) -; GFX1100_W32-NEXT: global_store_b32 v[11:12], v9, off dlc +; GFX1100_W32-NEXT: global_store_b32 v[11:12], v8, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31] ; @@ -958,7 +958,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W64-NEXT: ;;#ASMSTART ; GFX1100_W64-NEXT: s_mov_b32 s4, 17 ; GFX1100_W64-NEXT: ;;#ASMEND -; GFX1100_W64-NEXT: v_mov_b32_e32 v8, s4 +; GFX1100_W64-NEXT: v_mov_b32_e32 v9, s4 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s8, v0 @@ -971,9 +971,9 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W64-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 +; GFX1100_W64-NEXT: ; implicit-def: $vgpr9 ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_1 ; GFX1100_W64-NEXT: ; %bb.2: @@ -997,7 +997,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W64-NEXT: buffer_load_format_x v8, v0, s[4:7], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0 ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] @@ -1008,7 +1008,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) -; GFX1100_W64-NEXT: global_store_b32 v[11:12], v9, off dlc +; GFX1100_W64-NEXT: global_store_b32 v[11:12], v8, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index ac46f8ce20d60b..25d8a28ed67352 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -769,7 +769,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX9_W64-NEXT: ;;#ASMSTART ; GFX9_W64-NEXT: s_mov_b32 s4, 17 ; GFX9_W64-NEXT: ;;#ASMEND -; GFX9_W64-NEXT: v_mov_b32_e32 v8, s4 +; GFX9_W64-NEXT: v_mov_b32_e32 v9, s4 ; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0 @@ -781,9 +781,9 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX9_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] ; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX9_W64-NEXT: s_nop 0 -; GFX9_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX9_W64-NEXT: ; implicit-def: $vgpr8 +; GFX9_W64-NEXT: ; implicit-def: $vgpr9 ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_1 ; GFX9_W64-NEXT: ; %bb.2: @@ -805,7 +805,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 -; GFX9_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX9_W64-NEXT: buffer_load_format_x v8, v0, s[8:11], 0 idxen ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0 ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -815,7 +815,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX9_W64-NEXT: .LBB2_6: ; %bb2 ; GFX9_W64-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) -; GFX9_W64-NEXT: global_store_dword v[11:12], v9, off +; GFX9_W64-NEXT: global_store_dword v[11:12], v8, off ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) ; GFX9_W64-NEXT: s_setpc_b64 s[30:31] ; @@ -825,7 +825,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W32-NEXT: ;;#ASMSTART ; GFX1010_W32-NEXT: s_mov_b32 s4, 17 ; GFX1010_W32-NEXT: ;;#ASMEND -; GFX1010_W32-NEXT: v_mov_b32_e32 v8, s4 +; GFX1010_W32-NEXT: v_mov_b32_e32 v9, s4 ; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo ; GFX1010_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 @@ -836,9 +836,9 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[2:3] ; GFX1010_W32-NEXT: s_and_b32 s5, vcc_lo, s5 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, s5 -; GFX1010_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 +; GFX1010_W32-NEXT: ; implicit-def: $vgpr9 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s5 ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_1 @@ -860,7 +860,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[6:7] ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 -; GFX1010_W32-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W32-NEXT: buffer_load_format_x v8, v0, s[8:11], 0 idxen ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 @@ -871,7 +871,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) -; GFX1010_W32-NEXT: global_store_dword v[11:12], v9, off +; GFX1010_W32-NEXT: global_store_dword v[11:12], v8, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] ; @@ -881,7 +881,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W64-NEXT: ;;#ASMSTART ; GFX1010_W64-NEXT: s_mov_b32 s4, 17 ; GFX1010_W64-NEXT: ;;#ASMEND -; GFX1010_W64-NEXT: v_mov_b32_e32 v8, s4 +; GFX1010_W64-NEXT: v_mov_b32_e32 v9, s4 ; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec ; GFX1010_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 @@ -892,9 +892,9 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3] ; GFX1010_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] -; GFX1010_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 +; GFX1010_W64-NEXT: ; implicit-def: $vgpr9 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_1 @@ -916,7 +916,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7] ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX1010_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W64-NEXT: buffer_load_format_x v8, v0, s[8:11], 0 idxen ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 @@ -927,7 +927,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W64-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) -; GFX1010_W64-NEXT: global_store_dword v[11:12], v9, off +; GFX1010_W64-NEXT: global_store_dword v[11:12], v8, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] ; @@ -937,7 +937,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W32-NEXT: ;;#ASMSTART ; GFX1100_W32-NEXT: s_mov_b32 s4, 17 ; GFX1100_W32-NEXT: ;;#ASMEND -; GFX1100_W32-NEXT: v_mov_b32_e32 v8, s4 +; GFX1100_W32-NEXT: v_mov_b32_e32 v9, s4 ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s8, v0 @@ -950,9 +950,9 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W32-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 +; GFX1100_W32-NEXT: ; implicit-def: $vgpr9 ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_1 ; GFX1100_W32-NEXT: ; %bb.2: @@ -976,7 +976,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W32-NEXT: buffer_load_format_x v8, v0, s[4:7], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0 ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -987,7 +987,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) -; GFX1100_W32-NEXT: global_store_b32 v[11:12], v9, off dlc +; GFX1100_W32-NEXT: global_store_b32 v[11:12], v8, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31] ; @@ -997,7 +997,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W64-NEXT: ;;#ASMSTART ; GFX1100_W64-NEXT: s_mov_b32 s4, 17 ; GFX1100_W64-NEXT: ;;#ASMEND -; GFX1100_W64-NEXT: v_mov_b32_e32 v8, s4 +; GFX1100_W64-NEXT: v_mov_b32_e32 v9, s4 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s8, v0 @@ -1010,9 +1010,9 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W64-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 +; GFX1100_W64-NEXT: ; implicit-def: $vgpr9 ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_1 ; GFX1100_W64-NEXT: ; %bb.2: @@ -1036,7 +1036,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W64-NEXT: buffer_load_format_x v8, v0, s[4:7], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0 ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] @@ -1047,7 +1047,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) -; GFX1100_W64-NEXT: global_store_b32 v[11:12], v9, off dlc +; GFX1100_W64-NEXT: global_store_b32 v[11:12], v8, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index b4e9376d827773..21ed18cd95eaef 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -1787,32 +1787,32 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; EG-NEXT: Fetch clause starting at 12: ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X, -; EG-NEXT: MOV * T1.W, literal.x, +; EG-NEXT: OR_INT T1.W, KC0[2].W, KC0[3].X, +; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) -; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0, +; EG-NEXT: SETNE_INT * T1.W, PV.W, 0.0, ; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, ; EG-NEXT: ALU clause starting at 19: -; EG-NEXT: MOV T0.W, KC0[2].W, -; EG-NEXT: MOV * T1.W, KC0[3].Z, +; EG-NEXT: MOV T1.W, KC0[2].W, +; EG-NEXT: MOV * T0.W, KC0[3].Z, ; EG-NEXT: MOV T2.W, KC0[3].Y, ; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, -; EG-NEXT: MOV T1.W, KC0[3].X, -; EG-NEXT: MULHI * T0.Y, T0.W, PV.W, +; EG-NEXT: MOV T0.W, KC0[3].X, +; EG-NEXT: MULHI * T0.Y, T1.W, PV.W, ; EG-NEXT: ADD_INT T3.W, PS, T0.X, ; EG-NEXT: MULLO_INT * T0.X, PV.W, T2.W, ; EG-NEXT: ADD_INT T0.Y, PV.W, PS, -; EG-NEXT: MOV T1.W, literal.x, -; EG-NEXT: MULLO_INT * T0.X, T0.W, T2.W, +; EG-NEXT: MOV T0.W, literal.x, +; EG-NEXT: MULLO_INT * T0.X, T1.W, T2.W, ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 31: -; EG-NEXT: MOV T0.W, KC0[2].Y, -; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, +; EG-NEXT: MOV T1.W, KC0[2].Y, +; EG-NEXT: SETE_INT * T0.W, T0.W, 0.0, ; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, ; EG-NEXT: ALU clause starting at 34: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 35: -; EG-NEXT: LSHR * T1.X, T0.W, literal.x, +; EG-NEXT: LSHR * T1.X, T1.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp eq i64 %a, 0 @@ -1923,52 +1923,52 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s15, 0xf000 +; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s12, s11 -; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10 -; GFX9-NEXT: s_mul_i32 s2, s14, s9 -; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8 +; GFX9-NEXT: s_mul_i32 s0, s8, s7 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s6 +; GFX9-NEXT: s_mul_i32 s2, s10, s5 +; GFX9-NEXT: s_mul_hi_u32 s3, s10, s4 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s13, s10 +; GFX9-NEXT: s_mul_i32 s1, s9, s6 ; GFX9-NEXT: s_add_i32 s2, s3, s2 -; GFX9-NEXT: s_mul_i32 s3, s15, s8 +; GFX9-NEXT: s_mul_i32 s3, s11, s4 ; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s12, s10 +; GFX9-NEXT: s_mul_i32 s1, s8, s6 ; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_i32 s3, s14, s8 +; GFX9-NEXT: s_mul_i32 s3, s10, s4 ; GFX9-NEXT: s_add_u32 s3, s3, s1 ; GFX9-NEXT: s_addc_u32 s2, s2, s0 -; GFX9-NEXT: s_mul_i32 s14, s9, s12 -; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 -; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s1, s8, s13 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 -; GFX9-NEXT: s_add_u32 s1, s1, s14 -; GFX9-NEXT: s_addc_u32 s10, s10, 0 -; GFX9-NEXT: s_add_u32 s10, s11, s10 -; GFX9-NEXT: s_addc_u32 s11, 0, 0 -; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 -; GFX9-NEXT: s_mul_i32 s9, s9, s13 -; GFX9-NEXT: s_add_u32 s9, s9, s10 -; GFX9-NEXT: s_addc_u32 s10, s14, s11 +; GFX9-NEXT: s_mul_i32 s10, s5, s8 +; GFX9-NEXT: s_mul_hi_u32 s11, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s7, s5, s8 +; GFX9-NEXT: s_add_u32 s10, s10, s11 +; GFX9-NEXT: s_mul_i32 s1, s4, s9 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s9 +; GFX9-NEXT: s_add_u32 s1, s1, s10 +; GFX9-NEXT: s_addc_u32 s6, s6, 0 +; GFX9-NEXT: s_add_u32 s6, s7, s6 +; GFX9-NEXT: s_addc_u32 s7, 0, 0 +; GFX9-NEXT: s_mul_hi_u32 s10, s5, s9 +; GFX9-NEXT: s_mul_i32 s5, s5, s9 +; GFX9-NEXT: s_add_u32 s5, s5, s6 +; GFX9-NEXT: s_addc_u32 s6, s10, s7 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_add_u32 s9, s9, s3 -; GFX9-NEXT: s_addc_u32 s10, s10, s2 -; GFX9-NEXT: s_mul_i32 s2, s8, s12 +; GFX9-NEXT: s_add_u32 s5, s5, s3 +; GFX9-NEXT: s_addc_u32 s6, s6, s2 +; GFX9-NEXT: s_mul_i32 s2, s4, s8 ; GFX9-NEXT: s_mov_b32 s3, s0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index b21285e83dc21d..cf2cb3b867c7e3 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -4,8 +4,8 @@ # is killed by that store. # GCN-LABEL: name: global_sextload_v32i32_to_v32i64 -# GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) -# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46 +# GCN: renamable $vgpr31_vgpr32_vgpr33_vgpr34 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) +# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr24, killed renamable $vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr25 --- name: global_sextload_v32i32_to_v32i64 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 8f74132271ba68..a9f5bb441d622c 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -13,16 +13,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_mov_b32 s16, s33 ; GFX906-NEXT: s_mov_b32 s33, s32 ; GFX906-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GFX906-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, -1 ; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[18:19] -; GFX906-NEXT: s_mov_b32 s21, s15 ; GFX906-NEXT: ; implicit-def: $vgpr2 -; GFX906-NEXT: s_mov_b32 s22, s14 +; GFX906-NEXT: s_mov_b32 s21, s15 ; GFX906-NEXT: v_writelane_b32 v2, s21, 0 +; GFX906-NEXT: s_mov_b32 s22, s14 ; GFX906-NEXT: v_writelane_b32 v2, s22, 1 ; GFX906-NEXT: s_mov_b32 s23, s13 ; GFX906-NEXT: v_writelane_b32 v2, s23, 2 @@ -35,55 +34,53 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: v_writelane_b32 v2, s9, 7 ; GFX906-NEXT: v_writelane_b32 v2, s6, 8 ; GFX906-NEXT: v_writelane_b32 v41, s16, 2 +; GFX906-NEXT: v_mov_b32_e32 v35, v31 ; GFX906-NEXT: v_writelane_b32 v2, s7, 9 ; GFX906-NEXT: v_writelane_b32 v41, s30, 0 +; GFX906-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: v_writelane_b32 v2, s4, 10 -; GFX906-NEXT: s_addk_i32 s32, 0x2800 -; GFX906-NEXT: v_writelane_b32 v41, s31, 1 -; GFX906-NEXT: v_mov_b32_e32 v32, v31 -; GFX906-NEXT: v_writelane_b32 v2, s5, 11 -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: v_mov_b32_e32 v33, v2 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def v[0:31] +; GFX906-NEXT: ; def v[3:34] ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: s_addk_i32 s32, 0x2800 +; GFX906-NEXT: v_writelane_b32 v41, s31, 1 +; GFX906-NEXT: v_writelane_b32 v2, s5, 11 +; GFX906-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def v40 ; GFX906-NEXT: ;;#ASMEND @@ -91,7 +88,7 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: ; def s11 ; GFX906-NEXT: ;;#ASMEND ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: v_mov_b32_e32 v40, v33 +; GFX906-NEXT: v_mov_b32_e32 v40, v2 ; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: v_writelane_b32 v40, s11, 12 ; GFX906-NEXT: ;;#ASMSTART @@ -137,20 +134,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_writelane_b32 v40, s10, 22 ; GFX906-NEXT: v_writelane_b32 v40, s11, 23 -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: v_readlane_b32 s16, v40, 22 ; GFX906-NEXT: s_mov_b32 s12, s24 ; GFX906-NEXT: s_mov_b32 s13, s23 ; GFX906-NEXT: s_mov_b32 s14, s22 -; GFX906-NEXT: v_mov_b32_e32 v31, v32 +; GFX906-NEXT: v_mov_b32_e32 v31, v35 ; GFX906-NEXT: s_mov_b32 s15, s21 ; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX906-NEXT: v_readlane_b32 s17, v40, 23 -; GFX906-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: v_readlane_b32 s11, v40, 12 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s11 @@ -242,11 +234,7 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: v_readlane_b32 s9, v40, 7 ; GFX906-NEXT: v_readlane_b32 s11, v40, 5 ; GFX906-NEXT: v_readlane_b32 s17, v40, 23 -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: v_readlane_b32 s21, v40, 24 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s21 @@ -357,11 +345,10 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload ; GFX906-NEXT: s_mov_b64 exec, -1 ; GFX906-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload ; GFX906-NEXT: s_mov_b64 exec, s[6:7] ; GFX906-NEXT: s_addk_i32 s32, 0xd800 ; GFX906-NEXT: s_mov_b32 s33, s4 @@ -374,34 +361,33 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_mov_b32 s16, s33 ; GFX908-NEXT: s_mov_b32 s33, s32 ; GFX908-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GFX908-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, -1 -; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[18:19] ; GFX908-NEXT: v_mov_b32_e32 v3, s16 -; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill ; GFX908-NEXT: s_addk_i32 s32, 0x2c00 +; GFX908-NEXT: ; implicit-def: $vgpr2 ; GFX908-NEXT: s_mov_b64 s[16:17], exec ; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164 -; GFX908-NEXT: v_writelane_b32 v2, s30, 0 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164 +; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 +; GFX908-NEXT: v_writelane_b32 v3, s30, 0 +; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:160 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[16:17] ; GFX908-NEXT: s_mov_b64 s[16:17], exec ; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164 -; GFX908-NEXT: v_writelane_b32 v2, s31, 0 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164 +; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 +; GFX908-NEXT: v_writelane_b32 v3, s31, 0 +; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:160 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[16:17] ; GFX908-NEXT: s_mov_b32 s21, s15 -; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_mov_b32 s22, s14 ; GFX908-NEXT: v_writelane_b32 v2, s21, 0 +; GFX908-NEXT: s_mov_b32 s22, s14 ; GFX908-NEXT: v_writelane_b32 v2, s22, 1 ; GFX908-NEXT: s_mov_b32 s23, s13 ; GFX908-NEXT: v_writelane_b32 v2, s23, 2 @@ -413,52 +399,50 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: v_writelane_b32 v2, s8, 6 ; GFX908-NEXT: v_writelane_b32 v2, s9, 7 ; GFX908-NEXT: v_writelane_b32 v2, s6, 8 +; GFX908-NEXT: v_mov_b32_e32 v35, v31 ; GFX908-NEXT: v_writelane_b32 v2, s7, 9 +; GFX908-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX908-NEXT: v_writelane_b32 v2, s4, 10 -; GFX908-NEXT: v_mov_b32_e32 v32, v31 -; GFX908-NEXT: v_writelane_b32 v2, s5, 11 -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: v_mov_b32_e32 v33, v2 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def v[0:31] +; GFX908-NEXT: ; def v[3:34] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX908-NEXT: v_writelane_b32 v2, s5, 11 +; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def v40 ; GFX908-NEXT: ;;#ASMEND @@ -466,7 +450,7 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: ; def s11 ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: v_mov_b32_e32 v40, v33 +; GFX908-NEXT: v_mov_b32_e32 v40, v2 ; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: v_writelane_b32 v40, s11, 12 ; GFX908-NEXT: ;;#ASMSTART @@ -512,20 +496,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_writelane_b32 v40, s10, 22 ; GFX908-NEXT: v_writelane_b32 v40, s11, 23 -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: v_readlane_b32 s16, v40, 22 ; GFX908-NEXT: s_mov_b32 s12, s24 ; GFX908-NEXT: s_mov_b32 s13, s23 ; GFX908-NEXT: s_mov_b32 s14, s22 -; GFX908-NEXT: v_mov_b32_e32 v31, v32 +; GFX908-NEXT: v_mov_b32_e32 v31, v35 ; GFX908-NEXT: s_mov_b32 s15, s21 ; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX908-NEXT: v_readlane_b32 s17, v40, 23 -; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: v_readlane_b32 s11, v40, 12 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s11 @@ -617,11 +596,7 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: v_readlane_b32 s9, v40, 7 ; GFX908-NEXT: v_readlane_b32 s11, v40, 5 ; GFX908-NEXT: v_readlane_b32 s17, v40, 23 -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: v_readlane_b32 s21, v40, 24 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s21 @@ -729,31 +704,30 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164 +; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:160 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readlane_b32 s31, v0, 0 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[4:5] ; GFX908-NEXT: s_mov_b64 s[4:5], exec ; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164 +; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:160 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readlane_b32 s30, v0, 0 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[4:5] -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload ; GFX908-NEXT: ; kill: killed $vgpr40 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s4, v0 ; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 exec, -1 -; GFX908-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_addk_i32 s32, 0xd400 ; GFX908-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index a462c19ce645d4..d31b834e9b4467 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -447,87 +447,87 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v3, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x5000 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, 0 ; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX8-NEXT: v_mov_b32_e32 v6, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v5 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v5 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v3 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v3 ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v5 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v5 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v3 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v3 ; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] ; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v5 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v6, vcc +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v3 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] ; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v5 -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe800, v5 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v3 +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe800, v3 ; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v6, vcc +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[21:22], v[21:22] -; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xfffff000, v5 -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xfffff000, v3 +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, -1, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[23:24], v[23:24] -; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0xfffff800, v5 -; GFX8-NEXT: v_addc_u32_e32 v26, vcc, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0xfffff800, v3 +; GFX8-NEXT: v_addc_u32_e32 v26, vcc, -1, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[25:26], v[25:26] -; GFX8-NEXT: flat_load_dwordx2 v[27:28], v[5:6] -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x10000, v5 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX8-NEXT: flat_load_dwordx2 v[27:28], v[3:4] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x10000, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX8-NEXT: s_addk_i32 s1, 0x2000 ; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff ; GFX8-NEXT: s_waitcnt vmcnt(10) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(9) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v9, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v9, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v10, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(8) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v11, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v11, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v12, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v13, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v13, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v16, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v15, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v16, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v17, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v18, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v17, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v18, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v19, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v20, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v19, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v20, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v21, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v22, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v21, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v22, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v23, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v24, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v23, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v24, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v25, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v26, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v25, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v26, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v27, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v28, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v27, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v28, v6, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -541,7 +541,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[5:6] ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: clmem_read: @@ -575,77 +575,77 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc ; GFX900-NEXT: s_movk_i32 s0, 0x5000 ; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX900-NEXT: s_movk_i32 s2, 0x7f -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: s_movk_i32 s0, 0xd000 ; GFX900-NEXT: s_movk_i32 s1, 0xe000 ; GFX900-NEXT: s_movk_i32 s3, 0xf000 ; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX900-NEXT: ; =>This Loop Header: Depth=1 ; GFX900-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: s_mov_b32 s4, 0 ; GFX900-NEXT: .LBB1_2: ; %for.body ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v6, vcc -; GFX900-NEXT: global_load_dwordx2 v[9:10], v[5:6], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[11:12], v[5:6], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v5 +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v4, vcc +; GFX900-NEXT: global_load_dwordx2 v[9:10], v[3:4], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[11:12], v[3:4], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v3 ; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v6, vcc +; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v4, vcc ; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s0, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v6, vcc +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s0, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v4, vcc ; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, s1, v5 +; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, s1, v3 ; GFX900-NEXT: global_load_dwordx2 v[13:14], v[13:14], off -; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v6, vcc +; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v4, vcc ; GFX900-NEXT: global_load_dwordx2 v[23:24], v[19:20], off offset:-4096 ; GFX900-NEXT: global_load_dwordx2 v[25:26], v[19:20], off offset:-2048 ; GFX900-NEXT: global_load_dwordx2 v[27:28], v[19:20], off -; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, s3, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v6, vcc +; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, s3, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v4, vcc ; GFX900-NEXT: global_load_dwordx2 v[19:20], v[21:22], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[29:30], v[5:6], off -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, 0x10000, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX900-NEXT: global_load_dwordx2 v[29:30], v[3:4], off +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, 0x10000, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX900-NEXT: s_addk_i32 s4, 0x2000 ; GFX900-NEXT: s_cmp_gt_u32 s4, 0x3fffff ; GFX900-NEXT: s_waitcnt vmcnt(8) -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc ; GFX900-NEXT: s_waitcnt vmcnt(7) -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v17, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v18, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v17, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v18, v6, vcc ; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v13, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v4, vcc -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v15, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v16, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v13, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v14, v6, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v15, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v16, v6, vcc ; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v23, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v24, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v23, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v24, v6, vcc ; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v25, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v26, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v25, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v26, v6, vcc ; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v27, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v28, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v27, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v28, v6, vcc ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v19, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v20, v4, vcc -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v9, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v4, vcc -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v19, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v20, v6, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v9, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v6, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v11, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v12, v6, vcc ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v29, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v30, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v29, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v30, v6, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -659,7 +659,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_mov_b32_e32 v1, s35 ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: global_store_dwordx2 v[0:1], v[3:4], off +; GFX900-NEXT: global_store_dwordx2 v[0:1], v[5:6], off ; GFX900-NEXT: s_endpgm ; ; GFX10-LABEL: clmem_read: @@ -685,8 +685,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 17, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_movk_i32 s1, 0x7f ; GFX10-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] ; GFX10-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 @@ -698,26 +698,26 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX10-NEXT: v_mov_b32_e32 v6, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: .LBB1_2: ; %for.body ; GFX10-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v5, 0xffffb800 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v5, 0xffffc800 -; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v5, 0xffffd800 -; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, -1, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v17, vcc_lo, v5, 0xffffe800 +; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v3, 0xffffb800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v3, 0xffffc800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v3, 0xffffd800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v17, vcc_lo, v3, 0xffffe800 ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: global_load_dwordx2 v[11:12], v[7:8], off offset:-2048 ; GFX10-NEXT: global_load_dwordx2 v[15:16], v[9:10], off offset:-2048 ; GFX10-NEXT: global_load_dwordx2 v[19:20], v[13:14], off offset:-2048 -; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v6, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x7 ; GFX10-NEXT: global_load_dwordx2 v[23:24], v[17:18], off offset:-2048 ; GFX10-NEXT: global_load_dwordx2 v[7:8], v[7:8], off @@ -725,42 +725,42 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: global_load_dwordx2 v[13:14], v[13:14], off ; GFX10-NEXT: global_load_dwordx2 v[25:26], v[17:18], off ; GFX10-NEXT: global_load_dwordx2 v[27:28], v[21:22], off -; GFX10-NEXT: global_load_dwordx2 v[29:30], v[5:6], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[31:32], v[5:6], off -; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, 0x10000, v5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: global_load_dwordx2 v[29:30], v[3:4], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[31:32], v[3:4], off +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x10000, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo ; GFX10-NEXT: s_addk_i32 s2, 0x2000 ; GFX10-NEXT: s_cmp_gt_u32 s2, 0x3fffff ; GFX10-NEXT: s_waitcnt vmcnt(10) -; GFX10-NEXT: v_add_co_u32 v3, s0, v11, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v12, v4, s0 +; GFX10-NEXT: v_add_co_u32 v5, s0, v11, v5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v12, v6, s0 ; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_add_co_u32 v3, s0, v7, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v8, v4, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, v15, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v16, v4, s0 +; GFX10-NEXT: v_add_co_u32 v5, s0, v7, v5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v8, v6, s0 +; GFX10-NEXT: v_add_co_u32 v5, s0, v15, v5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v16, v6, s0 ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_add_co_u32 v3, s0, v9, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v10, v4, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, v19, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v20, v4, s0 +; GFX10-NEXT: v_add_co_u32 v5, s0, v9, v5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v10, v6, s0 +; GFX10-NEXT: v_add_co_u32 v5, s0, v19, v5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v20, v6, s0 ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_add_co_u32 v3, s0, v13, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v14, v4, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, v23, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v24, v4, s0 +; GFX10-NEXT: v_add_co_u32 v5, s0, v13, v5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v14, v6, s0 +; GFX10-NEXT: v_add_co_u32 v5, s0, v23, v5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v24, v6, s0 ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_add_co_u32 v3, s0, v25, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v26, v4, s0 +; GFX10-NEXT: v_add_co_u32 v5, s0, v25, v5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v26, v6, s0 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add_co_u32 v3, s0, v27, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v28, v4, s0 +; GFX10-NEXT: v_add_co_u32 v5, s0, v27, v5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v28, v6, s0 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_add_co_u32 v3, s0, v29, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v30, v4, s0 +; GFX10-NEXT: v_add_co_u32 v5, s0, v29, v5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v30, v6, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v31, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v32, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v31, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v32, v6, vcc_lo ; GFX10-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX10-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -773,7 +773,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: .LBB1_5: ; %while.end ; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[0:1], v[3:4], off +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[5:6], off ; GFX10-NEXT: s_endpgm ; ; GFX90A-LABEL: clmem_read: @@ -809,73 +809,73 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX90A-NEXT: s_movk_i32 s2, 0x7f -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], 0, 0 ; GFX90A-NEXT: s_movk_i32 s0, 0xd000 ; GFX90A-NEXT: s_movk_i32 s1, 0xe000 ; GFX90A-NEXT: s_movk_i32 s3, 0xf000 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_mov_b32 s4, 0 ; GFX90A-NEXT: .LBB1_2: ; %for.body ; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 -; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 -; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off -; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v5, vcc +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[4:5], off offset:-2048 +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v4 +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v5, vcc ; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 -; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v7, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v5, vcc ; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 -; GFX90A-NEXT: v_add_co_u32_e32 v20, vcc, s1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v20, vcc, s1, v4 ; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[14:15], off -; GFX90A-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v5, vcc ; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[20:21], off offset:-4096 ; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[20:21], off offset:-2048 ; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[20:21], off -; GFX90A-NEXT: v_add_co_u32_e32 v22, vcc, s3, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v22, vcc, s3, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v5, vcc ; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[22:23], off offset:-2048 -; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off -; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[4:5], off +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x10000, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX90A-NEXT: s_addk_i32 s4, 0x2000 ; GFX90A-NEXT: s_cmp_gt_u32 s4, 0x3fffff ; GFX90A-NEXT: s_waitcnt vmcnt(8) -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v8, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v7, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(7) ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v18, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v19, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v19, v6, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(5) ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v14, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v15, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v6, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v16, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v17, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v17, v6, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(4) ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v24, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v25, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v25, v6, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(3) ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v26, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v27, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v27, v6, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(2) ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v28, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v29, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v29, v6, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v20, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v21, v4, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v8, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v21, v6, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v10, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v6, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v6, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v30, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v31, v7, vcc ; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX90A-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -889,7 +889,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_mov_b32_e32 v1, s35 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[6:7], off ; GFX90A-NEXT: s_endpgm ; ; GFX11-LABEL: clmem_read: @@ -904,8 +904,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xff, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 17, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 17, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_movk_i32 s1, 0x7f ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] @@ -922,84 +922,84 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v5, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: .LBB1_2: ; %for.body ; GFX11-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v5, 0xffffc000 -; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v6, vcc_lo -; GFX11-NEXT: v_add_co_u32 v9, vcc_lo, 0xffffc000, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v6, vcc_lo -; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, 0xffffd000, v5 +; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v3, 0xffffc000 +; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v9, vcc_lo, 0xffffc000, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, 0xffffd000, v3 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[13:14], v[7:8], off offset:-4096 ; GFX11-NEXT: global_load_b64 v[9:10], v[9:10], off offset:-2048 -; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, -1, v6, vcc_lo -; GFX11-NEXT: v_add_co_u32 v15, vcc_lo, v5, 0xffffe000 -; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, -1, v6, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, -1, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v15, vcc_lo, v3, 0xffffe000 +; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, -1, v4, vcc_lo ; GFX11-NEXT: global_load_b64 v[11:12], v[11:12], off offset:-2048 -; GFX11-NEXT: v_add_co_u32 v17, vcc_lo, 0xffffe000, v5 +; GFX11-NEXT: v_add_co_u32 v17, vcc_lo, 0xffffe000, v3 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[19:20], v[15:16], off offset:-4096 ; GFX11-NEXT: global_load_b64 v[7:8], v[7:8], off -; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v6, vcc_lo -; GFX11-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v6, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v4, vcc_lo ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: global_load_b64 v[17:18], v[17:18], off offset:-2048 ; GFX11-NEXT: global_load_b64 v[15:16], v[15:16], off ; GFX11-NEXT: global_load_b64 v[21:22], v[21:22], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[23:24], v[5:6], off offset:-4096 -; GFX11-NEXT: global_load_b64 v[25:26], v[5:6], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[27:28], v[5:6], off -; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0x10000, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo +; GFX11-NEXT: global_load_b64 v[23:24], v[3:4], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[25:26], v[3:4], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[27:28], v[3:4], off +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x10000, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo ; GFX11-NEXT: s_addk_i32 s2, 0x2000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_cmp_gt_u32 s2, 0x3fffff ; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: v_add_co_u32 v3, s0, v13, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v14, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v13, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v14, v6, s0 ; GFX11-NEXT: s_waitcnt vmcnt(9) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v9, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v10, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v9, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v10, v6, s0 ; GFX11-NEXT: s_waitcnt vmcnt(6) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v7, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v8, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v7, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v8, v6, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v11, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v12, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v11, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v12, v6, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v19, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v20, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v19, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v20, v6, s0 ; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v17, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v18, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v17, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v18, v6, s0 ; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v15, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v16, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v15, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v16, v6, s0 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v21, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v22, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v21, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v22, v6, s0 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v23, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v24, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v23, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v24, v6, s0 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v25, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v26, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v25, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v26, v6, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v27, v3 -; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v28, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, v27, v5 +; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v28, v6, vcc_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX11-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -1013,7 +1013,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0 -; GFX11-NEXT: global_store_b64 v[0:1], v[3:4], off +; GFX11-NEXT: global_store_b64 v[0:1], v[5:6], off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll index 415ed89668abbd..2880c09afdb228 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -91,20 +91,20 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; SI-IEEE-SAFE-LABEL: rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -129,24 +129,24 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -169,7 +169,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: rsq_f32: ; GCN-UNSAFE: ; %bb.0: @@ -414,27 +414,27 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-DAZ-SAFE-LABEL: rsqrt_fmul: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, 0 ; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[4:5], s[0:1] ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2 -; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v2 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v2 -; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5] ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v2, v5 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v5, v7, 0.5 @@ -447,7 +447,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3 -; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v4 +; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v4 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v5, v3 ; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4 ; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -460,7 +460,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GCN-DAZ-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; GCN-DAZ-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4 -; GCN-DAZ-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-DAZ-SAFE-NEXT: s_endpgm ; ; GCN-IEEE-SAFE-LABEL: rsqrt_fmul: @@ -605,20 +605,20 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; SI-IEEE-SAFE-LABEL: neg_rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -643,24 +643,24 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: neg_rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -683,7 +683,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_f32: ; GCN-UNSAFE: ; %bb.0: @@ -786,20 +786,20 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 @@ -824,24 +824,24 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 @@ -864,7 +864,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-UNSAFE: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 5f291489848fe6..da6ad1b0765757 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -44,62 +44,48 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_writelane_b32 v23, s10, 12 ; CHECK-NEXT: v_writelane_b32 v23, s11, 13 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:19] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 14 -; CHECK-NEXT: v_writelane_b32 v23, s5, 15 -; CHECK-NEXT: v_writelane_b32 v23, s6, 16 -; CHECK-NEXT: v_writelane_b32 v23, s7, 17 -; CHECK-NEXT: v_writelane_b32 v23, s8, 18 -; CHECK-NEXT: v_writelane_b32 v23, s9, 19 -; CHECK-NEXT: v_writelane_b32 v23, s10, 20 -; CHECK-NEXT: v_writelane_b32 v23, s11, 21 -; CHECK-NEXT: v_writelane_b32 v23, s12, 22 -; CHECK-NEXT: v_writelane_b32 v23, s13, 23 -; CHECK-NEXT: v_writelane_b32 v23, s14, 24 -; CHECK-NEXT: v_writelane_b32 v23, s15, 25 -; CHECK-NEXT: v_writelane_b32 v23, s16, 26 -; CHECK-NEXT: v_writelane_b32 v23, s17, 27 -; CHECK-NEXT: v_writelane_b32 v23, s18, 28 -; CHECK-NEXT: v_writelane_b32 v23, s19, 29 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[42:43] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[52:55] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:11] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 30 -; CHECK-NEXT: v_writelane_b32 v23, s5, 31 -; CHECK-NEXT: v_writelane_b32 v23, s6, 32 -; CHECK-NEXT: v_writelane_b32 v23, s7, 33 -; CHECK-NEXT: v_writelane_b32 v23, s8, 34 -; CHECK-NEXT: v_writelane_b32 v23, s9, 35 -; CHECK-NEXT: v_writelane_b32 v23, s10, 36 -; CHECK-NEXT: v_writelane_b32 v23, s11, 37 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[16:31] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s16, 14 +; CHECK-NEXT: v_writelane_b32 v23, s17, 15 +; CHECK-NEXT: v_writelane_b32 v23, s18, 16 +; CHECK-NEXT: v_writelane_b32 v23, s19, 17 +; CHECK-NEXT: v_writelane_b32 v23, s20, 18 +; CHECK-NEXT: v_writelane_b32 v23, s21, 19 +; CHECK-NEXT: v_writelane_b32 v23, s22, 20 +; CHECK-NEXT: v_writelane_b32 v23, s23, 21 +; CHECK-NEXT: v_writelane_b32 v23, s24, 22 +; CHECK-NEXT: v_writelane_b32 v23, s25, 23 +; CHECK-NEXT: v_writelane_b32 v23, s26, 24 +; CHECK-NEXT: v_writelane_b32 v23, s27, 25 +; CHECK-NEXT: v_writelane_b32 v23, s28, 26 +; CHECK-NEXT: v_writelane_b32 v23, s29, 27 +; CHECK-NEXT: v_writelane_b32 v23, s30, 28 +; CHECK-NEXT: v_writelane_b32 v23, s31, 29 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[40:41] +; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[36:39] +; CHECK-NEXT: ; def s[16:19] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s16, 30 +; CHECK-NEXT: v_writelane_b32 v23, s17, 31 +; CHECK-NEXT: v_writelane_b32 v23, s18, 32 +; CHECK-NEXT: v_writelane_b32 v23, s19, 33 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[44:51] +; CHECK-NEXT: ; def s[16:23] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s16, 34 +; CHECK-NEXT: v_writelane_b32 v23, s17, 35 +; CHECK-NEXT: v_writelane_b32 v23, s18, 36 +; CHECK-NEXT: v_writelane_b32 v23, s19, 37 +; CHECK-NEXT: v_writelane_b32 v23, s20, 38 +; CHECK-NEXT: v_writelane_b32 v23, s21, 39 +; CHECK-NEXT: v_writelane_b32 v23, s22, 40 +; CHECK-NEXT: v_writelane_b32 v23, s23, 41 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:15] +; CHECK-NEXT: ; def s[4:19] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 38 -; CHECK-NEXT: v_writelane_b32 v23, s1, 39 -; CHECK-NEXT: v_writelane_b32 v23, s2, 40 -; CHECK-NEXT: v_writelane_b32 v23, s3, 41 ; CHECK-NEXT: v_writelane_b32 v23, s4, 42 ; CHECK-NEXT: v_writelane_b32 v23, s5, 43 ; CHECK-NEXT: v_writelane_b32 v23, s6, 44 @@ -112,35 +98,46 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_writelane_b32 v23, s13, 51 ; CHECK-NEXT: v_writelane_b32 v23, s14, 52 ; CHECK-NEXT: v_writelane_b32 v23, s15, 53 +; CHECK-NEXT: v_writelane_b32 v23, s16, 54 +; CHECK-NEXT: v_writelane_b32 v23, s17, 55 +; CHECK-NEXT: v_writelane_b32 v23, s18, 56 +; CHECK-NEXT: v_writelane_b32 v23, s19, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:3] +; CHECK-NEXT: ; def s[52:55] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 54 -; CHECK-NEXT: v_writelane_b32 v23, s1, 55 -; CHECK-NEXT: v_writelane_b32 v23, s2, 56 -; CHECK-NEXT: v_writelane_b32 v23, s3, 57 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ; def s[36:43] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 58 -; CHECK-NEXT: v_writelane_b32 v23, s1, 59 -; CHECK-NEXT: v_writelane_b32 v23, s2, 60 +; CHECK-NEXT: v_writelane_b32 v23, s36, 58 +; CHECK-NEXT: v_writelane_b32 v23, s37, 59 +; CHECK-NEXT: v_writelane_b32 v23, s38, 60 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v23, s3, 61 -; CHECK-NEXT: v_writelane_b32 v23, s4, 62 -; CHECK-NEXT: v_writelane_b32 v0, s6, 0 -; CHECK-NEXT: v_writelane_b32 v23, s5, 63 -; CHECK-NEXT: v_writelane_b32 v0, s7, 1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 2 -; CHECK-NEXT: v_writelane_b32 v0, s1, 3 -; CHECK-NEXT: v_writelane_b32 v0, s2, 4 -; CHECK-NEXT: v_writelane_b32 v0, s3, 5 +; CHECK-NEXT: v_writelane_b32 v23, s39, 61 +; CHECK-NEXT: v_writelane_b32 v23, s40, 62 +; CHECK-NEXT: v_writelane_b32 v0, s42, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: v_writelane_b32 v23, s41, 63 +; CHECK-NEXT: v_writelane_b32 v0, s43, 1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[36:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 2 +; CHECK-NEXT: v_writelane_b32 v0, s5, 3 +; CHECK-NEXT: v_writelane_b32 v0, s6, 4 +; CHECK-NEXT: v_writelane_b32 v0, s7, 5 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:11] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v0, s4, 6 ; CHECK-NEXT: v_writelane_b32 v0, s5, 7 ; CHECK-NEXT: v_writelane_b32 v0, s6, 8 @@ -149,81 +146,86 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_writelane_b32 v0, s9, 11 ; CHECK-NEXT: v_writelane_b32 v0, s10, 12 ; CHECK-NEXT: v_writelane_b32 v0, s11, 13 -; CHECK-NEXT: v_writelane_b32 v0, s12, 14 -; CHECK-NEXT: v_writelane_b32 v0, s13, 15 -; CHECK-NEXT: v_writelane_b32 v0, s14, 16 -; CHECK-NEXT: v_writelane_b32 v0, s15, 17 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:1] +; CHECK-NEXT: ; def s[4:19] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 18 -; CHECK-NEXT: v_writelane_b32 v0, s1, 19 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 20 -; CHECK-NEXT: v_writelane_b32 v0, s1, 21 -; CHECK-NEXT: v_writelane_b32 v0, s2, 22 -; CHECK-NEXT: v_writelane_b32 v0, s3, 23 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 24 -; CHECK-NEXT: v_writelane_b32 v0, s1, 25 -; CHECK-NEXT: v_writelane_b32 v0, s2, 26 -; CHECK-NEXT: v_writelane_b32 v0, s3, 27 -; CHECK-NEXT: v_writelane_b32 v0, s4, 28 -; CHECK-NEXT: v_writelane_b32 v0, s5, 29 -; CHECK-NEXT: v_writelane_b32 v0, s6, 30 -; CHECK-NEXT: v_writelane_b32 v0, s7, 31 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 32 -; CHECK-NEXT: v_writelane_b32 v0, s1, 33 -; CHECK-NEXT: v_writelane_b32 v0, s2, 34 -; CHECK-NEXT: v_writelane_b32 v0, s3, 35 -; CHECK-NEXT: v_writelane_b32 v0, s4, 36 -; CHECK-NEXT: v_writelane_b32 v0, s5, 37 -; CHECK-NEXT: v_writelane_b32 v0, s6, 38 -; CHECK-NEXT: v_writelane_b32 v0, s7, 39 -; CHECK-NEXT: v_writelane_b32 v0, s8, 40 -; CHECK-NEXT: v_writelane_b32 v0, s9, 41 -; CHECK-NEXT: v_writelane_b32 v0, s10, 42 -; CHECK-NEXT: v_writelane_b32 v0, s11, 43 -; CHECK-NEXT: v_writelane_b32 v0, s12, 44 -; CHECK-NEXT: v_writelane_b32 v0, s13, 45 -; CHECK-NEXT: v_writelane_b32 v0, s14, 46 -; CHECK-NEXT: v_writelane_b32 v0, s15, 47 +; CHECK-NEXT: v_writelane_b32 v0, s4, 14 +; CHECK-NEXT: v_writelane_b32 v0, s5, 15 +; CHECK-NEXT: v_writelane_b32 v0, s6, 16 +; CHECK-NEXT: v_writelane_b32 v0, s7, 17 +; CHECK-NEXT: v_writelane_b32 v0, s8, 18 +; CHECK-NEXT: v_writelane_b32 v0, s9, 19 +; CHECK-NEXT: v_writelane_b32 v0, s10, 20 +; CHECK-NEXT: v_writelane_b32 v0, s11, 21 +; CHECK-NEXT: v_writelane_b32 v0, s12, 22 +; CHECK-NEXT: v_writelane_b32 v0, s13, 23 +; CHECK-NEXT: v_writelane_b32 v0, s14, 24 +; CHECK-NEXT: v_writelane_b32 v0, s15, 25 +; CHECK-NEXT: v_writelane_b32 v0, s16, 26 +; CHECK-NEXT: v_writelane_b32 v0, s17, 27 +; CHECK-NEXT: v_writelane_b32 v0, s18, 28 +; CHECK-NEXT: v_writelane_b32 v0, s19, 29 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:5] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 30 +; CHECK-NEXT: v_writelane_b32 v0, s5, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[20:23] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[24:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:19] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 32 +; CHECK-NEXT: v_writelane_b32 v0, s5, 33 +; CHECK-NEXT: v_writelane_b32 v0, s6, 34 +; CHECK-NEXT: v_writelane_b32 v0, s7, 35 +; CHECK-NEXT: v_writelane_b32 v0, s8, 36 +; CHECK-NEXT: v_writelane_b32 v0, s9, 37 +; CHECK-NEXT: v_writelane_b32 v0, s10, 38 +; CHECK-NEXT: v_writelane_b32 v0, s11, 39 +; CHECK-NEXT: v_writelane_b32 v0, s12, 40 +; CHECK-NEXT: v_writelane_b32 v0, s13, 41 +; CHECK-NEXT: v_writelane_b32 v0, s14, 42 +; CHECK-NEXT: v_writelane_b32 v0, s15, 43 +; CHECK-NEXT: v_writelane_b32 v0, s16, 44 +; CHECK-NEXT: v_writelane_b32 v0, s17, 45 +; CHECK-NEXT: v_writelane_b32 v0, s18, 46 +; CHECK-NEXT: v_writelane_b32 v0, s19, 47 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %ret ; CHECK-NEXT: ; kill: killed $vgpr23 ; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB0_2: ; %bb0 +; CHECK-NEXT: v_readlane_b32 s4, v23, 2 +; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: v_readlane_b32 s0, v23, 0 +; CHECK-NEXT: v_readlane_b32 s5, v23, 3 +; CHECK-NEXT: v_readlane_b32 s6, v23, 4 +; CHECK-NEXT: v_readlane_b32 s7, v23, 5 ; CHECK-NEXT: v_readlane_b32 s1, v23, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 2 -; CHECK-NEXT: v_readlane_b32 s1, v23, 3 -; CHECK-NEXT: v_readlane_b32 s2, v23, 4 -; CHECK-NEXT: v_readlane_b32 s3, v23, 5 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ; use s[4:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 6 -; CHECK-NEXT: v_readlane_b32 s1, v23, 7 -; CHECK-NEXT: v_readlane_b32 s2, v23, 8 -; CHECK-NEXT: v_readlane_b32 s3, v23, 9 -; CHECK-NEXT: v_readlane_b32 s4, v23, 10 -; CHECK-NEXT: v_readlane_b32 s5, v23, 11 -; CHECK-NEXT: v_readlane_b32 s6, v23, 12 -; CHECK-NEXT: v_readlane_b32 s7, v23, 13 +; CHECK-NEXT: v_readlane_b32 s4, v23, 6 +; CHECK-NEXT: v_readlane_b32 s5, v23, 7 +; CHECK-NEXT: v_readlane_b32 s6, v23, 8 +; CHECK-NEXT: v_readlane_b32 s7, v23, 9 +; CHECK-NEXT: v_readlane_b32 s8, v23, 10 +; CHECK-NEXT: v_readlane_b32 s9, v23, 11 +; CHECK-NEXT: v_readlane_b32 s10, v23, 12 +; CHECK-NEXT: v_readlane_b32 s11, v23, 13 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ; use s[4:11] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] ; CHECK-NEXT: v_readlane_b32 s0, v23, 14 ; CHECK-NEXT: v_readlane_b32 s1, v23, 15 ; CHECK-NEXT: v_readlane_b32 s2, v23, 16 @@ -247,64 +249,52 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_readlane_b32 s1, v23, 31 ; CHECK-NEXT: v_readlane_b32 s2, v23, 32 ; CHECK-NEXT: v_readlane_b32 s3, v23, 33 -; CHECK-NEXT: v_readlane_b32 s4, v23, 34 -; CHECK-NEXT: v_readlane_b32 s5, v23, 35 -; CHECK-NEXT: v_readlane_b32 s6, v23, 36 -; CHECK-NEXT: v_readlane_b32 s7, v23, 37 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[42:43] +; CHECK-NEXT: ; use s[18:19] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[52:55] +; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 34 +; CHECK-NEXT: v_readlane_b32 s1, v23, 35 +; CHECK-NEXT: v_readlane_b32 s2, v23, 36 +; CHECK-NEXT: v_readlane_b32 s3, v23, 37 +; CHECK-NEXT: v_readlane_b32 s4, v23, 38 +; CHECK-NEXT: v_readlane_b32 s5, v23, 39 +; CHECK-NEXT: v_readlane_b32 s6, v23, 40 +; CHECK-NEXT: v_readlane_b32 s7, v23, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 38 -; CHECK-NEXT: v_readlane_b32 s1, v23, 39 -; CHECK-NEXT: v_readlane_b32 s2, v23, 40 -; CHECK-NEXT: v_readlane_b32 s3, v23, 41 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[16:31] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[40:41] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[36:39] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:51] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s4, v23, 42 -; CHECK-NEXT: v_readlane_b32 s5, v23, 43 -; CHECK-NEXT: v_readlane_b32 s6, v23, 44 -; CHECK-NEXT: v_readlane_b32 s7, v23, 45 -; CHECK-NEXT: v_readlane_b32 s8, v23, 46 -; CHECK-NEXT: v_readlane_b32 s9, v23, 47 -; CHECK-NEXT: v_readlane_b32 s10, v23, 48 -; CHECK-NEXT: v_readlane_b32 s11, v23, 49 -; CHECK-NEXT: v_readlane_b32 s12, v23, 50 -; CHECK-NEXT: v_readlane_b32 s13, v23, 51 -; CHECK-NEXT: v_readlane_b32 s14, v23, 52 -; CHECK-NEXT: v_readlane_b32 s15, v23, 53 +; CHECK-NEXT: v_readlane_b32 s0, v23, 42 +; CHECK-NEXT: v_readlane_b32 s1, v23, 43 +; CHECK-NEXT: v_readlane_b32 s2, v23, 44 +; CHECK-NEXT: v_readlane_b32 s3, v23, 45 +; CHECK-NEXT: v_readlane_b32 s4, v23, 46 +; CHECK-NEXT: v_readlane_b32 s5, v23, 47 +; CHECK-NEXT: v_readlane_b32 s6, v23, 48 +; CHECK-NEXT: v_readlane_b32 s7, v23, 49 +; CHECK-NEXT: v_readlane_b32 s8, v23, 50 +; CHECK-NEXT: v_readlane_b32 s9, v23, 51 +; CHECK-NEXT: v_readlane_b32 s10, v23, 52 +; CHECK-NEXT: v_readlane_b32 s11, v23, 53 +; CHECK-NEXT: v_readlane_b32 s12, v23, 54 +; CHECK-NEXT: v_readlane_b32 s13, v23, 55 +; CHECK-NEXT: v_readlane_b32 s14, v23, 56 +; CHECK-NEXT: v_readlane_b32 s15, v23, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 54 -; CHECK-NEXT: v_readlane_b32 s1, v23, 55 -; CHECK-NEXT: v_readlane_b32 s2, v23, 56 -; CHECK-NEXT: v_readlane_b32 s3, v23, 57 +; CHECK-NEXT: v_readlane_b32 s0, v23, 58 +; CHECK-NEXT: v_readlane_b32 s1, v23, 59 +; CHECK-NEXT: v_readlane_b32 s2, v23, 60 +; CHECK-NEXT: v_readlane_b32 s3, v23, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ; use s[52:55] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 58 -; CHECK-NEXT: v_readlane_b32 s1, v23, 59 -; CHECK-NEXT: v_readlane_b32 s2, v23, 60 -; CHECK-NEXT: v_readlane_b32 s3, v23, 61 ; CHECK-NEXT: v_readlane_b32 s4, v23, 62 ; CHECK-NEXT: v_readlane_b32 s5, v23, 63 ; CHECK-NEXT: v_readlane_b32 s6, v0, 0 @@ -316,45 +306,57 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_readlane_b32 s1, v0, 3 ; CHECK-NEXT: v_readlane_b32 s2, v0, 4 ; CHECK-NEXT: v_readlane_b32 s3, v0, 5 -; CHECK-NEXT: v_readlane_b32 s4, v0, 6 -; CHECK-NEXT: v_readlane_b32 s5, v0, 7 -; CHECK-NEXT: v_readlane_b32 s6, v0, 8 -; CHECK-NEXT: v_readlane_b32 s7, v0, 9 -; CHECK-NEXT: v_readlane_b32 s8, v0, 10 -; CHECK-NEXT: v_readlane_b32 s9, v0, 11 -; CHECK-NEXT: v_readlane_b32 s10, v0, 12 -; CHECK-NEXT: v_readlane_b32 s11, v0, 13 -; CHECK-NEXT: v_readlane_b32 s12, v0, 14 -; CHECK-NEXT: v_readlane_b32 s13, v0, 15 -; CHECK-NEXT: v_readlane_b32 s14, v0, 16 -; CHECK-NEXT: v_readlane_b32 s15, v0, 17 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ; use s[36:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 18 -; CHECK-NEXT: v_readlane_b32 s1, v0, 19 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ; use s[16:17] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 20 -; CHECK-NEXT: v_readlane_b32 s1, v0, 21 -; CHECK-NEXT: v_readlane_b32 s2, v0, 22 -; CHECK-NEXT: v_readlane_b32 s3, v0, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 24 -; CHECK-NEXT: v_readlane_b32 s1, v0, 25 -; CHECK-NEXT: v_readlane_b32 s2, v0, 26 -; CHECK-NEXT: v_readlane_b32 s3, v0, 27 -; CHECK-NEXT: v_readlane_b32 s4, v0, 28 -; CHECK-NEXT: v_readlane_b32 s5, v0, 29 -; CHECK-NEXT: v_readlane_b32 s6, v0, 30 -; CHECK-NEXT: v_readlane_b32 s7, v0, 31 +; CHECK-NEXT: v_readlane_b32 s0, v0, 6 +; CHECK-NEXT: v_readlane_b32 s1, v0, 7 +; CHECK-NEXT: v_readlane_b32 s2, v0, 8 +; CHECK-NEXT: v_readlane_b32 s3, v0, 9 +; CHECK-NEXT: v_readlane_b32 s4, v0, 10 +; CHECK-NEXT: v_readlane_b32 s5, v0, 11 +; CHECK-NEXT: v_readlane_b32 s6, v0, 12 +; CHECK-NEXT: v_readlane_b32 s7, v0, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 14 +; CHECK-NEXT: v_readlane_b32 s1, v0, 15 +; CHECK-NEXT: v_readlane_b32 s2, v0, 16 +; CHECK-NEXT: v_readlane_b32 s3, v0, 17 +; CHECK-NEXT: v_readlane_b32 s4, v0, 18 +; CHECK-NEXT: v_readlane_b32 s5, v0, 19 +; CHECK-NEXT: v_readlane_b32 s6, v0, 20 +; CHECK-NEXT: v_readlane_b32 s7, v0, 21 +; CHECK-NEXT: v_readlane_b32 s8, v0, 22 +; CHECK-NEXT: v_readlane_b32 s9, v0, 23 +; CHECK-NEXT: v_readlane_b32 s10, v0, 24 +; CHECK-NEXT: v_readlane_b32 s11, v0, 25 +; CHECK-NEXT: v_readlane_b32 s12, v0, 26 +; CHECK-NEXT: v_readlane_b32 s13, v0, 27 +; CHECK-NEXT: v_readlane_b32 s14, v0, 28 +; CHECK-NEXT: v_readlane_b32 s15, v0, 29 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 30 +; CHECK-NEXT: v_readlane_b32 s1, v0, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v0, 32 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[20:23] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[24:31] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s1, v0, 33 ; CHECK-NEXT: v_readlane_b32 s2, v0, 34 ; CHECK-NEXT: v_readlane_b32 s3, v0, 35 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index d59660751cc187..974cb71900a4db 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -16,17 +16,17 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -56,22 +56,22 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 @@ -101,56 +101,56 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 -; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_ashr_i32 s6, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_xor_b32 s7, s0, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_ashr_i32 s5, s4, 31 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: s_ashr_i32 s3, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_xor_b32 s6, s5, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: s_xor_b32 s3, s1, s3 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s1, 0, s2 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s5, 0, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s8 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s5 -; GFX9-NEXT: s_add_i32 s8, s8, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 -; GFX9-NEXT: s_mul_i32 s8, s5, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_add_i32 s9, s5, 1 -; GFX9-NEXT: s_sub_i32 s8, s4, s7 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s5, s9, s5 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s8, s5 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mul_i32 s1, s1, s8 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1 +; GFX9-NEXT: s_add_i32 s8, s8, s1 +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s8 +; GFX9-NEXT: s_mul_i32 s8, s1, s2 +; GFX9-NEXT: s_sub_i32 s0, s0, s8 +; GFX9-NEXT: s_add_i32 s9, s1, 1 +; GFX9-NEXT: s_sub_i32 s8, s0, s2 +; GFX9-NEXT: s_cmp_ge_u32 s0, s2 +; GFX9-NEXT: s_cselect_b32 s1, s9, s1 +; GFX9-NEXT: s_cselect_b32 s0, s8, s0 +; GFX9-NEXT: s_add_i32 s8, s1, 1 +; GFX9-NEXT: s_cmp_ge_u32 s0, s2 +; GFX9-NEXT: s_cselect_b32 s0, s8, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s3 +; GFX9-NEXT: s_sub_i32 s0, s0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_i32: @@ -1373,17 +1373,17 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: sdiv_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 2, v3 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32_4: @@ -1672,20 +1672,20 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i23: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 ; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 ; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 ; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(3) ; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; TONGA-NEXT: s_waitcnt vmcnt(2) @@ -1710,25 +1710,25 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 -; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i23: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 ; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 ; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 ; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1753,7 +1753,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i23: @@ -1859,20 +1859,20 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i24: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 ; TONGA-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; TONGA-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 ; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(3) ; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; TONGA-NEXT: s_waitcnt vmcnt(2) @@ -1895,25 +1895,25 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24 -; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; GFX9-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 ; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1936,7 +1936,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i24: @@ -1997,17 +1997,17 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i25: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_bfe_i32 v2, v1, 0, 25 ; GCN-NEXT: v_bfe_i32 v1, v1, 24, 1 @@ -2040,22 +2040,22 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: v_sdiv_i25: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_bfe_i32 v2, v1, 0, 25 ; TONGA-NEXT: v_bfe_i32 v1, v1, 24, 1 @@ -2088,59 +2088,59 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 -; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i25: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: s_bfe_i32 s3, s2, 0x190000 +; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10018 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x190000 -; GFX9-NEXT: s_bfe_i32 s6, s0, 0x10018 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x10018 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v1 -; GFX9-NEXT: s_bfe_i32 s5, s4, 0x190000 -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x10018 -; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_xor_b32 s2, s0, s2 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s6, s4, s6 -; GFX9-NEXT: s_xor_b32 s4, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, 0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s3 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s8 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s5 -; GFX9-NEXT: s_add_i32 s8, s8, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 -; GFX9-NEXT: s_mul_i32 s8, s5, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_add_i32 s9, s5, 1 -; GFX9-NEXT: s_sub_i32 s8, s4, s7 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s5, s9, s5 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s8, s5 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x190000 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mul_i32 s1, s1, s8 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1 +; GFX9-NEXT: s_add_i32 s8, s8, s1 +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s8 +; GFX9-NEXT: s_mul_i32 s8, s1, s3 +; GFX9-NEXT: s_sub_i32 s0, s0, s8 +; GFX9-NEXT: s_add_i32 s9, s1, 1 +; GFX9-NEXT: s_sub_i32 s8, s0, s3 +; GFX9-NEXT: s_cmp_ge_u32 s0, s3 +; GFX9-NEXT: s_cselect_b32 s1, s9, s1 +; GFX9-NEXT: s_cselect_b32 s0, s8, s0 +; GFX9-NEXT: s_add_i32 s8, s1, 1 +; GFX9-NEXT: s_cmp_ge_u32 s0, s3 +; GFX9-NEXT: s_cselect_b32 s0, s8, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_sub_i32 s0, s0, s2 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x190000 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i25: diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 4f2fd3f50494c9..0fa19d77fb5152 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -156,7 +156,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[12:13], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[8:9] ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 @@ -164,51 +164,51 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s12 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s13 -; GCN-IR-NEXT: s_min_u32 s18, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s16, s14, s18 -; GCN-IR-NEXT: s_subb_u32 s17, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[16:17], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63 -; GCN-IR-NEXT: s_or_b64 s[20:21], s[10:11], s[20:21] -; GCN-IR-NEXT: s_and_b64 s[10:11], s[20:21], exec -; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13 -; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12 +; GCN-IR-NEXT: s_min_u32 s16, s8, s9 +; GCN-IR-NEXT: s_sub_u32 s10, s14, s16 +; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[10:11], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[10:11], 63 +; GCN-IR-NEXT: s_or_b64 s[20:21], s[18:19], s[20:21] +; GCN-IR-NEXT: s_and_b64 s[18:19], s[20:21], exec +; GCN-IR-NEXT: s_cselect_b32 s19, 0, s13 +; GCN-IR-NEXT: s_cselect_b32 s18, 0, s12 ; GCN-IR-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s20, s16, 1 -; GCN-IR-NEXT: s_addc_u32 s21, s17, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[20:21], 0 -; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16 +; GCN-IR-NEXT: s_add_u32 s18, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s19, s11, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[18:19], 0 +; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21] +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s10 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s20 -; GCN-IR-NEXT: s_add_u32 s19, s6, -1 +; GCN-IR-NEXT: s_lshr_b64 s[18:19], s[12:13], s18 +; GCN-IR-NEXT: s_add_u32 s17, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s20, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] -; GCN-IR-NEXT: s_add_u32 s12, s8, s18 +; GCN-IR-NEXT: s_add_u32 s12, s8, s16 ; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 +; GCN-IR-NEXT: s_lshl_b64 s[18:19], s[18:19], 1 ; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s8, s19, s16 -; GCN-IR-NEXT: s_subb_u32 s8, s20, s17 +; GCN-IR-NEXT: s_sub_u32 s8, s17, s18 +; GCN-IR-NEXT: s_subb_u32 s8, s20, s19 ; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 ; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s16, s16, s14 -; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 +; GCN-IR-NEXT: s_sub_u32 s18, s18, s14 +; GCN-IR-NEXT: s_subb_u32 s19, s19, s15 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[12:13], 0 @@ -217,10 +217,10 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[18:19], s[8:9], s[6:7] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end ; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[18:19], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 ; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 @@ -239,14 +239,14 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN-NEXT: v_xor_b32_e32 v3, v3, v4 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v4 -; GCN-NEXT: v_xor_b32_e32 v3, v5, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2 -; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 -; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v2, vcc +; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v6, v3 +; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 +; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc ; GCN-NEXT: v_madmk_f32 v5, v6, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v5, v5 ; GCN-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 @@ -318,33 +318,33 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v3, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 +; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 +; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 +; GCN-NEXT: v_mul_lo_u32 v10, v3, v5 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GCN-NEXT: v_mul_lo_u32 v9, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v9, v2, v5 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, v1, v8 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; GCN-NEXT: v_subb_u32_e64 v9, s[4:5], v10, v2, vcc -; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v3 +; GCN-NEXT: v_subb_u32_e64 v9, s[4:5], v10, v3, vcc +; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v2 ; GCN-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; GCN-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 ; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v2 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 ; GCN-NEXT: v_cndmask_b32_e64 v9, v11, v10, s[4:5] ; GCN-NEXT: v_add_i32_e64 v10, s[4:5], 2, v5 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; GCN-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v6, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_add_i32_e64 v12, s[4:5], 1, v5 ; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e64 v13, s[4:5], 0, v6, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 ; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v9, v13, v11, s[4:5] @@ -366,8 +366,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v1 -; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v0, v4 -; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v1, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v1, v4, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v2 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 @@ -376,13 +376,13 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v12, v2, v3 -; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v10 +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v8 ; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 -; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v11 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v9 ; GCN-IR-NEXT: v_min_u32_e32 v13, v2, v3 ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v12, v13 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] ; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[2:3] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -391,8 +391,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 -; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v10, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v11, v9, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v8, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 @@ -401,46 +401,46 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[8:9], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v0 +; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[8:9], v14 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_not_b32_e32 v9, v12 -; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[10:11], v14 -; GCN-IR-NEXT: v_not_b32_e32 v8, 0 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v9, v13 +; GCN-IR-NEXT: v_not_b32_e32 v8, v12 +; GCN-IR-NEXT: v_not_b32_e32 v9, 0 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v8 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v10, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v10 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v16, v14 -; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v17, v15, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v16, v14 +; GCN-IR-NEXT: v_subb_u32_e32 v10, vcc, v17, v15, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v8 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v10 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 -; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v10, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v1 ; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v14, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v13, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v11 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v10 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -448,14 +448,14 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v1 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0 +; GCN-IR-NEXT: v_or_b32_e32 v11, v11, v1 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v6 -; GCN-IR-NEXT: v_xor_b32_e32 v3, v8, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v2, v9, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v3, v10, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v2, v11, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -993,7 +993,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_subb_u32 s7, s7, s4 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[12:13], 0 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] +; GCN-IR-NEXT: s_or_b64 s[18:19], s[8:9], s[10:11] ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 @@ -1001,51 +1001,51 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s12 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s13 -; GCN-IR-NEXT: s_min_u32 s18, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s16, s14, s18 -; GCN-IR-NEXT: s_subb_u32 s17, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[16:17], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63 -; GCN-IR-NEXT: s_or_b64 s[20:21], s[10:11], s[20:21] -; GCN-IR-NEXT: s_and_b64 s[10:11], s[20:21], exec -; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13 -; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12 +; GCN-IR-NEXT: s_min_u32 s16, s8, s9 +; GCN-IR-NEXT: s_sub_u32 s10, s14, s16 +; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[10:11], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[10:11], 63 +; GCN-IR-NEXT: s_or_b64 s[20:21], s[18:19], s[20:21] +; GCN-IR-NEXT: s_and_b64 s[18:19], s[20:21], exec +; GCN-IR-NEXT: s_cselect_b32 s19, 0, s13 +; GCN-IR-NEXT: s_cselect_b32 s18, 0, s12 ; GCN-IR-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s20, s16, 1 -; GCN-IR-NEXT: s_addc_u32 s21, s17, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[20:21], 0 -; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16 +; GCN-IR-NEXT: s_add_u32 s18, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s19, s11, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[18:19], 0 +; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21] +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s10 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s20 -; GCN-IR-NEXT: s_add_u32 s19, s6, -1 +; GCN-IR-NEXT: s_lshr_b64 s[18:19], s[12:13], s18 +; GCN-IR-NEXT: s_add_u32 s17, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s20, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] -; GCN-IR-NEXT: s_add_u32 s12, s8, s18 +; GCN-IR-NEXT: s_add_u32 s12, s8, s16 ; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 +; GCN-IR-NEXT: s_lshl_b64 s[18:19], s[18:19], 1 ; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s8, s19, s16 -; GCN-IR-NEXT: s_subb_u32 s8, s20, s17 +; GCN-IR-NEXT: s_sub_u32 s8, s17, s18 +; GCN-IR-NEXT: s_subb_u32 s8, s20, s19 ; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 ; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s16, s16, s14 -; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 +; GCN-IR-NEXT: s_sub_u32 s18, s18, s14 +; GCN-IR-NEXT: s_subb_u32 s19, s19, s15 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[12:13], 0 @@ -1054,21 +1054,21 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 ; GCN-IR-NEXT: .LBB9_4: ; %Flow4 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[18:19], s[8:9], s[6:7] ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end -; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3] -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[18:19], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 ; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 -; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s14, -1 +; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s10, -1 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: buffer_store_short v0, off, s[12:15], 0 offset:4 +; GCN-IR-NEXT: buffer_store_short v0, off, s[8:11], 0 offset:4 ; GCN-IR-NEXT: s_waitcnt expcnt(0) ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 @@ -1203,32 +1203,32 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s2, s2, s4 ; GCN-IR-NEXT: s_subb_u32 s3, s3, s4 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 -; GCN-IR-NEXT: s_min_u32 s10, s10, s11 -; GCN-IR-NEXT: s_add_u32 s12, s10, 0xffffffc5 -; GCN-IR-NEXT: s_addc_u32 s13, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 63 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[14:15] -; GCN-IR-NEXT: s_and_b64 s[8:9], s[14:15], exec -; GCN-IR-NEXT: s_cselect_b32 s8, 0, 24 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 +; GCN-IR-NEXT: s_min_u32 s10, s8, s9 +; GCN-IR-NEXT: s_add_u32 s8, s10, 0xffffffc5 +; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[8:9], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 63 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[12:13], s[14:15] +; GCN-IR-NEXT: s_and_b64 s[12:13], s[14:15], exec +; GCN-IR-NEXT: s_cselect_b32 s12, 0, 24 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] -; GCN-IR-NEXT: s_mov_b32 s9, 0 +; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s15, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0 -; GCN-IR-NEXT: s_sub_i32 s11, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s11 +; GCN-IR-NEXT: s_add_u32 s12, s8, 1 +; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[12:13], 0 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s8 ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s14 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s12 ; GCN-IR-NEXT: s_add_u32 s16, s2, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s3, -1 ; GCN-IR-NEXT: s_sub_u32 s10, 58, s10 @@ -1258,9 +1258,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3 ; GCN-IR-NEXT: .LBB10_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[2:3] ; GCN-IR-NEXT: .LBB10_5: ; %udiv-end -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[12:13], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index adce63c7e45e7f..b03353972ab664 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -692,17 +692,17 @@ define amdgpu_kernel void @select_v2f16( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s2 -; GFX11-NEXT: s_mov_b32 s15, s3 -; GFX11-NEXT: s_mov_b32 s22, s2 -; GFX11-NEXT: s_mov_b32 s23, s3 -; GFX11-NEXT: s_mov_b32 s18, s2 -; GFX11-NEXT: s_mov_b32 s19, s3 -; GFX11-NEXT: s_mov_b32 s26, s2 -; GFX11-NEXT: s_mov_b32 s27, s3 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX11-NEXT: s_mov_b32 s14, -1 +; GFX11-NEXT: s_mov_b32 s15, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s14 +; GFX11-NEXT: s_mov_b32 s3, s15 +; GFX11-NEXT: s_mov_b32 s22, s14 +; GFX11-NEXT: s_mov_b32 s23, s15 +; GFX11-NEXT: s_mov_b32 s18, s14 +; GFX11-NEXT: s_mov_b32 s19, s15 +; GFX11-NEXT: s_mov_b32 s26, s14 +; GFX11-NEXT: s_mov_b32 s27, s15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s20, s8 ; GFX11-NEXT: s_mov_b32 s21, s9 @@ -710,12 +710,12 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: s_mov_b32 s17, s7 ; GFX11-NEXT: s_mov_b32 s24, s10 ; GFX11-NEXT: s_mov_b32 s25, s11 -; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 ; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0 -; GFX11-NEXT: s_mov_b32 s0, s4 -; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_mov_b32 s12, s4 +; GFX11-NEXT: s_mov_b32 s13, s5 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(2) @@ -730,7 +730,7 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[12:15], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index be0aa394dd99dc..6220da1bb14686 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -902,28 +902,28 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b64 v[6:7], v[6:7], v13 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], v11 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll index 56d7fc335911ec..84d1be3aeb74f7 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll @@ -9,7 +9,8 @@ ; Flow does not fail during annotation. define void @my_func(i32 %0) { -; IR-LABEL: @my_func( +; IR-LABEL: define void @my_func( +; IR-SAME: i32 [[TMP0:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) null, align 8 ; IR-NEXT: br label [[NODEBLOCK:%.*]] @@ -40,7 +41,7 @@ define void @my_func(i32 %0) { ; IR-NEXT: [[TMP10]] = phi i1 [ [[TMP25:%.*]], [[FLOW15]] ], [ true, [[LEAFBLOCK]] ] ; IR-NEXT: br label [[FLOW11]] ; IR: NodeBlock7: -; IR-NEXT: [[PIVOT8:%.*]] = icmp sge i32 [[TMP0:%.*]], 2 +; IR-NEXT: [[PIVOT8:%.*]] = icmp sge i32 [[TMP0]], 2 ; IR-NEXT: [[TMP11:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[PIVOT8]]) ; IR-NEXT: [[TMP12:%.*]] = extractvalue { i1, i64 } [[TMP11]], 0 ; IR-NEXT: [[TMP13:%.*]] = extractvalue { i1, i64 } [[TMP11]], 1 @@ -145,45 +146,45 @@ define void @my_func(i32 %0) { ; GCN-NEXT: s_branch .LBB0_4 ; GCN-NEXT: .LBB0_10: ; %NodeBlock7 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0 -; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: s_mov_b64 s[10:11], 0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: ; %bb.11: ; %LeafBlock5 ; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: s_and_b64 s[8:9], vcc, exec +; GCN-NEXT: s_and_b64 s[10:11], vcc, exec ; GCN-NEXT: ; %bb.12: ; %Flow13 -; GCN-NEXT: s_andn2_saveexec_b64 s[10:11], s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] ; GCN-NEXT: ; %bb.13: ; %LeafBlock3 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec -; GCN-NEXT: s_andn2_b64 s[8:9], s[8:9], exec +; GCN-NEXT: s_andn2_b64 s[10:11], s[10:11], exec ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_and_b64 s[12:13], vcc, exec ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GCN-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] ; GCN-NEXT: ; %bb.14: ; %Flow14 -; GCN-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[8:9] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GCN-NEXT: s_cbranch_execz .LBB0_18 ; GCN-NEXT: ; %bb.15: ; %LeafBlock9 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0 -; GCN-NEXT: s_mov_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b64 s[10:11], -1 ; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc ; GCN-NEXT: ; %bb.16: ; %do.body.i.i.i.i ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_xor_b64 s[8:9], exec, -1 +; GCN-NEXT: s_xor_b64 s[10:11], exec, -1 ; GCN-NEXT: ; %bb.17: ; %Flow16 ; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec -; GCN-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GCN-NEXT: s_and_b64 s[10:11], s[10:11], exec +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GCN-NEXT: .LBB0_18: ; %Flow15 -; GCN-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] ; GCN-NEXT: s_cbranch_execnz .LBB0_3 ; GCN-NEXT: s_branch .LBB0_4 diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 08db1e7fee259d..0c538407d85753 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -61,101 +61,77 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: v_mov_b32_e32 v6, 0 ; GFX6-NEXT: s_mov_b32 s2, 0x3fd00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1268 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1380 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1272 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1276 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1280 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1384 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1388 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1392 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1300 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1412 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1304 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1308 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1312 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1416 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1420 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1424 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1332 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1444 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1336 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1340 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1344 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1448 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1452 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1456 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1364 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1476 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1368 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1372 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1376 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1480 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1484 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1488 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1396 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1508 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1400 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1404 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1408 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1512 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1516 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1520 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1428 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1540 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1432 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1436 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1440 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1544 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1548 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1552 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1460 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1572 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1464 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1468 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1472 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1576 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1580 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1584 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1492 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1604 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1496 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1500 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1504 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1608 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1612 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1616 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1556 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1560 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1564 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1568 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:3984 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1588 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1592 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1596 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1600 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4000 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1620 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1624 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1628 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1632 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4016 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1652 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1656 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1660 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1664 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1684 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -163,7 +139,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1692 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1696 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1716 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -171,7 +147,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1724 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1728 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1748 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -179,7 +155,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1756 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1760 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1780 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -187,95 +163,103 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1788 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1792 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4048 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1812 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1816 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1820 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1824 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4064 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1844 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1848 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1852 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1856 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4080 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1876 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1880 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1884 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1888 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1860 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1940 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1864 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1868 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1872 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1944 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1948 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1952 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1892 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1972 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1896 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1900 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1904 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1976 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1980 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1984 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1924 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2004 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1928 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1932 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1936 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2008 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2012 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2016 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1956 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2036 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1960 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1964 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1968 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2040 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2044 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2048 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1988 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2068 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1992 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1996 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2000 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2072 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2076 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2080 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2020 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2100 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2024 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2028 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2032 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2104 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2108 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2112 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2052 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2132 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2056 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2060 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2064 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2136 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2140 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2144 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2084 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2164 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2088 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2092 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2096 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2168 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2172 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2176 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2148 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2152 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2156 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2160 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:3984 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2180 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2184 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2188 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2192 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4000 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2212 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2216 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2220 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2224 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2244 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -283,7 +267,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2252 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2256 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2276 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -291,7 +275,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2284 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2288 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2308 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -299,7 +283,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2316 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2320 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2340 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -307,7 +291,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2348 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2352 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2372 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -315,87 +299,95 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2380 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2384 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4064 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2404 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2408 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2412 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2416 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4080 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2436 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2440 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2444 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2448 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2452 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2500 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2456 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2460 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2464 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2504 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2508 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2512 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2484 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2532 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2488 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2492 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2496 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2536 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2540 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2544 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2516 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2564 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2520 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2524 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2528 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2568 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2572 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2576 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2548 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2596 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2552 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2556 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2560 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2600 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2604 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2608 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2580 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2628 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2584 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2588 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2592 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2632 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2636 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2640 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2612 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2660 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2616 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2620 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2624 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2664 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2668 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2672 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2644 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2692 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2648 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2652 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2656 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2696 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2700 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2704 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2676 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2724 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2680 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2684 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2688 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2728 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2732 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2736 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2740 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2744 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2748 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2752 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:3984 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2772 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2776 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2780 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2784 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2804 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -403,7 +395,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2812 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2816 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2836 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -411,7 +403,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2844 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2848 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2868 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -419,7 +411,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2876 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2880 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2900 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -427,7 +419,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2908 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2912 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2932 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -435,7 +427,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2940 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2944 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2964 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -443,69 +435,77 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2972 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2976 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4080 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2996 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3000 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3004 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3008 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3044 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3060 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3048 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3052 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3056 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3064 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3068 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3072 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3076 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3092 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3080 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3084 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3088 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3096 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3100 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3104 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3108 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3124 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3112 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3116 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3120 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3128 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3132 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3136 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3140 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3156 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3144 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3148 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3152 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3160 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3164 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3168 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3172 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3188 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3176 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3180 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3184 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3192 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3196 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3200 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3204 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3220 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3208 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3212 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3216 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3224 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3228 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3232 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3236 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3252 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3240 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3244 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3248 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3256 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3260 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3264 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3268 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3284 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3272 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3276 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3280 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3288 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3292 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3296 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1206,13 +1206,29 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1264 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1268 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1272 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1276 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1280 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1280 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1284 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1288 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1292 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1296 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1280 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1296 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1300 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1304 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1308 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1312 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1312 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1316 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1220,7 +1236,15 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1324 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1328 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1296 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1328 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1332 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1336 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1340 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1344 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1344 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1348 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1228,47 +1252,47 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1356 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1360 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1312 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1360 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1380 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1364 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1384 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1388 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1392 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1368 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1372 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1376 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1328 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1376 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1412 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1396 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1416 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1420 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1424 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1400 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1404 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1408 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1344 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1392 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1444 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1428 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1448 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1452 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1456 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1432 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1436 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1440 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1360 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1408 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1476 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1460 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1480 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1484 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1488 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1464 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1468 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1472 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1376 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1424 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1508 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1492 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1512 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1516 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1520 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1496 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1500 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1504 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1392 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1440 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1524 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1276,31 +1300,31 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1532 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1536 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1408 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1456 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1540 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1556 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1544 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1548 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1552 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1560 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1564 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1568 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1424 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1472 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1572 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1588 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1576 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1580 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1584 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1592 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1596 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1600 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1440 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1488 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1604 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1620 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1608 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1612 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1616 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1624 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1628 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1632 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1456 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1504 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1636 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1308,7 +1332,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1644 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1648 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1472 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1520 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1668 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1316,7 +1340,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1676 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1680 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1488 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1536 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1700 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1324,7 +1348,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1708 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1712 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1504 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1552 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1732 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1332,7 +1356,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1740 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1744 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1520 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1568 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1764 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1340,7 +1364,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1772 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1776 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1536 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1584 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1796 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1348,15 +1372,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1804 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1808 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1552 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1812 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1816 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1820 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1824 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1568 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1600 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1828 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1364,23 +1380,23 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1836 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1840 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1584 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1616 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1844 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1860 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1848 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1852 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1856 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1864 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1868 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1872 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1600 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1632 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1876 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1892 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1880 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1884 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1888 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1896 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1900 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1904 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1616 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1648 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1908 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1388,55 +1404,55 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1916 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1920 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1632 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1664 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1940 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1924 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1944 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1948 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1952 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1928 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1932 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1936 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1648 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1680 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1972 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1956 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1976 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1980 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1984 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1960 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1964 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1968 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1664 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1696 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2004 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1988 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2008 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2012 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2016 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1992 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1996 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2000 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1680 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1712 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2036 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2020 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2040 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2044 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2048 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2024 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2028 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2032 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1696 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1728 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2068 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2052 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2072 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2076 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2080 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2056 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2060 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2064 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1712 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1744 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2100 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2084 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2104 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2108 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2112 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2088 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2092 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2096 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1728 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1760 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2116 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1444,23 +1460,23 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2124 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2128 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1744 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1776 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2132 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2148 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2136 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2140 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2144 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2152 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2156 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2160 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1760 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1792 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2164 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2180 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2168 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2172 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2176 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2184 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2188 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2192 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1776 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1808 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2196 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1468,7 +1484,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2204 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2208 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1792 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1824 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2228 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1476,7 +1492,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2236 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2240 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1808 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1840 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2260 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1484,7 +1500,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2268 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2272 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1824 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1856 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2292 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1492,7 +1508,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2300 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2304 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1840 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1872 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2324 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1500,7 +1516,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2332 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2336 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1856 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1888 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2356 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1508,7 +1524,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2364 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2368 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1872 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1904 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2388 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1516,15 +1532,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2396 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2400 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1888 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2404 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2408 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2412 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2416 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1904 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1920 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2420 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1532,15 +1540,15 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2428 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2432 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1920 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1936 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2436 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2452 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2440 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2444 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2448 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2456 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2460 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2464 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1936 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1952 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2468 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1548,79 +1556,79 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2476 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2480 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1952 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2500 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2504 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2508 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2512 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1968 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2532 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2484 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2536 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2540 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2544 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2488 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2492 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2496 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1984 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2564 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2516 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2568 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2572 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2576 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2520 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2524 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2528 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2596 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2548 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2600 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2604 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2608 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2552 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2556 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2560 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2016 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2628 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2580 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2632 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2636 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2640 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2584 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2588 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2592 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2032 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2660 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2612 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2664 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2668 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2672 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2616 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2620 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2624 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2048 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2692 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2644 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2696 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2700 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2704 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2648 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2652 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2656 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2064 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2676 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2680 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2684 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2688 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2080 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2708 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2712 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2716 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2720 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2080 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2096 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2724 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2740 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2728 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2732 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2736 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2744 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2748 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2752 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2096 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2112 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2756 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1628,7 +1636,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2764 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2768 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2112 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2128 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2788 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1636,7 +1644,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2796 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2800 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2128 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2144 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2820 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1644,7 +1652,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2828 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2832 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2144 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2160 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2852 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1652,7 +1660,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2860 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2864 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2160 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2176 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2884 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1660,7 +1668,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2892 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2896 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2176 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2192 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2916 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1668,7 +1676,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2924 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2928 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2192 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2208 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2948 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1676,7 +1684,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2956 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2960 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2208 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2224 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2980 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1684,14 +1692,6 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2988 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2992 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2224 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2996 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3000 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3004 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3008 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2240 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3012 ; 4-byte Folded Spill @@ -1710,67 +1710,67 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2272 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3060 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3044 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3064 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3068 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3072 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3048 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3052 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3056 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2288 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3092 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3076 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3096 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3100 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3104 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3080 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3084 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3088 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2304 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3124 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3108 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3128 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3132 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3136 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3112 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3116 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3120 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2320 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3156 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3140 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3160 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3164 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3168 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3144 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3148 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3152 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2336 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3188 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3172 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3192 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3196 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3200 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3176 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3180 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3184 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2352 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3220 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3204 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3224 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3228 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3232 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3208 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3212 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3216 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2368 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3252 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3236 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3256 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3260 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3264 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3240 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3244 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3248 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2384 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3284 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3268 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3288 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3292 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3296 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3272 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3276 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3280 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2400 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2749,395 +2749,395 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3268 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3272 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3276 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3280 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3284 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3288 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3292 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3296 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3236 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3240 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3244 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3248 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3252 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3256 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3260 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3264 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3204 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3208 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3212 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3216 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3220 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3224 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3228 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3232 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3172 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3176 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3180 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3184 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3188 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3192 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3196 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3200 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3140 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3144 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3148 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3152 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3156 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3160 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3164 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3168 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3108 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3112 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3116 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3120 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3124 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3128 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3132 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3136 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3076 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3080 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3084 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3088 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3092 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3096 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3100 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3104 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3044 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3048 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3052 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3056 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3060 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3064 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3068 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3072 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2996 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3000 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3004 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3008 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4080 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2964 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2968 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2972 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2976 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2932 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2936 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2940 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2944 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2900 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2904 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2908 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2912 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2868 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2872 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2876 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2880 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2836 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2840 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2844 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2848 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2804 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2808 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2812 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2816 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2772 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2776 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2780 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2784 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:3984 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2740 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2744 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2748 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2752 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2676 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2680 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2684 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2688 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2724 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2728 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2732 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2736 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2644 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2648 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2652 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2656 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2692 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2696 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2700 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2704 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2612 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2616 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2620 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2624 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2660 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2664 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2668 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2672 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2580 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2584 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2588 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2592 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2628 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2632 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2636 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2640 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2548 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2552 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2556 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2560 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2596 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2600 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2604 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2608 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2516 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2520 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2524 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2528 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2564 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2568 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2572 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2576 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2484 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2488 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2492 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2496 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2532 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2536 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2540 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2544 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2452 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2456 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2460 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2464 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2500 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2504 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2508 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2512 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2436 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2440 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2444 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2448 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4080 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2404 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2408 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2412 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2416 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4064 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2372 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2376 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2380 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2384 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2340 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2344 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2348 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2352 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2308 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2312 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2316 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2320 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2276 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2280 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2284 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2288 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2244 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2248 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2252 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2256 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2212 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2216 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2220 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2224 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4000 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2180 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2184 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2188 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2192 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:3984 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2148 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2152 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2156 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2160 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2084 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2088 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2092 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2096 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2164 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2168 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2172 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2176 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2052 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2056 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2060 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2064 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2132 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2136 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2140 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2144 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2020 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2024 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2028 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2032 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2100 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2104 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2108 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2112 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1988 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1992 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1996 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2000 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2068 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2072 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2076 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2080 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1956 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1960 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1964 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1968 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2036 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2040 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2044 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2048 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1924 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1928 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1932 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1936 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2004 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2008 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2012 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2016 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1892 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1896 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1900 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1904 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1972 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1976 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1980 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1984 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1860 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1864 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1868 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1872 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1940 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1944 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1948 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1952 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1876 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1880 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1884 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1888 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4080 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1844 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1848 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1852 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1856 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4064 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1812 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1816 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1820 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1824 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4048 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1780 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1784 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1788 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1792 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1748 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1752 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1756 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1760 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1716 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1720 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1724 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1728 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1684 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1688 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1692 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1696 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1652 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1656 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1660 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1664 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4016 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1620 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1624 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1628 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1632 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4000 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1588 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1592 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1596 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1600 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:3984 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1556 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1560 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1564 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1568 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1492 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1496 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1500 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1504 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1604 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1608 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1612 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1616 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1460 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1464 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1468 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1472 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1572 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1576 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1580 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1584 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1428 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1432 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1436 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1440 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1540 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1544 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1548 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1552 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1396 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1400 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1404 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1408 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1508 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1512 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1516 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1520 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1364 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1368 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1372 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1376 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1476 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1480 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1484 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1488 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1332 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1336 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1340 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1344 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1444 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1448 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1452 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1456 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1300 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1304 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1308 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1312 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1412 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1416 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1420 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1424 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1268 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1272 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1276 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1280 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1380 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1384 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1388 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1392 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:3968 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4080 @@ -3941,59 +3941,59 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3284 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3288 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3292 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3296 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3268 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3272 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3276 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3280 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2384 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3252 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3256 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3260 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3264 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3236 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3240 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3244 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3248 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2368 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3220 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3224 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3228 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3232 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3204 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3208 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3212 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3216 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2352 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3188 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3192 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3196 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3200 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3172 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3176 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3180 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3184 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3156 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3160 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3164 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3168 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3140 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3144 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3148 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3152 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3124 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3128 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3132 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3136 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3108 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3112 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3116 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3120 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3092 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3096 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3100 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3104 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3076 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3080 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3084 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3088 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3060 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3064 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3068 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3072 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3044 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3048 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3052 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3056 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2272 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -4011,431 +4011,431 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2996 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3000 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3004 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3008 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2224 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2980 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2984 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2988 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2992 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2208 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2224 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2948 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2952 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2956 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2960 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2192 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2208 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2916 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2920 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2924 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2928 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2176 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2192 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2884 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2888 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2892 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2896 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2160 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2176 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2852 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2856 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2860 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2864 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2144 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2820 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2824 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2828 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2832 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2128 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2788 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2792 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2796 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2800 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2112 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2756 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2760 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2764 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2768 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2096 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2724 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2728 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2732 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2736 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2740 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2744 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2748 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2752 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2080 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2096 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2708 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2712 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2716 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2720 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2080 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2676 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2680 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2684 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2688 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2692 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2696 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2700 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2704 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2644 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2648 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2652 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2656 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2660 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2664 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2668 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2672 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2612 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2616 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2620 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2624 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2628 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2632 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2636 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2640 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2580 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2584 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2588 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2592 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2596 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2600 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2604 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2608 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2548 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2552 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2556 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2560 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2564 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2568 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2572 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2576 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2516 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2520 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2524 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2528 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2532 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2536 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2540 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2544 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2484 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2488 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2492 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2496 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1968 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2500 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2504 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2508 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2512 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1952 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2468 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2472 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2476 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2480 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1936 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1952 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2436 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2440 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2444 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2448 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2452 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2456 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2460 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2464 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1920 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1936 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2420 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2424 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2428 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2432 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1904 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2404 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2408 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2412 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2416 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1888 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1920 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2388 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2392 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2396 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2400 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1872 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1904 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2356 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2360 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2364 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2368 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1856 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1888 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2324 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2328 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2332 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2336 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1840 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1872 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2292 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2296 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2300 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2304 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1824 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1856 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2260 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2264 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2268 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2272 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1808 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1840 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2228 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2232 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2236 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2240 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1792 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1824 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2196 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2200 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2204 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2208 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1776 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1808 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2164 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2168 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2172 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2176 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2180 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2184 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2188 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2192 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1760 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1792 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2132 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2136 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2140 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2144 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2148 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2152 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2156 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2160 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1744 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1776 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2116 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2120 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2124 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2128 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1760 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2084 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2088 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2092 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2096 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1744 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2052 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2056 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2060 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2064 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1728 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2100 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2104 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2108 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2112 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2020 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2024 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2028 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2032 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1712 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2068 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2072 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2076 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2080 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1988 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1992 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1996 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2000 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1696 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2036 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2040 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2044 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2048 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1956 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1960 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1964 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1968 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1680 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2004 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2008 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2012 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2016 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1924 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1928 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1932 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1936 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1664 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1972 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1976 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1980 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1984 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1648 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1940 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1944 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1948 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1952 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1632 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1908 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1912 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1916 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1920 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1616 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1648 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1876 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1880 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1884 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1888 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1892 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1896 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1900 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1904 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1600 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1632 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1844 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1848 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1852 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1856 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1860 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1864 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1868 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1872 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1584 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1616 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1828 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1832 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1836 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1840 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1568 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1812 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1816 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1820 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1824 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1552 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1600 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1796 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1800 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1804 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1808 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1536 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1584 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1764 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1768 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1772 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1776 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1520 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1568 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1732 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1736 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1740 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1744 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1504 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1552 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1700 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1704 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1708 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1712 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1488 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1536 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1668 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1672 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1676 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1680 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1472 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1520 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1636 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1640 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1644 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1648 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1456 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1504 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1604 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1608 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1612 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1616 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1620 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1624 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1628 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1632 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1440 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1488 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1572 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1576 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1580 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1584 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1588 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1592 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1596 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1600 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1424 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1472 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1540 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1544 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1548 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1552 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1556 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1560 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1564 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1568 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1408 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1456 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1524 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1528 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1532 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1536 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1392 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1440 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1508 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1512 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1516 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1520 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1492 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1496 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1500 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1504 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1376 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1424 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1476 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1480 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1484 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1488 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1460 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1464 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1468 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1472 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1360 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1408 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1444 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1448 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1452 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1456 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1428 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1432 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1436 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1440 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1344 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1392 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1412 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1416 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1420 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1424 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1396 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1400 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1404 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1408 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1328 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1376 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1380 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1384 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1388 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1392 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1364 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1368 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1372 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1376 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1312 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1360 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1348 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1352 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1356 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1360 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1296 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1344 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1332 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1336 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1340 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1344 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1328 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1316 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1320 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1324 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1328 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1280 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1312 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1300 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1304 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1308 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1312 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1296 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1284 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1288 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1292 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1296 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1280 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1268 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1272 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1276 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1280 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1264 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1252 ; 4-byte Folded Reload @@ -10093,7 +10093,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 -; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: s_mov_b32 s2, 0x80c00 ; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10103,24 +10103,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224 -; GFX6-NEXT: s_mov_b32 s2, 0x83400 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208 -; GFX6-NEXT: s_mov_b32 s2, 0x83000 +; GFX6-NEXT: s_mov_b32 s2, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:208 +; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[4:7], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176 ; GFX6-NEXT: s_mov_b32 s2, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10129,8 +10122,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176 -; GFX6-NEXT: s_mov_b32 s2, 0x82800 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160 +; GFX6-NEXT: s_mov_b32 s2, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10138,7 +10131,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144 ; GFX6-NEXT: s_mov_b32 s2, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10147,8 +10140,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144 -; GFX6-NEXT: s_mov_b32 s2, 0x82000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128 +; GFX6-NEXT: s_mov_b32 s2, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10156,8 +10149,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128 -; GFX6-NEXT: s_mov_b32 s2, 0x81c00 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112 +; GFX6-NEXT: s_mov_b32 s2, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10165,8 +10158,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112 -; GFX6-NEXT: s_mov_b32 s2, 0x81800 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96 +; GFX6-NEXT: s_mov_b32 s2, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10174,8 +10167,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96 -; GFX6-NEXT: s_mov_b32 s2, 0x81400 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80 +; GFX6-NEXT: s_mov_b32 s2, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10183,15 +10176,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80 -; GFX6-NEXT: s_mov_b32 s2, 0x81000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:64 +; GFX6-NEXT: s_mov_b32 s2, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 ; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16 @@ -10202,7 +10194,15 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[4:7], 0 addr64 offset:32 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:32 +; GFX6-NEXT: s_mov_b32 s2, 0x83400 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -10218,7 +10218,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[8:9] ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[7:8], s[4:7], 0 addr64 offset:48 -; GFX6-NEXT: s_mov_b32 s2, 0x80c00 +; GFX6-NEXT: s_mov_b32 s2, 0x83800 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10251,7 +10251,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[2:3] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[34:35], 0, v0 ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[8:15] ; GFX6-NEXT: ;;#ASMEND @@ -10270,10 +10270,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s33 ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: s_and_saveexec_b64 s[34:35], vcc +; GFX6-NEXT: s_and_saveexec_b64 vcc, s[34:35] ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; %bb0 -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10285,18 +10285,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s13, 5 ; GFX6-NEXT: v_writelane_b32 v4, s14, 6 ; GFX6-NEXT: v_writelane_b32 v4, s15, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x84400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s36, 0x84400 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, s[34:35] +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x83c00 +; GFX6-NEXT: s_mov_b32 s36, 0x83c00 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s8, v4, 0 ; GFX6-NEXT: v_readlane_b32 s9, v4, 1 @@ -10308,8 +10308,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s15, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, s[34:35] +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10321,18 +10321,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s21, 5 ; GFX6-NEXT: v_writelane_b32 v4, s22, 6 ; GFX6-NEXT: v_writelane_b32 v4, s23, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x84c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s36, 0x84c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, s[34:35] +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x84400 +; GFX6-NEXT: s_mov_b32 s36, 0x84400 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s16, v4, 0 ; GFX6-NEXT: v_readlane_b32 s17, v4, 1 @@ -10344,8 +10344,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s23, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, s[34:35] +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10357,18 +10357,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s29, 5 ; GFX6-NEXT: v_writelane_b32 v4, s30, 6 ; GFX6-NEXT: v_writelane_b32 v4, s31, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x85400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s36, 0x85400 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, s[34:35] +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x84c00 +; GFX6-NEXT: s_mov_b32 s36, 0x84c00 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s24, v4, 0 ; GFX6-NEXT: v_readlane_b32 s25, v4, 1 @@ -10380,8 +10380,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s31, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, s[34:35] +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10389,12 +10389,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s1, 1 ; GFX6-NEXT: v_writelane_b32 v4, s2, 2 ; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s38, 0x85c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s36, 0x85c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 exec, s[34:35] ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -10403,8 +10403,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s5, 1 ; GFX6-NEXT: v_writelane_b32 v4, s6, 2 ; GFX6-NEXT: v_writelane_b32 v4, s7, 3 -; GFX6-NEXT: s_mov_b32 s36, 0x86000 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s34, 0x86000 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10421,12 +10421,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x85400 +; GFX6-NEXT: s_mov_b32 s36, 0x85400 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s0, v4, 0 ; GFX6-NEXT: v_readlane_b32 s1, v4, 1 @@ -10438,13 +10438,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s7, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, s[34:35] +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2180 +; GFX6-NEXT: s_mov_b32 s44, 0x86000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s36, v4, 0 ; GFX6-NEXT: v_readlane_b32 s37, v4, 1 @@ -10452,8 +10452,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s39, v4, 3 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 vcc, s[34:35] +; GFX6-NEXT: s_mov_b64 exec, s[34:35] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 3 ; GFX6-NEXT: v_mov_b32_e32 v7, 0x2190 @@ -10469,7 +10468,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35] ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: s_mov_b64 s[34:35], vcc ; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: s_mov_b32 s6, 0x85c00 @@ -10485,38 +10483,29 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[4:5] ; GFX6-NEXT: s_mov_b32 s2, 0x83c00 -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84400 ; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(4) -; GFX6-NEXT: v_mov_b32_e32 v0, v17 -; GFX6-NEXT: v_mov_b32_e32 v1, v18 -; GFX6-NEXT: v_mov_b32_e32 v2, v19 -; GFX6-NEXT: v_mov_b32_e32 v3, v20 +; GFX6-NEXT: s_mov_b32 s2, 0x84400 +; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: v_mov_b32_e32 v20, v3 +; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s2, 0x83c00 ; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 -; GFX6-NEXT: v_mov_b32_e32 v19, v2 -; GFX6-NEXT: v_mov_b32_e32 v18, v1 -; GFX6-NEXT: v_mov_b32_e32 v17, v0 -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART @@ -10530,7 +10519,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: .LBB1_2: ; %ret -; GFX6-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX6-NEXT: s_or_b64 exec, exec, vcc ; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: s_mov_b32 s8, 0x80400 @@ -10545,14 +10534,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[2:3] -; GFX6-NEXT: s_mov_b32 s4, 0x83800 +; GFX6-NEXT: s_mov_b32 s4, 0x80c00 ; GFX6-NEXT: v_lshl_b64 v[4:5], v[5:6], 8 ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] -; GFX6-NEXT: s_mov_b32 s4, 0x83400 +; GFX6-NEXT: s_mov_b32 s4, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10560,91 +10549,92 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x83000 +; GFX6-NEXT: s_mov_b32 s4, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[13:16], v[4:5], s[0:3], 0 addr64 offset:192 +; GFX6-NEXT: s_waitcnt expcnt(2) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82c00 +; GFX6-NEXT: s_mov_b32 s4, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82800 +; GFX6-NEXT: s_mov_b32 s4, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82400 +; GFX6-NEXT: s_mov_b32 s4, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82000 +; GFX6-NEXT: s_mov_b32 s4, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81c00 +; GFX6-NEXT: s_mov_b32 s4, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81800 +; GFX6-NEXT: s_mov_b32 s4, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81400 +; GFX6-NEXT: s_mov_b32 s4, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81000 +; GFX6-NEXT: s_mov_b32 s4, 0x83800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80c00 +; GFX6-NEXT: s_mov_b32 s4, 0x83400 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80 -; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[13:16], v[4:5], s[0:3], 0 addr64 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:32 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload @@ -10662,60 +10652,57 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:240 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[16:19], v0, s[38:39] offset:240 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:224 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 1 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[8:11], v0, s[38:39] offset:224 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:208 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[12:15], v0, s[38:39] offset:208 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:192 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v0, s[38:39] offset:192 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:176 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[8:11], v0, s[38:39] offset:176 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:160 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[16:19], v0, s[38:39] offset:160 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:144 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:128 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:112 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:96 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:80 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:64 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:48 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:32 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:16 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v0, s[38:39] offset:16 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v0, s[38:39] ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -10749,24 +10736,24 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_nop 0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: ;;#ASMSTART @@ -10781,61 +10768,59 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 8, v[5:6] ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s37 ; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, s36, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:240 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[8:11], off offset:224 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:208 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:192 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:176 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:160 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:240 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:224 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:208 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:144 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:128 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:192 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[8:11], off offset:176 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:160 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(3) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:112 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:96 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:144 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:128 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:80 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:112 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:64 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:96 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:48 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:80 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:64 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:48 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:16 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-FLATSCR-NEXT: s_endpgm ; @@ -10854,21 +10839,21 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; GFX10-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLATSCR-NEXT: s_clause 0xf -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[64:67], v0, s[38:39] offset:240 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[60:63], v0, s[38:39] offset:224 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[56:59], v0, s[38:39] offset:208 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[52:55], v0, s[38:39] offset:192 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[48:51], v0, s[38:39] offset:176 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[44:47], v0, s[38:39] offset:160 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[40:43], v0, s[38:39] offset:144 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[16:19], v0, s[38:39] offset:240 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[52:55], v0, s[38:39] offset:224 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[12:15], v0, s[38:39] offset:208 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[44:47], v0, s[38:39] offset:192 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[8:11], v0, s[38:39] offset:176 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[40:43], v0, s[38:39] offset:160 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[60:63], v0, s[38:39] offset:144 ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[36:39], v0, s[38:39] offset:128 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[32:35], v0, s[38:39] offset:112 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[28:31], v0, s[38:39] offset:96 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[24:27], v0, s[38:39] offset:80 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v0, s[38:39] offset:64 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[16:19], v0, s[38:39] offset:48 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[12:15], v0, s[38:39] offset:32 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[8:11], v0, s[38:39] offset:16 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[48:51], v0, s[38:39] offset:112 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[32:35], v0, s[38:39] offset:96 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[28:31], v0, s[38:39] offset:80 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[64:67], v0, s[38:39] offset:64 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[24:27], v0, s[38:39] offset:48 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[56:59], v0, s[38:39] offset:32 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v0, s[38:39] offset:16 ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v0, s[38:39] ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, 16 @@ -11034,22 +11019,22 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 8, v[5:6] ; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, vcc_lo, s36, v4 ; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s37, v5, vcc_lo -; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[64:67], off offset:240 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[60:63], off offset:224 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[56:59], off offset:208 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[52:55], off offset:192 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[48:51], off offset:176 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[44:47], off offset:160 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[40:43], off offset:144 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:240 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[52:55], off offset:224 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:208 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[44:47], off offset:192 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[8:11], off offset:176 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[40:43], off offset:160 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[60:63], off offset:144 ; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[36:39], off offset:128 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[32:35], off offset:112 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[28:31], off offset:96 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[24:27], off offset:80 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:64 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:48 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:32 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[8:11], off offset:16 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[48:51], off offset:112 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[32:35], off offset:96 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[28:31], off offset:80 +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[64:67], off offset:64 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[24:27], off offset:48 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[56:59], off offset:32 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:16 ; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-FLATSCR-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll index 314785cdbefd61..daf7c0305b2bc6 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll @@ -85,13 +85,13 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(ptr addrspace(1) %p) #0 { ; GFX908-DAG: v_accvgpr_read_b32 ; GFX900: NumVgprs: 256 -; GFX900: ScratchSize: 132 -; GFX908: NumVgprs: 252 +; GFX900: ScratchSize: 148 +; GFX908: NumVgprs: 254 ; GFX908: ScratchSize: 0 ; GCN900: VGPRBlocks: 63 ; GCN908: VGPRBlocks: 62 ; GFX900: NumVGPRsForWavesPerEU: 256 -; GFX908: NumVGPRsForWavesPerEU: 252 +; GFX908: NumVGPRsForWavesPerEU: 254 define amdgpu_kernel void @max_256_vgprs_spill_9x32(ptr addrspace(1) %p) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %p1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid @@ -137,7 +137,7 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(ptr addrspace(1) %p) #1 { ; GFX900: NumVgprs: 256 ; GFX908: NumVgprs: 254 -; GFX900: ScratchSize: 1796 +; GFX900: ScratchSize: 1412 ; GFX908: ScratchSize: 0 ; GFX900: VGPRBlocks: 63 ; GFX908: VGPRBlocks: 63 diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 844c346bfca381..62fd2c9658e666 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -597,54 +597,54 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashr_i64 v[6:7], v[6:7], v13 ; SI-NEXT: v_ashr_i64 v[4:5], v[4:5], v11 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_ashrrev_i64 v[2:3], v10, v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i64 v[6:7], v13, v[6:7] ; VI-NEXT: v_ashrrev_i64 v[4:5], v11, v[4:5] ; VI-NEXT: v_ashrrev_i64 v[0:1], v8, v[0:1] -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 24319a639da447..f50b9eab48441e 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -135,11 +135,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 ; GCN-IR-NEXT: s_min_u32 s10, s10, s11 -; GCN-IR-NEXT: s_min_u32 s14, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 -; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 +; GCN-IR-NEXT: s_min_u32 s12, s6, s7 +; GCN-IR-NEXT: s_sub_u32 s14, s10, s12 +; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[16:17] ; GCN-IR-NEXT: s_and_b64 s[8:9], s[16:17], exec ; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 @@ -150,41 +150,41 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 +; GCN-IR-NEXT: s_add_u32 s16, s14, 1 +; GCN-IR-NEXT: s_addc_u32 s17, s15, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 +; GCN-IR-NEXT: s_sub_i32 s13, 63, s14 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s13 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s16 ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] -; GCN-IR-NEXT: s_add_u32 s10, s6, s14 +; GCN-IR-NEXT: s_add_u32 s10, s6, s12 ; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 +; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 ; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] -; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 -; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s6, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[4:5] -; GCN-IR-NEXT: s_sub_u32 s12, s12, s14 -; GCN-IR-NEXT: s_subb_u32 s13, s13, s15 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s6, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s6, s17, s15 +; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31 +; GCN-IR-NEXT: s_mov_b32 s13, s12 +; GCN-IR-NEXT: s_and_b32 s6, s12, 1 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_sub_u32 s14, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s13 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow7 @@ -218,14 +218,14 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN-NEXT: v_xor_b32_e32 v3, v3, v4 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v4 -; GCN-NEXT: v_xor_b32_e32 v3, v5, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, v3 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 -; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v2, vcc +; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 +; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc ; GCN-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -297,34 +297,34 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 -; GCN-NEXT: v_mul_hi_u32 v7, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v8, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v7, v2, v4 +; GCN-NEXT: v_mul_lo_u32 v8, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, v1, v5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v2, vcc -; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v3 +; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v3, vcc +; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2 ; GCN-NEXT: v_subbrev_u32_e64 v8, s[6:7], 0, v4, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3 ; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v2 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7] -; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v2 -; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v2, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v3 +; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5] +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] -; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v7, v3 +; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v7, v2 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 -; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v10, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] @@ -877,28 +877,28 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem33_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], 31 +; GCN-NEXT: s_ashr_i64 s[4:5], s[0:1], 31 ; GCN-NEXT: s_ashr_i32 s0, s1, 31 -; GCN-NEXT: s_add_u32 s8, s8, s0 +; GCN-NEXT: s_add_u32 s4, s4, s0 ; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s9, s9, s0 -; GCN-NEXT: s_xor_b64 s[12:13], s[8:9], s[0:1] +; GCN-NEXT: s_addc_u32 s5, s5, s0 +; GCN-NEXT: s_xor_b64 s[12:13], s[4:5], s[0:1] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 ; GCN-NEXT: s_sub_u32 s0, 0, s12 ; GCN-NEXT: s_subb_u32 s1, 0, s13 -; GCN-NEXT: s_ashr_i32 s6, s7, 31 +; GCN-NEXT: s_ashr_i32 s10, s11, 31 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: s_mov_b32 s4, s8 +; GCN-NEXT: s_mov_b32 s5, s9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -947,11 +947,11 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: s_add_u32 s0, s2, s6 +; GCN-NEXT: s_add_u32 s0, s2, s10 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_addc_u32 s1, s3, s6 +; GCN-NEXT: s_addc_u32 s1, s3, s10 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[6:7] +; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[10:11] ; GCN-NEXT: v_mul_lo_u32 v2, s14, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s14, v0 ; GCN-NEXT: v_mul_hi_u32 v4, s14, v1 @@ -1001,12 +1001,12 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s6, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s10, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s10, v1 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem33_64: @@ -1037,11 +1037,11 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 -; GCN-IR-NEXT: s_min_u32 s16, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s14, s12, s16 -; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[14:15], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 63 +; GCN-IR-NEXT: s_min_u32 s14, s6, s7 +; GCN-IR-NEXT: s_sub_u32 s16, s12, s14 +; GCN-IR-NEXT: s_subb_u32 s17, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[16:17], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[16:17], 63 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[18:19] ; GCN-IR-NEXT: s_and_b64 s[10:11], s[18:19], exec ; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3 @@ -1051,41 +1051,41 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s18, s14, 1 -; GCN-IR-NEXT: s_addc_u32 s19, s15, 0 +; GCN-IR-NEXT: s_add_u32 s18, s16, 1 +; GCN-IR-NEXT: s_addc_u32 s19, s17, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0 -; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 +; GCN-IR-NEXT: s_sub_i32 s15, 63, s16 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s15 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s18 +; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[2:3], s18 ; GCN-IR-NEXT: s_add_u32 s18, s8, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s9, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[12:13] -; GCN-IR-NEXT: s_add_u32 s12, s6, s16 +; GCN-IR-NEXT: s_add_u32 s12, s6, s14 ; GCN-IR-NEXT: s_addc_u32 s13, s7, 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 +; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 ; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s6, s18, s14 -; GCN-IR-NEXT: s_subb_u32 s6, s19, s15 -; GCN-IR-NEXT: s_ashr_i32 s16, s6, 31 -; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s6, s16, 1 -; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 -; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s6, s18, s16 +; GCN-IR-NEXT: s_subb_u32 s6, s19, s17 +; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 +; GCN-IR-NEXT: s_mov_b32 s15, s14 +; GCN-IR-NEXT: s_and_b32 s6, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s16, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 ; GCN-IR-NEXT: .LBB8_4: ; %Flow7 @@ -1188,11 +1188,11 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s4 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s5 -; GCN-IR-NEXT: s_min_u32 s16, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s14, s12, s16 -; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[14:15], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 63 +; GCN-IR-NEXT: s_min_u32 s14, s8, s9 +; GCN-IR-NEXT: s_sub_u32 s16, s12, s14 +; GCN-IR-NEXT: s_subb_u32 s17, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[16:17], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[16:17], 63 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[18:19] ; GCN-IR-NEXT: s_and_b64 s[10:11], s[18:19], exec ; GCN-IR-NEXT: s_cselect_b32 s11, 0, s5 @@ -1202,41 +1202,41 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s18, s14, 1 -; GCN-IR-NEXT: s_addc_u32 s19, s15, 0 +; GCN-IR-NEXT: s_add_u32 s18, s16, 1 +; GCN-IR-NEXT: s_addc_u32 s19, s17, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0 -; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 +; GCN-IR-NEXT: s_sub_i32 s15, 63, s16 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s14 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s15 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[4:5], s18 +; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[4:5], s18 ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] -; GCN-IR-NEXT: s_add_u32 s12, s8, s16 +; GCN-IR-NEXT: s_add_u32 s12, s8, s14 ; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 +; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 ; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[8:9] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s8, s18, s14 -; GCN-IR-NEXT: s_subb_u32 s8, s19, s15 -; GCN-IR-NEXT: s_ashr_i32 s16, s8, 31 -; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s8, s16, 1 -; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 -; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s8, s18, s16 +; GCN-IR-NEXT: s_subb_u32 s8, s19, s17 +; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 +; GCN-IR-NEXT: s_mov_b32 s15, s14 +; GCN-IR-NEXT: s_and_b32 s8, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s16, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[8:9] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 ; GCN-IR-NEXT: .LBB9_4: ; %Flow4 @@ -1584,22 +1584,22 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[3:4] +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 24, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 @@ -1647,14 +1647,14 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 -; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v6 ; GCN-IR-NEXT: .LBB11_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v3 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc @@ -1775,14 +1775,14 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 @@ -1790,9 +1790,9 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 @@ -1841,14 +1841,14 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 -; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v6 ; GCN-IR-NEXT: .LBB12_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v3 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index 7bc13f41262ca7..d73cc7366f5fe1 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -258,28 +258,28 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v11 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/swdev373493.ll b/llvm/test/CodeGen/AMDGPU/swdev373493.ll index 4f33e19835172a..4d1d88d643f151 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev373493.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev373493.ll @@ -22,16 +22,16 @@ define hidden fastcc void @bar(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %a ; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 ; CHECK-NEXT: ; %bb.2: ; %bb7 ; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; CHECK-NEXT: s_getpc_b64 s[18:19] -; CHECK-NEXT: s_add_u32 s18, s18, global@rel32@lo+1948 -; CHECK-NEXT: s_addc_u32 s19, s19, global@rel32@hi+1956 -; CHECK-NEXT: v_mov_b32_e32 v5, 0 -; CHECK-NEXT: v_mov_b32_e32 v0, s18 -; CHECK-NEXT: v_mov_b32_e32 v1, s19 ; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, eggs@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, eggs@rel32@hi+12 -; CHECK-NEXT: s_setpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, global@rel32@lo+1948 +; CHECK-NEXT: s_addc_u32 s17, s17, global@rel32@hi+1956 +; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s16 +; CHECK-NEXT: v_mov_b32_e32 v1, s17 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, eggs@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, eggs@rel32@hi+12 +; CHECK-NEXT: s_setpc_b64 s[18:19] ; CHECK-NEXT: .LBB0_3: ; %LeafBlock1 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 ; CHECK-NEXT: ; %bb.4: ; %bb8 diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll index 7201ffaf561662..01e58e6ecc7711 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -89,13 +89,15 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) ; CHECK-NEXT: s_mov_b32 s1, s3 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] ; CHECK-NEXT: v_writelane_b32 v2, s0, 7 -; CHECK-NEXT: s_mov_b32 s4, s0 ; CHECK-NEXT: v_writelane_b32 v2, s1, 8 +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0x40140000 +; CHECK-NEXT: s_mov_b32 s4, s0 ; CHECK-NEXT: v_readlane_b32 s0, v2, 0 ; CHECK-NEXT: v_readlane_b32 s2, v2, 11 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5] ; CHECK-NEXT: s_add_i32 s2, s2, s0 ; CHECK-NEXT: v_writelane_b32 v2, s2, 11 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5] ; CHECK-NEXT: v_readlane_b32 s0, v2, 11 ; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 012b3f976734de..63c241ef9d1b0a 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -818,9 +818,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: flat_load_dwordx4 v[6:9], v[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cvt_f32_u32_e32 v10, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v12, v1 @@ -855,51 +855,51 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15 ; GCN-NEXT: v_add_u32_e32 v13, vcc, v16, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_hi_u32 v10, v6, v10 -; GCN-NEXT: v_mul_hi_u32 v11, v7, v11 -; GCN-NEXT: v_mul_hi_u32 v12, v8, v12 -; GCN-NEXT: v_mul_hi_u32 v13, v9, v13 +; GCN-NEXT: v_mul_hi_u32 v10, v4, v10 +; GCN-NEXT: v_mul_hi_u32 v11, v5, v11 +; GCN-NEXT: v_mul_hi_u32 v12, v6, v12 +; GCN-NEXT: v_mul_hi_u32 v13, v7, v13 ; GCN-NEXT: v_mul_lo_u32 v14, v10, v0 ; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 ; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 ; GCN-NEXT: v_mul_lo_u32 v20, v13, v3 -; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v14 -; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 -; GCN-NEXT: v_sub_u32_e32 v8, vcc, v8, v18 -; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v20 +; GCN-NEXT: v_sub_u32_e32 v4, vcc, v4, v14 +; GCN-NEXT: v_sub_u32_e32 v5, vcc, v5, v16 +; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v18 +; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v20 ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 ; GCN-NEXT: v_add_u32_e32 v19, vcc, 1, v12 ; GCN-NEXT: v_add_u32_e32 v21, vcc, 1, v13 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v6, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v7, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 -; GCN-NEXT: v_sub_u32_e32 v14, vcc, v6, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 +; GCN-NEXT: v_sub_u32_e32 v14, vcc, v4, v0 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] -; GCN-NEXT: v_sub_u32_e32 v15, vcc, v7, v1 +; GCN-NEXT: v_sub_u32_e32 v15, vcc, v5, v1 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] -; GCN-NEXT: v_sub_u32_e32 v16, vcc, v8, v2 +; GCN-NEXT: v_sub_u32_e32 v16, vcc, v6, v2 ; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5] -; GCN-NEXT: v_sub_u32_e32 v17, vcc, v9, v3 +; GCN-NEXT: v_sub_u32_e32 v17, vcc, v7, v3 ; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[0:1] ; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v15, s[2:3] ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v16, s[4:5] ; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[6:7] ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v14, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v1 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v16, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc -; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: s_endpgm ; ; GFX1030-LABEL: udiv_v4i32: @@ -1848,20 +1848,20 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i24: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 ; SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -1890,25 +1890,25 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; SI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_udiv_i24: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 ; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -1937,7 +1937,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: v_udiv_i24: diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index e23f3cfad89bc8..1e78ba52419e0c 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -129,59 +129,59 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s4 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[6:7], s[8:9] ; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s5 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 ; GCN-IR-NEXT: s_min_u32 s10, s10, s11 -; GCN-IR-NEXT: s_min_u32 s14, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 -; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[16:17] -; GCN-IR-NEXT: s_and_b64 s[8:9], s[16:17], exec -; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 -; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 +; GCN-IR-NEXT: s_min_u32 s12, s6, s7 +; GCN-IR-NEXT: s_sub_u32 s8, s10, s12 +; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[8:9], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[8:9], 63 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[14:15], s[16:17] +; GCN-IR-NEXT: s_and_b64 s[14:15], s[16:17], exec +; GCN-IR-NEXT: s_cselect_b32 s15, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s14, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 +; GCN-IR-NEXT: s_add_u32 s14, s8, 1 +; GCN-IR-NEXT: s_addc_u32 s15, s9, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[14:15], 0 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16 -; GCN-IR-NEXT: s_add_u32 s15, s4, -1 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s14 +; GCN-IR-NEXT: s_add_u32 s13, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s16, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] -; GCN-IR-NEXT: s_add_u32 s2, s2, s14 +; GCN-IR-NEXT: s_add_u32 s2, s2, s12 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 ; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s15, s12 -; GCN-IR-NEXT: s_subb_u32 s6, s16, s13 +; GCN-IR-NEXT: s_sub_u32 s6, s13, s14 +; GCN-IR-NEXT: s_subb_u32 s6, s16, s15 ; GCN-IR-NEXT: s_ashr_i32 s10, s6, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_and_b32 s6, s10, 1 ; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[4:5] -; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 -; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 +; GCN-IR-NEXT: s_sub_u32 s14, s14, s10 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s11 ; GCN-IR-NEXT: s_add_u32 s2, s2, 1 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[2:3], 0 @@ -190,12 +190,12 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[6:7], s[2:3] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s14 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s15 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 %x, %y @@ -796,7 +796,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[4:5], s[6:7] ; GCN-IR-NEXT: s_flbit_i32_b32 s4, s2 ; GCN-IR-NEXT: s_add_i32 s4, s4, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s5, s3 @@ -804,51 +804,51 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_flbit_i32_b32 s4, s8 ; GCN-IR-NEXT: s_add_i32 s4, s4, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s5, s9 -; GCN-IR-NEXT: s_min_u32 s14, s4, s5 -; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 -; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[6:7], s[16:17] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[16:17], exec -; GCN-IR-NEXT: s_cselect_b32 s7, 0, s9 -; GCN-IR-NEXT: s_cselect_b32 s6, 0, s8 +; GCN-IR-NEXT: s_min_u32 s12, s4, s5 +; GCN-IR-NEXT: s_sub_u32 s6, s10, s12 +; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[6:7], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[6:7], 63 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[14:15], s[16:17] +; GCN-IR-NEXT: s_and_b64 s[14:15], s[16:17], exec +; GCN-IR-NEXT: s_cselect_b32 s15, 0, s9 +; GCN-IR-NEXT: s_cselect_b32 s14, 0, s8 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s12 +; GCN-IR-NEXT: s_add_u32 s14, s6, 1 +; GCN-IR-NEXT: s_addc_u32 s15, s7, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[14:15], 0 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s6 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s16 -; GCN-IR-NEXT: s_add_u32 s15, s2, -1 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[8:9], s14 +; GCN-IR-NEXT: s_add_u32 s13, s2, -1 ; GCN-IR-NEXT: s_addc_u32 s16, s3, -1 ; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11] -; GCN-IR-NEXT: s_add_u32 s8, s4, s14 +; GCN-IR-NEXT: s_add_u32 s8, s4, s12 ; GCN-IR-NEXT: s_addc_u32 s9, s5, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 ; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[4:5] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s4, s15, s12 -; GCN-IR-NEXT: s_subb_u32 s4, s16, s13 +; GCN-IR-NEXT: s_sub_u32 s4, s13, s14 +; GCN-IR-NEXT: s_subb_u32 s4, s16, s15 ; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_and_b32 s4, s10, 1 ; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 -; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 +; GCN-IR-NEXT: s_sub_u32 s14, s14, s10 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s11 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[8:9], 0 @@ -857,13 +857,13 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-IR-NEXT: .LBB7_4: ; %Flow4 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB7_5: ; %udiv-end ; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s6 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s15 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s14 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GCN-IR-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -986,32 +986,32 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 -; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_min_u32 s8, s8, s9 -; GCN-IR-NEXT: s_add_u32 s10, s8, 0xffffffc5 -; GCN-IR-NEXT: s_addc_u32 s11, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[10:11], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 63 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[12:13], exec -; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24 +; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 +; GCN-IR-NEXT: s_add_i32 s6, s6, 32 +; GCN-IR-NEXT: s_min_u32 s8, s6, s7 +; GCN-IR-NEXT: s_add_u32 s6, s8, 0xffffffc5 +; GCN-IR-NEXT: s_addc_u32 s7, 0, -1 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[6:7], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[6:7], 63 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[10:11], s[12:13] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s10, 0, 24 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] -; GCN-IR-NEXT: s_mov_b32 s7, 0 +; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s12, s10, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s11, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0 -; GCN-IR-NEXT: s_sub_i32 s9, 63, s10 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s9 +; GCN-IR-NEXT: s_add_u32 s10, s6, 1 +; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s6 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s12 +; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10 ; GCN-IR-NEXT: s_add_u32 s14, s2, -1 ; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 ; GCN-IR-NEXT: s_sub_u32 s8, 58, s8 @@ -1041,12 +1041,12 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 ; GCN-IR-NEXT: .LBB8_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB8_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 24, %x @@ -1435,62 +1435,62 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_min_u32 s10, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s8, 59, s10 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 +; GCN-IR-NEXT: s_min_u32 s8, s6, s7 +; GCN-IR-NEXT: s_sub_u32 s6, 59, s8 +; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[8:9], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], exec -; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3 -; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[6:7], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[6:7], 63 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s12, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0 -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s8 +; GCN-IR-NEXT: s_add_u32 s10, s6, 1 +; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], s12 -; GCN-IR-NEXT: s_add_u32 s2, s10, 0xffffffc4 +; GCN-IR-NEXT: s_lshr_b64 s[10:11], s[2:3], s10 +; GCN-IR-NEXT: s_add_u32 s2, s8, 0xffffffc4 ; GCN-IR-NEXT: s_addc_u32 s3, 0, -1 -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s4, 23, s8 -; GCN-IR-NEXT: s_subb_u32 s4, 0, s9 -; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 -; GCN-IR-NEXT: s_and_b32 s4, s10, 1 -; GCN-IR-NEXT: s_and_b32 s10, s10, 24 -; GCN-IR-NEXT: s_sub_u32 s8, s8, s10 -; GCN-IR-NEXT: s_subb_u32 s9, s9, 0 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s4, 23, s10 +; GCN-IR-NEXT: s_subb_u32 s4, 0, s11 +; GCN-IR-NEXT: s_ashr_i32 s8, s4, 31 +; GCN-IR-NEXT: s_and_b32 s4, s8, 1 +; GCN-IR-NEXT: s_and_b32 s8, s8, 24 +; GCN-IR-NEXT: s_sub_u32 s10, s10, s8 +; GCN-IR-NEXT: s_subb_u32 s11, s11, 0 ; GCN-IR-NEXT: s_add_u32 s2, s2, 1 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[12:13] ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_3 ; GCN-IR-NEXT: .LBB11_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB11_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 %x, 24 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index f68d14a32b929a..3cf34c13c248e2 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -135,11 +135,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 ; GCN-IR-NEXT: s_min_u32 s10, s10, s11 -; GCN-IR-NEXT: s_min_u32 s14, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 -; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 +; GCN-IR-NEXT: s_min_u32 s12, s6, s7 +; GCN-IR-NEXT: s_sub_u32 s14, s10, s12 +; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[16:17] ; GCN-IR-NEXT: s_and_b64 s[8:9], s[16:17], exec ; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 @@ -150,41 +150,41 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 +; GCN-IR-NEXT: s_add_u32 s16, s14, 1 +; GCN-IR-NEXT: s_addc_u32 s17, s15, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 +; GCN-IR-NEXT: s_sub_i32 s13, 63, s14 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s13 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s16 ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] -; GCN-IR-NEXT: s_add_u32 s10, s6, s14 +; GCN-IR-NEXT: s_add_u32 s10, s6, s12 ; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 +; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 ; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] -; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 -; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s6, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[4:5] -; GCN-IR-NEXT: s_sub_u32 s12, s12, s14 -; GCN-IR-NEXT: s_subb_u32 s13, s13, s15 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s6, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s6, s17, s15 +; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31 +; GCN-IR-NEXT: s_mov_b32 s13, s12 +; GCN-IR-NEXT: s_and_b32 s6, s12, 1 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_sub_u32 s14, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s13 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow7 @@ -1179,14 +1179,14 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 0xffffffd0, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 @@ -1194,9 +1194,9 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 @@ -1245,14 +1245,14 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 -; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v6 ; GCN-IR-NEXT: .LBB8_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v3 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll index 9dcd3a66a16dbf..d1df6a1c5233ab 100644 --- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll +++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll @@ -134,7 +134,7 @@ l2: ; any of the v_cmp source operands. ; GCN-LABEL: check_saveexec_overwrites_vcmp_source: -; GCN: .LBB7_3: ; %then +; GCN: %bb.1: ; %then ; GFX1010: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}} ; GFX1010-NEXT: v_mov_b32_e32 {{.*}}, s[[A]] ; GFX1010-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index b8d18f56b76023..036d7a7822733e 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -333,9 +333,9 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v25, 24, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v27, 8, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v28, 24, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v29, 24, v7 ; GFX906-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v28, 8, v7 ; GFX906-NEXT: v_lshrrev_b32_e32 v32, 24, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v31, 8, v6 @@ -368,17 +368,17 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v25, 24, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v27, 8, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v28, 24, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v29, 24, v7 ; GFX906-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v28, 8, v7 ; GFX906-NEXT: v_lshrrev_b32_e32 v32, 24, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v31, 8, v6 ; GFX906-NEXT: .LBB5_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: v_lshlrev_b16_e32 v28, 8, v28 +; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; GFX906-NEXT: v_or_b32_sdwa v28, v30, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v31 ; GFX906-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v6, v6, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -409,12 +409,12 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; GFX906-NEXT: v_lshlrev_b16_e32 v28, 8, v28 ; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v10 -; GFX906-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v5, v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] ; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1] offset:16 @@ -446,1540 +446,1538 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_add_u32 s8, s8, s3 ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX906-NEXT: s_addc_u32 s9, s9, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[18:21], v2, s[4:5] offset:240 -; GFX906-NEXT: global_load_dwordx4 v[6:9], v2, s[4:5] offset:224 -; GFX906-NEXT: global_load_dwordx4 v[10:13], v2, s[4:5] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[3:6], v2, s[4:5] offset:240 +; GFX906-NEXT: global_load_dwordx4 v[7:10], v2, s[4:5] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[18:21], v2, s[4:5] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[14:17], v2, s[4:5] offset:192 -; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v9 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v9 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v8 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v7 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v7 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v6 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v6 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v13 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v13 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v13 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v12 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v12 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v11 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v11 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v10 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v10 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v17 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v17 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v16 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v16 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v15 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v15 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v14 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v14 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v14 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[18:21], v2, s[4:5] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[22:25], v2, s[4:5] offset:160 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v25 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v25 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v25 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v24 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v24 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v24 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v23 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v23 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v23 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v22 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v22 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v22 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[26:29], v2, s[4:5] offset:144 -; GFX906-NEXT: global_load_dwordx4 v[30:33], v2, s[4:5] offset:128 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v29 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v29 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v29 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v28 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v28 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v28 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v27 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v27 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v27 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v26 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v26 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v26 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v33 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v33 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v32 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v32 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v32 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v31 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v31 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v30 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v30 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v30 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[34:37], v2, s[4:5] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[38:41], v2, s[4:5] offset:96 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v37 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v37 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v37 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v36 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v36 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v36 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v35 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v35 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v34 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v34 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v34 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v41 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v41 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v41 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v40 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v40 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v39 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v39 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v39 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v38 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v38 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v38 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[42:45], v2, s[4:5] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[46:49], v2, s[4:5] offset:64 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v45 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v45 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v45 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v44 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v44 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v44 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v43 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v43 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v43 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v42 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v42 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v42 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v49 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v49 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v49 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v48 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v48 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v48 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v47 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v47 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v47 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v46 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v46 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v46 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[50:53], v2, s[4:5] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[54:57], v2, s[4:5] offset:32 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v53 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v53 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v53 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v52 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v52 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v52 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v51 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v51 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v51 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v50 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v50 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v57 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v57 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v57 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v56 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v56 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v55 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v55 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v54 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v54 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v54 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[58:61], v2, s[4:5] offset:16 ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[2:5], v2, s[4:5] -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v61 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v61 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v61 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v60 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v60 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v60 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v59 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v59 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v58 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v58 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v58 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v5 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v5 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v4 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v4 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 24, v2 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; GFX906-NEXT: buffer_store_dword v63, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v3 -; GFX906-NEXT: buffer_store_dword v63, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB6_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: v_lshlrev_b64 v[2:3], 3, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v0, s7 -; GFX906-NEXT: v_add_co_u32_e32 v2, vcc, s6, v2 -; GFX906-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX906-NEXT: global_load_dwordx4 v[18:21], v[2:3], off offset:240 -; GFX906-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:224 -; GFX906-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:208 -; GFX906-NEXT: global_load_dwordx4 v[14:17], v[2:3], off offset:192 -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 +; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 +; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[18:21], v[2:3], off offset:176 -; GFX906-NEXT: global_load_dwordx4 v[22:25], v[2:3], off offset:160 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[18:21], v2, s[4:5] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[22:25], v2, s[4:5] offset:160 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[26:29], v[2:3], off offset:144 -; GFX906-NEXT: global_load_dwordx4 v[30:33], v[2:3], off offset:128 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[26:29], v2, s[4:5] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[30:33], v2, s[4:5] offset:128 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[34:37], v[2:3], off offset:112 -; GFX906-NEXT: global_load_dwordx4 v[38:41], v[2:3], off offset:96 -; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[34:37], v2, s[4:5] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[38:41], v2, s[4:5] offset:96 +; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[42:45], v[2:3], off offset:80 -; GFX906-NEXT: global_load_dwordx4 v[46:49], v[2:3], off offset:64 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[42:45], v2, s[4:5] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[46:49], v2, s[4:5] offset:64 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[50:53], v[2:3], off offset:48 -; GFX906-NEXT: global_load_dwordx4 v[54:57], v[2:3], off offset:32 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[50:53], v2, s[4:5] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[54:57], v2, s[4:5] offset:32 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v57 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v57 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v57 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[58:61], v[2:3], off offset:16 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[58:61], v2, s[4:5] offset:16 ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[2:5], v[2:3], off +; GFX906-NEXT: global_load_dwordx4 v[2:5], v2, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v61 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v61 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v61 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v60 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v60 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v60 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v59 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v59 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v59 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v58 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v58 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v58 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v12, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_cbranch_execz .LBB6_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: v_mov_b32_e32 v3, s7 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX906-NEXT: v_add_co_u32_e32 v2, vcc, s6, v0 +; GFX906-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc +; GFX906-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:240 +; GFX906-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:224 +; GFX906-NEXT: global_load_dwordx4 v[18:21], v[2:3], off offset:208 +; GFX906-NEXT: global_load_dwordx4 v[14:17], v[2:3], off offset:192 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v3 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v3 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill -; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v63 -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v62 -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v63, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62 -; GFX906-NEXT: v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[18:21], v[2:3], off offset:176 +; GFX906-NEXT: global_load_dwordx4 v[22:25], v[2:3], off offset:160 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[26:29], v[2:3], off offset:144 +; GFX906-NEXT: global_load_dwordx4 v[30:33], v[2:3], off offset:128 ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[34:37], v[2:3], off offset:112 +; GFX906-NEXT: global_load_dwordx4 v[38:41], v[2:3], off offset:96 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[42:45], v[2:3], off offset:80 +; GFX906-NEXT: global_load_dwordx4 v[46:49], v[2:3], off offset:64 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[50:53], v[2:3], off offset:48 +; GFX906-NEXT: global_load_dwordx4 v[54:57], v[2:3], off offset:32 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v57 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v57 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v57 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[58:61], v[2:3], off offset:16 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: global_load_dwordx4 v[2:5], v[2:3], off +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v61 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v61 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v61 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v60 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v60 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v60 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v59 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v59 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v59 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v58 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v58 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v58 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v12, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX906-NEXT: .LBB6_2: ; %bb.2 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; GFX906-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; GFX906-NEXT: v_lshlrev_b16_e32 v12, 8, v12 +; GFX906-NEXT: v_or_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v10 +; GFX906-NEXT: v_lshlrev_b16_e32 v7, 8, v8 +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v62 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: global_store_dwordx4 v6, v[1:4], s[2:3] +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v60, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v61, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v58, 8, v58 -; GFX906-NEXT: v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:16 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:16 +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v56, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v54, 8, v54 -; GFX906-NEXT: v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v57, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:32 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:32 +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v50, 8, v50 -; GFX906-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:48 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:48 +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v46, 8, v46 -; GFX906-NEXT: v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:64 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:64 +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v44, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v45, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v42, 8, v42 -; GFX906-NEXT: v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v45, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:80 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:80 +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v38, 8, v38 -; GFX906-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v41, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:96 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:96 +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX906-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:112 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:112 +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v32, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30 -; GFX906-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:128 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:128 +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v29, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26 -; GFX906-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:144 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:144 +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v25, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v22, 8, v22 -; GFX906-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:160 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:160 +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v21, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v18, 8, v18 -; GFX906-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:176 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v15, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:176 +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v14, 8, v14 -; GFX906-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:192 +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(5) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:192 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(4) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX906-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; GFX906-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:208 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:208 +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(4) +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(5) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX906-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; GFX906-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:224 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:224 +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(5) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX906-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:240 +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] offset:240 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index cadc23414dcac1..0bfcbd8022bc1e 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1532,33 +1532,33 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1032: ; %bb.0: ; %bb ; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX1032-NEXT: ; implicit-def: $sgpr1 ; GFX1032-NEXT: ; implicit-def: $sgpr2 +; GFX1032-NEXT: ; implicit-def: $sgpr1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_subrev_nc_u32_e32 v0, s0, v0 ; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: s_branch .LBB27_2 ; GFX1032-NEXT: .LBB27_1: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB27_2 Depth=1 -; GFX1032-NEXT: s_xor_b32 s3, s1, -1 -; GFX1032-NEXT: s_add_i32 s2, s2, 1 +; GFX1032-NEXT: s_xor_b32 s3, s2, -1 +; GFX1032-NEXT: s_add_i32 s1, s1, 1 ; GFX1032-NEXT: s_and_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_or_b32 s0, s3, s0 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB27_4 ; GFX1032-NEXT: .LBB27_2: ; %bb1 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_or_b32 s1, s1, exec_lo -; GFX1032-NEXT: s_cmp_gt_i32 s2, -1 +; GFX1032-NEXT: s_or_b32 s2, s2, exec_lo +; GFX1032-NEXT: s_cmp_gt_i32 s1, -1 ; GFX1032-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1032-NEXT: ; %bb.3: ; %bb4 ; GFX1032-NEXT: ; in Loop: Header=BB27_2 Depth=1 ; GFX1032-NEXT: global_load_dword v1, v[0:1], off glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: s_andn2_b32 s1, s1, exec_lo +; GFX1032-NEXT: s_andn2_b32 s2, s2, exec_lo ; GFX1032-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GFX1032-NEXT: s_or_b32 s1, s1, s3 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 ; GFX1032-NEXT: s_branch .LBB27_1 ; GFX1032-NEXT: .LBB27_4: ; %bb9 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll index 76efff1152f788..ffa5508bce0f51 100644 --- a/llvm/test/CodeGen/AMDGPU/while-break.ll +++ b/llvm/test/CodeGen/AMDGPU/while-break.ll @@ -9,8 +9,8 @@ define amdgpu_ps float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 { ; GCN-NEXT: s_branch .LBB0_2 ; GCN-NEXT: .LBB0_1: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: s_and_b32 s2, exec_lo, s3 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GCN-NEXT: s_and_b32 s2, exec_lo, s4 ; GCN-NEXT: s_or_b32 s0, s2, s0 ; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: s_cbranch_execz .LBB0_8 @@ -35,13 +35,13 @@ define amdgpu_ps float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 { ; GCN-NEXT: ; %bb.6: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GCN-NEXT: s_mov_b32 s3, -1 -; GCN-NEXT: s_and_saveexec_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s4, -1 +; GCN-NEXT: s_and_saveexec_b32 s3, s2 ; GCN-NEXT: s_cbranch_execz .LBB0_1 ; GCN-NEXT: ; %bb.7: ; %latch ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v0 -; GCN-NEXT: s_orn2_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: s_orn2_b32 s4, vcc_lo, exec_lo ; GCN-NEXT: s_branch .LBB0_1 ; GCN-NEXT: .LBB0_8: ; %end ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -84,8 +84,8 @@ define amdgpu_ps float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 { ; GCN-NEXT: s_branch .LBB1_2 ; GCN-NEXT: .LBB1_1: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: s_and_b32 s2, exec_lo, s3 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GCN-NEXT: s_and_b32 s2, exec_lo, s4 ; GCN-NEXT: s_or_b32 s0, s2, s0 ; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: s_cbranch_execz .LBB1_8 @@ -112,13 +112,13 @@ define amdgpu_ps float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 { ; GCN-NEXT: ; %bb.6: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GCN-NEXT: s_mov_b32 s3, -1 -; GCN-NEXT: s_and_saveexec_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s4, -1 +; GCN-NEXT: s_and_saveexec_b32 s3, s2 ; GCN-NEXT: s_cbranch_execz .LBB1_1 ; GCN-NEXT: ; %bb.7: ; %latch ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v0 -; GCN-NEXT: s_orn2_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: s_orn2_b32 s4, vcc_lo, exec_lo ; GCN-NEXT: s_branch .LBB1_1 ; GCN-NEXT: .LBB1_8: ; %end ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 95f947cbca14f0..5e0c95fe9878a4 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1968,36 +1968,36 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { ; GFX10-W32: ; %bb.0: ; %entry ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v7, v3 +; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10-W32-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_branch .LBB35_2 ; GFX10-W32-NEXT: .p2align 6 ; GFX10-W32-NEXT: .LBB35_1: ; %body ; GFX10-W32-NEXT: ; in Loop: Header=BB35_2 Depth=1 -; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX10-W32-NEXT: s_cbranch_execz .LBB35_4 ; GFX10-W32-NEXT: .LBB35_2: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-W32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_mov_b32_e32 v7, v3 -; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2 -; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v6 +; GFX10-W32-NEXT: v_mov_b32_e32 v3, v7 ; GFX10-W32-NEXT: s_cbranch_vccz .LBB35_1 ; GFX10-W32-NEXT: ; %bb.3: -; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX10-W32-NEXT: ; implicit-def: $vgpr8 ; GFX10-W32-NEXT: .LBB35_4: ; %break ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-W32-NEXT: v_mov_b32_e32 v2, v6 -; GFX10-W32-NEXT: v_mov_b32_e32 v3, v7 ; GFX10-W32-NEXT: ; return to shader part epilog entry: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0) @@ -2253,12 +2253,13 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-W64-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF -; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2272,23 +2273,23 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-W64-NEXT: .LBB40_2: ; %ENDIF ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] -; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_wwm_within_wqm: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX10-W32-NEXT: s_cbranch_execz .LBB40_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF -; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2302,11 +2303,10 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-W32-NEXT: .LBB40_2: ; %ENDIF ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %cmp = icmp eq i32 %z, 0 @@ -2789,12 +2789,13 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-W64-NEXT: s_cbranch_execz .LBB49_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF -; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2808,23 +2809,23 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-W64-NEXT: .LBB49_2: ; %ENDIF ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] -; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wwm_within_wqm: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX10-W32-NEXT: s_cbranch_execz .LBB49_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF -; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2838,11 +2839,10 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-W32-NEXT: .LBB49_2: ; %ENDIF ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %cmp = icmp eq i32 %z, 0 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll b/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll index ffa808b459226a..e4a1c843957b3c 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll @@ -7,38 +7,38 @@ define dso_local arm_aapcscc void @complex_dot_prod(ptr nocapture readonly %pSrc ; CHECK-LLC-LABEL: complex_dot_prod: ; CHECK-LLC: @ %bb.0: @ %entry ; CHECK-LLC-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-LLC-NEXT: ldr r5, [r0] -; CHECK-LLC-NEXT: ldr r7, [r1] -; CHECK-LLC-NEXT: ldr.w r10, [r0, #4] -; CHECK-LLC-NEXT: ldr.w r8, [r0, #8] -; CHECK-LLC-NEXT: ldr.w r12, [r0, #12] -; CHECK-LLC-NEXT: ldr r4, [r1, #4] -; CHECK-LLC-NEXT: ldr.w r9, [r1, #8] -; CHECK-LLC-NEXT: ldr.w lr, [r1, #12] +; CHECK-LLC-NEXT: ldr.w r12, [r0] +; CHECK-LLC-NEXT: ldr r5, [r1] +; CHECK-LLC-NEXT: ldr.w lr, [r0, #4] +; CHECK-LLC-NEXT: ldr.w r10, [r0, #8] +; CHECK-LLC-NEXT: ldr.w r8, [r0, #12] +; CHECK-LLC-NEXT: ldr r6, [r1, #4] +; CHECK-LLC-NEXT: ldr r7, [r1, #8] +; CHECK-LLC-NEXT: ldr.w r9, [r1, #12] ; CHECK-LLC-NEXT: movs r0, #0 ; CHECK-LLC-NEXT: movs r1, #0 -; CHECK-LLC-NEXT: smlaldx r0, r1, r5, r7 -; CHECK-LLC-NEXT: smulbb r6, r7, r5 -; CHECK-LLC-NEXT: smultt r5, r7, r5 -; CHECK-LLC-NEXT: asr.w r11, r6, #31 -; CHECK-LLC-NEXT: subs r6, r6, r5 +; CHECK-LLC-NEXT: smlaldx r0, r1, r12, r5 +; CHECK-LLC-NEXT: smulbb r4, r5, r12 +; CHECK-LLC-NEXT: smultt r5, r5, r12 +; CHECK-LLC-NEXT: asr.w r11, r4, #31 +; CHECK-LLC-NEXT: subs r4, r4, r5 ; CHECK-LLC-NEXT: sbc.w r5, r11, r5, asr #31 -; CHECK-LLC-NEXT: smlaldx r0, r1, r10, r4 -; CHECK-LLC-NEXT: smlalbb r6, r5, r4, r10 -; CHECK-LLC-NEXT: smultt r4, r4, r10 -; CHECK-LLC-NEXT: subs r6, r6, r4 -; CHECK-LLC-NEXT: sbc.w r4, r5, r4, asr #31 -; CHECK-LLC-NEXT: smlalbb r6, r4, r9, r8 -; CHECK-LLC-NEXT: smultt r5, r9, r8 -; CHECK-LLC-NEXT: subs r6, r6, r5 -; CHECK-LLC-NEXT: sbc.w r4, r4, r5, asr #31 +; CHECK-LLC-NEXT: smlaldx r0, r1, lr, r6 +; CHECK-LLC-NEXT: smlalbb r4, r5, r6, lr +; CHECK-LLC-NEXT: smultt r6, r6, lr +; CHECK-LLC-NEXT: subs r4, r4, r6 +; CHECK-LLC-NEXT: sbc.w r6, r5, r6, asr #31 +; CHECK-LLC-NEXT: smlaldx r0, r1, r10, r7 +; CHECK-LLC-NEXT: smlalbb r4, r6, r7, r10 +; CHECK-LLC-NEXT: smultt r7, r7, r10 +; CHECK-LLC-NEXT: subs r5, r4, r7 +; CHECK-LLC-NEXT: sbc.w r7, r6, r7, asr #31 +; CHECK-LLC-NEXT: smlalbb r5, r7, r9, r8 +; CHECK-LLC-NEXT: smultt r6, r9, r8 ; CHECK-LLC-NEXT: smlaldx r0, r1, r8, r9 -; CHECK-LLC-NEXT: smlalbb r6, r4, lr, r12 -; CHECK-LLC-NEXT: smultt r7, lr, r12 -; CHECK-LLC-NEXT: smlaldx r0, r1, r12, lr -; CHECK-LLC-NEXT: subs r6, r6, r7 -; CHECK-LLC-NEXT: sbc.w r7, r4, r7, asr #31 -; CHECK-LLC-NEXT: lsrs r6, r6, #6 +; CHECK-LLC-NEXT: subs r5, r5, r6 +; CHECK-LLC-NEXT: sbc.w r7, r7, r6, asr #31 +; CHECK-LLC-NEXT: lsrs r6, r5, #6 ; CHECK-LLC-NEXT: lsrs r0, r0, #6 ; CHECK-LLC-NEXT: orr.w r7, r6, r7, lsl #26 ; CHECK-LLC-NEXT: orr.w r0, r0, r1, lsl #26 @@ -46,97 +46,98 @@ define dso_local arm_aapcscc void @complex_dot_prod(ptr nocapture readonly %pSrc ; CHECK-LLC-NEXT: str r0, [r3] ; CHECK-LLC-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-LCC: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} -; -; CHECK-OPT-LABEL: @complex_dot_prod( +; CHECK-OPT-LABEL: define dso_local arm_aapcscc void @complex_dot_prod( +; CHECK-OPT-SAME: ptr nocapture readonly [[PSRCA:%.*]], ptr nocapture readonly [[PSRCB:%.*]], ptr nocapture [[REALRESULT:%.*]], ptr nocapture [[IMAGRESULT:%.*]]) { ; CHECK-OPT-NEXT: entry: -; CHECK-OPT-NEXT: [[TMP1:%.*]] = load i32, ptr [[PSRCA:%.*]], align 2 -; CHECK-OPT-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 -; CHECK-OPT-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 -; CHECK-OPT-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 -; CHECK-OPT-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 -; CHECK-OPT-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-OPT-NEXT: [[TMP0:%.*]] = load i32, ptr [[PSRCA]], align 2 +; CHECK-OPT-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i16 +; CHECK-OPT-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-OPT-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP0]], 16 +; CHECK-OPT-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +; CHECK-OPT-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i32 ; CHECK-OPT-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i16, ptr [[PSRCA]], i32 2 -; CHECK-OPT-NEXT: [[TMP8:%.*]] = load i32, ptr [[PSRCB:%.*]], align 2 -; CHECK-OPT-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 -; CHECK-OPT-NEXT: [[TMP10:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP1]], i32 [[TMP8]], i64 0) -; CHECK-OPT-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 -; CHECK-OPT-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 -; CHECK-OPT-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 -; CHECK-OPT-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-OPT-NEXT: [[TMP6:%.*]] = load i32, ptr [[PSRCB]], align 2 +; CHECK-OPT-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16 +; CHECK-OPT-NEXT: [[TMP8:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP0]], i32 [[TMP6]], i64 0) +; CHECK-OPT-NEXT: [[TMP9:%.*]] = sext i16 [[TMP7]] to i32 +; CHECK-OPT-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP6]], 16 +; CHECK-OPT-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i16 +; CHECK-OPT-NEXT: [[TMP12:%.*]] = sext i16 [[TMP11]] to i32 ; CHECK-OPT-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i16, ptr [[PSRCB]], i32 2 -; CHECK-OPT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], [[TMP3]] +; CHECK-OPT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], [[TMP2]] ; CHECK-OPT-NEXT: [[CONV5:%.*]] = sext i32 [[MUL]] to i64 -; CHECK-OPT-NEXT: [[MUL13:%.*]] = mul nsw i32 [[TMP14]], [[TMP6]] +; CHECK-OPT-NEXT: [[MUL13:%.*]] = mul nsw i32 [[TMP12]], [[TMP5]] ; CHECK-OPT-NEXT: [[CONV14:%.*]] = sext i32 [[MUL13]] to i64 ; CHECK-OPT-NEXT: [[SUB:%.*]] = sub nsw i64 [[CONV5]], [[CONV14]] -; CHECK-OPT-NEXT: [[TMP16:%.*]] = load i32, ptr [[INCDEC_PTR1]], align 2 +; CHECK-OPT-NEXT: [[TMP13:%.*]] = load i32, ptr [[INCDEC_PTR1]], align 2 +; CHECK-OPT-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16 +; CHECK-OPT-NEXT: [[TMP15:%.*]] = sext i16 [[TMP14]] to i32 +; CHECK-OPT-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP13]], 16 ; CHECK-OPT-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 ; CHECK-OPT-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32 -; CHECK-OPT-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP16]], 16 -; CHECK-OPT-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i16 -; CHECK-OPT-NEXT: [[TMP21:%.*]] = sext i16 [[TMP20]] to i32 ; CHECK-OPT-NEXT: [[INCDEC_PTR21:%.*]] = getelementptr inbounds i16, ptr [[PSRCA]], i32 4 -; CHECK-OPT-NEXT: [[TMP23:%.*]] = load i32, ptr [[INCDEC_PTR3]], align 2 +; CHECK-OPT-NEXT: [[TMP19:%.*]] = load i32, ptr [[INCDEC_PTR3]], align 2 +; CHECK-OPT-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i16 +; CHECK-OPT-NEXT: [[TMP21:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP13]], i32 [[TMP19]], i64 [[TMP8]]) +; CHECK-OPT-NEXT: [[TMP22:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-OPT-NEXT: [[TMP23:%.*]] = lshr i32 [[TMP19]], 16 ; CHECK-OPT-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i16 -; CHECK-OPT-NEXT: [[TMP25:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP16]], i32 [[TMP23]], i64 [[TMP10]]) -; CHECK-OPT-NEXT: [[TMP26:%.*]] = sext i16 [[TMP24]] to i32 -; CHECK-OPT-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP23]], 16 -; CHECK-OPT-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i16 -; CHECK-OPT-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 +; CHECK-OPT-NEXT: [[TMP25:%.*]] = sext i16 [[TMP24]] to i32 ; CHECK-OPT-NEXT: [[INCDEC_PTR23:%.*]] = getelementptr inbounds i16, ptr [[PSRCB]], i32 4 -; CHECK-OPT-NEXT: [[MUL26:%.*]] = mul nsw i32 [[TMP26]], [[TMP18]] +; CHECK-OPT-NEXT: [[MUL26:%.*]] = mul nsw i32 [[TMP22]], [[TMP15]] ; CHECK-OPT-NEXT: [[CONV27:%.*]] = sext i32 [[MUL26]] to i64 ; CHECK-OPT-NEXT: [[ADD28:%.*]] = add nsw i64 [[SUB]], [[CONV27]] -; CHECK-OPT-NEXT: [[MUL36:%.*]] = mul nsw i32 [[TMP29]], [[TMP21]] +; CHECK-OPT-NEXT: [[MUL36:%.*]] = mul nsw i32 [[TMP25]], [[TMP18]] ; CHECK-OPT-NEXT: [[CONV37:%.*]] = sext i32 [[MUL36]] to i64 ; CHECK-OPT-NEXT: [[SUB38:%.*]] = sub nsw i64 [[ADD28]], [[CONV37]] -; CHECK-OPT-NEXT: [[TMP31:%.*]] = load i32, ptr [[INCDEC_PTR21]], align 2 -; CHECK-OPT-NEXT: [[TMP32:%.*]] = trunc i32 [[TMP31]] to i16 -; CHECK-OPT-NEXT: [[TMP33:%.*]] = sext i16 [[TMP32]] to i32 -; CHECK-OPT-NEXT: [[TMP34:%.*]] = lshr i32 [[TMP31]], 16 -; CHECK-OPT-NEXT: [[TMP35:%.*]] = trunc i32 [[TMP34]] to i16 -; CHECK-OPT-NEXT: [[TMP36:%.*]] = sext i16 [[TMP35]] to i32 +; CHECK-OPT-NEXT: [[TMP26:%.*]] = load i32, ptr [[INCDEC_PTR21]], align 2 +; CHECK-OPT-NEXT: [[TMP27:%.*]] = trunc i32 [[TMP26]] to i16 +; CHECK-OPT-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +; CHECK-OPT-NEXT: [[TMP29:%.*]] = lshr i32 [[TMP26]], 16 +; CHECK-OPT-NEXT: [[TMP30:%.*]] = trunc i32 [[TMP29]] to i16 +; CHECK-OPT-NEXT: [[TMP31:%.*]] = sext i16 [[TMP30]] to i32 ; CHECK-OPT-NEXT: [[INCDEC_PTR45:%.*]] = getelementptr inbounds i16, ptr [[PSRCA]], i32 6 -; CHECK-OPT-NEXT: [[TMP38:%.*]] = load i32, ptr [[INCDEC_PTR23]], align 2 -; CHECK-OPT-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 -; CHECK-OPT-NEXT: [[TMP40:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP31]], i32 [[TMP38]], i64 [[TMP25]]) -; CHECK-OPT-NEXT: [[TMP41:%.*]] = sext i16 [[TMP39]] to i32 -; CHECK-OPT-NEXT: [[TMP42:%.*]] = lshr i32 [[TMP38]], 16 -; CHECK-OPT-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 -; CHECK-OPT-NEXT: [[TMP44:%.*]] = sext i16 [[TMP43]] to i32 +; CHECK-OPT-NEXT: [[TMP32:%.*]] = load i32, ptr [[INCDEC_PTR23]], align 2 +; CHECK-OPT-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16 +; CHECK-OPT-NEXT: [[TMP34:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP26]], i32 [[TMP32]], i64 [[TMP21]]) +; CHECK-OPT-NEXT: [[TMP35:%.*]] = sext i16 [[TMP33]] to i32 +; CHECK-OPT-NEXT: [[TMP36:%.*]] = lshr i32 [[TMP32]], 16 +; CHECK-OPT-NEXT: [[TMP37:%.*]] = trunc i32 [[TMP36]] to i16 +; CHECK-OPT-NEXT: [[TMP38:%.*]] = sext i16 [[TMP37]] to i32 ; CHECK-OPT-NEXT: [[INCDEC_PTR47:%.*]] = getelementptr inbounds i16, ptr [[PSRCB]], i32 6 -; CHECK-OPT-NEXT: [[MUL50:%.*]] = mul nsw i32 [[TMP41]], [[TMP33]] +; CHECK-OPT-NEXT: [[MUL50:%.*]] = mul nsw i32 [[TMP35]], [[TMP28]] ; CHECK-OPT-NEXT: [[CONV51:%.*]] = sext i32 [[MUL50]] to i64 ; CHECK-OPT-NEXT: [[ADD52:%.*]] = add nsw i64 [[SUB38]], [[CONV51]] -; CHECK-OPT-NEXT: [[MUL60:%.*]] = mul nsw i32 [[TMP44]], [[TMP36]] +; CHECK-OPT-NEXT: [[MUL60:%.*]] = mul nsw i32 [[TMP38]], [[TMP31]] ; CHECK-OPT-NEXT: [[CONV61:%.*]] = sext i32 [[MUL60]] to i64 ; CHECK-OPT-NEXT: [[SUB62:%.*]] = sub nsw i64 [[ADD52]], [[CONV61]] -; CHECK-OPT-NEXT: [[TMP46:%.*]] = load i32, ptr [[INCDEC_PTR45]], align 2 -; CHECK-OPT-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 -; CHECK-OPT-NEXT: [[TMP48:%.*]] = sext i16 [[TMP47]] to i32 -; CHECK-OPT-NEXT: [[TMP49:%.*]] = lshr i32 [[TMP46]], 16 +; CHECK-OPT-NEXT: [[TMP39:%.*]] = load i32, ptr [[INCDEC_PTR45]], align 2 +; CHECK-OPT-NEXT: [[TMP40:%.*]] = trunc i32 [[TMP39]] to i16 +; CHECK-OPT-NEXT: [[TMP41:%.*]] = sext i16 [[TMP40]] to i32 +; CHECK-OPT-NEXT: [[TMP42:%.*]] = lshr i32 [[TMP39]], 16 +; CHECK-OPT-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 +; CHECK-OPT-NEXT: [[TMP44:%.*]] = sext i16 [[TMP43]] to i32 +; CHECK-OPT-NEXT: [[TMP45:%.*]] = load i32, ptr [[INCDEC_PTR47]], align 2 +; CHECK-OPT-NEXT: [[TMP46:%.*]] = trunc i32 [[TMP45]] to i16 +; CHECK-OPT-NEXT: [[TMP47:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP39]], i32 [[TMP45]], i64 [[TMP34]]) +; CHECK-OPT-NEXT: [[TMP48:%.*]] = sext i16 [[TMP46]] to i32 +; CHECK-OPT-NEXT: [[TMP49:%.*]] = lshr i32 [[TMP45]], 16 ; CHECK-OPT-NEXT: [[TMP50:%.*]] = trunc i32 [[TMP49]] to i16 ; CHECK-OPT-NEXT: [[TMP51:%.*]] = sext i16 [[TMP50]] to i32 -; CHECK-OPT-NEXT: [[TMP53:%.*]] = load i32, ptr [[INCDEC_PTR47]], align 2 -; CHECK-OPT-NEXT: [[TMP54:%.*]] = trunc i32 [[TMP53]] to i16 -; CHECK-OPT-NEXT: [[TMP55:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP46]], i32 [[TMP53]], i64 [[TMP40]]) -; CHECK-OPT-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 -; CHECK-OPT-NEXT: [[TMP57:%.*]] = lshr i32 [[TMP53]], 16 -; CHECK-OPT-NEXT: [[TMP58:%.*]] = trunc i32 [[TMP57]] to i16 -; CHECK-OPT-NEXT: [[TMP59:%.*]] = sext i16 [[TMP58]] to i32 -; CHECK-OPT-NEXT: [[MUL74:%.*]] = mul nsw i32 [[TMP56]], [[TMP48]] +; CHECK-OPT-NEXT: [[MUL74:%.*]] = mul nsw i32 [[TMP48]], [[TMP41]] ; CHECK-OPT-NEXT: [[CONV75:%.*]] = sext i32 [[MUL74]] to i64 ; CHECK-OPT-NEXT: [[ADD76:%.*]] = add nsw i64 [[SUB62]], [[CONV75]] -; CHECK-OPT-NEXT: [[MUL84:%.*]] = mul nsw i32 [[TMP59]], [[TMP51]] +; CHECK-OPT-NEXT: [[MUL84:%.*]] = mul nsw i32 [[TMP51]], [[TMP44]] ; CHECK-OPT-NEXT: [[CONV85:%.*]] = sext i32 [[MUL84]] to i64 ; CHECK-OPT-NEXT: [[SUB86:%.*]] = sub nsw i64 [[ADD76]], [[CONV85]] -; CHECK-OPT-NEXT: [[TMP60:%.*]] = lshr i64 [[SUB86]], 6 -; CHECK-OPT-NEXT: [[CONV92:%.*]] = trunc i64 [[TMP60]] to i32 -; CHECK-OPT-NEXT: store i32 [[CONV92]], ptr [[REALRESULT:%.*]], align 4 -; CHECK-OPT-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP55]], 6 -; CHECK-OPT-NEXT: [[CONV94:%.*]] = trunc i64 [[TMP61]] to i32 -; CHECK-OPT-NEXT: store i32 [[CONV94]], ptr [[IMAGRESULT:%.*]], align 4 +; CHECK-OPT-NEXT: [[TMP52:%.*]] = lshr i64 [[SUB86]], 6 +; CHECK-OPT-NEXT: [[CONV92:%.*]] = trunc i64 [[TMP52]] to i32 +; CHECK-OPT-NEXT: store i32 [[CONV92]], ptr [[REALRESULT]], align 4 +; CHECK-OPT-NEXT: [[TMP53:%.*]] = lshr i64 [[TMP47]], 6 +; CHECK-OPT-NEXT: [[CONV94:%.*]] = trunc i64 [[TMP53]] to i32 +; CHECK-OPT-NEXT: store i32 [[CONV94]], ptr [[IMAGRESULT]], align 4 ; CHECK-OPT-NEXT: ret void +; entry: %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA, i32 1 %0 = load i16, ptr %pSrcA, align 2 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll index e45985136cf341..562e832d68b5a3 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll @@ -10,26 +10,26 @@ define i32 @add_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture reado ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-LE-NEXT: .save {r4, lr} ; CHECK-LE-NEXT: push {r4, lr} -; CHECK-LE-NEXT: sub.w lr, r3, #2 -; CHECK-LE-NEXT: subs r2, #2 +; CHECK-LE-NEXT: subs r1, r2, #2 +; CHECK-LE-NEXT: subs r3, #2 ; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: .LBB0_2: @ %for.body ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-LE-NEXT: ldr r3, [lr, #2]! +; CHECK-LE-NEXT: ldr lr, [r3, #2]! ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: ldr r4, [r2, #2]! -; CHECK-LE-NEXT: sxtah r1, r1, r3 -; CHECK-LE-NEXT: smlad r12, r4, r3, r12 +; CHECK-LE-NEXT: ldr r4, [r1, #2]! +; CHECK-LE-NEXT: sxtah r2, r2, lr +; CHECK-LE-NEXT: smlad r12, r4, lr, r12 ; CHECK-LE-NEXT: bne .LBB0_2 ; CHECK-LE-NEXT: @ %bb.3: ; CHECK-LE-NEXT: pop.w {r4, lr} -; CHECK-LE-NEXT: add.w r0, r12, r1 +; CHECK-LE-NEXT: add.w r0, r12, r2 ; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB0_4: ; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: add.w r0, r12, r1 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: add.w r0, r12, r2 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: add_user: @@ -39,29 +39,29 @@ define i32 @add_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture reado ; CHECK-BE-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-BE-NEXT: .save {r4, r5, r7, lr} ; CHECK-BE-NEXT: push {r4, r5, r7, lr} -; CHECK-BE-NEXT: subs r3, #2 -; CHECK-BE-NEXT: subs r2, #2 +; CHECK-BE-NEXT: subs r1, r3, #2 +; CHECK-BE-NEXT: subs r3, r2, #2 ; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: .LBB0_2: @ %for.body ; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: ldrsh lr, [r3, #2]! +; CHECK-BE-NEXT: ldrsh lr, [r1, #2]! ; CHECK-BE-NEXT: subs r0, #1 -; CHECK-BE-NEXT: ldrsh r4, [r2, #2]! -; CHECK-BE-NEXT: add r1, lr -; CHECK-BE-NEXT: ldrsh.w r5, [r2, #2] +; CHECK-BE-NEXT: ldrsh r4, [r3, #2]! +; CHECK-BE-NEXT: add r2, lr +; CHECK-BE-NEXT: ldrsh.w r5, [r3, #2] ; CHECK-BE-NEXT: smlabb r12, r4, lr, r12 -; CHECK-BE-NEXT: ldrsh.w r4, [r3, #2] +; CHECK-BE-NEXT: ldrsh.w r4, [r1, #2] ; CHECK-BE-NEXT: smlabb r12, r5, r4, r12 ; CHECK-BE-NEXT: bne .LBB0_2 ; CHECK-BE-NEXT: @ %bb.3: ; CHECK-BE-NEXT: pop.w {r4, r5, r7, lr} -; CHECK-BE-NEXT: add.w r0, r12, r1 +; CHECK-BE-NEXT: add.w r0, r12, r2 ; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .LBB0_4: ; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: add.w r0, r12, r1 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: add.w r0, r12, r2 ; CHECK-BE-NEXT: bx lr entry: %cmp24 = icmp sgt i32 %arg, 0 @@ -112,27 +112,27 @@ define i32 @mul_bottom_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocaptur ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-LE-NEXT: .save {r4, lr} ; CHECK-LE-NEXT: push {r4, lr} -; CHECK-LE-NEXT: sub.w lr, r3, #2 -; CHECK-LE-NEXT: subs r2, #2 +; CHECK-LE-NEXT: subs r1, r2, #2 +; CHECK-LE-NEXT: subs r3, #2 ; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: .LBB1_2: @ %for.body ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-LE-NEXT: ldr r3, [lr, #2]! +; CHECK-LE-NEXT: ldr lr, [r3, #2]! ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: ldr r4, [r2, #2]! -; CHECK-LE-NEXT: smlad r12, r4, r3, r12 -; CHECK-LE-NEXT: sxth r3, r3 -; CHECK-LE-NEXT: mul r1, r3, r1 +; CHECK-LE-NEXT: ldr r4, [r1, #2]! +; CHECK-LE-NEXT: smlad r12, r4, lr, r12 +; CHECK-LE-NEXT: sxth.w r4, lr +; CHECK-LE-NEXT: mul r2, r4, r2 ; CHECK-LE-NEXT: bne .LBB1_2 ; CHECK-LE-NEXT: @ %bb.3: ; CHECK-LE-NEXT: pop.w {r4, lr} -; CHECK-LE-NEXT: add.w r0, r12, r1 +; CHECK-LE-NEXT: add.w r0, r12, r2 ; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB1_4: ; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: add.w r0, r12, r1 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: add.w r0, r12, r2 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: mul_bottom_user: @@ -142,29 +142,29 @@ define i32 @mul_bottom_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocaptur ; CHECK-BE-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-BE-NEXT: .save {r4, r5, r7, lr} ; CHECK-BE-NEXT: push {r4, r5, r7, lr} -; CHECK-BE-NEXT: subs r3, #2 -; CHECK-BE-NEXT: subs r2, #2 +; CHECK-BE-NEXT: subs r1, r3, #2 +; CHECK-BE-NEXT: subs r3, r2, #2 ; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: .LBB1_2: @ %for.body ; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: ldrsh lr, [r3, #2]! +; CHECK-BE-NEXT: ldrsh lr, [r1, #2]! ; CHECK-BE-NEXT: subs r0, #1 -; CHECK-BE-NEXT: ldrsh r4, [r2, #2]! -; CHECK-BE-NEXT: ldrsh.w r5, [r2, #2] -; CHECK-BE-NEXT: mul r1, lr, r1 +; CHECK-BE-NEXT: ldrsh r4, [r3, #2]! +; CHECK-BE-NEXT: ldrsh.w r5, [r3, #2] +; CHECK-BE-NEXT: mul r2, lr, r2 ; CHECK-BE-NEXT: smlabb r12, r4, lr, r12 -; CHECK-BE-NEXT: ldrsh.w r4, [r3, #2] +; CHECK-BE-NEXT: ldrsh.w r4, [r1, #2] ; CHECK-BE-NEXT: smlabb r12, r5, r4, r12 ; CHECK-BE-NEXT: bne .LBB1_2 ; CHECK-BE-NEXT: @ %bb.3: ; CHECK-BE-NEXT: pop.w {r4, r5, r7, lr} -; CHECK-BE-NEXT: add.w r0, r12, r1 +; CHECK-BE-NEXT: add.w r0, r12, r2 ; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .LBB1_4: ; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: add.w r0, r12, r1 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: add.w r0, r12, r2 ; CHECK-BE-NEXT: bx lr entry: %cmp24 = icmp sgt i32 %arg, 0 @@ -215,27 +215,27 @@ define i32 @mul_top_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture r ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-LE-NEXT: .save {r4, lr} ; CHECK-LE-NEXT: push {r4, lr} +; CHECK-LE-NEXT: subs r1, r2, #2 ; CHECK-LE-NEXT: subs r3, #2 -; CHECK-LE-NEXT: subs r2, #2 ; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: .LBB2_2: @ %for.body ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-LE-NEXT: ldr lr, [r3, #2]! ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: ldr r4, [r2, #2]! +; CHECK-LE-NEXT: ldr r4, [r1, #2]! ; CHECK-LE-NEXT: smlad r12, r4, lr, r12 ; CHECK-LE-NEXT: asr.w r4, r4, #16 -; CHECK-LE-NEXT: mul r1, r4, r1 +; CHECK-LE-NEXT: mul r2, r4, r2 ; CHECK-LE-NEXT: bne .LBB2_2 ; CHECK-LE-NEXT: @ %bb.3: ; CHECK-LE-NEXT: pop.w {r4, lr} -; CHECK-LE-NEXT: add.w r0, r12, r1 +; CHECK-LE-NEXT: add.w r0, r12, r2 ; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB2_4: ; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: add.w r0, r12, r1 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: add.w r0, r12, r2 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: mul_top_user: @@ -245,29 +245,29 @@ define i32 @mul_top_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture r ; CHECK-BE-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-BE-NEXT: .save {r4, lr} ; CHECK-BE-NEXT: push {r4, lr} -; CHECK-BE-NEXT: subs r3, #2 -; CHECK-BE-NEXT: subs r2, #2 +; CHECK-BE-NEXT: subs r1, r3, #2 +; CHECK-BE-NEXT: subs r3, r2, #2 ; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: .LBB2_2: @ %for.body ; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: ldrsh lr, [r3, #2]! +; CHECK-BE-NEXT: ldrsh lr, [r1, #2]! ; CHECK-BE-NEXT: subs r0, #1 -; CHECK-BE-NEXT: ldrsh r4, [r2, #2]! +; CHECK-BE-NEXT: ldrsh r4, [r3, #2]! ; CHECK-BE-NEXT: smlabb r12, r4, lr, r12 -; CHECK-BE-NEXT: ldrsh.w r4, [r2, #2] -; CHECK-BE-NEXT: ldrsh.w lr, [r3, #2] -; CHECK-BE-NEXT: mul r1, r4, r1 +; CHECK-BE-NEXT: ldrsh.w r4, [r3, #2] +; CHECK-BE-NEXT: ldrsh.w lr, [r1, #2] +; CHECK-BE-NEXT: mul r2, r4, r2 ; CHECK-BE-NEXT: smlabb r12, r4, lr, r12 ; CHECK-BE-NEXT: bne .LBB2_2 ; CHECK-BE-NEXT: @ %bb.3: ; CHECK-BE-NEXT: pop.w {r4, lr} -; CHECK-BE-NEXT: add.w r0, r12, r1 +; CHECK-BE-NEXT: add.w r0, r12, r2 ; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .LBB2_4: ; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: add.w r0, r12, r1 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: add.w r0, r12, r2 ; CHECK-BE-NEXT: bx lr entry: %cmp24 = icmp sgt i32 %arg, 0 @@ -318,27 +318,27 @@ define i32 @and_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture reado ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-LE-NEXT: .save {r4, lr} ; CHECK-LE-NEXT: push {r4, lr} -; CHECK-LE-NEXT: sub.w lr, r3, #2 -; CHECK-LE-NEXT: subs r2, #2 +; CHECK-LE-NEXT: subs r1, r2, #2 +; CHECK-LE-NEXT: subs r3, #2 ; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: .LBB3_2: @ %for.body ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-LE-NEXT: ldr r3, [lr, #2]! +; CHECK-LE-NEXT: ldr lr, [r3, #2]! ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: ldr r4, [r2, #2]! -; CHECK-LE-NEXT: smlad r12, r4, r3, r12 -; CHECK-LE-NEXT: uxth r3, r3 -; CHECK-LE-NEXT: mul r1, r3, r1 +; CHECK-LE-NEXT: ldr r4, [r1, #2]! +; CHECK-LE-NEXT: smlad r12, r4, lr, r12 +; CHECK-LE-NEXT: uxth.w r4, lr +; CHECK-LE-NEXT: mul r2, r4, r2 ; CHECK-LE-NEXT: bne .LBB3_2 ; CHECK-LE-NEXT: @ %bb.3: ; CHECK-LE-NEXT: pop.w {r4, lr} -; CHECK-LE-NEXT: add.w r0, r12, r1 +; CHECK-LE-NEXT: add.w r0, r12, r2 ; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB3_4: ; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: add.w r0, r12, r1 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: add.w r0, r12, r2 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: and_user: @@ -348,29 +348,29 @@ define i32 @and_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture reado ; CHECK-BE-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-BE-NEXT: .save {r4, r5, r7, lr} ; CHECK-BE-NEXT: push {r4, r5, r7, lr} -; CHECK-BE-NEXT: subs r3, #2 -; CHECK-BE-NEXT: subs r2, #2 +; CHECK-BE-NEXT: subs r1, r3, #2 +; CHECK-BE-NEXT: subs r3, r2, #2 ; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: .LBB3_2: @ %for.body ; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: ldrh lr, [r3, #2]! +; CHECK-BE-NEXT: ldrh lr, [r1, #2]! ; CHECK-BE-NEXT: subs r0, #1 -; CHECK-BE-NEXT: ldrsh r4, [r2, #2]! -; CHECK-BE-NEXT: ldrsh.w r5, [r2, #2] -; CHECK-BE-NEXT: mul r1, lr, r1 +; CHECK-BE-NEXT: ldrsh r4, [r3, #2]! +; CHECK-BE-NEXT: ldrsh.w r5, [r3, #2] +; CHECK-BE-NEXT: mul r2, lr, r2 ; CHECK-BE-NEXT: smlabb r12, r4, lr, r12 -; CHECK-BE-NEXT: ldrsh.w r4, [r3, #2] +; CHECK-BE-NEXT: ldrsh.w r4, [r1, #2] ; CHECK-BE-NEXT: smlabb r12, r5, r4, r12 ; CHECK-BE-NEXT: bne .LBB3_2 ; CHECK-BE-NEXT: @ %bb.3: ; CHECK-BE-NEXT: pop.w {r4, r5, r7, lr} -; CHECK-BE-NEXT: add.w r0, r12, r1 +; CHECK-BE-NEXT: add.w r0, r12, r2 ; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .LBB3_4: ; CHECK-BE-NEXT: mov.w r12, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: add.w r0, r12, r1 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: add.w r0, r12, r2 ; CHECK-BE-NEXT: bx lr entry: %cmp24 = icmp sgt i32 %arg, 0 @@ -417,33 +417,34 @@ for.body: define i32 @multi_uses(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture readonly %arg2, ptr nocapture readonly %arg3) { ; CHECK-LE-LABEL: multi_uses: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, lr} -; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: cmp r0, #1 ; CHECK-LE-NEXT: blt .LBB4_4 ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: subs r3, #2 ; CHECK-LE-NEXT: subs r2, #2 -; CHECK-LE-NEXT: mov.w lr, #0 +; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: mov.w r12, #0 ; CHECK-LE-NEXT: .LBB4_2: @ %for.body ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-LE-NEXT: ldr r1, [r3, #2]! +; CHECK-LE-NEXT: ldr lr, [r3, #2]! ; CHECK-LE-NEXT: subs r0, #1 ; CHECK-LE-NEXT: ldr r4, [r2, #2]! -; CHECK-LE-NEXT: smlad lr, r4, r1, lr -; CHECK-LE-NEXT: eor.w r4, r1, r12 -; CHECK-LE-NEXT: mul r1, r4, r1 -; CHECK-LE-NEXT: lsl.w r12, r1, #16 +; CHECK-LE-NEXT: smlad r1, r4, lr, r1 +; CHECK-LE-NEXT: eor.w r4, lr, r12 +; CHECK-LE-NEXT: mul r4, lr, r4 +; CHECK-LE-NEXT: lsl.w r12, r4, #16 ; CHECK-LE-NEXT: bne .LBB4_2 -; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup -; CHECK-LE-NEXT: add.w r0, lr, r12 -; CHECK-LE-NEXT: pop {r4, pc} +; CHECK-LE-NEXT: @ %bb.3: +; CHECK-LE-NEXT: pop.w {r4, lr} +; CHECK-LE-NEXT: add.w r0, r1, r12 +; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB4_4: -; CHECK-LE-NEXT: mov.w lr, #0 +; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: mov.w r12, #0 -; CHECK-LE-NEXT: add.w r0, lr, r12 -; CHECK-LE-NEXT: pop {r4, pc} +; CHECK-LE-NEXT: add.w r0, r1, r12 +; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: multi_uses: ; CHECK-BE: @ %bb.0: @ %entry @@ -452,7 +453,7 @@ define i32 @multi_uses(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture rea ; CHECK-BE-NEXT: cmp r0, #1 ; CHECK-BE-NEXT: blt .LBB4_4 ; CHECK-BE-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-BE-NEXT: subs r3, #2 +; CHECK-BE-NEXT: subs r1, r3, #2 ; CHECK-BE-NEXT: subs r2, #2 ; CHECK-BE-NEXT: mov.w r12, #0 ; CHECK-BE-NEXT: mov.w lr, #0 @@ -460,14 +461,14 @@ define i32 @multi_uses(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture rea ; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ldrsh r4, [r2, #2]! ; CHECK-BE-NEXT: subs r0, #1 -; CHECK-BE-NEXT: ldrsh r1, [r3, #2]! +; CHECK-BE-NEXT: ldrsh r3, [r1, #2]! ; CHECK-BE-NEXT: ldrsh.w r5, [r2, #2] -; CHECK-BE-NEXT: smlabb r12, r4, r1, r12 -; CHECK-BE-NEXT: ldrsh.w r4, [r3, #2] +; CHECK-BE-NEXT: smlabb r12, r4, r3, r12 +; CHECK-BE-NEXT: ldrsh.w r4, [r1, #2] ; CHECK-BE-NEXT: smlabb r12, r5, r4, r12 -; CHECK-BE-NEXT: eor.w r5, r1, lr -; CHECK-BE-NEXT: mul r1, r5, r1 -; CHECK-BE-NEXT: lsl.w lr, r1, #16 +; CHECK-BE-NEXT: eor.w r5, r3, lr +; CHECK-BE-NEXT: mul r3, r5, r3 +; CHECK-BE-NEXT: lsl.w lr, r3, #16 ; CHECK-BE-NEXT: bne .LBB4_2 ; CHECK-BE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-BE-NEXT: add.w r0, r12, lr diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll b/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll index 3890edeaa353df..427f7de33b63ca 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll @@ -44,6 +44,9 @@ entry: ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp +; CHECK-REG-PRESSURE: ldr{{.*}}, [sp +; CHECK-REG-PRESSURE: ldr{{.*}}, [sp +; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE-NOT: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: bne .LBB0_1 diff --git a/llvm/test/CodeGen/ARM/addsubo-legalization.ll b/llvm/test/CodeGen/ARM/addsubo-legalization.ll index 5ebb115791c663..94210a892525df 100644 --- a/llvm/test/CodeGen/ARM/addsubo-legalization.ll +++ b/llvm/test/CodeGen/ARM/addsubo-legalization.ll @@ -12,14 +12,14 @@ define <2 x i1> @uaddo(ptr %ptr, ptr %ptr2) { ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vmov r3, r2, d18 +; CHECK-NEXT: vmov r3, lr, d18 ; CHECK-NEXT: vadd.i64 q8, q9, q8 ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vmov r6, r7, d19 -; CHECK-NEXT: vmov lr, r12, d16 +; CHECK-NEXT: vmov r2, r12, d16 ; CHECK-NEXT: vmov r4, r5, d17 -; CHECK-NEXT: subs.w r3, lr, r3 -; CHECK-NEXT: sbcs.w r2, r12, r2 +; CHECK-NEXT: subs r2, r2, r3 +; CHECK-NEXT: sbcs.w r2, r12, lr ; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r2, #1 @@ -53,12 +53,12 @@ define <2 x i1> @usubo(ptr %ptr, ptr %ptr2) { ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] ; CHECK-NEXT: vsub.i64 q8, q9, q8 -; CHECK-NEXT: vmov lr, r12, d18 +; CHECK-NEXT: vmov r2, r12, d18 ; CHECK-NEXT: vmov r4, r5, d19 -; CHECK-NEXT: vmov r3, r2, d16 +; CHECK-NEXT: vmov r3, lr, d16 ; CHECK-NEXT: vmov r6, r7, d17 -; CHECK-NEXT: subs.w r3, lr, r3 -; CHECK-NEXT: sbcs.w r2, r12, r2 +; CHECK-NEXT: subs r2, r2, r3 +; CHECK-NEXT: sbcs.w r2, r12, lr ; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r2, #1 diff --git a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll index f9b62df37ff329..325078d9496b83 100644 --- a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll +++ b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll @@ -1355,8 +1355,8 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, < ; CHECK-FIX-NOSCHED: @ %bb.0: ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-FIX-NOSCHED-NEXT: .pad #24 -; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: .pad #20 +; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #20 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: @@ -1367,17 +1367,16 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, < ; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[0] ; CHECK-FIX-NOSCHED-NEXT: uxth r4, r3 ; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #8] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #4] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: uxth r4, r7 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #12] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #8] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: uxth r4, r6 ; CHECK-FIX-NOSCHED-NEXT: lsr r6, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #20] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #16] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r6, [sp, #12] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: str r6, [sp, #16] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: lsr r9, r5, #16 ; CHECK-FIX-NOSCHED-NEXT: uxth r10, r5 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: bne .LBB36_4 @@ -1389,78 +1388,76 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, < ; CHECK-FIX-NOSCHED-NEXT: uxth r11, r6 ; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 ; CHECK-FIX-NOSCHED-NEXT: lsr r12, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r9, r4 +; CHECK-FIX-NOSCHED-NEXT: uxth r4, r4 ; CHECK-FIX-NOSCHED-NEXT: uxth r6, r3 ; CHECK-FIX-NOSCHED-NEXT: b .LBB36_5 ; CHECK-FIX-NOSCHED-NEXT: .LBB36_3: ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #14] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #16] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #12] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #16] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #8] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #6] ; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2, #10] +; CHECK-FIX-NOSCHED-NEXT: ldrh r9, [r2, #2] ; CHECK-FIX-NOSCHED-NEXT: ldrh r10, [r2] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #4] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #6] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #4] +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB36_4: ; CHECK-FIX-NOSCHED-NEXT: vmov r5, r3, d1 -; CHECK-FIX-NOSCHED-NEXT: mov r4, r7 +; CHECK-FIX-NOSCHED-NEXT: mov lr, r7 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d0[1] ; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d0[0]}, [r1:16] ; CHECK-FIX-NOSCHED-NEXT: vmov.32 r0, d0[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r9, r5 +; CHECK-FIX-NOSCHED-NEXT: uxth r4, r5 ; CHECK-FIX-NOSCHED-NEXT: uxth r11, r3 ; CHECK-FIX-NOSCHED-NEXT: uxth r6, r7 ; CHECK-FIX-NOSCHED-NEXT: lsr r12, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: mov r7, lr ; CHECK-FIX-NOSCHED-NEXT: lsr r1, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: lsr r5, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: mov r7, r4 ; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 ; CHECK-FIX-NOSCHED-NEXT: .LBB36_5: ; CHECK-FIX-NOSCHED-NEXT: uxth r8, r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r10, r0, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r10, r9, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp] @ 4-byte Reload ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r8, lr, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r0 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r6, r12, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r3, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r9, r5, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r4, r5, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r7, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r0 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r11, r1, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[1], r0 ; CHECK-FIX-NOSCHED-NEXT: aese.8 q8, q9 ; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q8 ; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #20 ; CHECK-FIX-NOSCHED-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-CORTEX-FIX-LABEL: aese_setf16_cond_via_ptr: ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-CORTEX-FIX-NEXT: .pad #24 -; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #24 +; CHECK-CORTEX-FIX-NEXT: .pad #28 +; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #28 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB36_3 +; CHECK-CORTEX-FIX-NEXT: beq .LBB36_2 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-CORTEX-FIX-NEXT: vorr q9, q8, q8 @@ -1468,94 +1465,94 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, half* %1, < ; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d18[0] ; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 ; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #20] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #24] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 ; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: vmov r3, r6, d17 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #16] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 ; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r11, r6 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB36_4 +; CHECK-CORTEX-FIX-NEXT: lsr r10, r6, #16 +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: uxth r3, r6 +; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: b .LBB36_3 ; CHECK-CORTEX-FIX-NEXT: .LBB36_2: -; CHECK-CORTEX-FIX-NEXT: vmov r1, r7, d0 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r1 -; CHECK-CORTEX-FIX-NEXT: uxth r6, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r7, #16 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r1, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: mov r0, r3 -; CHECK-CORTEX-FIX-NEXT: vmov r7, r3, d1 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r5, r7, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 -; CHECK-CORTEX-FIX-NEXT: mov r3, r0 -; CHECK-CORTEX-FIX-NEXT: b .LBB36_5 -; CHECK-CORTEX-FIX-NEXT: .LBB36_3: ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2] -; CHECK-CORTEX-FIX-NEXT: ldrh r11, [r2, #12] -; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r2, #14] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r10, [r2, #14] +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #24] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #2] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #4] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #6] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #8] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #8] +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #10] +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #12] +; CHECK-CORTEX-FIX-NEXT: .LBB36_3: +; CHECK-CORTEX-FIX-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB36_2 -; CHECK-CORTEX-FIX-NEXT: .LBB36_4: +; CHECK-CORTEX-FIX-NEXT: beq .LBB36_5 +; CHECK-CORTEX-FIX-NEXT: @ %bb.4: ; CHECK-CORTEX-FIX-NEXT: vorr q8, q0, q0 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r5, d0[1] +; CHECK-CORTEX-FIX-NEXT: vmov.32 r6, d0[1] +; CHECK-CORTEX-FIX-NEXT: vmov r7, r3, d1 ; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] -; CHECK-CORTEX-FIX-NEXT: uxth r6, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r5, #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, r7, d1 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r7, #16 +; CHECK-CORTEX-FIX-NEXT: uxth lr, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 +; CHECK-CORTEX-FIX-NEXT: uxth r4, r6 +; CHECK-CORTEX-FIX-NEXT: lsr r5, r6, #16 +; CHECK-CORTEX-FIX-NEXT: uxth r6, r7 ; CHECK-CORTEX-FIX-NEXT: vmov.32 r1, d16[0] -; CHECK-CORTEX-FIX-NEXT: uxth r10, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r5, r5, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r7, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r1 +; CHECK-CORTEX-FIX-NEXT: uxth r11, r1 ; CHECK-CORTEX-FIX-NEXT: lsr r9, r1, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: b .LBB36_6 ; CHECK-CORTEX-FIX-NEXT: .LBB36_5: -; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r11, r11, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r4, [sp, #16] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: vmov r1, r3, d0 +; CHECK-CORTEX-FIX-NEXT: uxth r4, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r5, r3, #16 +; CHECK-CORTEX-FIX-NEXT: vmov r3, r7, d1 +; CHECK-CORTEX-FIX-NEXT: uxth r11, r1 +; CHECK-CORTEX-FIX-NEXT: lsr r9, r1, #16 +; CHECK-CORTEX-FIX-NEXT: uxth r6, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 +; CHECK-CORTEX-FIX-NEXT: uxth lr, r7 +; CHECK-CORTEX-FIX-NEXT: lsr r8, r7, #16 +; CHECK-CORTEX-FIX-NEXT: .LBB36_6: +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r4, r5, lsl #16 ; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r12, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r10, r5, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r10, r0, r10, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r0, r1, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, lr, r8, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r3, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r4 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r1 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r7 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r11 -; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r9, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r11, r9, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r3 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r5 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r6 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r6 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r1 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r5 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r7 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r10 ; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: add sp, sp, #24 +; CHECK-CORTEX-FIX-NEXT: add sp, sp, #28 ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} br i1 %0, label %5, label %12 @@ -1604,104 +1601,97 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1 ; CHECK-FIX-NOSCHED: @ %bb.0: ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-FIX-NOSCHED-NEXT: .pad #24 -; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: .pad #16 +; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r12, s0 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1] -; CHECK-FIX-NOSCHED-NEXT: vmov r7, r6, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r2, d16[1] +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r6, d17 ; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r12 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r7 -; CHECK-FIX-NOSCHED-NEXT: uxth r2, r3 +; CHECK-FIX-NOSCHED-NEXT: uxth r7, r2 +; CHECK-FIX-NOSCHED-NEXT: lsr r11, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: str r7, [sp] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: uxth r7, r3 +; CHECK-FIX-NOSCHED-NEXT: str r7, [sp, #8] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: uxth r7, r6 ; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r6 +; CHECK-FIX-NOSCHED-NEXT: str r7, [sp, #12] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: lsr r6, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #20] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r3, r5 -; CHECK-FIX-NOSCHED-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: b .LBB37_3 +; CHECK-FIX-NOSCHED-NEXT: lsr r9, r5, #16 +; CHECK-FIX-NOSCHED-NEXT: uxth r10, r5 +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB37_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB37_2: -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #14] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #12] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #8] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #6] -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #2] +; CHECK-FIX-NOSCHED-NEXT: vmov r2, r5, d2 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r0, d3 +; CHECK-FIX-NOSCHED-NEXT: lsr r12, r5, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr lr, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r4, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: uxth r0, r0 +; CHECK-FIX-NOSCHED-NEXT: uxth r3, r3 +; CHECK-FIX-NOSCHED-NEXT: uxth r5, r5 +; CHECK-FIX-NOSCHED-NEXT: b .LBB37_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB37_3: +; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #12] +; CHECK-FIX-NOSCHED-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #10] +; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r1, #14] +; CHECK-FIX-NOSCHED-NEXT: ldrh r11, [r1, #6] +; CHECK-FIX-NOSCHED-NEXT: ldrh r9, [r1, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r10, [r1] +; CHECK-FIX-NOSCHED-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #8] ; CHECK-FIX-NOSCHED-NEXT: str r2, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r1, #10] ; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #4] -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1] -; CHECK-FIX-NOSCHED-NEXT: .LBB37_3: +; CHECK-FIX-NOSCHED-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_5 -; CHECK-FIX-NOSCHED-NEXT: @ %bb.4: -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r6, d2[1] -; CHECK-FIX-NOSCHED-NEXT: mov r3, r2 -; CHECK-FIX-NOSCHED-NEXT: mov r2, r7 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_2 +; CHECK-FIX-NOSCHED-NEXT: .LBB37_4: ; CHECK-FIX-NOSCHED-NEXT: vmov r4, r7, d3 +; CHECK-FIX-NOSCHED-NEXT: mov lr, r6 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r6, d2[1] ; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r12 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r0, d2[0] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r2, d2[0] +; CHECK-FIX-NOSCHED-NEXT: uxth r3, r4 +; CHECK-FIX-NOSCHED-NEXT: uxth r0, r7 ; CHECK-FIX-NOSCHED-NEXT: uxth r5, r6 ; CHECK-FIX-NOSCHED-NEXT: lsr r12, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r10, r4 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r9, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: mov r7, r2 -; CHECK-FIX-NOSCHED-NEXT: mov r2, r3 +; CHECK-FIX-NOSCHED-NEXT: mov r6, lr +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 ; CHECK-FIX-NOSCHED-NEXT: lsr r4, r4, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: b .LBB37_6 +; CHECK-FIX-NOSCHED-NEXT: lsr lr, r2, #16 ; CHECK-FIX-NOSCHED-NEXT: .LBB37_5: -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r6, d3 -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r5, d2 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r9, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r6 -; CHECK-FIX-NOSCHED-NEXT: uxth r10, r3 -; CHECK-FIX-NOSCHED-NEXT: uxth r5, r5 -; CHECK-FIX-NOSCHED-NEXT: .LBB37_6: -; CHECK-FIX-NOSCHED-NEXT: uxth r8, r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r3, r0, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r8, lr, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r5, r12, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r2, r0, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r10, r4, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: uxth r8, r2 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r10, r9, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r7, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r11, r9, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r2 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r8, lr, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r2 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r5, r12, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r2 +; CHECK-FIX-NOSCHED-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r2, r11, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r2 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r3, r4, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r2 +; CHECK-FIX-NOSCHED-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r2, r3, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r2 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r6, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[1], r0 ; CHECK-FIX-NOSCHED-NEXT: aese.8 q8, q9 ; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q8 ; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r1] -; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #16 ; CHECK-FIX-NOSCHED-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-CORTEX-FIX-LABEL: aese_setf16_cond_via_val: @@ -1710,100 +1700,98 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_val(i1 zeroext %0, half %1, <1 ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-CORTEX-FIX-NEXT: .pad #28 ; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #28 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s0 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 ; CHECK-CORTEX-FIX-NEXT: beq .LBB37_2 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] -; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r7, d16[0] -; CHECK-CORTEX-FIX-NEXT: uxth r6, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r7, #16 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #24] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #8] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov r3, r7, d17 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d16[1] +; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r7 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[0] ; CHECK-CORTEX-FIX-NEXT: uxth r6, r3 ; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r11, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r7, #16 -; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: b .LBB37_3 -; CHECK-CORTEX-FIX-NEXT: .LBB37_2: -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1] -; CHECK-CORTEX-FIX-NEXT: ldrh r11, [r1, #12] -; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #14] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #24] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #2] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #4] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #6] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #8] +; CHECK-CORTEX-FIX-NEXT: uxth r3, r2 +; CHECK-CORTEX-FIX-NEXT: lsr r2, r2, #16 +; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #24] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #10] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d17 +; CHECK-CORTEX-FIX-NEXT: uxth r6, r2 +; CHECK-CORTEX-FIX-NEXT: lsr r2, r2, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r10, r3, #16 +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: uxth r2, r3 +; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: b .LBB37_3 +; CHECK-CORTEX-FIX-NEXT: .LBB37_2: +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1] +; CHECK-CORTEX-FIX-NEXT: ldrh r10, [r1, #14] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #24] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #2] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #20] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #4] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #6] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #8] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #10] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12] ; CHECK-CORTEX-FIX-NEXT: .LBB37_3: -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 ; CHECK-CORTEX-FIX-NEXT: beq .LBB37_5 ; CHECK-CORTEX-FIX-NEXT: @ %bb.4: -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d2[1] -; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov r4, r6, d3 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r4 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r4, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r6 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r6, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r5, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d2[0] -; CHECK-CORTEX-FIX-NEXT: uxth r0, r2 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r2, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov.32 r5, d2[1] +; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r7 +; CHECK-CORTEX-FIX-NEXT: vmov r6, r2, d3 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r6, #16 +; CHECK-CORTEX-FIX-NEXT: uxth lr, r2 +; CHECK-CORTEX-FIX-NEXT: lsr r8, r2, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, r5, #16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d2[0] +; CHECK-CORTEX-FIX-NEXT: uxth r11, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r9, r3, #16 +; CHECK-CORTEX-FIX-NEXT: uxth r3, r5 +; CHECK-CORTEX-FIX-NEXT: uxth r5, r6 ; CHECK-CORTEX-FIX-NEXT: b .LBB37_6 ; CHECK-CORTEX-FIX-NEXT: .LBB37_5: -; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d2 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r2 +; CHECK-CORTEX-FIX-NEXT: vmov r2, r6, d2 +; CHECK-CORTEX-FIX-NEXT: uxth r11, r2 ; CHECK-CORTEX-FIX-NEXT: lsr r9, r2, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r5, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: mov r0, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r6, r7, d3 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r6 +; CHECK-CORTEX-FIX-NEXT: uxth r3, r6 ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r7, #16 -; CHECK-CORTEX-FIX-NEXT: mov r7, r0 +; CHECK-CORTEX-FIX-NEXT: vmov r2, r6, d3 +; CHECK-CORTEX-FIX-NEXT: uxth r5, r2 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r2, #16 +; CHECK-CORTEX-FIX-NEXT: uxth lr, r6 +; CHECK-CORTEX-FIX-NEXT: lsr r8, r6, #16 ; CHECK-CORTEX-FIX-NEXT: .LBB37_6: -; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r11, r11, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r6, [sp, #20] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r4, lsl #16 ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r12, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r10, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r0, r2, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r10, r0, r10, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r0, r2, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, lr, r8, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r2, r3, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #24] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r6, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r6, [sp] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r3 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r7 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r11 -; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r9, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r6 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r4 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r5 +; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r2, r7, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #24] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r7, [sp, #20] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r2, r7, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r11, r9, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r5 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r7 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r3 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r6 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r10 ; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] @@ -3567,8 +3555,8 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, < ; CHECK-FIX-NOSCHED: @ %bb.0: ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-FIX-NOSCHED-NEXT: .pad #24 -; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: .pad #20 +; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #20 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: @@ -3579,17 +3567,16 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, < ; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[0] ; CHECK-FIX-NOSCHED-NEXT: uxth r4, r3 ; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #8] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #4] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: uxth r4, r7 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #12] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #8] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: uxth r4, r6 ; CHECK-FIX-NOSCHED-NEXT: lsr r6, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #20] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #16] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r6, [sp, #12] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: str r6, [sp, #16] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: lsr r9, r5, #16 ; CHECK-FIX-NOSCHED-NEXT: uxth r10, r5 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: bne .LBB82_4 @@ -3601,78 +3588,76 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, < ; CHECK-FIX-NOSCHED-NEXT: uxth r11, r6 ; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 ; CHECK-FIX-NOSCHED-NEXT: lsr r12, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r9, r4 +; CHECK-FIX-NOSCHED-NEXT: uxth r4, r4 ; CHECK-FIX-NOSCHED-NEXT: uxth r6, r3 ; CHECK-FIX-NOSCHED-NEXT: b .LBB82_5 ; CHECK-FIX-NOSCHED-NEXT: .LBB82_3: ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #14] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #16] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #12] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #16] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #8] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #6] ; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2, #10] +; CHECK-FIX-NOSCHED-NEXT: ldrh r9, [r2, #2] ; CHECK-FIX-NOSCHED-NEXT: ldrh r10, [r2] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #4] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #6] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #4] +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB82_4: ; CHECK-FIX-NOSCHED-NEXT: vmov r5, r3, d1 -; CHECK-FIX-NOSCHED-NEXT: mov r4, r7 +; CHECK-FIX-NOSCHED-NEXT: mov lr, r7 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d0[1] ; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d0[0]}, [r1:16] ; CHECK-FIX-NOSCHED-NEXT: vmov.32 r0, d0[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r9, r5 +; CHECK-FIX-NOSCHED-NEXT: uxth r4, r5 ; CHECK-FIX-NOSCHED-NEXT: uxth r11, r3 ; CHECK-FIX-NOSCHED-NEXT: uxth r6, r7 ; CHECK-FIX-NOSCHED-NEXT: lsr r12, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: mov r7, lr ; CHECK-FIX-NOSCHED-NEXT: lsr r1, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: lsr r5, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: mov r7, r4 ; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 ; CHECK-FIX-NOSCHED-NEXT: .LBB82_5: ; CHECK-FIX-NOSCHED-NEXT: uxth r8, r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r10, r0, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r10, r9, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp] @ 4-byte Reload ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r8, lr, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r0 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r6, r12, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r3, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r9, r5, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r4, r5, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r7, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r0 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r11, r1, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[1], r0 ; CHECK-FIX-NOSCHED-NEXT: aesd.8 q8, q9 ; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q8 ; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #20 ; CHECK-FIX-NOSCHED-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-CORTEX-FIX-LABEL: aesd_setf16_cond_via_ptr: ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-CORTEX-FIX-NEXT: .pad #24 -; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #24 +; CHECK-CORTEX-FIX-NEXT: .pad #28 +; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #28 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB82_3 +; CHECK-CORTEX-FIX-NEXT: beq .LBB82_2 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-CORTEX-FIX-NEXT: vorr q9, q8, q8 @@ -3680,94 +3665,94 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, half* %1, < ; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d18[0] ; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 ; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #20] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #24] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 ; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: vmov r3, r6, d17 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #16] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 ; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r11, r6 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB82_4 +; CHECK-CORTEX-FIX-NEXT: lsr r10, r6, #16 +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: uxth r3, r6 +; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: b .LBB82_3 ; CHECK-CORTEX-FIX-NEXT: .LBB82_2: -; CHECK-CORTEX-FIX-NEXT: vmov r1, r7, d0 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r1 -; CHECK-CORTEX-FIX-NEXT: uxth r6, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r7, #16 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r1, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: mov r0, r3 -; CHECK-CORTEX-FIX-NEXT: vmov r7, r3, d1 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r5, r7, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 -; CHECK-CORTEX-FIX-NEXT: mov r3, r0 -; CHECK-CORTEX-FIX-NEXT: b .LBB82_5 -; CHECK-CORTEX-FIX-NEXT: .LBB82_3: ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2] -; CHECK-CORTEX-FIX-NEXT: ldrh r11, [r2, #12] -; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r2, #14] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r10, [r2, #14] +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #24] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #2] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #4] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #6] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #8] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #8] +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #10] +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #12] +; CHECK-CORTEX-FIX-NEXT: .LBB82_3: +; CHECK-CORTEX-FIX-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB82_2 -; CHECK-CORTEX-FIX-NEXT: .LBB82_4: +; CHECK-CORTEX-FIX-NEXT: beq .LBB82_5 +; CHECK-CORTEX-FIX-NEXT: @ %bb.4: ; CHECK-CORTEX-FIX-NEXT: vorr q8, q0, q0 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r5, d0[1] +; CHECK-CORTEX-FIX-NEXT: vmov.32 r6, d0[1] +; CHECK-CORTEX-FIX-NEXT: vmov r7, r3, d1 ; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] -; CHECK-CORTEX-FIX-NEXT: uxth r6, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r5, #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, r7, d1 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r7, #16 +; CHECK-CORTEX-FIX-NEXT: uxth lr, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 +; CHECK-CORTEX-FIX-NEXT: uxth r4, r6 +; CHECK-CORTEX-FIX-NEXT: lsr r5, r6, #16 +; CHECK-CORTEX-FIX-NEXT: uxth r6, r7 ; CHECK-CORTEX-FIX-NEXT: vmov.32 r1, d16[0] -; CHECK-CORTEX-FIX-NEXT: uxth r10, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r5, r5, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r7, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r1 +; CHECK-CORTEX-FIX-NEXT: uxth r11, r1 ; CHECK-CORTEX-FIX-NEXT: lsr r9, r1, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: b .LBB82_6 ; CHECK-CORTEX-FIX-NEXT: .LBB82_5: -; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r11, r11, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r4, [sp, #16] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: vmov r1, r3, d0 +; CHECK-CORTEX-FIX-NEXT: uxth r4, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r5, r3, #16 +; CHECK-CORTEX-FIX-NEXT: vmov r3, r7, d1 +; CHECK-CORTEX-FIX-NEXT: uxth r11, r1 +; CHECK-CORTEX-FIX-NEXT: lsr r9, r1, #16 +; CHECK-CORTEX-FIX-NEXT: uxth r6, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 +; CHECK-CORTEX-FIX-NEXT: uxth lr, r7 +; CHECK-CORTEX-FIX-NEXT: lsr r8, r7, #16 +; CHECK-CORTEX-FIX-NEXT: .LBB82_6: +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r4, r5, lsl #16 ; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r12, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r10, r5, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r10, r0, r10, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r0, r1, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, lr, r8, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r3, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r4 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r1 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r7 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r11 -; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r9, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r11, r9, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r3 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r5 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r6 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r6 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r1 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r5 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r7 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r10 ; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: add sp, sp, #24 +; CHECK-CORTEX-FIX-NEXT: add sp, sp, #28 ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} br i1 %0, label %5, label %12 @@ -3816,104 +3801,97 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1 ; CHECK-FIX-NOSCHED: @ %bb.0: ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-FIX-NOSCHED-NEXT: .pad #24 -; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: .pad #16 +; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r12, s0 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1] -; CHECK-FIX-NOSCHED-NEXT: vmov r7, r6, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r2, d16[1] +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r6, d17 ; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r12 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r7 -; CHECK-FIX-NOSCHED-NEXT: uxth r2, r3 +; CHECK-FIX-NOSCHED-NEXT: uxth r7, r2 +; CHECK-FIX-NOSCHED-NEXT: lsr r11, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: str r7, [sp] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: uxth r7, r3 +; CHECK-FIX-NOSCHED-NEXT: str r7, [sp, #8] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: uxth r7, r6 ; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r6 +; CHECK-FIX-NOSCHED-NEXT: str r7, [sp, #12] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: lsr r6, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #20] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r3, r5 -; CHECK-FIX-NOSCHED-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: b .LBB83_3 +; CHECK-FIX-NOSCHED-NEXT: lsr r9, r5, #16 +; CHECK-FIX-NOSCHED-NEXT: uxth r10, r5 +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB83_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB83_2: -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #14] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #12] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #8] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #6] -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #2] +; CHECK-FIX-NOSCHED-NEXT: vmov r2, r5, d2 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r0, d3 +; CHECK-FIX-NOSCHED-NEXT: lsr r12, r5, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr lr, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r4, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: uxth r0, r0 +; CHECK-FIX-NOSCHED-NEXT: uxth r3, r3 +; CHECK-FIX-NOSCHED-NEXT: uxth r5, r5 +; CHECK-FIX-NOSCHED-NEXT: b .LBB83_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB83_3: +; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #12] +; CHECK-FIX-NOSCHED-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #10] +; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r1, #14] +; CHECK-FIX-NOSCHED-NEXT: ldrh r11, [r1, #6] +; CHECK-FIX-NOSCHED-NEXT: ldrh r9, [r1, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r10, [r1] +; CHECK-FIX-NOSCHED-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #8] ; CHECK-FIX-NOSCHED-NEXT: str r2, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r1, #10] ; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #4] -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1] -; CHECK-FIX-NOSCHED-NEXT: .LBB83_3: +; CHECK-FIX-NOSCHED-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_5 -; CHECK-FIX-NOSCHED-NEXT: @ %bb.4: -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r6, d2[1] -; CHECK-FIX-NOSCHED-NEXT: mov r3, r2 -; CHECK-FIX-NOSCHED-NEXT: mov r2, r7 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_2 +; CHECK-FIX-NOSCHED-NEXT: .LBB83_4: ; CHECK-FIX-NOSCHED-NEXT: vmov r4, r7, d3 +; CHECK-FIX-NOSCHED-NEXT: mov lr, r6 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r6, d2[1] ; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r12 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r0, d2[0] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r2, d2[0] +; CHECK-FIX-NOSCHED-NEXT: uxth r3, r4 +; CHECK-FIX-NOSCHED-NEXT: uxth r0, r7 ; CHECK-FIX-NOSCHED-NEXT: uxth r5, r6 ; CHECK-FIX-NOSCHED-NEXT: lsr r12, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r10, r4 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r9, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: mov r7, r2 -; CHECK-FIX-NOSCHED-NEXT: mov r2, r3 +; CHECK-FIX-NOSCHED-NEXT: mov r6, lr +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 ; CHECK-FIX-NOSCHED-NEXT: lsr r4, r4, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: b .LBB83_6 +; CHECK-FIX-NOSCHED-NEXT: lsr lr, r2, #16 ; CHECK-FIX-NOSCHED-NEXT: .LBB83_5: -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r6, d3 -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r5, d2 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r9, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r6 -; CHECK-FIX-NOSCHED-NEXT: uxth r10, r3 -; CHECK-FIX-NOSCHED-NEXT: uxth r5, r5 -; CHECK-FIX-NOSCHED-NEXT: .LBB83_6: -; CHECK-FIX-NOSCHED-NEXT: uxth r8, r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r3, r0, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r8, lr, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r5, r12, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r2, r0, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r10, r4, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: uxth r8, r2 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r10, r9, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r7, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r11, r9, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r2 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r8, lr, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r2 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r5, r12, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r2 +; CHECK-FIX-NOSCHED-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r2, r11, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r2 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r3, r4, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r2 +; CHECK-FIX-NOSCHED-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: pkhbt r2, r2, r3, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r2 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r6, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[1], r0 ; CHECK-FIX-NOSCHED-NEXT: aesd.8 q8, q9 ; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q8 ; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r1] -; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #16 ; CHECK-FIX-NOSCHED-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-CORTEX-FIX-LABEL: aesd_setf16_cond_via_val: @@ -3922,100 +3900,98 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_val(i1 zeroext %0, half %1, <1 ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-CORTEX-FIX-NEXT: .pad #28 ; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #28 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s0 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 ; CHECK-CORTEX-FIX-NEXT: beq .LBB83_2 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] -; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r7, d16[0] -; CHECK-CORTEX-FIX-NEXT: uxth r6, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r7, #16 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #24] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #8] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov r3, r7, d17 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d16[1] +; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r7 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[0] ; CHECK-CORTEX-FIX-NEXT: uxth r6, r3 ; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r11, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r7, #16 -; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: b .LBB83_3 -; CHECK-CORTEX-FIX-NEXT: .LBB83_2: -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1] -; CHECK-CORTEX-FIX-NEXT: ldrh r11, [r1, #12] -; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #14] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #24] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #2] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #4] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #6] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #8] +; CHECK-CORTEX-FIX-NEXT: uxth r3, r2 +; CHECK-CORTEX-FIX-NEXT: lsr r2, r2, #16 +; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #24] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #10] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d17 +; CHECK-CORTEX-FIX-NEXT: uxth r6, r2 +; CHECK-CORTEX-FIX-NEXT: lsr r2, r2, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r10, r3, #16 +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: uxth r2, r3 +; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: b .LBB83_3 +; CHECK-CORTEX-FIX-NEXT: .LBB83_2: +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1] +; CHECK-CORTEX-FIX-NEXT: ldrh r10, [r1, #14] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #24] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #2] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #20] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #4] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #6] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #8] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #10] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12] ; CHECK-CORTEX-FIX-NEXT: .LBB83_3: -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 ; CHECK-CORTEX-FIX-NEXT: beq .LBB83_5 ; CHECK-CORTEX-FIX-NEXT: @ %bb.4: -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d2[1] -; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov r4, r6, d3 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r4 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r4, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r6 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r6, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r5, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d2[0] -; CHECK-CORTEX-FIX-NEXT: uxth r0, r2 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r2, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov.32 r5, d2[1] +; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r7 +; CHECK-CORTEX-FIX-NEXT: vmov r6, r2, d3 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r6, #16 +; CHECK-CORTEX-FIX-NEXT: uxth lr, r2 +; CHECK-CORTEX-FIX-NEXT: lsr r8, r2, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, r5, #16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d2[0] +; CHECK-CORTEX-FIX-NEXT: uxth r11, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r9, r3, #16 +; CHECK-CORTEX-FIX-NEXT: uxth r3, r5 +; CHECK-CORTEX-FIX-NEXT: uxth r5, r6 ; CHECK-CORTEX-FIX-NEXT: b .LBB83_6 ; CHECK-CORTEX-FIX-NEXT: .LBB83_5: -; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d2 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r2 +; CHECK-CORTEX-FIX-NEXT: vmov r2, r6, d2 +; CHECK-CORTEX-FIX-NEXT: uxth r11, r2 ; CHECK-CORTEX-FIX-NEXT: lsr r9, r2, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r5, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: mov r0, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r6, r7, d3 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r6 +; CHECK-CORTEX-FIX-NEXT: uxth r3, r6 ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r7, #16 -; CHECK-CORTEX-FIX-NEXT: mov r7, r0 +; CHECK-CORTEX-FIX-NEXT: vmov r2, r6, d3 +; CHECK-CORTEX-FIX-NEXT: uxth r5, r2 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r2, #16 +; CHECK-CORTEX-FIX-NEXT: uxth lr, r6 +; CHECK-CORTEX-FIX-NEXT: lsr r8, r6, #16 ; CHECK-CORTEX-FIX-NEXT: .LBB83_6: -; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r11, r11, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r6, [sp, #20] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r4, lsl #16 ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r12, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r10, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r0, r2, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r10, r0, r10, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r0, r2, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, lr, r8, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r2, r3, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #24] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r6, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r6, [sp] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r3 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r7 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r11 -; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r9, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r6 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r4 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r5 +; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r2, r7, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #24] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r7, [sp, #20] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r2, r7, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r11, r9, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r5 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r7 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r3 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r6 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r10 ; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] diff --git a/llvm/test/CodeGen/ARM/atomic-ops-v8.ll b/llvm/test/CodeGen/ARM/atomic-ops-v8.ll index 0a467c2b70acf2..bd20210a531632 100644 --- a/llvm/test/CodeGen/ARM/atomic-ops-v8.ll +++ b/llvm/test/CodeGen/ARM/atomic-ops-v8.ll @@ -1037,25 +1037,22 @@ define i8 @test_atomic_cmpxchg_i8(i8 zeroext %wanted, i8 zeroext %new) nounwind ; CHECK-NOT: mcr ; CHECK-DAG: movw r[[ADDR:[0-9]+]], :lower16:var8 ; CHECK-DAG: movt r[[ADDR]], :upper16:var8 -; CHECK-THUMB-DAG: mov r[[WANTED:[0-9]+]], r0 +; CHECK-DAG: mov r[[WANTED:[0-9]+]], r0 ; CHECK: .LBB{{[0-9]+}}_1: ; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]] ; r0 below is a reasonable guess but could change: it certainly comes into the ; function there. -; CHECK-ARM-NEXT: cmp r[[OLD]], r0 -; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] +; CHECK-NEXT: cmp r[[OLD]], r[[WANTED]] ; CHECK-NEXT: bne .LBB{{[0-9]+}}_{{[0-9]}} ; CHECK-NEXT: %bb.2: ; As above, r1 is a reasonable guess. ; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 -; CHECK-ARM-NEXT: bne .LBB{{[0-9]+}}_{{[0-9]}} ; CHECK-THUMB-NEXT: it eq -; CHECK-THUMB-NEXT: bxeq lr -; CHECK-ARM: mov r0, r[[OLD]] -; CHECK-ARM: clrex -; CHECK: bx lr +; CHECK-NEXT: bxeq lr +; CHECK: clrex +; CHECK-NEXT: bx lr ; CHECK-NOT: dmb ; CHECK-NOT: mcr ret i8 %old @@ -1069,31 +1066,24 @@ define i16 @test_atomic_cmpxchg_i16(i16 zeroext %wanted, i16 zeroext %new) nounw ; CHECK-NOT: mcr ; CHECK-DAG: movw r[[ADDR:[0-9]+]], :lower16:var16 ; CHECK-DAG: movt r[[ADDR]], :upper16:var16 -; CHECK-THUMB-DAG: mov r[[WANTED:[0-9]+]], r0 +; CHECK-DAG: mov r[[WANTED:[0-9]+]], r0 ; CHECK: .LBB{{[0-9]+}}_1: ; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]] ; r0 below is a reasonable guess but could change: it certainly comes into the ; function there. -; CHECK-ARM-NEXT: cmp r[[OLD]], r0 -; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] +; CHECK-NEXT: cmp r[[OLD]], r[[WANTED]] ; CHECK-NEXT: bne .LBB{{[0-9]+}}_{{[0-9]}} ; CHECK-NEXT: %bb.2: ; As above, r1 is a reasonable guess. ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 -; CHECK-ARM-NEXT: bne .LBB{{[0-9]+}}_{{[0-9]}} ; CHECK-THUMB-NEXT: it eq -; CHECK-THUMB-NEXT: bxeq lr -; CHECK-ARM: mov r0, r[[OLD]] -; CHECK: bx lr -; CHECK-ARM-NEXT: .LBB{{[0-9]+}}_{{[0-9]}} -; CHECK-ARM-NEXT: clrex +; CHECK-NEXT: bxeq lr +; CHECK: clrex +; CHECK-NEXT: bx lr ; CHECK-NOT: dmb ; CHECK-NOT: mcr - -; CHECK-ARM: mov r0, r[[OLD]] -; CHECK-ARM-NEXT: bx lr ret i16 %old } diff --git a/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll index 243ec4deecdb84..14eb7a6ba27f63 100644 --- a/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll @@ -69,29 +69,28 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) { define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-LABEL: atomicrmw_uinc_wrap_i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} +; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: dmb ish ; CHECK-NEXT: .LBB3_1: @ %atomicrmw.start ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrexd r4, r5, [r0] -; CHECK-NEXT: adds r6, r4, #1 -; CHECK-NEXT: adc r7, r5, #0 -; CHECK-NEXT: subs r1, r4, r2 -; CHECK-NEXT: sbcs r1, r5, r3 -; CHECK-NEXT: mov r1, #0 -; CHECK-NEXT: movwhs r1, #1 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: ldrexd r0, r1, [r12] +; CHECK-NEXT: adds r6, r0, #1 +; CHECK-NEXT: adc r7, r1, #0 +; CHECK-NEXT: subs r4, r0, r2 +; CHECK-NEXT: sbcs r4, r1, r3 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: movwhs r4, #1 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: movwne r7, #0 ; CHECK-NEXT: movwne r6, #0 -; CHECK-NEXT: strexd r1, r6, r7, [r0] -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: strexd r4, r6, r7, [r12] +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: bne .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %atomicrmw.end -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: dmb ish -; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc} +; CHECK-NEXT: pop {r4, r6, r7, pc} %result = atomicrmw uinc_wrap ptr %ptr, i64 %val seq_cst ret i64 %result } @@ -170,33 +169,32 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) { define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-LABEL: atomicrmw_udec_wrap_i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} +; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: dmb ish ; CHECK-NEXT: .LBB7_1: @ %atomicrmw.start ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrexd r4, r5, [r0] -; CHECK-NEXT: mov r12, #0 -; CHECK-NEXT: subs r1, r2, r4 -; CHECK-NEXT: sbcs r1, r3, r5 -; CHECK-NEXT: orr r1, r4, r5 -; CHECK-NEXT: clz r1, r1 -; CHECK-NEXT: movwlo r12, #1 -; CHECK-NEXT: lsr r1, r1, #5 -; CHECK-NEXT: subs r6, r4, #1 -; CHECK-NEXT: sbc r7, r5, #0 -; CHECK-NEXT: orr r1, r1, r12 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: ldrexd r0, r1, [r12] +; CHECK-NEXT: orr r4, r0, r1 +; CHECK-NEXT: subs lr, r2, r0 +; CHECK-NEXT: clz r4, r4 +; CHECK-NEXT: sbcs lr, r3, r1 +; CHECK-NEXT: lsr r4, r4, #5 +; CHECK-NEXT: mov lr, #0 +; CHECK-NEXT: movwlo lr, #1 +; CHECK-NEXT: subs r6, r0, #1 +; CHECK-NEXT: orr r4, r4, lr +; CHECK-NEXT: sbc r7, r1, #0 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: movne r7, r3 ; CHECK-NEXT: movne r6, r2 -; CHECK-NEXT: strexd r1, r6, r7, [r0] -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: strexd r4, r6, r7, [r12] +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %atomicrmw.end -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: dmb ish -; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc} +; CHECK-NEXT: pop {r4, r6, r7, pc} %result = atomicrmw udec_wrap ptr %ptr, i64 %val seq_cst ret i64 %result } diff --git a/llvm/test/CodeGen/ARM/bf16-shuffle.ll b/llvm/test/CodeGen/ARM/bf16-shuffle.ll index 9968e7887f4b37..656216ffd01186 100644 --- a/llvm/test/CodeGen/ARM/bf16-shuffle.ll +++ b/llvm/test/CodeGen/ARM/bf16-shuffle.ll @@ -325,49 +325,49 @@ entry: define arm_aapcs_vfpcc <8 x bfloat> @shuffle3step1_bf16(<32 x bfloat> %src) { ; CHECK-NOFP16-LABEL: shuffle3step1_bf16: ; CHECK-NOFP16: @ %bb.0: @ %entry -; CHECK-NOFP16-NEXT: vorr q3, q0, q0 -; CHECK-NOFP16-NEXT: vmov.u16 r1, d6[1] -; CHECK-NOFP16-NEXT: vmov r0, s14 -; CHECK-NOFP16-NEXT: vmov.16 d0[0], r1 -; CHECK-NOFP16-NEXT: vmov.16 d0[1], r0 -; CHECK-NOFP16-NEXT: vmov.u16 r0, d7[3] -; CHECK-NOFP16-NEXT: vmov.16 d0[2], r0 +; CHECK-NOFP16-NEXT: vmov.u16 r1, d0[1] +; CHECK-NOFP16-NEXT: vmov r0, s2 +; CHECK-NOFP16-NEXT: vmov.16 d16[0], r1 +; CHECK-NOFP16-NEXT: vmov.16 d16[1], r0 +; CHECK-NOFP16-NEXT: vmov.u16 r0, d1[3] +; CHECK-NOFP16-NEXT: vdup.16 q0, d3[1] +; CHECK-NOFP16-NEXT: vmov r1, s0 +; CHECK-NOFP16-NEXT: vmov.16 d16[2], r0 ; CHECK-NOFP16-NEXT: vmov r0, s5 -; CHECK-NOFP16-NEXT: vdup.16 q1, d3[1] -; CHECK-NOFP16-NEXT: vmov r1, s4 -; CHECK-NOFP16-NEXT: vmov.16 d0[3], r0 +; CHECK-NOFP16-NEXT: vmov.16 d16[3], r0 ; CHECK-NOFP16-NEXT: vmov r0, s8 -; CHECK-NOFP16-NEXT: vmov.16 d1[0], r1 -; CHECK-NOFP16-NEXT: vmov.16 d1[1], r0 +; CHECK-NOFP16-NEXT: vmov.16 d17[0], r1 +; CHECK-NOFP16-NEXT: vmov.16 d17[1], r0 ; CHECK-NOFP16-NEXT: vmov.u16 r0, d4[3] -; CHECK-NOFP16-NEXT: vmov.16 d1[2], r0 +; CHECK-NOFP16-NEXT: vmov.16 d17[2], r0 ; CHECK-NOFP16-NEXT: vmov r0, s11 -; CHECK-NOFP16-NEXT: vmov.16 d1[3], r0 +; CHECK-NOFP16-NEXT: vmov.16 d17[3], r0 +; CHECK-NOFP16-NEXT: vorr q0, q8, q8 ; CHECK-NOFP16-NEXT: bx lr ; ; CHECK-FP16-LABEL: shuffle3step1_bf16: ; CHECK-FP16: @ %bb.0: @ %entry -; CHECK-FP16-NEXT: vorr q3, q0, q0 -; CHECK-FP16-NEXT: vmovx.f16 s0, s12 -; CHECK-FP16-NEXT: vmovx.f16 s12, s15 +; CHECK-FP16-NEXT: vmovx.f16 s12, s0 +; CHECK-FP16-NEXT: vmov r0, s2 +; CHECK-FP16-NEXT: vmov r1, s12 +; CHECK-FP16-NEXT: vmovx.f16 s0, s3 +; CHECK-FP16-NEXT: vmov.16 d16[0], r1 +; CHECK-FP16-NEXT: vmov.16 d16[1], r0 +; CHECK-FP16-NEXT: vmov r0, s0 +; CHECK-FP16-NEXT: vdup.16 q0, d3[1] ; CHECK-FP16-NEXT: vmov r1, s0 -; CHECK-FP16-NEXT: vmov r0, s14 -; CHECK-FP16-NEXT: vmov.16 d0[0], r1 -; CHECK-FP16-NEXT: vmov.16 d0[1], r0 -; CHECK-FP16-NEXT: vmov r0, s12 -; CHECK-FP16-NEXT: vmov.16 d0[2], r0 +; CHECK-FP16-NEXT: vmovx.f16 s0, s9 +; CHECK-FP16-NEXT: vmov.16 d16[2], r0 ; CHECK-FP16-NEXT: vmov r0, s5 -; CHECK-FP16-NEXT: vdup.16 q1, d3[1] -; CHECK-FP16-NEXT: vmov r1, s4 -; CHECK-FP16-NEXT: vmovx.f16 s4, s9 -; CHECK-FP16-NEXT: vmov.16 d0[3], r0 +; CHECK-FP16-NEXT: vmov.16 d16[3], r0 ; CHECK-FP16-NEXT: vmov r0, s8 -; CHECK-FP16-NEXT: vmov.16 d1[0], r1 -; CHECK-FP16-NEXT: vmov.16 d1[1], r0 -; CHECK-FP16-NEXT: vmov r0, s4 -; CHECK-FP16-NEXT: vmov.16 d1[2], r0 +; CHECK-FP16-NEXT: vmov.16 d17[0], r1 +; CHECK-FP16-NEXT: vmov.16 d17[1], r0 +; CHECK-FP16-NEXT: vmov r0, s0 +; CHECK-FP16-NEXT: vmov.16 d17[2], r0 ; CHECK-FP16-NEXT: vmov r0, s11 -; CHECK-FP16-NEXT: vmov.16 d1[3], r0 +; CHECK-FP16-NEXT: vmov.16 d17[3], r0 +; CHECK-FP16-NEXT: vorr q0, q8, q8 ; CHECK-FP16-NEXT: bx lr entry: %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> diff --git a/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll b/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll index 4026495a0f2b41..1bb31e7c30d71b 100644 --- a/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll +++ b/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll @@ -500,14 +500,15 @@ define void @conv_v8f16_to_i128( <8 x half> %a, ptr %store ) { ; CHECK-NEXT: vrev64.16 q8, q8 ; CHECK-NEXT: vadd.f16 q8, q9, q8 ; CHECK-NEXT: vrev32.16 q8, q8 -; CHECK-NEXT: vmov r12, r2, d17 -; CHECK-NEXT: vmov r3, r1, d16 +; CHECK-NEXT: vmov r1, r2, d17 +; CHECK-NEXT: vmov r12, r3, d16 ; CHECK-NEXT: subs lr, r2, #1 -; CHECK-NEXT: sbcs r2, r12, #0 +; CHECK-NEXT: str lr, [r0, #12] ; CHECK-NEXT: sbcs r1, r1, #0 -; CHECK-NEXT: sbc r3, r3, #0 -; CHECK-NEXT: str r3, [r0] -; CHECK-NEXT: stmib r0, {r1, r2, lr} +; CHECK-NEXT: str r1, [r0, #8] +; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: sbc r2, r12, #0 +; CHECK-NEXT: stm r0, {r2, r3} ; CHECK-NEXT: pop {r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/ARM/cttz.ll b/llvm/test/CodeGen/ARM/cttz.ll index d9663a1c148fc5..050e58d7ae39e7 100644 --- a/llvm/test/CodeGen/ARM/cttz.ll +++ b/llvm/test/CodeGen/ARM/cttz.ll @@ -223,11 +223,11 @@ define i64 @test_i64(i64 %a) { ; CHECK-6M: @ %bb.0: ; CHECK-6M-NEXT: .save {r4, r5, r7, lr} ; CHECK-6M-NEXT: push {r4, r5, r7, lr} -; CHECK-6M-NEXT: ldr r5, .LCPI3_0 -; CHECK-6M-NEXT: adr r4, .LCPI3_1 -; CHECK-6M-NEXT: movs r3, #32 +; CHECK-6M-NEXT: ldr r4, .LCPI3_0 +; CHECK-6M-NEXT: adr r3, .LCPI3_1 +; CHECK-6M-NEXT: movs r5, #32 ; CHECK-6M-NEXT: cmp r0, #0 -; CHECK-6M-NEXT: mov r2, r3 +; CHECK-6M-NEXT: mov r2, r5 ; CHECK-6M-NEXT: bne .LBB3_5 ; CHECK-6M-NEXT: @ %bb.1: ; CHECK-6M-NEXT: cmp r1, #0 @@ -236,8 +236,8 @@ define i64 @test_i64(i64 %a) { ; CHECK-6M-NEXT: cmp r0, #0 ; CHECK-6M-NEXT: bne .LBB3_4 ; CHECK-6M-NEXT: .LBB3_3: -; CHECK-6M-NEXT: adds r3, #32 -; CHECK-6M-NEXT: mov r2, r3 +; CHECK-6M-NEXT: adds r5, #32 +; CHECK-6M-NEXT: mov r2, r5 ; CHECK-6M-NEXT: .LBB3_4: ; CHECK-6M-NEXT: movs r1, #0 ; CHECK-6M-NEXT: mov r0, r2 @@ -245,17 +245,17 @@ define i64 @test_i64(i64 %a) { ; CHECK-6M-NEXT: .LBB3_5: ; CHECK-6M-NEXT: rsbs r2, r0, #0 ; CHECK-6M-NEXT: ands r2, r0 -; CHECK-6M-NEXT: muls r2, r5, r2 +; CHECK-6M-NEXT: muls r2, r4, r2 ; CHECK-6M-NEXT: lsrs r2, r2, #27 -; CHECK-6M-NEXT: ldrb r2, [r4, r2] +; CHECK-6M-NEXT: ldrb r2, [r3, r2] ; CHECK-6M-NEXT: cmp r1, #0 ; CHECK-6M-NEXT: beq .LBB3_2 ; CHECK-6M-NEXT: .LBB3_6: -; CHECK-6M-NEXT: rsbs r3, r1, #0 -; CHECK-6M-NEXT: ands r3, r1 -; CHECK-6M-NEXT: muls r5, r3, r5 -; CHECK-6M-NEXT: lsrs r1, r5, #27 -; CHECK-6M-NEXT: ldrb r3, [r4, r1] +; CHECK-6M-NEXT: rsbs r5, r1, #0 +; CHECK-6M-NEXT: ands r5, r1 +; CHECK-6M-NEXT: muls r4, r5, r4 +; CHECK-6M-NEXT: lsrs r1, r4, #27 +; CHECK-6M-NEXT: ldrb r5, [r3, r1] ; CHECK-6M-NEXT: cmp r0, #0 ; CHECK-6M-NEXT: beq .LBB3_3 ; CHECK-6M-NEXT: b .LBB3_4 @@ -270,19 +270,19 @@ define i64 @test_i64(i64 %a) { ; CHECK-8MBASE: @ %bb.0: ; CHECK-8MBASE-NEXT: .save {r4, r5, r7, lr} ; CHECK-8MBASE-NEXT: push {r4, r5, r7, lr} -; CHECK-8MBASE-NEXT: movw r5, #46385 -; CHECK-8MBASE-NEXT: movt r5, #1916 -; CHECK-8MBASE-NEXT: adr r4, .LCPI3_0 -; CHECK-8MBASE-NEXT: movs r3, #32 -; CHECK-8MBASE-NEXT: mov r2, r3 +; CHECK-8MBASE-NEXT: movw r4, #46385 +; CHECK-8MBASE-NEXT: movt r4, #1916 +; CHECK-8MBASE-NEXT: adr r3, .LCPI3_0 +; CHECK-8MBASE-NEXT: movs r5, #32 +; CHECK-8MBASE-NEXT: mov r2, r5 ; CHECK-8MBASE-NEXT: cbnz r0, .LBB3_5 ; CHECK-8MBASE-NEXT: @ %bb.1: ; CHECK-8MBASE-NEXT: cbnz r1, .LBB3_6 ; CHECK-8MBASE-NEXT: .LBB3_2: ; CHECK-8MBASE-NEXT: cbnz r0, .LBB3_4 ; CHECK-8MBASE-NEXT: .LBB3_3: -; CHECK-8MBASE-NEXT: adds r3, #32 -; CHECK-8MBASE-NEXT: mov r2, r3 +; CHECK-8MBASE-NEXT: adds r5, #32 +; CHECK-8MBASE-NEXT: mov r2, r5 ; CHECK-8MBASE-NEXT: .LBB3_4: ; CHECK-8MBASE-NEXT: movs r1, #0 ; CHECK-8MBASE-NEXT: mov r0, r2 @@ -290,17 +290,17 @@ define i64 @test_i64(i64 %a) { ; CHECK-8MBASE-NEXT: .LBB3_5: ; CHECK-8MBASE-NEXT: rsbs r2, r0, #0 ; CHECK-8MBASE-NEXT: ands r2, r0 -; CHECK-8MBASE-NEXT: muls r2, r5, r2 +; CHECK-8MBASE-NEXT: muls r2, r4, r2 ; CHECK-8MBASE-NEXT: lsrs r2, r2, #27 -; CHECK-8MBASE-NEXT: ldrb r2, [r4, r2] +; CHECK-8MBASE-NEXT: ldrb r2, [r3, r2] ; CHECK-8MBASE-NEXT: cmp r1, #0 ; CHECK-8MBASE-NEXT: beq .LBB3_2 ; CHECK-8MBASE-NEXT: .LBB3_6: -; CHECK-8MBASE-NEXT: rsbs r3, r1, #0 -; CHECK-8MBASE-NEXT: ands r3, r1 -; CHECK-8MBASE-NEXT: muls r5, r3, r5 -; CHECK-8MBASE-NEXT: lsrs r1, r5, #27 -; CHECK-8MBASE-NEXT: ldrb r3, [r4, r1] +; CHECK-8MBASE-NEXT: rsbs r5, r1, #0 +; CHECK-8MBASE-NEXT: ands r5, r1 +; CHECK-8MBASE-NEXT: muls r4, r5, r4 +; CHECK-8MBASE-NEXT: lsrs r1, r4, #27 +; CHECK-8MBASE-NEXT: ldrb r5, [r3, r1] ; CHECK-8MBASE-NEXT: cmp r0, #0 ; CHECK-8MBASE-NEXT: beq .LBB3_3 ; CHECK-8MBASE-NEXT: b .LBB3_4 @@ -494,11 +494,11 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-6M: @ %bb.0: ; CHECK-6M-NEXT: .save {r4, r5, r7, lr} ; CHECK-6M-NEXT: push {r4, r5, r7, lr} -; CHECK-6M-NEXT: ldr r5, .LCPI7_0 -; CHECK-6M-NEXT: adr r4, .LCPI7_1 -; CHECK-6M-NEXT: movs r3, #32 +; CHECK-6M-NEXT: ldr r4, .LCPI7_0 +; CHECK-6M-NEXT: adr r3, .LCPI7_1 +; CHECK-6M-NEXT: movs r5, #32 ; CHECK-6M-NEXT: cmp r0, #0 -; CHECK-6M-NEXT: mov r2, r3 +; CHECK-6M-NEXT: mov r2, r5 ; CHECK-6M-NEXT: bne .LBB7_5 ; CHECK-6M-NEXT: @ %bb.1: ; CHECK-6M-NEXT: cmp r1, #0 @@ -507,8 +507,8 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-6M-NEXT: cmp r0, #0 ; CHECK-6M-NEXT: bne .LBB7_4 ; CHECK-6M-NEXT: .LBB7_3: -; CHECK-6M-NEXT: adds r3, #32 -; CHECK-6M-NEXT: mov r2, r3 +; CHECK-6M-NEXT: adds r5, #32 +; CHECK-6M-NEXT: mov r2, r5 ; CHECK-6M-NEXT: .LBB7_4: ; CHECK-6M-NEXT: movs r1, #0 ; CHECK-6M-NEXT: mov r0, r2 @@ -516,17 +516,17 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-6M-NEXT: .LBB7_5: ; CHECK-6M-NEXT: rsbs r2, r0, #0 ; CHECK-6M-NEXT: ands r2, r0 -; CHECK-6M-NEXT: muls r2, r5, r2 +; CHECK-6M-NEXT: muls r2, r4, r2 ; CHECK-6M-NEXT: lsrs r2, r2, #27 -; CHECK-6M-NEXT: ldrb r2, [r4, r2] +; CHECK-6M-NEXT: ldrb r2, [r3, r2] ; CHECK-6M-NEXT: cmp r1, #0 ; CHECK-6M-NEXT: beq .LBB7_2 ; CHECK-6M-NEXT: .LBB7_6: -; CHECK-6M-NEXT: rsbs r3, r1, #0 -; CHECK-6M-NEXT: ands r3, r1 -; CHECK-6M-NEXT: muls r5, r3, r5 -; CHECK-6M-NEXT: lsrs r1, r5, #27 -; CHECK-6M-NEXT: ldrb r3, [r4, r1] +; CHECK-6M-NEXT: rsbs r5, r1, #0 +; CHECK-6M-NEXT: ands r5, r1 +; CHECK-6M-NEXT: muls r4, r5, r4 +; CHECK-6M-NEXT: lsrs r1, r4, #27 +; CHECK-6M-NEXT: ldrb r5, [r3, r1] ; CHECK-6M-NEXT: cmp r0, #0 ; CHECK-6M-NEXT: beq .LBB7_3 ; CHECK-6M-NEXT: b .LBB7_4 @@ -541,19 +541,19 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-8MBASE: @ %bb.0: ; CHECK-8MBASE-NEXT: .save {r4, r5, r7, lr} ; CHECK-8MBASE-NEXT: push {r4, r5, r7, lr} -; CHECK-8MBASE-NEXT: movw r5, #46385 -; CHECK-8MBASE-NEXT: movt r5, #1916 -; CHECK-8MBASE-NEXT: adr r4, .LCPI7_0 -; CHECK-8MBASE-NEXT: movs r3, #32 -; CHECK-8MBASE-NEXT: mov r2, r3 +; CHECK-8MBASE-NEXT: movw r4, #46385 +; CHECK-8MBASE-NEXT: movt r4, #1916 +; CHECK-8MBASE-NEXT: adr r3, .LCPI7_0 +; CHECK-8MBASE-NEXT: movs r5, #32 +; CHECK-8MBASE-NEXT: mov r2, r5 ; CHECK-8MBASE-NEXT: cbnz r0, .LBB7_5 ; CHECK-8MBASE-NEXT: @ %bb.1: ; CHECK-8MBASE-NEXT: cbnz r1, .LBB7_6 ; CHECK-8MBASE-NEXT: .LBB7_2: ; CHECK-8MBASE-NEXT: cbnz r0, .LBB7_4 ; CHECK-8MBASE-NEXT: .LBB7_3: -; CHECK-8MBASE-NEXT: adds r3, #32 -; CHECK-8MBASE-NEXT: mov r2, r3 +; CHECK-8MBASE-NEXT: adds r5, #32 +; CHECK-8MBASE-NEXT: mov r2, r5 ; CHECK-8MBASE-NEXT: .LBB7_4: ; CHECK-8MBASE-NEXT: movs r1, #0 ; CHECK-8MBASE-NEXT: mov r0, r2 @@ -561,17 +561,17 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-8MBASE-NEXT: .LBB7_5: ; CHECK-8MBASE-NEXT: rsbs r2, r0, #0 ; CHECK-8MBASE-NEXT: ands r2, r0 -; CHECK-8MBASE-NEXT: muls r2, r5, r2 +; CHECK-8MBASE-NEXT: muls r2, r4, r2 ; CHECK-8MBASE-NEXT: lsrs r2, r2, #27 -; CHECK-8MBASE-NEXT: ldrb r2, [r4, r2] +; CHECK-8MBASE-NEXT: ldrb r2, [r3, r2] ; CHECK-8MBASE-NEXT: cmp r1, #0 ; CHECK-8MBASE-NEXT: beq .LBB7_2 ; CHECK-8MBASE-NEXT: .LBB7_6: -; CHECK-8MBASE-NEXT: rsbs r3, r1, #0 -; CHECK-8MBASE-NEXT: ands r3, r1 -; CHECK-8MBASE-NEXT: muls r5, r3, r5 -; CHECK-8MBASE-NEXT: lsrs r1, r5, #27 -; CHECK-8MBASE-NEXT: ldrb r3, [r4, r1] +; CHECK-8MBASE-NEXT: rsbs r5, r1, #0 +; CHECK-8MBASE-NEXT: ands r5, r1 +; CHECK-8MBASE-NEXT: muls r4, r5, r4 +; CHECK-8MBASE-NEXT: lsrs r1, r4, #27 +; CHECK-8MBASE-NEXT: ldrb r5, [r3, r1] ; CHECK-8MBASE-NEXT: cmp r0, #0 ; CHECK-8MBASE-NEXT: beq .LBB7_3 ; CHECK-8MBASE-NEXT: b .LBB7_4 diff --git a/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll b/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll index 656bce616ea048..7ce723b423b569 100644 --- a/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll +++ b/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll @@ -299,20 +299,20 @@ define <8 x half> @fadd_vselect_fneg_posk_v8f16(<8 x i32> %arg0, <8 x half> %x, ; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov.16 q2[1], r1 ; CHECK-NEXT: vcmp.i32 eq, q3, zr -; CHECK-NEXT: vpsel q1, q1, q0 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov.16 q2[2], r4 -; CHECK-NEXT: vmov r3, r0, d2 +; CHECK-NEXT: vmov r3, r0, d0 ; CHECK-NEXT: vmov.16 q2[3], r5 ; CHECK-NEXT: vmov.16 q2[4], r3 -; CHECK-NEXT: vmov r6, lr, d3 +; CHECK-NEXT: vmov r6, lr, d1 ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vldrw.u32 q0, [r12] ; CHECK-NEXT: vmov.16 q2[6], r6 -; CHECK-NEXT: vmov.i16 q0, #0xc400 +; CHECK-NEXT: vmov.i16 q1, #0xc400 ; CHECK-NEXT: vmov.16 q2[7], lr ; CHECK-NEXT: add r0, sp, #48 ; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vsub.f16 q0, q1, q0 ; CHECK-NEXT: vmov r0, r1, d0 diff --git a/llvm/test/CodeGen/ARM/fpclamptosat.ll b/llvm/test/CodeGen/ARM/fpclamptosat.ll index 6c3c74a47ebf1e..031978a68bde5a 100644 --- a/llvm/test/CodeGen/ARM/fpclamptosat.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat.ll @@ -990,8 +990,8 @@ define i64 @stest_f64i64(double %x) { ; SOFT: @ %bb.0: @ %entry ; SOFT-NEXT: .save {r4, r5, r6, r7, lr} ; SOFT-NEXT: push {r4, r5, r6, r7, lr} -; SOFT-NEXT: .pad #12 -; SOFT-NEXT: sub sp, #12 +; SOFT-NEXT: .pad #4 +; SOFT-NEXT: sub sp, #4 ; SOFT-NEXT: bl __fixdfti ; SOFT-NEXT: mov r6, r0 ; SOFT-NEXT: movs r4, #1 @@ -1015,28 +1015,27 @@ define i64 @stest_f64i64(double %x) { ; SOFT-NEXT: .LBB18_3: @ %entry ; SOFT-NEXT: mov r2, r5 ; SOFT-NEXT: .LBB18_4: @ %entry -; SOFT-NEXT: str r2, [sp, #8] @ 4-byte Spill +; SOFT-NEXT: str r3, [sp] @ 4-byte Spill ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB18_6 ; SOFT-NEXT: @ %bb.5: @ %entry ; SOFT-NEXT: mov r1, r0 ; SOFT-NEXT: .LBB18_6: @ %entry -; SOFT-NEXT: str r3, [sp, #4] @ 4-byte Spill +; SOFT-NEXT: mov r3, r2 ; SOFT-NEXT: mvns r0, r5 ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB18_8 ; SOFT-NEXT: @ %bb.7: @ %entry ; SOFT-NEXT: mov r6, r0 ; SOFT-NEXT: .LBB18_8: @ %entry -; SOFT-NEXT: lsls r3, r4, #31 +; SOFT-NEXT: lsls r2, r4, #31 ; SOFT-NEXT: rsbs r7, r6, #0 -; SOFT-NEXT: mov r7, r3 +; SOFT-NEXT: mov r7, r2 ; SOFT-NEXT: sbcs r7, r1 ; SOFT-NEXT: mov r7, r0 -; SOFT-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; SOFT-NEXT: sbcs r7, r2 -; SOFT-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; SOFT-NEXT: sbcs r0, r2 +; SOFT-NEXT: sbcs r7, r3 +; SOFT-NEXT: ldr r3, [sp] @ 4-byte Reload +; SOFT-NEXT: sbcs r0, r3 ; SOFT-NEXT: bge .LBB18_15 ; SOFT-NEXT: @ %bb.9: @ %entry ; SOFT-NEXT: cmp r4, #0 @@ -1045,10 +1044,10 @@ define i64 @stest_f64i64(double %x) { ; SOFT-NEXT: cmp r4, #0 ; SOFT-NEXT: bne .LBB18_12 ; SOFT-NEXT: .LBB18_11: @ %entry -; SOFT-NEXT: mov r1, r3 +; SOFT-NEXT: mov r1, r2 ; SOFT-NEXT: .LBB18_12: @ %entry ; SOFT-NEXT: mov r0, r6 -; SOFT-NEXT: add sp, #12 +; SOFT-NEXT: add sp, #4 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .LBB18_13: @ %entry ; SOFT-NEXT: mov r7, r5 @@ -1356,8 +1355,8 @@ define i64 @stest_f32i64(float %x) { ; SOFT: @ %bb.0: @ %entry ; SOFT-NEXT: .save {r4, r5, r6, r7, lr} ; SOFT-NEXT: push {r4, r5, r6, r7, lr} -; SOFT-NEXT: .pad #12 -; SOFT-NEXT: sub sp, #12 +; SOFT-NEXT: .pad #4 +; SOFT-NEXT: sub sp, #4 ; SOFT-NEXT: bl __fixsfti ; SOFT-NEXT: mov r6, r0 ; SOFT-NEXT: movs r4, #1 @@ -1381,28 +1380,27 @@ define i64 @stest_f32i64(float %x) { ; SOFT-NEXT: .LBB21_3: @ %entry ; SOFT-NEXT: mov r2, r5 ; SOFT-NEXT: .LBB21_4: @ %entry -; SOFT-NEXT: str r2, [sp, #8] @ 4-byte Spill +; SOFT-NEXT: str r3, [sp] @ 4-byte Spill ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB21_6 ; SOFT-NEXT: @ %bb.5: @ %entry ; SOFT-NEXT: mov r1, r0 ; SOFT-NEXT: .LBB21_6: @ %entry -; SOFT-NEXT: str r3, [sp, #4] @ 4-byte Spill +; SOFT-NEXT: mov r3, r2 ; SOFT-NEXT: mvns r0, r5 ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB21_8 ; SOFT-NEXT: @ %bb.7: @ %entry ; SOFT-NEXT: mov r6, r0 ; SOFT-NEXT: .LBB21_8: @ %entry -; SOFT-NEXT: lsls r3, r4, #31 +; SOFT-NEXT: lsls r2, r4, #31 ; SOFT-NEXT: rsbs r7, r6, #0 -; SOFT-NEXT: mov r7, r3 +; SOFT-NEXT: mov r7, r2 ; SOFT-NEXT: sbcs r7, r1 ; SOFT-NEXT: mov r7, r0 -; SOFT-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; SOFT-NEXT: sbcs r7, r2 -; SOFT-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; SOFT-NEXT: sbcs r0, r2 +; SOFT-NEXT: sbcs r7, r3 +; SOFT-NEXT: ldr r3, [sp] @ 4-byte Reload +; SOFT-NEXT: sbcs r0, r3 ; SOFT-NEXT: bge .LBB21_15 ; SOFT-NEXT: @ %bb.9: @ %entry ; SOFT-NEXT: cmp r4, #0 @@ -1411,10 +1409,10 @@ define i64 @stest_f32i64(float %x) { ; SOFT-NEXT: cmp r4, #0 ; SOFT-NEXT: bne .LBB21_12 ; SOFT-NEXT: .LBB21_11: @ %entry -; SOFT-NEXT: mov r1, r3 +; SOFT-NEXT: mov r1, r2 ; SOFT-NEXT: .LBB21_12: @ %entry ; SOFT-NEXT: mov r0, r6 -; SOFT-NEXT: add sp, #12 +; SOFT-NEXT: add sp, #4 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .LBB21_13: @ %entry ; SOFT-NEXT: mov r7, r5 @@ -1722,8 +1720,8 @@ define i64 @stest_f16i64(half %x) { ; SOFT: @ %bb.0: @ %entry ; SOFT-NEXT: .save {r4, r5, r6, r7, lr} ; SOFT-NEXT: push {r4, r5, r6, r7, lr} -; SOFT-NEXT: .pad #12 -; SOFT-NEXT: sub sp, #12 +; SOFT-NEXT: .pad #4 +; SOFT-NEXT: sub sp, #4 ; SOFT-NEXT: uxth r0, r0 ; SOFT-NEXT: bl __aeabi_h2f ; SOFT-NEXT: bl __fixsfti @@ -1749,28 +1747,27 @@ define i64 @stest_f16i64(half %x) { ; SOFT-NEXT: .LBB24_3: @ %entry ; SOFT-NEXT: mov r2, r5 ; SOFT-NEXT: .LBB24_4: @ %entry -; SOFT-NEXT: str r2, [sp, #8] @ 4-byte Spill +; SOFT-NEXT: str r3, [sp] @ 4-byte Spill ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB24_6 ; SOFT-NEXT: @ %bb.5: @ %entry ; SOFT-NEXT: mov r1, r0 ; SOFT-NEXT: .LBB24_6: @ %entry -; SOFT-NEXT: str r3, [sp, #4] @ 4-byte Spill +; SOFT-NEXT: mov r3, r2 ; SOFT-NEXT: mvns r0, r5 ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB24_8 ; SOFT-NEXT: @ %bb.7: @ %entry ; SOFT-NEXT: mov r6, r0 ; SOFT-NEXT: .LBB24_8: @ %entry -; SOFT-NEXT: lsls r3, r4, #31 +; SOFT-NEXT: lsls r2, r4, #31 ; SOFT-NEXT: rsbs r7, r6, #0 -; SOFT-NEXT: mov r7, r3 +; SOFT-NEXT: mov r7, r2 ; SOFT-NEXT: sbcs r7, r1 ; SOFT-NEXT: mov r7, r0 -; SOFT-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; SOFT-NEXT: sbcs r7, r2 -; SOFT-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; SOFT-NEXT: sbcs r0, r2 +; SOFT-NEXT: sbcs r7, r3 +; SOFT-NEXT: ldr r3, [sp] @ 4-byte Reload +; SOFT-NEXT: sbcs r0, r3 ; SOFT-NEXT: bge .LBB24_15 ; SOFT-NEXT: @ %bb.9: @ %entry ; SOFT-NEXT: cmp r4, #0 @@ -1779,10 +1776,10 @@ define i64 @stest_f16i64(half %x) { ; SOFT-NEXT: cmp r4, #0 ; SOFT-NEXT: bne .LBB24_12 ; SOFT-NEXT: .LBB24_11: @ %entry -; SOFT-NEXT: mov r1, r3 +; SOFT-NEXT: mov r1, r2 ; SOFT-NEXT: .LBB24_12: @ %entry ; SOFT-NEXT: mov r0, r6 -; SOFT-NEXT: add sp, #12 +; SOFT-NEXT: add sp, #4 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .LBB24_13: @ %entry ; SOFT-NEXT: mov r7, r5 @@ -2372,19 +2369,19 @@ define i32 @ustest_f32i32_mm(float %x) { ; SOFT-NEXT: .save {r7, lr} ; SOFT-NEXT: push {r7, lr} ; SOFT-NEXT: bl __aeabi_f2lz -; SOFT-NEXT: mov r2, r0 -; SOFT-NEXT: movs r0, #0 +; SOFT-NEXT: movs r2, #0 ; SOFT-NEXT: cmp r1, #1 ; SOFT-NEXT: blt .LBB32_2 ; SOFT-NEXT: @ %bb.1: @ %entry -; SOFT-NEXT: mvns r2, r0 +; SOFT-NEXT: mvns r0, r2 ; SOFT-NEXT: .LBB32_2: @ %entry ; SOFT-NEXT: asrs r3, r1, #31 ; SOFT-NEXT: ands r3, r1 ; SOFT-NEXT: bmi .LBB32_4 ; SOFT-NEXT: @ %bb.3: @ %entry -; SOFT-NEXT: mov r0, r2 +; SOFT-NEXT: mov r2, r0 ; SOFT-NEXT: .LBB32_4: @ %entry +; SOFT-NEXT: mov r0, r2 ; SOFT-NEXT: pop {r7, pc} ; ; VFP-LABEL: ustest_f32i32_mm: @@ -2519,19 +2516,19 @@ define i32 @ustest_f16i32_mm(half %x) { ; SOFT-NEXT: uxth r0, r0 ; SOFT-NEXT: bl __aeabi_h2f ; SOFT-NEXT: bl __aeabi_f2lz -; SOFT-NEXT: mov r2, r0 -; SOFT-NEXT: movs r0, #0 +; SOFT-NEXT: movs r2, #0 ; SOFT-NEXT: cmp r1, #1 ; SOFT-NEXT: blt .LBB35_2 ; SOFT-NEXT: @ %bb.1: @ %entry -; SOFT-NEXT: mvns r2, r0 +; SOFT-NEXT: mvns r0, r2 ; SOFT-NEXT: .LBB35_2: @ %entry ; SOFT-NEXT: asrs r3, r1, #31 ; SOFT-NEXT: ands r3, r1 ; SOFT-NEXT: bmi .LBB35_4 ; SOFT-NEXT: @ %bb.3: @ %entry -; SOFT-NEXT: mov r0, r2 +; SOFT-NEXT: mov r2, r0 ; SOFT-NEXT: .LBB35_4: @ %entry +; SOFT-NEXT: mov r0, r2 ; SOFT-NEXT: pop {r7, pc} ; ; VFP2-LABEL: ustest_f16i32_mm: @@ -3196,53 +3193,52 @@ define i64 @ustest_f64i64_mm(double %x) { ; SOFT-NEXT: .save {r4, r5, r7, lr} ; SOFT-NEXT: push {r4, r5, r7, lr} ; SOFT-NEXT: bl __fixdfti -; SOFT-NEXT: mov r4, r1 -; SOFT-NEXT: movs r1, #0 +; SOFT-NEXT: movs r5, #0 ; SOFT-NEXT: subs r2, r2, #1 ; SOFT-NEXT: mov r2, r3 -; SOFT-NEXT: sbcs r2, r1 +; SOFT-NEXT: sbcs r2, r5 ; SOFT-NEXT: blt .LBB47_2 ; SOFT-NEXT: @ %bb.1: @ %entry -; SOFT-NEXT: mov r5, r1 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: mov r2, r5 +; SOFT-NEXT: cmp r2, #0 ; SOFT-NEXT: beq .LBB47_3 ; SOFT-NEXT: b .LBB47_4 ; SOFT-NEXT: .LBB47_2: -; SOFT-NEXT: movs r5, #1 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: movs r2, #1 +; SOFT-NEXT: cmp r2, #0 ; SOFT-NEXT: bne .LBB47_4 ; SOFT-NEXT: .LBB47_3: @ %entry -; SOFT-NEXT: mov r3, r5 +; SOFT-NEXT: mov r3, r2 ; SOFT-NEXT: .LBB47_4: @ %entry -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r2, #0 ; SOFT-NEXT: bne .LBB47_6 ; SOFT-NEXT: @ %bb.5: @ %entry -; SOFT-NEXT: mov r0, r5 +; SOFT-NEXT: mov r0, r2 ; SOFT-NEXT: .LBB47_6: @ %entry ; SOFT-NEXT: cmp r3, #0 -; SOFT-NEXT: mov r2, r1 -; SOFT-NEXT: bpl .LBB47_10 +; SOFT-NEXT: mov r4, r5 +; SOFT-NEXT: bpl .LBB47_11 ; SOFT-NEXT: @ %bb.7: @ %entry -; SOFT-NEXT: cmp r5, #0 -; SOFT-NEXT: beq .LBB47_11 +; SOFT-NEXT: cmp r2, #0 +; SOFT-NEXT: beq .LBB47_12 ; SOFT-NEXT: .LBB47_8: @ %entry ; SOFT-NEXT: cmp r3, #0 -; SOFT-NEXT: bpl .LBB47_12 +; SOFT-NEXT: bmi .LBB47_10 ; SOFT-NEXT: .LBB47_9: @ %entry -; SOFT-NEXT: mov r0, r2 -; SOFT-NEXT: pop {r4, r5, r7, pc} +; SOFT-NEXT: mov r5, r1 ; SOFT-NEXT: .LBB47_10: @ %entry -; SOFT-NEXT: mov r2, r0 -; SOFT-NEXT: cmp r5, #0 -; SOFT-NEXT: bne .LBB47_8 +; SOFT-NEXT: mov r0, r4 +; SOFT-NEXT: mov r1, r5 +; SOFT-NEXT: pop {r4, r5, r7, pc} ; SOFT-NEXT: .LBB47_11: @ %entry -; SOFT-NEXT: mov r4, r5 -; SOFT-NEXT: cmp r3, #0 -; SOFT-NEXT: bmi .LBB47_9 +; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: cmp r2, #0 +; SOFT-NEXT: bne .LBB47_8 ; SOFT-NEXT: .LBB47_12: @ %entry -; SOFT-NEXT: mov r1, r4 -; SOFT-NEXT: mov r0, r2 -; SOFT-NEXT: pop {r4, r5, r7, pc} +; SOFT-NEXT: mov r1, r2 +; SOFT-NEXT: cmp r3, #0 +; SOFT-NEXT: bpl .LBB47_9 +; SOFT-NEXT: b .LBB47_10 ; ; VFP2-LABEL: ustest_f64i64_mm: ; VFP2: @ %bb.0: @ %entry @@ -3525,53 +3521,52 @@ define i64 @ustest_f32i64_mm(float %x) { ; SOFT-NEXT: .save {r4, r5, r7, lr} ; SOFT-NEXT: push {r4, r5, r7, lr} ; SOFT-NEXT: bl __fixsfti -; SOFT-NEXT: mov r4, r1 -; SOFT-NEXT: movs r1, #0 +; SOFT-NEXT: movs r5, #0 ; SOFT-NEXT: subs r2, r2, #1 ; SOFT-NEXT: mov r2, r3 -; SOFT-NEXT: sbcs r2, r1 +; SOFT-NEXT: sbcs r2, r5 ; SOFT-NEXT: blt .LBB50_2 ; SOFT-NEXT: @ %bb.1: @ %entry -; SOFT-NEXT: mov r5, r1 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: mov r2, r5 +; SOFT-NEXT: cmp r2, #0 ; SOFT-NEXT: beq .LBB50_3 ; SOFT-NEXT: b .LBB50_4 ; SOFT-NEXT: .LBB50_2: -; SOFT-NEXT: movs r5, #1 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: movs r2, #1 +; SOFT-NEXT: cmp r2, #0 ; SOFT-NEXT: bne .LBB50_4 ; SOFT-NEXT: .LBB50_3: @ %entry -; SOFT-NEXT: mov r3, r5 +; SOFT-NEXT: mov r3, r2 ; SOFT-NEXT: .LBB50_4: @ %entry -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r2, #0 ; SOFT-NEXT: bne .LBB50_6 ; SOFT-NEXT: @ %bb.5: @ %entry -; SOFT-NEXT: mov r0, r5 +; SOFT-NEXT: mov r0, r2 ; SOFT-NEXT: .LBB50_6: @ %entry ; SOFT-NEXT: cmp r3, #0 -; SOFT-NEXT: mov r2, r1 -; SOFT-NEXT: bpl .LBB50_10 +; SOFT-NEXT: mov r4, r5 +; SOFT-NEXT: bpl .LBB50_11 ; SOFT-NEXT: @ %bb.7: @ %entry -; SOFT-NEXT: cmp r5, #0 -; SOFT-NEXT: beq .LBB50_11 +; SOFT-NEXT: cmp r2, #0 +; SOFT-NEXT: beq .LBB50_12 ; SOFT-NEXT: .LBB50_8: @ %entry ; SOFT-NEXT: cmp r3, #0 -; SOFT-NEXT: bpl .LBB50_12 +; SOFT-NEXT: bmi .LBB50_10 ; SOFT-NEXT: .LBB50_9: @ %entry -; SOFT-NEXT: mov r0, r2 -; SOFT-NEXT: pop {r4, r5, r7, pc} +; SOFT-NEXT: mov r5, r1 ; SOFT-NEXT: .LBB50_10: @ %entry -; SOFT-NEXT: mov r2, r0 -; SOFT-NEXT: cmp r5, #0 -; SOFT-NEXT: bne .LBB50_8 +; SOFT-NEXT: mov r0, r4 +; SOFT-NEXT: mov r1, r5 +; SOFT-NEXT: pop {r4, r5, r7, pc} ; SOFT-NEXT: .LBB50_11: @ %entry -; SOFT-NEXT: mov r4, r5 -; SOFT-NEXT: cmp r3, #0 -; SOFT-NEXT: bmi .LBB50_9 +; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: cmp r2, #0 +; SOFT-NEXT: bne .LBB50_8 ; SOFT-NEXT: .LBB50_12: @ %entry -; SOFT-NEXT: mov r1, r4 -; SOFT-NEXT: mov r0, r2 -; SOFT-NEXT: pop {r4, r5, r7, pc} +; SOFT-NEXT: mov r1, r2 +; SOFT-NEXT: cmp r3, #0 +; SOFT-NEXT: bpl .LBB50_9 +; SOFT-NEXT: b .LBB50_10 ; ; VFP2-LABEL: ustest_f32i64_mm: ; VFP2: @ %bb.0: @ %entry @@ -3870,53 +3865,52 @@ define i64 @ustest_f16i64_mm(half %x) { ; SOFT-NEXT: uxth r0, r0 ; SOFT-NEXT: bl __aeabi_h2f ; SOFT-NEXT: bl __fixsfti -; SOFT-NEXT: mov r4, r1 -; SOFT-NEXT: movs r1, #0 +; SOFT-NEXT: movs r5, #0 ; SOFT-NEXT: subs r2, r2, #1 ; SOFT-NEXT: mov r2, r3 -; SOFT-NEXT: sbcs r2, r1 +; SOFT-NEXT: sbcs r2, r5 ; SOFT-NEXT: blt .LBB53_2 ; SOFT-NEXT: @ %bb.1: @ %entry -; SOFT-NEXT: mov r5, r1 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: mov r2, r5 +; SOFT-NEXT: cmp r2, #0 ; SOFT-NEXT: beq .LBB53_3 ; SOFT-NEXT: b .LBB53_4 ; SOFT-NEXT: .LBB53_2: -; SOFT-NEXT: movs r5, #1 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: movs r2, #1 +; SOFT-NEXT: cmp r2, #0 ; SOFT-NEXT: bne .LBB53_4 ; SOFT-NEXT: .LBB53_3: @ %entry -; SOFT-NEXT: mov r3, r5 +; SOFT-NEXT: mov r3, r2 ; SOFT-NEXT: .LBB53_4: @ %entry -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r2, #0 ; SOFT-NEXT: bne .LBB53_6 ; SOFT-NEXT: @ %bb.5: @ %entry -; SOFT-NEXT: mov r0, r5 +; SOFT-NEXT: mov r0, r2 ; SOFT-NEXT: .LBB53_6: @ %entry ; SOFT-NEXT: cmp r3, #0 -; SOFT-NEXT: mov r2, r1 -; SOFT-NEXT: bpl .LBB53_10 +; SOFT-NEXT: mov r4, r5 +; SOFT-NEXT: bpl .LBB53_11 ; SOFT-NEXT: @ %bb.7: @ %entry -; SOFT-NEXT: cmp r5, #0 -; SOFT-NEXT: beq .LBB53_11 +; SOFT-NEXT: cmp r2, #0 +; SOFT-NEXT: beq .LBB53_12 ; SOFT-NEXT: .LBB53_8: @ %entry ; SOFT-NEXT: cmp r3, #0 -; SOFT-NEXT: bpl .LBB53_12 +; SOFT-NEXT: bmi .LBB53_10 ; SOFT-NEXT: .LBB53_9: @ %entry -; SOFT-NEXT: mov r0, r2 -; SOFT-NEXT: pop {r4, r5, r7, pc} +; SOFT-NEXT: mov r5, r1 ; SOFT-NEXT: .LBB53_10: @ %entry -; SOFT-NEXT: mov r2, r0 -; SOFT-NEXT: cmp r5, #0 -; SOFT-NEXT: bne .LBB53_8 +; SOFT-NEXT: mov r0, r4 +; SOFT-NEXT: mov r1, r5 +; SOFT-NEXT: pop {r4, r5, r7, pc} ; SOFT-NEXT: .LBB53_11: @ %entry -; SOFT-NEXT: mov r4, r5 -; SOFT-NEXT: cmp r3, #0 -; SOFT-NEXT: bmi .LBB53_9 +; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: cmp r2, #0 +; SOFT-NEXT: bne .LBB53_8 ; SOFT-NEXT: .LBB53_12: @ %entry -; SOFT-NEXT: mov r1, r4 -; SOFT-NEXT: mov r0, r2 -; SOFT-NEXT: pop {r4, r5, r7, pc} +; SOFT-NEXT: mov r1, r2 +; SOFT-NEXT: cmp r3, #0 +; SOFT-NEXT: bpl .LBB53_9 +; SOFT-NEXT: b .LBB53_10 ; ; VFP2-LABEL: ustest_f16i64_mm: ; VFP2: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll index 78090083a00264..9469c42e6c0381 100644 --- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll @@ -203,35 +203,36 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: mov r8, r1 -; CHECK-NEXT: vmov r6, s17 -; CHECK-NEXT: vmov r10, s19 +; CHECK-NEXT: vmov r5, s17 +; CHECK-NEXT: vmov r9, s19 ; CHECK-NEXT: vmov.32 d8[0], r7 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vmov.32 d10[0], r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: vmov.32 d9[0], r0 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mvn r6, #-2147483648 -; CHECK-NEXT: subs r3, r7, r6 +; CHECK-NEXT: mvn r5, #-2147483648 +; CHECK-NEXT: subs r3, r7, r5 ; CHECK-NEXT: sbcs r3, r8, #0 ; CHECK-NEXT: vmov.32 d11[0], r0 ; CHECK-NEXT: mov r3, #0 ; CHECK-NEXT: adr r2, .LCPI3_0 ; CHECK-NEXT: movwlt r3, #1 -; CHECK-NEXT: subs r7, r5, r6 +; CHECK-NEXT: subs r7, r6, r5 ; CHECK-NEXT: sbcs r7, r4, #0 ; CHECK-NEXT: vmov.32 d11[1], r1 ; CHECK-NEXT: mov r7, #0 +; CHECK-NEXT: mvn r6, #0 ; CHECK-NEXT: movwlt r7, #1 ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: mvnne r7, #0 -; CHECK-NEXT: subs r0, r0, r6 +; CHECK-NEXT: subs r0, r0, r5 ; CHECK-NEXT: sbcs r0, r1, #0 ; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128] ; CHECK-NEXT: mov r0, #0 @@ -241,17 +242,16 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: vmov.32 d10[1], r4 ; CHECK-NEXT: vdup.32 d17, r0 -; CHECK-NEXT: subs r0, r9, r6 -; CHECK-NEXT: sbcs r0, r11, #0 +; CHECK-NEXT: subs r0, r11, r5 +; CHECK-NEXT: sbcs r0, r10, #0 ; CHECK-NEXT: vdup.32 d16, r7 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: vbsl q8, q5, q9 ; CHECK-NEXT: movwlt r0, #1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov.32 d9[1], r11 +; CHECK-NEXT: vmov.32 d9[1], r10 ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: mvn r6, #0 ; CHECK-NEXT: vdup.32 d21, r0 ; CHECK-NEXT: mvnne r3, #0 ; CHECK-NEXT: vmov.32 d8[1], r8 @@ -346,37 +346,37 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) { ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mvn r7, #0 -; CHECK-NEXT: subs r2, r5, r7 -; CHECK-NEXT: sbcs r2, r4, #0 +; CHECK-NEXT: mvn r3, #0 +; CHECK-NEXT: subs r7, r5, r3 +; CHECK-NEXT: sbcs r7, r4, #0 ; CHECK-NEXT: vmov.32 d10[0], r0 +; CHECK-NEXT: mov r7, #0 ; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: mov r3, #0 -; CHECK-NEXT: movwlo r2, #1 -; CHECK-NEXT: subs r0, r0, r7 +; CHECK-NEXT: movwlo r7, #1 +; CHECK-NEXT: subs r0, r0, r3 ; CHECK-NEXT: sbcs r0, r1, #0 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movwlo r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: subs r1, r6, r7 +; CHECK-NEXT: subs r1, r6, r3 ; CHECK-NEXT: sbcs r1, r10, #0 ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: movwlo r1, #1 -; CHECK-NEXT: subs r7, r9, r7 -; CHECK-NEXT: sbcs r7, r8, #0 -; CHECK-NEXT: movwlo r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: mvnne r3, #0 +; CHECK-NEXT: subs r3, r9, r3 +; CHECK-NEXT: sbcs r3, r8, #0 +; CHECK-NEXT: movwlo r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: mvnne r2, #0 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: vdup.32 d19, r1 -; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d17, r3 +; CHECK-NEXT: mvnne r7, #0 +; CHECK-NEXT: vdup.32 d17, r2 ; CHECK-NEXT: vdup.32 d18, r0 ; CHECK-NEXT: vand q10, q5, q9 -; CHECK-NEXT: vdup.32 d16, r2 +; CHECK-NEXT: vdup.32 d16, r7 ; CHECK-NEXT: vand q11, q4, q8 ; CHECK-NEXT: vorn q9, q10, q9 ; CHECK-NEXT: vorn q8, q11, q8 @@ -406,33 +406,33 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.32 d16[0], r2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov.32 d16[0], r0 ; CHECK-NEXT: mvn r4, #0 -; CHECK-NEXT: subs r2, r2, r4 -; CHECK-NEXT: vmov r8, s19 -; CHECK-NEXT: sbcs r2, r1, #0 +; CHECK-NEXT: subs r0, r0, r4 +; CHECK-NEXT: sbcs r0, r1, #0 ; CHECK-NEXT: vmov.32 d17[0], r5 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: vmov.i64 q5, #0xffffffff -; CHECK-NEXT: movwlt r2, #1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmov r8, s19 +; CHECK-NEXT: movwlt r0, #1 ; CHECK-NEXT: subs r3, r5, r4 ; CHECK-NEXT: sbcs r3, r6, #0 ; CHECK-NEXT: vmov.32 d17[1], r6 ; CHECK-NEXT: mov r3, #0 -; CHECK-NEXT: mov r7, #0 +; CHECK-NEXT: vmov.i64 q5, #0xffffffff ; CHECK-NEXT: movwlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: mvnne r3, #0 -; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: vdup.32 d19, r3 -; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d18, r2 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d18, r0 +; CHECK-NEXT: mov r7, #0 ; CHECK-NEXT: vmov.32 d16[1], r1 ; CHECK-NEXT: vorr q4, q9, q9 ; CHECK-NEXT: vbsl q4, q8, q5 ; CHECK-NEXT: vmov r10, r9, d8 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: vmov.32 d12[0], r0 @@ -448,46 +448,46 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NEXT: sbcs r0, r1, #0 ; CHECK-NEXT: vmov.32 d13[1], r1 ; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: vmov r5, r4, d9 ; CHECK-NEXT: movwlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: vmov.32 d12[1], r6 ; CHECK-NEXT: mvnne r2, #0 +; CHECK-NEXT: rsbs r6, r10, #0 ; CHECK-NEXT: vdup.32 d17, r0 -; CHECK-NEXT: rsbs r0, r10, #0 +; CHECK-NEXT: vmov r6, r5, d9 ; CHECK-NEXT: vdup.32 d16, r2 -; CHECK-NEXT: rscs r0, r9, #0 +; CHECK-NEXT: rscs r4, r9, #0 ; CHECK-NEXT: vbsl q8, q6, q5 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: movwlt r4, #1 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: rscs r0, r1, #0 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movwlt r0, #1 -; CHECK-NEXT: vmov r1, r2, d16 -; CHECK-NEXT: vmov r3, r6, d17 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rscs r1, r2, #0 +; CHECK-NEXT: rsbs r1, r2, #0 +; CHECK-NEXT: rscs r1, r3, #0 ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: movwlt r1, #1 -; CHECK-NEXT: rsbs r2, r3, #0 -; CHECK-NEXT: rscs r2, r6, #0 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: movwlt r2, #1 -; CHECK-NEXT: rsbs r3, r5, #0 -; CHECK-NEXT: rscs r3, r4, #0 +; CHECK-NEXT: rsbs r2, r6, #0 +; CHECK-NEXT: rscs r2, r5, #0 ; CHECK-NEXT: movwlt r7, #1 ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: mvnne r7, #0 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: mvnne r2, #0 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: vmov.32 d21[0], r2 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov.32 d20[0], r1 ; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vmov.32 d21[0], r1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: vmov.32 d20[0], r0 +; CHECK-NEXT: mvnne r4, #0 ; CHECK-NEXT: vmov.32 d19[0], r7 ; CHECK-NEXT: vand q8, q8, q10 -; CHECK-NEXT: vmov.32 d18[0], r0 +; CHECK-NEXT: vmov.32 d18[0], r4 ; CHECK-NEXT: vmovn.i64 d1, q8 ; CHECK-NEXT: vand q9, q4, q9 ; CHECK-NEXT: vmovn.i64 d0, q9 @@ -522,37 +522,37 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s18 ; CHECK-NEON-NEXT: vmov r10, s16 ; CHECK-NEON-NEXT: mov r8, r1 -; CHECK-NEON-NEXT: vmov r6, s20 +; CHECK-NEON-NEXT: vmov r4, s20 ; CHECK-NEON-NEXT: vmov.32 d8[0], r9 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz ; CHECK-NEON-NEXT: mov r5, r0 ; CHECK-NEON-NEXT: vmov.32 d10[0], r0 -; CHECK-NEON-NEXT: mov r0, r6 -; CHECK-NEON-NEXT: mov r4, r1 +; CHECK-NEON-NEXT: mov r0, r4 +; CHECK-NEON-NEXT: mov r6, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: mov r11, r0 +; CHECK-NEON-NEXT: mov r7, r0 ; CHECK-NEON-NEXT: vmov.32 d9[0], r0 ; CHECK-NEON-NEXT: mov r0, r10 -; CHECK-NEON-NEXT: mov r7, r1 +; CHECK-NEON-NEXT: mov r11, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: mvn r6, #-2147483648 -; CHECK-NEON-NEXT: subs r3, r9, r6 +; CHECK-NEON-NEXT: mvn r4, #-2147483648 +; CHECK-NEON-NEXT: subs r3, r9, r4 ; CHECK-NEON-NEXT: sbcs r3, r8, #0 ; CHECK-NEON-NEXT: vmov.32 d11[0], r0 ; CHECK-NEON-NEXT: mov r3, #0 ; CHECK-NEON-NEXT: adr r2, .LCPI6_0 ; CHECK-NEON-NEXT: movwlt r3, #1 -; CHECK-NEON-NEXT: subs r5, r5, r6 -; CHECK-NEON-NEXT: sbcs r5, r4, #0 +; CHECK-NEON-NEXT: subs r5, r5, r4 +; CHECK-NEON-NEXT: sbcs r5, r6, #0 ; CHECK-NEON-NEXT: vmov.32 d11[1], r1 ; CHECK-NEON-NEXT: mov r5, #0 ; CHECK-NEON-NEXT: movwlt r5, #1 ; CHECK-NEON-NEXT: cmp r5, #0 ; CHECK-NEON-NEXT: mvnne r5, #0 -; CHECK-NEON-NEXT: subs r0, r0, r6 +; CHECK-NEON-NEXT: subs r0, r0, r4 ; CHECK-NEON-NEXT: sbcs r0, r1, #0 ; CHECK-NEON-NEXT: vld1.64 {d18, d19}, [r2:128] ; CHECK-NEON-NEXT: mov r0, #0 @@ -560,19 +560,19 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: movwlt r0, #1 ; CHECK-NEON-NEXT: cmp r0, #0 ; CHECK-NEON-NEXT: mvnne r0, #0 -; CHECK-NEON-NEXT: vmov.32 d10[1], r4 +; CHECK-NEON-NEXT: vmov.32 d10[1], r6 +; CHECK-NEON-NEXT: mvn r6, #0 ; CHECK-NEON-NEXT: vdup.32 d17, r0 -; CHECK-NEON-NEXT: subs r0, r11, r6 -; CHECK-NEON-NEXT: sbcs r0, r7, #0 +; CHECK-NEON-NEXT: subs r0, r7, r4 +; CHECK-NEON-NEXT: sbcs r0, r11, #0 ; CHECK-NEON-NEXT: vdup.32 d16, r5 ; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: vbsl q8, q5, q9 ; CHECK-NEON-NEXT: movwlt r0, #1 ; CHECK-NEON-NEXT: cmp r0, #0 -; CHECK-NEON-NEXT: vmov.32 d9[1], r7 +; CHECK-NEON-NEXT: vmov.32 d9[1], r11 ; CHECK-NEON-NEXT: mvnne r0, #0 ; CHECK-NEON-NEXT: cmp r3, #0 -; CHECK-NEON-NEXT: mvn r6, #0 ; CHECK-NEON-NEXT: vdup.32 d21, r0 ; CHECK-NEON-NEXT: mvnne r3, #0 ; CHECK-NEON-NEXT: vmov.32 d8[1], r8 @@ -644,15 +644,15 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-FP16-NEXT: vmov.u16 r6, d0[1] ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: mov r4, r0 +; CHECK-FP16-NEXT: mov r7, r0 ; CHECK-FP16-NEXT: vmov.u16 r0, d8[2] ; CHECK-FP16-NEXT: mov r8, r1 -; CHECK-FP16-NEXT: vmov.32 d10[0], r4 +; CHECK-FP16-NEXT: vmov.32 d10[0], r7 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi ; CHECK-FP16-NEXT: vmov s0, r6 ; CHECK-FP16-NEXT: mov r5, r0 -; CHECK-FP16-NEXT: mov r7, r1 +; CHECK-FP16-NEXT: mov r4, r1 ; CHECK-FP16-NEXT: vmov.32 d12[0], r0 ; CHECK-FP16-NEXT: bl __fixhfdi ; CHECK-FP16-NEXT: mov r9, r0 @@ -662,19 +662,19 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi ; CHECK-FP16-NEXT: mvn r6, #-2147483648 -; CHECK-FP16-NEXT: subs r3, r4, r6 +; CHECK-FP16-NEXT: subs r3, r7, r6 ; CHECK-FP16-NEXT: sbcs r3, r8, #0 ; CHECK-FP16-NEXT: vmov.32 d13[0], r0 ; CHECK-FP16-NEXT: mov r3, #0 ; CHECK-FP16-NEXT: adr r2, .LCPI6_0 ; CHECK-FP16-NEXT: movwlt r3, #1 -; CHECK-FP16-NEXT: subs r5, r5, r6 -; CHECK-FP16-NEXT: sbcs r5, r7, #0 +; CHECK-FP16-NEXT: subs r7, r5, r6 +; CHECK-FP16-NEXT: sbcs r7, r4, #0 ; CHECK-FP16-NEXT: vmov.32 d13[1], r1 -; CHECK-FP16-NEXT: mov r5, #0 -; CHECK-FP16-NEXT: movwlt r5, #1 -; CHECK-FP16-NEXT: cmp r5, #0 -; CHECK-FP16-NEXT: mvnne r5, #0 +; CHECK-FP16-NEXT: mov r7, #0 +; CHECK-FP16-NEXT: movwlt r7, #1 +; CHECK-FP16-NEXT: cmp r7, #0 +; CHECK-FP16-NEXT: mvnne r7, #0 ; CHECK-FP16-NEXT: subs r0, r0, r6 ; CHECK-FP16-NEXT: sbcs r0, r1, #0 ; CHECK-FP16-NEXT: vld1.64 {d18, d19}, [r2:128] @@ -683,11 +683,11 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-FP16-NEXT: movwlt r0, #1 ; CHECK-FP16-NEXT: cmp r0, #0 ; CHECK-FP16-NEXT: mvnne r0, #0 -; CHECK-FP16-NEXT: vmov.32 d12[1], r7 +; CHECK-FP16-NEXT: vmov.32 d12[1], r4 ; CHECK-FP16-NEXT: vdup.32 d17, r0 ; CHECK-FP16-NEXT: subs r0, r9, r6 ; CHECK-FP16-NEXT: sbcs r0, r10, #0 -; CHECK-FP16-NEXT: vdup.32 d16, r5 +; CHECK-FP16-NEXT: vdup.32 d16, r7 ; CHECK-FP16-NEXT: mov r0, #0 ; CHECK-FP16-NEXT: vbsl q8, q6, q9 ; CHECK-FP16-NEXT: movwlt r0, #1 @@ -766,8 +766,8 @@ entry: define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-NEON-LABEL: utesth_f16i32: ; CHECK-NEON: @ %bb.0: @ %entry -; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-NEON-NEXT: .vsave {d12, d13} ; CHECK-NEON-NEXT: vpush {d12, d13} ; CHECK-NEON-NEXT: .vsave {d8, d9, d10} @@ -778,7 +778,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: vmov.f32 s20, s0 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2ulz -; CHECK-NEON-NEXT: mov r10, r0 +; CHECK-NEON-NEXT: mov r4, r0 ; CHECK-NEON-NEXT: vmov r0, s18 ; CHECK-NEON-NEXT: mov r8, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f @@ -794,39 +794,39 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s16 ; CHECK-NEON-NEXT: mov r7, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f -; CHECK-NEON-NEXT: vmov.32 d9[0], r10 +; CHECK-NEON-NEXT: vmov.32 d9[0], r4 ; CHECK-NEON-NEXT: bl __aeabi_f2ulz -; CHECK-NEON-NEXT: mvn r4, #0 -; CHECK-NEON-NEXT: subs r2, r5, r4 -; CHECK-NEON-NEXT: sbcs r2, r7, #0 +; CHECK-NEON-NEXT: mvn r3, #0 +; CHECK-NEON-NEXT: subs r5, r5, r3 +; CHECK-NEON-NEXT: sbcs r7, r7, #0 ; CHECK-NEON-NEXT: vmov.32 d8[0], r0 +; CHECK-NEON-NEXT: mov r7, #0 ; CHECK-NEON-NEXT: mov r2, #0 -; CHECK-NEON-NEXT: mov r3, #0 -; CHECK-NEON-NEXT: movwlo r2, #1 -; CHECK-NEON-NEXT: subs r0, r0, r4 +; CHECK-NEON-NEXT: movwlo r7, #1 +; CHECK-NEON-NEXT: subs r0, r0, r3 ; CHECK-NEON-NEXT: sbcs r0, r1, #0 ; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: movwlo r0, #1 ; CHECK-NEON-NEXT: cmp r0, #0 ; CHECK-NEON-NEXT: mvnne r0, #0 -; CHECK-NEON-NEXT: subs r1, r10, r4 +; CHECK-NEON-NEXT: subs r1, r4, r3 ; CHECK-NEON-NEXT: sbcs r1, r8, #0 ; CHECK-NEON-NEXT: mov r1, #0 ; CHECK-NEON-NEXT: movwlo r1, #1 -; CHECK-NEON-NEXT: subs r7, r6, r4 -; CHECK-NEON-NEXT: sbcs r7, r9, #0 -; CHECK-NEON-NEXT: movwlo r3, #1 -; CHECK-NEON-NEXT: cmp r3, #0 -; CHECK-NEON-NEXT: mvnne r3, #0 +; CHECK-NEON-NEXT: subs r3, r6, r3 +; CHECK-NEON-NEXT: sbcs r3, r9, #0 +; CHECK-NEON-NEXT: movwlo r2, #1 +; CHECK-NEON-NEXT: cmp r2, #0 +; CHECK-NEON-NEXT: mvnne r2, #0 ; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: mvnne r1, #0 -; CHECK-NEON-NEXT: cmp r2, #0 +; CHECK-NEON-NEXT: cmp r7, #0 ; CHECK-NEON-NEXT: vdup.32 d19, r1 -; CHECK-NEON-NEXT: mvnne r2, #0 -; CHECK-NEON-NEXT: vdup.32 d17, r3 +; CHECK-NEON-NEXT: mvnne r7, #0 +; CHECK-NEON-NEXT: vdup.32 d17, r2 ; CHECK-NEON-NEXT: vdup.32 d18, r0 ; CHECK-NEON-NEXT: vand q10, q4, q9 -; CHECK-NEON-NEXT: vdup.32 d16, r2 +; CHECK-NEON-NEXT: vdup.32 d16, r7 ; CHECK-NEON-NEXT: vand q11, q6, q8 ; CHECK-NEON-NEXT: vorn q9, q10, q9 ; CHECK-NEON-NEXT: vorn q8, q11, q8 @@ -834,12 +834,12 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: vmovn.i64 d0, q8 ; CHECK-NEON-NEXT: vpop {d8, d9, d10} ; CHECK-NEON-NEXT: vpop {d12, d13} -; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} ; ; CHECK-FP16-LABEL: utesth_f16i32: ; CHECK-FP16: @ %bb.0: @ %entry -; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-FP16-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-FP16-NEXT: vpush {d8, d9, d10, d11} ; CHECK-FP16-NEXT: vmov.u16 r0, d0[1] @@ -847,10 +847,10 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-FP16-NEXT: vmov.u16 r5, d0[3] ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixunshfdi -; CHECK-FP16-NEXT: mov r10, r0 +; CHECK-FP16-NEXT: mov r4, r0 ; CHECK-FP16-NEXT: vmov.u16 r0, d8[0] ; CHECK-FP16-NEXT: mov r8, r1 -; CHECK-FP16-NEXT: vmov.32 d11[0], r10 +; CHECK-FP16-NEXT: vmov.32 d11[0], r4 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixunshfdi ; CHECK-FP16-NEXT: vmov s0, r5 @@ -864,44 +864,44 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-FP16-NEXT: vmov.32 d9[0], r5 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixunshfdi -; CHECK-FP16-NEXT: mvn r4, #0 -; CHECK-FP16-NEXT: subs r2, r6, r4 -; CHECK-FP16-NEXT: sbcs r2, r7, #0 +; CHECK-FP16-NEXT: mvn r3, #0 +; CHECK-FP16-NEXT: subs r6, r6, r3 +; CHECK-FP16-NEXT: sbcs r7, r7, #0 ; CHECK-FP16-NEXT: vmov.32 d8[0], r0 +; CHECK-FP16-NEXT: mov r7, #0 ; CHECK-FP16-NEXT: mov r2, #0 -; CHECK-FP16-NEXT: mov r3, #0 -; CHECK-FP16-NEXT: movwlo r2, #1 -; CHECK-FP16-NEXT: subs r0, r0, r4 +; CHECK-FP16-NEXT: movwlo r7, #1 +; CHECK-FP16-NEXT: subs r0, r0, r3 ; CHECK-FP16-NEXT: sbcs r0, r1, #0 ; CHECK-FP16-NEXT: mov r0, #0 ; CHECK-FP16-NEXT: movwlo r0, #1 ; CHECK-FP16-NEXT: cmp r0, #0 ; CHECK-FP16-NEXT: mvnne r0, #0 -; CHECK-FP16-NEXT: subs r1, r5, r4 +; CHECK-FP16-NEXT: subs r1, r5, r3 ; CHECK-FP16-NEXT: sbcs r1, r9, #0 ; CHECK-FP16-NEXT: mov r1, #0 ; CHECK-FP16-NEXT: movwlo r1, #1 -; CHECK-FP16-NEXT: subs r7, r10, r4 -; CHECK-FP16-NEXT: sbcs r7, r8, #0 -; CHECK-FP16-NEXT: movwlo r3, #1 -; CHECK-FP16-NEXT: cmp r3, #0 -; CHECK-FP16-NEXT: mvnne r3, #0 +; CHECK-FP16-NEXT: subs r3, r4, r3 +; CHECK-FP16-NEXT: sbcs r3, r8, #0 +; CHECK-FP16-NEXT: movwlo r2, #1 +; CHECK-FP16-NEXT: cmp r2, #0 +; CHECK-FP16-NEXT: mvnne r2, #0 ; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: mvnne r1, #0 -; CHECK-FP16-NEXT: cmp r2, #0 +; CHECK-FP16-NEXT: cmp r7, #0 ; CHECK-FP16-NEXT: vdup.32 d19, r1 -; CHECK-FP16-NEXT: mvnne r2, #0 -; CHECK-FP16-NEXT: vdup.32 d17, r3 +; CHECK-FP16-NEXT: mvnne r7, #0 +; CHECK-FP16-NEXT: vdup.32 d17, r2 ; CHECK-FP16-NEXT: vdup.32 d18, r0 ; CHECK-FP16-NEXT: vand q10, q4, q9 -; CHECK-FP16-NEXT: vdup.32 d16, r2 +; CHECK-FP16-NEXT: vdup.32 d16, r7 ; CHECK-FP16-NEXT: vand q11, q5, q8 ; CHECK-FP16-NEXT: vorn q9, q10, q9 ; CHECK-FP16-NEXT: vorn q8, q11, q8 ; CHECK-FP16-NEXT: vmovn.i64 d1, q9 ; CHECK-FP16-NEXT: vmovn.i64 d0, q8 ; CHECK-FP16-NEXT: vpop {d8, d9, d10, d11} -; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} entry: %conv = fptoui <4 x half> %x to <4 x i64> %0 = icmp ult <4 x i64> %conv, @@ -975,42 +975,42 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: cmp r0, #0 ; CHECK-NEON-NEXT: mvnne r0, #0 ; CHECK-NEON-NEXT: cmp r2, #0 -; CHECK-NEON-NEXT: vmov.32 d12[1], r6 ; CHECK-NEON-NEXT: mvnne r2, #0 ; CHECK-NEON-NEXT: vdup.32 d17, r0 -; CHECK-NEON-NEXT: rsbs r0, r7, #0 +; CHECK-NEON-NEXT: vmov.32 d12[1], r6 +; CHECK-NEON-NEXT: rsbs r7, r7, #0 +; CHECK-NEON-NEXT: vmov r7, r6, d9 +; CHECK-NEON-NEXT: rscs r5, r10, #0 ; CHECK-NEON-NEXT: vdup.32 d16, r2 -; CHECK-NEON-NEXT: vmov r7, r5, d9 +; CHECK-NEON-NEXT: mov r5, #0 ; CHECK-NEON-NEXT: vbsl q8, q6, q5 -; CHECK-NEON-NEXT: rscs r0, r10, #0 +; CHECK-NEON-NEXT: movwlt r5, #1 +; CHECK-NEON-NEXT: vmov r0, r1, d16 +; CHECK-NEON-NEXT: vmov r2, r3, d17 +; CHECK-NEON-NEXT: rsbs r0, r0, #0 +; CHECK-NEON-NEXT: rscs r0, r1, #0 ; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: movwlt r0, #1 -; CHECK-NEON-NEXT: vmov r1, r2, d16 -; CHECK-NEON-NEXT: vmov r3, r6, d17 -; CHECK-NEON-NEXT: rsbs r1, r1, #0 -; CHECK-NEON-NEXT: rscs r1, r2, #0 +; CHECK-NEON-NEXT: rsbs r1, r2, #0 +; CHECK-NEON-NEXT: rscs r1, r3, #0 ; CHECK-NEON-NEXT: mov r1, #0 ; CHECK-NEON-NEXT: movwlt r1, #1 -; CHECK-NEON-NEXT: rsbs r2, r3, #0 +; CHECK-NEON-NEXT: rsbs r2, r7, #0 ; CHECK-NEON-NEXT: rscs r2, r6, #0 -; CHECK-NEON-NEXT: mov r2, #0 -; CHECK-NEON-NEXT: movwlt r2, #1 -; CHECK-NEON-NEXT: rsbs r3, r7, #0 -; CHECK-NEON-NEXT: rscs r3, r5, #0 ; CHECK-NEON-NEXT: movwlt r4, #1 ; CHECK-NEON-NEXT: cmp r4, #0 ; CHECK-NEON-NEXT: mvnne r4, #0 -; CHECK-NEON-NEXT: cmp r2, #0 -; CHECK-NEON-NEXT: mvnne r2, #0 ; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: mvnne r1, #0 -; CHECK-NEON-NEXT: vmov.32 d21[0], r2 ; CHECK-NEON-NEXT: cmp r0, #0 -; CHECK-NEON-NEXT: vmov.32 d20[0], r1 ; CHECK-NEON-NEXT: mvnne r0, #0 +; CHECK-NEON-NEXT: vmov.32 d21[0], r1 +; CHECK-NEON-NEXT: cmp r5, #0 +; CHECK-NEON-NEXT: vmov.32 d20[0], r0 +; CHECK-NEON-NEXT: mvnne r5, #0 ; CHECK-NEON-NEXT: vmov.32 d19[0], r4 ; CHECK-NEON-NEXT: vand q8, q8, q10 -; CHECK-NEON-NEXT: vmov.32 d18[0], r0 +; CHECK-NEON-NEXT: vmov.32 d18[0], r5 ; CHECK-NEON-NEXT: vmovn.i64 d1, q8 ; CHECK-NEON-NEXT: vand q9, q4, q9 ; CHECK-NEON-NEXT: vmovn.i64 d0, q9 @@ -1080,42 +1080,42 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-FP16-NEXT: cmp r0, #0 ; CHECK-FP16-NEXT: mvnne r0, #0 ; CHECK-FP16-NEXT: cmp r2, #0 -; CHECK-FP16-NEXT: vmov.32 d14[1], r5 ; CHECK-FP16-NEXT: mvnne r2, #0 -; CHECK-FP16-NEXT: vmov r5, r4, d11 ; CHECK-FP16-NEXT: vdup.32 d17, r0 -; CHECK-FP16-NEXT: rsbs r0, r9, #0 +; CHECK-FP16-NEXT: vmov.32 d14[1], r5 +; CHECK-FP16-NEXT: rsbs r7, r9, #0 +; CHECK-FP16-NEXT: vmov r7, r5, d11 +; CHECK-FP16-NEXT: rscs r4, r8, #0 ; CHECK-FP16-NEXT: vdup.32 d16, r2 -; CHECK-FP16-NEXT: rscs r0, r8, #0 +; CHECK-FP16-NEXT: mov r4, #0 ; CHECK-FP16-NEXT: vbsl q8, q7, q6 +; CHECK-FP16-NEXT: movwlt r4, #1 +; CHECK-FP16-NEXT: vmov r0, r1, d16 +; CHECK-FP16-NEXT: vmov r2, r3, d17 +; CHECK-FP16-NEXT: rsbs r0, r0, #0 +; CHECK-FP16-NEXT: rscs r0, r1, #0 ; CHECK-FP16-NEXT: mov r0, #0 ; CHECK-FP16-NEXT: movwlt r0, #1 -; CHECK-FP16-NEXT: vmov r1, r2, d16 -; CHECK-FP16-NEXT: vmov r3, r7, d17 -; CHECK-FP16-NEXT: rsbs r1, r1, #0 -; CHECK-FP16-NEXT: rscs r1, r2, #0 +; CHECK-FP16-NEXT: rsbs r1, r2, #0 +; CHECK-FP16-NEXT: rscs r1, r3, #0 ; CHECK-FP16-NEXT: mov r1, #0 ; CHECK-FP16-NEXT: movwlt r1, #1 -; CHECK-FP16-NEXT: rsbs r2, r3, #0 -; CHECK-FP16-NEXT: rscs r2, r7, #0 -; CHECK-FP16-NEXT: mov r2, #0 -; CHECK-FP16-NEXT: movwlt r2, #1 -; CHECK-FP16-NEXT: rsbs r3, r5, #0 -; CHECK-FP16-NEXT: rscs r3, r4, #0 +; CHECK-FP16-NEXT: rsbs r2, r7, #0 +; CHECK-FP16-NEXT: rscs r2, r5, #0 ; CHECK-FP16-NEXT: movwlt r6, #1 ; CHECK-FP16-NEXT: cmp r6, #0 ; CHECK-FP16-NEXT: mvnne r6, #0 -; CHECK-FP16-NEXT: cmp r2, #0 -; CHECK-FP16-NEXT: mvnne r2, #0 ; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: mvnne r1, #0 -; CHECK-FP16-NEXT: vmov.32 d21[0], r2 ; CHECK-FP16-NEXT: cmp r0, #0 -; CHECK-FP16-NEXT: vmov.32 d20[0], r1 ; CHECK-FP16-NEXT: mvnne r0, #0 +; CHECK-FP16-NEXT: vmov.32 d21[0], r1 +; CHECK-FP16-NEXT: cmp r4, #0 +; CHECK-FP16-NEXT: vmov.32 d20[0], r0 +; CHECK-FP16-NEXT: mvnne r4, #0 ; CHECK-FP16-NEXT: vmov.32 d19[0], r6 ; CHECK-FP16-NEXT: vand q8, q8, q10 -; CHECK-FP16-NEXT: vmov.32 d18[0], r0 +; CHECK-FP16-NEXT: vmov.32 d18[0], r4 ; CHECK-FP16-NEXT: vmovn.i64 d1, q8 ; CHECK-FP16-NEXT: vand q9, q5, q9 ; CHECK-FP16-NEXT: vmovn.i64 d0, q9 @@ -1263,8 +1263,8 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NEON: @ %bb.0: @ %entry ; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 @@ -1293,9 +1293,9 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s16 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov s22, r7 +; CHECK-NEON-NEXT: vmov s16, r7 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 -; CHECK-NEON-NEXT: vmov s30, r6 +; CHECK-NEON-NEXT: vmov s18, r6 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d13[1], r0 ; CHECK-NEON-NEXT: vmov r0, s28 @@ -1304,36 +1304,36 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vmov r0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 -; CHECK-NEON-NEXT: vmov.32 d8[0], r0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s18 +; CHECK-NEON-NEXT: vcvt.s32.f32 s18, s2 +; CHECK-NEON-NEXT: vmov.32 d10[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 ; CHECK-NEON-NEXT: mov r0, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov r0, s20 +; CHECK-NEON-NEXT: vmov r0, s18 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r4 ; CHECK-NEON-NEXT: vmov.i32 q8, #0x7fff ; CHECK-NEON-NEXT: vcvt.s32.f32 s2, s2 ; CHECK-NEON-NEXT: vmvn.i32 q9, #0x7fff -; CHECK-NEON-NEXT: vmov.32 d9[0], r0 +; CHECK-NEON-NEXT: vmov.32 d11[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s22 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s16 ; CHECK-NEON-NEXT: vmov.32 d12[1], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmin.s32 q10, q6, q8 ; CHECK-NEON-NEXT: vmax.s32 q10, q10, q9 -; CHECK-NEON-NEXT: vmov.32 d9[1], r0 +; CHECK-NEON-NEXT: vmov.32 d11[1], r0 ; CHECK-NEON-NEXT: vmov r0, s2 ; CHECK-NEON-NEXT: vmovn.i32 d1, q10 -; CHECK-NEON-NEXT: vmov.32 d8[1], r0 -; CHECK-NEON-NEXT: vmin.s32 q8, q4, q8 +; CHECK-NEON-NEXT: vmov.32 d10[1], r0 +; CHECK-NEON-NEXT: vmin.s32 q8, q5, q8 ; CHECK-NEON-NEXT: vmax.s32 q8, q8, q9 ; CHECK-NEON-NEXT: vmovn.i32 d0, q8 -; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r11, pc} ; ; CHECK-FP16-LABEL: stest_f16i16: @@ -1509,8 +1509,8 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NEON: @ %bb.0: @ %entry ; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 @@ -1539,9 +1539,9 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s16 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov s22, r7 +; CHECK-NEON-NEXT: vmov s16, r7 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 -; CHECK-NEON-NEXT: vmov s30, r6 +; CHECK-NEON-NEXT: vmov s18, r6 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d13[1], r0 ; CHECK-NEON-NEXT: vmov r0, s28 @@ -1550,36 +1550,36 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vmov r0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 -; CHECK-NEON-NEXT: vmov.32 d8[0], r0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s18 +; CHECK-NEON-NEXT: vcvt.s32.f32 s18, s2 +; CHECK-NEON-NEXT: vmov.32 d10[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 ; CHECK-NEON-NEXT: mov r0, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov r0, s20 +; CHECK-NEON-NEXT: vmov r0, s18 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r4 ; CHECK-NEON-NEXT: vmov.i32 q8, #0xffff ; CHECK-NEON-NEXT: vcvt.s32.f32 s2, s2 ; CHECK-NEON-NEXT: vmov.i32 q9, #0x0 -; CHECK-NEON-NEXT: vmov.32 d9[0], r0 +; CHECK-NEON-NEXT: vmov.32 d11[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s22 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s16 ; CHECK-NEON-NEXT: vmov.32 d12[1], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmin.s32 q10, q6, q8 ; CHECK-NEON-NEXT: vmax.s32 q10, q10, q9 -; CHECK-NEON-NEXT: vmov.32 d9[1], r0 +; CHECK-NEON-NEXT: vmov.32 d11[1], r0 ; CHECK-NEON-NEXT: vmov r0, s2 ; CHECK-NEON-NEXT: vmovn.i32 d1, q10 -; CHECK-NEON-NEXT: vmov.32 d8[1], r0 -; CHECK-NEON-NEXT: vmin.s32 q8, q4, q8 +; CHECK-NEON-NEXT: vmov.32 d10[1], r0 +; CHECK-NEON-NEXT: vmin.s32 q8, q5, q8 ; CHECK-NEON-NEXT: vmax.s32 q8, q8, q9 ; CHECK-NEON-NEXT: vmovn.i32 d0, q8 -; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r11, pc} ; ; CHECK-FP16-LABEL: ustest_f16i16: @@ -1718,25 +1718,25 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) { ; CHECK-NEXT: vorr q4, q0, q0 ; CHECK-NEXT: vorr d0, d9, d9 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: subs r0, r2, #1 ; CHECK-NEXT: vorr d0, d8, d8 ; CHECK-NEXT: sbcs r0, r3, #0 ; CHECK-NEXT: mov r7, #0 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: movwlo r7, #1 ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: moveq r5, r7 +; CHECK-NEXT: moveq r4, r7 ; CHECK-NEXT: bl __fixunsdfti ; CHECK-NEXT: subs r2, r2, #1 -; CHECK-NEXT: vmov.32 d1[0], r5 +; CHECK-NEXT: vmov.32 d1[0], r4 ; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: movwlo r6, #1 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: moveq r0, r6 ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: movne r7, r4 +; CHECK-NEXT: movne r7, r5 ; CHECK-NEXT: vmov.32 d0[0], r0 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: vmov.32 d1[1], r7 @@ -1906,24 +1906,24 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) { ; CHECK-NEXT: vmov.f32 s0, s17 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vmov.f32 s0, s16 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: subs r0, r2, #1 ; CHECK-NEXT: mov r7, #0 ; CHECK-NEXT: sbcs r0, r3, #0 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: movwlo r7, #1 ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: moveq r5, r7 +; CHECK-NEXT: moveq r4, r7 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: subs r2, r2, #1 -; CHECK-NEXT: vmov.32 d1[0], r5 +; CHECK-NEXT: vmov.32 d1[0], r4 ; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: movwlo r6, #1 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: moveq r0, r6 ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: movne r7, r4 +; CHECK-NEXT: movne r7, r5 ; CHECK-NEXT: vmov.32 d0[0], r0 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: vmov.32 d1[1], r7 @@ -2163,33 +2163,33 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.f32 s16, s1 ; CHECK-NEON-NEXT: bl __aeabi_h2f -; CHECK-NEON-NEXT: mov r5, r0 +; CHECK-NEON-NEXT: mov r4, r0 ; CHECK-NEON-NEXT: vmov r0, s16 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 ; CHECK-NEON-NEXT: bl __fixunssfti -; CHECK-NEON-NEXT: mov r6, r0 +; CHECK-NEON-NEXT: mov r5, r0 ; CHECK-NEON-NEXT: subs r0, r2, #1 -; CHECK-NEON-NEXT: vmov s0, r5 +; CHECK-NEON-NEXT: vmov s0, r4 ; CHECK-NEON-NEXT: sbcs r0, r3, #0 -; CHECK-NEON-NEXT: mov r5, #0 -; CHECK-NEON-NEXT: mov r4, r1 -; CHECK-NEON-NEXT: movwlo r5, #1 -; CHECK-NEON-NEXT: cmp r5, #0 +; CHECK-NEON-NEXT: mov r4, #0 +; CHECK-NEON-NEXT: mov r6, r1 +; CHECK-NEON-NEXT: movwlo r4, #1 +; CHECK-NEON-NEXT: cmp r4, #0 ; CHECK-NEON-NEXT: mov r7, #0 -; CHECK-NEON-NEXT: moveq r6, r5 +; CHECK-NEON-NEXT: moveq r5, r4 ; CHECK-NEON-NEXT: bl __fixunssfti ; CHECK-NEON-NEXT: subs r2, r2, #1 -; CHECK-NEON-NEXT: vmov.32 d1[0], r6 +; CHECK-NEON-NEXT: vmov.32 d1[0], r5 ; CHECK-NEON-NEXT: sbcs r2, r3, #0 ; CHECK-NEON-NEXT: movwlo r7, #1 ; CHECK-NEON-NEXT: cmp r7, #0 ; CHECK-NEON-NEXT: moveq r0, r7 -; CHECK-NEON-NEXT: cmp r5, #0 -; CHECK-NEON-NEXT: movne r5, r4 +; CHECK-NEON-NEXT: cmp r4, #0 +; CHECK-NEON-NEXT: movne r4, r6 ; CHECK-NEON-NEXT: vmov.32 d0[0], r0 ; CHECK-NEON-NEXT: cmp r7, #0 -; CHECK-NEON-NEXT: vmov.32 d1[1], r5 +; CHECK-NEON-NEXT: vmov.32 d1[1], r4 ; CHECK-NEON-NEXT: movne r7, r1 ; CHECK-NEON-NEXT: vmov.32 d0[1], r7 ; CHECK-NEON-NEXT: vpop {d8} @@ -2200,33 +2200,33 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) { ; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-FP16-NEXT: vmov.u16 r0, d0[1] -; CHECK-FP16-NEXT: vmov.u16 r7, d0[0] +; CHECK-FP16-NEXT: vmov.u16 r6, d0[0] ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixunshfti -; CHECK-FP16-NEXT: mov r5, r0 +; CHECK-FP16-NEXT: mov r4, r0 ; CHECK-FP16-NEXT: subs r0, r2, #1 -; CHECK-FP16-NEXT: vmov s0, r7 +; CHECK-FP16-NEXT: vmov s0, r6 ; CHECK-FP16-NEXT: sbcs r0, r3, #0 -; CHECK-FP16-NEXT: mov r7, #0 -; CHECK-FP16-NEXT: mov r4, r1 -; CHECK-FP16-NEXT: movwlo r7, #1 -; CHECK-FP16-NEXT: cmp r7, #0 ; CHECK-FP16-NEXT: mov r6, #0 -; CHECK-FP16-NEXT: moveq r5, r7 +; CHECK-FP16-NEXT: mov r5, r1 +; CHECK-FP16-NEXT: movwlo r6, #1 +; CHECK-FP16-NEXT: cmp r6, #0 +; CHECK-FP16-NEXT: mov r7, #0 +; CHECK-FP16-NEXT: moveq r4, r6 ; CHECK-FP16-NEXT: bl __fixunshfti ; CHECK-FP16-NEXT: subs r2, r2, #1 -; CHECK-FP16-NEXT: vmov.32 d1[0], r5 +; CHECK-FP16-NEXT: vmov.32 d1[0], r4 ; CHECK-FP16-NEXT: sbcs r2, r3, #0 -; CHECK-FP16-NEXT: movwlo r6, #1 -; CHECK-FP16-NEXT: cmp r6, #0 -; CHECK-FP16-NEXT: moveq r0, r6 +; CHECK-FP16-NEXT: movwlo r7, #1 ; CHECK-FP16-NEXT: cmp r7, #0 -; CHECK-FP16-NEXT: movne r7, r4 -; CHECK-FP16-NEXT: vmov.32 d0[0], r0 +; CHECK-FP16-NEXT: moveq r0, r7 ; CHECK-FP16-NEXT: cmp r6, #0 -; CHECK-FP16-NEXT: vmov.32 d1[1], r7 -; CHECK-FP16-NEXT: movne r6, r1 -; CHECK-FP16-NEXT: vmov.32 d0[1], r6 +; CHECK-FP16-NEXT: movne r6, r5 +; CHECK-FP16-NEXT: vmov.32 d0[0], r0 +; CHECK-FP16-NEXT: cmp r7, #0 +; CHECK-FP16-NEXT: vmov.32 d1[1], r6 +; CHECK-FP16-NEXT: movne r7, r1 +; CHECK-FP16-NEXT: vmov.32 d0[1], r7 ; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r11, pc} entry: %conv = fptoui <2 x half> %x to <2 x i128> @@ -2881,27 +2881,27 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-NEON-NEXT: bl __aeabi_f2ulz ; CHECK-NEON-NEXT: mov r4, r1 ; CHECK-NEON-NEXT: vmov r1, s18 -; CHECK-NEON-NEXT: vmov r6, s16 +; CHECK-NEON-NEXT: vmov r5, s16 ; CHECK-NEON-NEXT: vmov.32 d9[0], r0 -; CHECK-NEON-NEXT: vmov r7, s20 +; CHECK-NEON-NEXT: vmov r6, s20 ; CHECK-NEON-NEXT: mov r0, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2ulz ; CHECK-NEON-NEXT: vmov.32 d10[0], r0 -; CHECK-NEON-NEXT: mov r0, r6 -; CHECK-NEON-NEXT: mov r5, r1 +; CHECK-NEON-NEXT: mov r0, r5 +; CHECK-NEON-NEXT: mov r7, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2ulz ; CHECK-NEON-NEXT: vmov.32 d11[0], r0 -; CHECK-NEON-NEXT: mov r0, r7 -; CHECK-NEON-NEXT: mov r6, r1 +; CHECK-NEON-NEXT: mov r0, r6 +; CHECK-NEON-NEXT: mov r5, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2ulz ; CHECK-NEON-NEXT: vmov.32 d8[0], r0 ; CHECK-NEON-NEXT: vmov.i64 q8, #0xffffffff -; CHECK-NEON-NEXT: vmov.32 d11[1], r6 +; CHECK-NEON-NEXT: vmov.32 d11[1], r5 ; CHECK-NEON-NEXT: vmov.32 d9[1], r4 -; CHECK-NEON-NEXT: vmov.32 d10[1], r5 +; CHECK-NEON-NEXT: vmov.32 d10[1], r7 ; CHECK-NEON-NEXT: vmov.32 d8[1], r1 ; CHECK-NEON-NEXT: vqsub.u64 q9, q5, q8 ; CHECK-NEON-NEXT: vqsub.u64 q8, q4, q8 @@ -3255,8 +3255,8 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON: @ %bb.0: @ %entry ; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 @@ -3285,9 +3285,9 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s16 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov s22, r7 +; CHECK-NEON-NEXT: vmov s16, r7 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 -; CHECK-NEON-NEXT: vmov s30, r6 +; CHECK-NEON-NEXT: vmov s18, r6 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d13[1], r0 ; CHECK-NEON-NEXT: vmov r0, s28 @@ -3296,36 +3296,36 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vmov r0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 -; CHECK-NEON-NEXT: vmov.32 d8[0], r0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s18 +; CHECK-NEON-NEXT: vcvt.s32.f32 s18, s2 +; CHECK-NEON-NEXT: vmov.32 d10[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 ; CHECK-NEON-NEXT: mov r0, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov r0, s20 +; CHECK-NEON-NEXT: vmov r0, s18 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r4 ; CHECK-NEON-NEXT: vmov.i32 q8, #0x7fff ; CHECK-NEON-NEXT: vcvt.s32.f32 s2, s2 ; CHECK-NEON-NEXT: vmvn.i32 q9, #0x7fff -; CHECK-NEON-NEXT: vmov.32 d9[0], r0 +; CHECK-NEON-NEXT: vmov.32 d11[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s22 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s16 ; CHECK-NEON-NEXT: vmov.32 d12[1], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmin.s32 q10, q6, q8 ; CHECK-NEON-NEXT: vmax.s32 q10, q10, q9 -; CHECK-NEON-NEXT: vmov.32 d9[1], r0 +; CHECK-NEON-NEXT: vmov.32 d11[1], r0 ; CHECK-NEON-NEXT: vmov r0, s2 ; CHECK-NEON-NEXT: vmovn.i32 d1, q10 -; CHECK-NEON-NEXT: vmov.32 d8[1], r0 -; CHECK-NEON-NEXT: vmin.s32 q8, q4, q8 +; CHECK-NEON-NEXT: vmov.32 d10[1], r0 +; CHECK-NEON-NEXT: vmin.s32 q8, q5, q8 ; CHECK-NEON-NEXT: vmax.s32 q8, q8, q9 ; CHECK-NEON-NEXT: vmovn.i32 d0, q8 -; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r11, pc} ; ; CHECK-FP16-LABEL: stest_f16i16_mm: @@ -3498,8 +3498,8 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON: @ %bb.0: @ %entry ; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 @@ -3528,9 +3528,9 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s16 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov s22, r7 +; CHECK-NEON-NEXT: vmov s16, r7 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 -; CHECK-NEON-NEXT: vmov s30, r6 +; CHECK-NEON-NEXT: vmov s18, r6 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d13[1], r0 ; CHECK-NEON-NEXT: vmov r0, s28 @@ -3539,36 +3539,36 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vmov r0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 -; CHECK-NEON-NEXT: vmov.32 d8[0], r0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s18 +; CHECK-NEON-NEXT: vcvt.s32.f32 s18, s2 +; CHECK-NEON-NEXT: vmov.32 d10[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 ; CHECK-NEON-NEXT: mov r0, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov r0, s20 +; CHECK-NEON-NEXT: vmov r0, s18 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r4 ; CHECK-NEON-NEXT: vmov.i32 q8, #0xffff ; CHECK-NEON-NEXT: vcvt.s32.f32 s2, s2 ; CHECK-NEON-NEXT: vmov.i32 q9, #0x0 -; CHECK-NEON-NEXT: vmov.32 d9[0], r0 +; CHECK-NEON-NEXT: vmov.32 d11[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s22 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s16 ; CHECK-NEON-NEXT: vmov.32 d12[1], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmin.s32 q10, q6, q8 ; CHECK-NEON-NEXT: vmax.s32 q10, q10, q9 -; CHECK-NEON-NEXT: vmov.32 d9[1], r0 +; CHECK-NEON-NEXT: vmov.32 d11[1], r0 ; CHECK-NEON-NEXT: vmov r0, s2 ; CHECK-NEON-NEXT: vmovn.i32 d1, q10 -; CHECK-NEON-NEXT: vmov.32 d8[1], r0 -; CHECK-NEON-NEXT: vmin.s32 q8, q4, q8 +; CHECK-NEON-NEXT: vmov.32 d10[1], r0 +; CHECK-NEON-NEXT: vmin.s32 q8, q5, q8 ; CHECK-NEON-NEXT: vmax.s32 q8, q8, q9 ; CHECK-NEON-NEXT: vmovn.i32 d0, q8 -; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r11, pc} ; ; CHECK-FP16-LABEL: ustest_f16i16_mm: @@ -3703,25 +3703,25 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) { ; CHECK-NEXT: vorr q4, q0, q0 ; CHECK-NEXT: vorr d0, d9, d9 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: subs r0, r2, #1 ; CHECK-NEXT: vorr d0, d8, d8 ; CHECK-NEXT: sbcs r0, r3, #0 ; CHECK-NEXT: mov r7, #0 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: movwlo r7, #1 ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: moveq r5, r7 +; CHECK-NEXT: moveq r4, r7 ; CHECK-NEXT: bl __fixunsdfti ; CHECK-NEXT: subs r2, r2, #1 -; CHECK-NEXT: vmov.32 d1[0], r5 +; CHECK-NEXT: vmov.32 d1[0], r4 ; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: movwlo r6, #1 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: moveq r0, r6 ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: movne r7, r4 +; CHECK-NEXT: movne r7, r5 ; CHECK-NEXT: vmov.32 d0[0], r0 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: vmov.32 d1[1], r7 @@ -3874,24 +3874,24 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) { ; CHECK-NEXT: vmov.f32 s0, s17 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vmov.f32 s0, s16 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: subs r0, r2, #1 ; CHECK-NEXT: mov r7, #0 ; CHECK-NEXT: sbcs r0, r3, #0 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: movwlo r7, #1 ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: moveq r5, r7 +; CHECK-NEXT: moveq r4, r7 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: subs r2, r2, #1 -; CHECK-NEXT: vmov.32 d1[0], r5 +; CHECK-NEXT: vmov.32 d1[0], r4 ; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: movwlo r6, #1 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: moveq r0, r6 ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: movne r7, r4 +; CHECK-NEXT: movne r7, r5 ; CHECK-NEXT: vmov.32 d0[0], r0 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: vmov.32 d1[1], r7 @@ -4114,33 +4114,33 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.f32 s16, s1 ; CHECK-NEON-NEXT: bl __aeabi_h2f -; CHECK-NEON-NEXT: mov r5, r0 +; CHECK-NEON-NEXT: mov r4, r0 ; CHECK-NEON-NEXT: vmov r0, s16 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 ; CHECK-NEON-NEXT: bl __fixunssfti -; CHECK-NEON-NEXT: mov r6, r0 +; CHECK-NEON-NEXT: mov r5, r0 ; CHECK-NEON-NEXT: subs r0, r2, #1 -; CHECK-NEON-NEXT: vmov s0, r5 +; CHECK-NEON-NEXT: vmov s0, r4 ; CHECK-NEON-NEXT: sbcs r0, r3, #0 -; CHECK-NEON-NEXT: mov r5, #0 -; CHECK-NEON-NEXT: mov r4, r1 -; CHECK-NEON-NEXT: movwlo r5, #1 -; CHECK-NEON-NEXT: cmp r5, #0 +; CHECK-NEON-NEXT: mov r4, #0 +; CHECK-NEON-NEXT: mov r6, r1 +; CHECK-NEON-NEXT: movwlo r4, #1 +; CHECK-NEON-NEXT: cmp r4, #0 ; CHECK-NEON-NEXT: mov r7, #0 -; CHECK-NEON-NEXT: moveq r6, r5 +; CHECK-NEON-NEXT: moveq r5, r4 ; CHECK-NEON-NEXT: bl __fixunssfti ; CHECK-NEON-NEXT: subs r2, r2, #1 -; CHECK-NEON-NEXT: vmov.32 d1[0], r6 +; CHECK-NEON-NEXT: vmov.32 d1[0], r5 ; CHECK-NEON-NEXT: sbcs r2, r3, #0 ; CHECK-NEON-NEXT: movwlo r7, #1 ; CHECK-NEON-NEXT: cmp r7, #0 ; CHECK-NEON-NEXT: moveq r0, r7 -; CHECK-NEON-NEXT: cmp r5, #0 -; CHECK-NEON-NEXT: movne r5, r4 +; CHECK-NEON-NEXT: cmp r4, #0 +; CHECK-NEON-NEXT: movne r4, r6 ; CHECK-NEON-NEXT: vmov.32 d0[0], r0 ; CHECK-NEON-NEXT: cmp r7, #0 -; CHECK-NEON-NEXT: vmov.32 d1[1], r5 +; CHECK-NEON-NEXT: vmov.32 d1[1], r4 ; CHECK-NEON-NEXT: movne r7, r1 ; CHECK-NEON-NEXT: vmov.32 d0[1], r7 ; CHECK-NEON-NEXT: vpop {d8} @@ -4151,33 +4151,33 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) { ; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-FP16-NEXT: vmov.u16 r0, d0[1] -; CHECK-FP16-NEXT: vmov.u16 r7, d0[0] +; CHECK-FP16-NEXT: vmov.u16 r6, d0[0] ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixunshfti -; CHECK-FP16-NEXT: mov r5, r0 +; CHECK-FP16-NEXT: mov r4, r0 ; CHECK-FP16-NEXT: subs r0, r2, #1 -; CHECK-FP16-NEXT: vmov s0, r7 +; CHECK-FP16-NEXT: vmov s0, r6 ; CHECK-FP16-NEXT: sbcs r0, r3, #0 -; CHECK-FP16-NEXT: mov r7, #0 -; CHECK-FP16-NEXT: mov r4, r1 -; CHECK-FP16-NEXT: movwlo r7, #1 -; CHECK-FP16-NEXT: cmp r7, #0 ; CHECK-FP16-NEXT: mov r6, #0 -; CHECK-FP16-NEXT: moveq r5, r7 +; CHECK-FP16-NEXT: mov r5, r1 +; CHECK-FP16-NEXT: movwlo r6, #1 +; CHECK-FP16-NEXT: cmp r6, #0 +; CHECK-FP16-NEXT: mov r7, #0 +; CHECK-FP16-NEXT: moveq r4, r6 ; CHECK-FP16-NEXT: bl __fixunshfti ; CHECK-FP16-NEXT: subs r2, r2, #1 -; CHECK-FP16-NEXT: vmov.32 d1[0], r5 +; CHECK-FP16-NEXT: vmov.32 d1[0], r4 ; CHECK-FP16-NEXT: sbcs r2, r3, #0 -; CHECK-FP16-NEXT: movwlo r6, #1 -; CHECK-FP16-NEXT: cmp r6, #0 -; CHECK-FP16-NEXT: moveq r0, r6 +; CHECK-FP16-NEXT: movwlo r7, #1 ; CHECK-FP16-NEXT: cmp r7, #0 -; CHECK-FP16-NEXT: movne r7, r4 -; CHECK-FP16-NEXT: vmov.32 d0[0], r0 +; CHECK-FP16-NEXT: moveq r0, r7 ; CHECK-FP16-NEXT: cmp r6, #0 -; CHECK-FP16-NEXT: vmov.32 d1[1], r7 -; CHECK-FP16-NEXT: movne r6, r1 -; CHECK-FP16-NEXT: vmov.32 d0[1], r6 +; CHECK-FP16-NEXT: movne r6, r5 +; CHECK-FP16-NEXT: vmov.32 d0[0], r0 +; CHECK-FP16-NEXT: cmp r7, #0 +; CHECK-FP16-NEXT: vmov.32 d1[1], r6 +; CHECK-FP16-NEXT: movne r7, r1 +; CHECK-FP16-NEXT: vmov.32 d0[1], r7 ; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r11, pc} entry: %conv = fptoui <2 x half> %x to <2 x i128> @@ -4196,28 +4196,28 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s0 ; CHECK-NEON-NEXT: bl __aeabi_h2f -; CHECK-NEON-NEXT: mov r7, r0 +; CHECK-NEON-NEXT: mov r6, r0 ; CHECK-NEON-NEXT: vmov r0, s16 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 ; CHECK-NEON-NEXT: bl __fixsfti -; CHECK-NEON-NEXT: mov r6, r0 +; CHECK-NEON-NEXT: mov r7, r0 ; CHECK-NEON-NEXT: subs r0, r2, #1 -; CHECK-NEON-NEXT: vmov s0, r7 +; CHECK-NEON-NEXT: vmov s0, r6 ; CHECK-NEON-NEXT: sbcs r0, r3, #0 -; CHECK-NEON-NEXT: mov r7, #0 +; CHECK-NEON-NEXT: mov r6, #0 ; CHECK-NEON-NEXT: mov r5, r3 -; CHECK-NEON-NEXT: movwlt r7, #1 -; CHECK-NEON-NEXT: cmp r7, #0 -; CHECK-NEON-NEXT: moveq r6, r7 -; CHECK-NEON-NEXT: moveq r5, r7 +; CHECK-NEON-NEXT: movwlt r6, #1 +; CHECK-NEON-NEXT: cmp r6, #0 +; CHECK-NEON-NEXT: moveq r7, r6 +; CHECK-NEON-NEXT: moveq r5, r6 ; CHECK-NEON-NEXT: cmp r5, #0 ; CHECK-NEON-NEXT: mov r8, r1 ; CHECK-NEON-NEXT: mov r4, #0 -; CHECK-NEON-NEXT: movwmi r6, #0 +; CHECK-NEON-NEXT: movwmi r7, #0 ; CHECK-NEON-NEXT: bl __fixsfti ; CHECK-NEON-NEXT: subs r2, r2, #1 -; CHECK-NEON-NEXT: vmov.32 d0[0], r6 +; CHECK-NEON-NEXT: vmov.32 d0[0], r7 ; CHECK-NEON-NEXT: sbcs r2, r3, #0 ; CHECK-NEON-NEXT: movwlt r4, #1 ; CHECK-NEON-NEXT: cmp r4, #0 @@ -4230,12 +4230,12 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) { ; CHECK-NEON-NEXT: cmp r3, #0 ; CHECK-NEON-NEXT: vmov.32 d1[0], r0 ; CHECK-NEON-NEXT: movwmi r4, #0 -; CHECK-NEON-NEXT: cmp r7, #0 -; CHECK-NEON-NEXT: movne r7, r8 +; CHECK-NEON-NEXT: cmp r6, #0 +; CHECK-NEON-NEXT: movne r6, r8 ; CHECK-NEON-NEXT: cmp r5, #0 ; CHECK-NEON-NEXT: vmov.32 d1[1], r4 -; CHECK-NEON-NEXT: movwmi r7, #0 -; CHECK-NEON-NEXT: vmov.32 d0[1], r7 +; CHECK-NEON-NEXT: movwmi r6, #0 +; CHECK-NEON-NEXT: vmov.32 d0[1], r6 ; CHECK-NEON-NEXT: vpop {d8} ; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, pc} ; diff --git a/llvm/test/CodeGen/ARM/fptoi-sat-store.ll b/llvm/test/CodeGen/ARM/fptoi-sat-store.ll index 67edf9855f372f..08065e45b67949 100644 --- a/llvm/test/CodeGen/ARM/fptoi-sat-store.ll +++ b/llvm/test/CodeGen/ARM/fptoi-sat-store.ll @@ -97,36 +97,39 @@ define void @test_signed_i32_f64(ptr %d, double %f) nounwind { ; SOFT-NEXT: mov r1, r5 ; SOFT-NEXT: bl __aeabi_d2iz ; SOFT-NEXT: cmp r4, #0 -; SOFT-NEXT: bne .LBB1_2 +; SOFT-NEXT: beq .LBB1_2 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: movs r0, #1 -; SOFT-NEXT: lsls r0, r0, #31 +; SOFT-NEXT: mov r1, r0 +; SOFT-NEXT: b .LBB1_3 ; SOFT-NEXT: .LBB1_2: -; SOFT-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; SOFT-NEXT: cmp r1, #0 -; SOFT-NEXT: bne .LBB1_4 -; SOFT-NEXT: @ %bb.3: -; SOFT-NEXT: mov r4, r0 -; SOFT-NEXT: b .LBB1_5 -; SOFT-NEXT: .LBB1_4: -; SOFT-NEXT: ldr r4, .LCPI1_3 +; SOFT-NEXT: movs r0, #1 +; SOFT-NEXT: lsls r1, r0, #31 +; SOFT-NEXT: .LBB1_3: +; SOFT-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; SOFT-NEXT: cmp r0, #0 +; SOFT-NEXT: bne .LBB1_5 +; SOFT-NEXT: @ %bb.4: +; SOFT-NEXT: mov r4, r1 +; SOFT-NEXT: b .LBB1_6 ; SOFT-NEXT: .LBB1_5: +; SOFT-NEXT: ldr r4, .LCPI1_3 +; SOFT-NEXT: .LBB1_6: ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: mov r1, r5 ; SOFT-NEXT: mov r2, r6 ; SOFT-NEXT: mov r3, r5 ; SOFT-NEXT: bl __aeabi_dcmpun ; SOFT-NEXT: cmp r0, #0 -; SOFT-NEXT: bne .LBB1_7 -; SOFT-NEXT: @ %bb.6: +; SOFT-NEXT: bne .LBB1_8 +; SOFT-NEXT: @ %bb.7: ; SOFT-NEXT: mov r7, r4 -; SOFT-NEXT: .LBB1_7: +; SOFT-NEXT: .LBB1_8: ; SOFT-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; SOFT-NEXT: str r7, [r0] ; SOFT-NEXT: add sp, #12 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .p2align 2 -; SOFT-NEXT: @ %bb.8: +; SOFT-NEXT: @ %bb.9: ; SOFT-NEXT: .LCPI1_0: ; SOFT-NEXT: .long 4290772992 @ 0xffc00000 ; SOFT-NEXT: .LCPI1_1: @@ -212,34 +215,34 @@ define void @test_unsigned_i32_f64(ptr %d, double %f) nounwind { ; SOFT-NEXT: .pad #12 ; SOFT-NEXT: sub sp, #12 ; SOFT-NEXT: mov r5, r3 -; SOFT-NEXT: mov r4, r2 +; SOFT-NEXT: mov r6, r2 ; SOFT-NEXT: str r0, [sp, #8] @ 4-byte Spill ; SOFT-NEXT: ldr r2, .LCPI3_0 ; SOFT-NEXT: ldr r3, .LCPI3_1 -; SOFT-NEXT: mov r0, r4 +; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: mov r1, r5 ; SOFT-NEXT: bl __aeabi_dcmpgt ; SOFT-NEXT: str r0, [sp, #4] @ 4-byte Spill -; SOFT-NEXT: movs r6, #0 -; SOFT-NEXT: mov r0, r4 +; SOFT-NEXT: movs r7, #0 +; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: mov r1, r5 -; SOFT-NEXT: mov r2, r6 -; SOFT-NEXT: mov r3, r6 +; SOFT-NEXT: mov r2, r7 +; SOFT-NEXT: mov r3, r7 ; SOFT-NEXT: bl __aeabi_dcmpge -; SOFT-NEXT: mov r7, r0 -; SOFT-NEXT: mov r0, r4 +; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: mov r1, r5 ; SOFT-NEXT: bl __aeabi_d2uiz -; SOFT-NEXT: cmp r7, #0 +; SOFT-NEXT: cmp r4, #0 ; SOFT-NEXT: bne .LBB3_2 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: mov r0, r7 +; SOFT-NEXT: mov r0, r4 ; SOFT-NEXT: .LBB3_2: ; SOFT-NEXT: ldr r1, [sp, #4] @ 4-byte Reload ; SOFT-NEXT: cmp r1, #0 ; SOFT-NEXT: beq .LBB3_4 ; SOFT-NEXT: @ %bb.3: -; SOFT-NEXT: mvns r0, r6 +; SOFT-NEXT: mvns r0, r7 ; SOFT-NEXT: .LBB3_4: ; SOFT-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; SOFT-NEXT: str r0, [r1] diff --git a/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll b/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll index 4b27e804e6df9a..1b560192768845 100644 --- a/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll @@ -808,7 +808,7 @@ define i100 @test_signed_i100_f32(float %f) nounwind { ; SOFT-NEXT: mov r6, r0 ; SOFT-NEXT: ldr r1, .LCPI8_0 ; SOFT-NEXT: bl __aeabi_fcmpgt -; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: str r0, [sp, #16] @ 4-byte Spill ; SOFT-NEXT: movs r0, #241 ; SOFT-NEXT: lsls r1, r0, #24 ; SOFT-NEXT: mov r0, r6 @@ -816,24 +816,24 @@ define i100 @test_signed_i100_f32(float %f) nounwind { ; SOFT-NEXT: mov r7, r0 ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: bl __fixsfti -; SOFT-NEXT: str r1, [sp, #4] @ 4-byte Spill +; SOFT-NEXT: str r1, [sp, #8] @ 4-byte Spill ; SOFT-NEXT: str r2, [sp, #12] @ 4-byte Spill -; SOFT-NEXT: str r3, [sp] @ 4-byte Spill +; SOFT-NEXT: str r3, [sp, #4] @ 4-byte Spill ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB8_2 ; SOFT-NEXT: @ %bb.1: ; SOFT-NEXT: mov r0, r7 ; SOFT-NEXT: .LBB8_2: ; SOFT-NEXT: movs r5, #0 -; SOFT-NEXT: mvns r1, r5 -; SOFT-NEXT: str r4, [sp, #16] @ 4-byte Spill -; SOFT-NEXT: cmp r4, #0 -; SOFT-NEXT: str r1, [sp, #8] @ 4-byte Spill -; SOFT-NEXT: mov r4, r1 +; SOFT-NEXT: mvns r4, r5 +; SOFT-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; SOFT-NEXT: cmp r1, #0 +; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: bne .LBB8_4 ; SOFT-NEXT: @ %bb.3: -; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: mov r1, r0 ; SOFT-NEXT: .LBB8_4: +; SOFT-NEXT: str r1, [sp] @ 4-byte Spill ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: mov r1, r6 ; SOFT-NEXT: bl __aeabi_fcmpun @@ -842,20 +842,19 @@ define i100 @test_signed_i100_f32(float %f) nounwind { ; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: bne .LBB8_6 ; SOFT-NEXT: @ %bb.5: -; SOFT-NEXT: mov r0, r4 +; SOFT-NEXT: ldr r0, [sp] @ 4-byte Reload ; SOFT-NEXT: .LBB8_6: ; SOFT-NEXT: cmp r7, #0 -; SOFT-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; SOFT-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: bne .LBB8_8 ; SOFT-NEXT: @ %bb.7: -; SOFT-NEXT: str r7, [sp, #4] @ 4-byte Spill +; SOFT-NEXT: str r7, [sp, #8] @ 4-byte Spill ; SOFT-NEXT: .LBB8_8: -; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload -; SOFT-NEXT: cmp r4, #0 -; SOFT-NEXT: mov r2, r6 +; SOFT-NEXT: cmp r6, #0 +; SOFT-NEXT: mov r2, r4 ; SOFT-NEXT: bne .LBB8_10 ; SOFT-NEXT: @ %bb.9: -; SOFT-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; SOFT-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; SOFT-NEXT: .LBB8_10: ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: mov r1, r5 @@ -864,51 +863,51 @@ define i100 @test_signed_i100_f32(float %f) nounwind { ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB8_19 ; SOFT-NEXT: .LBB8_12: -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB8_14 ; SOFT-NEXT: .LBB8_13: -; SOFT-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; SOFT-NEXT: ldr r4, [sp, #12] @ 4-byte Reload ; SOFT-NEXT: .LBB8_14: ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: mov r2, r5 ; SOFT-NEXT: bne .LBB8_16 ; SOFT-NEXT: @ %bb.15: -; SOFT-NEXT: mov r2, r6 +; SOFT-NEXT: mov r2, r4 ; SOFT-NEXT: .LBB8_16: ; SOFT-NEXT: movs r4, #7 ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB8_20 ; SOFT-NEXT: @ %bb.17: -; SOFT-NEXT: ldr r7, [sp] @ 4-byte Reload -; SOFT-NEXT: b .LBB8_21 +; SOFT-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; SOFT-NEXT: cmp r6, #0 +; SOFT-NEXT: beq .LBB8_21 +; SOFT-NEXT: b .LBB8_22 ; SOFT-NEXT: .LBB8_18: ; SOFT-NEXT: mov r1, r2 ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB8_12 ; SOFT-NEXT: .LBB8_19: ; SOFT-NEXT: str r7, [sp, #12] @ 4-byte Spill -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB8_13 ; SOFT-NEXT: b .LBB8_14 ; SOFT-NEXT: .LBB8_20: ; SOFT-NEXT: mvns r7, r4 -; SOFT-NEXT: .LBB8_21: -; SOFT-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: cmp r6, #0 -; SOFT-NEXT: bne .LBB8_23 -; SOFT-NEXT: @ %bb.22: +; SOFT-NEXT: bne .LBB8_22 +; SOFT-NEXT: .LBB8_21: ; SOFT-NEXT: mov r4, r7 -; SOFT-NEXT: .LBB8_23: +; SOFT-NEXT: .LBB8_22: ; SOFT-NEXT: cmp r3, #0 -; SOFT-NEXT: bne .LBB8_25 -; SOFT-NEXT: @ %bb.24: +; SOFT-NEXT: bne .LBB8_24 +; SOFT-NEXT: @ %bb.23: ; SOFT-NEXT: mov r5, r4 -; SOFT-NEXT: .LBB8_25: +; SOFT-NEXT: .LBB8_24: ; SOFT-NEXT: mov r3, r5 ; SOFT-NEXT: add sp, #20 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .p2align 2 -; SOFT-NEXT: @ %bb.26: +; SOFT-NEXT: @ %bb.25: ; SOFT-NEXT: .LCPI8_0: ; SOFT-NEXT: .long 1895825407 @ 0x70ffffff ; @@ -990,7 +989,7 @@ define i128 @test_signed_i128_f32(float %f) nounwind { ; SOFT-NEXT: mov r6, r0 ; SOFT-NEXT: ldr r1, .LCPI9_0 ; SOFT-NEXT: bl __aeabi_fcmpgt -; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: str r0, [sp, #16] @ 4-byte Spill ; SOFT-NEXT: movs r0, #255 ; SOFT-NEXT: lsls r1, r0, #24 ; SOFT-NEXT: mov r0, r6 @@ -998,24 +997,24 @@ define i128 @test_signed_i128_f32(float %f) nounwind { ; SOFT-NEXT: mov r7, r0 ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: bl __fixsfti -; SOFT-NEXT: str r1, [sp, #4] @ 4-byte Spill +; SOFT-NEXT: str r1, [sp, #8] @ 4-byte Spill ; SOFT-NEXT: str r2, [sp, #12] @ 4-byte Spill -; SOFT-NEXT: str r3, [sp] @ 4-byte Spill +; SOFT-NEXT: str r3, [sp, #4] @ 4-byte Spill ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB9_2 ; SOFT-NEXT: @ %bb.1: ; SOFT-NEXT: mov r0, r7 ; SOFT-NEXT: .LBB9_2: ; SOFT-NEXT: movs r5, #0 -; SOFT-NEXT: mvns r1, r5 -; SOFT-NEXT: str r4, [sp, #16] @ 4-byte Spill -; SOFT-NEXT: cmp r4, #0 -; SOFT-NEXT: str r1, [sp, #8] @ 4-byte Spill -; SOFT-NEXT: mov r4, r1 +; SOFT-NEXT: mvns r4, r5 +; SOFT-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; SOFT-NEXT: cmp r1, #0 +; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: bne .LBB9_4 ; SOFT-NEXT: @ %bb.3: -; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: mov r1, r0 ; SOFT-NEXT: .LBB9_4: +; SOFT-NEXT: str r1, [sp] @ 4-byte Spill ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: mov r1, r6 ; SOFT-NEXT: bl __aeabi_fcmpun @@ -1024,20 +1023,19 @@ define i128 @test_signed_i128_f32(float %f) nounwind { ; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: bne .LBB9_6 ; SOFT-NEXT: @ %bb.5: -; SOFT-NEXT: mov r0, r4 +; SOFT-NEXT: ldr r0, [sp] @ 4-byte Reload ; SOFT-NEXT: .LBB9_6: ; SOFT-NEXT: cmp r7, #0 -; SOFT-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; SOFT-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: bne .LBB9_8 ; SOFT-NEXT: @ %bb.7: -; SOFT-NEXT: str r7, [sp, #4] @ 4-byte Spill +; SOFT-NEXT: str r7, [sp, #8] @ 4-byte Spill ; SOFT-NEXT: .LBB9_8: -; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload -; SOFT-NEXT: cmp r4, #0 -; SOFT-NEXT: mov r2, r6 +; SOFT-NEXT: cmp r6, #0 +; SOFT-NEXT: mov r2, r4 ; SOFT-NEXT: bne .LBB9_10 ; SOFT-NEXT: @ %bb.9: -; SOFT-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; SOFT-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; SOFT-NEXT: .LBB9_10: ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: mov r1, r5 @@ -1046,22 +1044,22 @@ define i128 @test_signed_i128_f32(float %f) nounwind { ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB9_19 ; SOFT-NEXT: .LBB9_12: -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB9_14 ; SOFT-NEXT: .LBB9_13: -; SOFT-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; SOFT-NEXT: ldr r4, [sp, #12] @ 4-byte Reload ; SOFT-NEXT: .LBB9_14: ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: mov r2, r5 ; SOFT-NEXT: bne .LBB9_16 ; SOFT-NEXT: @ %bb.15: -; SOFT-NEXT: mov r2, r6 +; SOFT-NEXT: mov r2, r4 ; SOFT-NEXT: .LBB9_16: ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB9_20 ; SOFT-NEXT: @ %bb.17: -; SOFT-NEXT: ldr r6, [sp] @ 4-byte Reload -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: ldr r4, [sp, #4] @ 4-byte Reload +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB9_21 ; SOFT-NEXT: b .LBB9_22 ; SOFT-NEXT: .LBB9_18: @@ -1070,22 +1068,21 @@ define i128 @test_signed_i128_f32(float %f) nounwind { ; SOFT-NEXT: bne .LBB9_12 ; SOFT-NEXT: .LBB9_19: ; SOFT-NEXT: str r7, [sp, #12] @ 4-byte Spill -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB9_13 ; SOFT-NEXT: b .LBB9_14 ; SOFT-NEXT: .LBB9_20: ; SOFT-NEXT: movs r4, #1 -; SOFT-NEXT: lsls r6, r4, #31 -; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: lsls r4, r4, #31 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB9_22 ; SOFT-NEXT: .LBB9_21: -; SOFT-NEXT: ldr r6, .LCPI9_1 +; SOFT-NEXT: ldr r4, .LCPI9_1 ; SOFT-NEXT: .LBB9_22: ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: bne .LBB9_24 ; SOFT-NEXT: @ %bb.23: -; SOFT-NEXT: mov r5, r6 +; SOFT-NEXT: mov r5, r4 ; SOFT-NEXT: .LBB9_24: ; SOFT-NEXT: mov r3, r5 ; SOFT-NEXT: add sp, #20 @@ -1298,19 +1295,19 @@ define i8 @test_signed_i8_f64(double %f) nounwind { ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: mov r1, r5 ; SOFT-NEXT: bl __aeabi_d2iz -; SOFT-NEXT: movs r1, #127 +; SOFT-NEXT: movs r2, #127 ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB11_2 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: mvns r0, r1 +; SOFT-NEXT: mvns r0, r2 ; SOFT-NEXT: .LBB11_2: -; SOFT-NEXT: ldr r2, [sp] @ 4-byte Reload -; SOFT-NEXT: cmp r2, #0 +; SOFT-NEXT: ldr r1, [sp] @ 4-byte Reload +; SOFT-NEXT: cmp r1, #0 ; SOFT-NEXT: bne .LBB11_4 ; SOFT-NEXT: @ %bb.3: -; SOFT-NEXT: mov r1, r0 +; SOFT-NEXT: mov r2, r0 ; SOFT-NEXT: .LBB11_4: -; SOFT-NEXT: mov r7, r1 +; SOFT-NEXT: mov r7, r2 ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: mov r1, r5 ; SOFT-NEXT: mov r2, r6 @@ -1762,35 +1759,38 @@ define i32 @test_signed_i32_f64(double %f) nounwind { ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: bl __aeabi_d2iz ; SOFT-NEXT: cmp r7, #0 -; SOFT-NEXT: bne .LBB15_2 +; SOFT-NEXT: beq .LBB15_2 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: movs r0, #1 -; SOFT-NEXT: lsls r0, r0, #31 +; SOFT-NEXT: mov r1, r0 +; SOFT-NEXT: b .LBB15_3 ; SOFT-NEXT: .LBB15_2: -; SOFT-NEXT: ldr r1, [sp] @ 4-byte Reload -; SOFT-NEXT: cmp r1, #0 -; SOFT-NEXT: bne .LBB15_4 -; SOFT-NEXT: @ %bb.3: -; SOFT-NEXT: mov r7, r0 -; SOFT-NEXT: b .LBB15_5 -; SOFT-NEXT: .LBB15_4: -; SOFT-NEXT: ldr r7, .LCPI15_3 +; SOFT-NEXT: movs r0, #1 +; SOFT-NEXT: lsls r1, r0, #31 +; SOFT-NEXT: .LBB15_3: +; SOFT-NEXT: ldr r0, [sp] @ 4-byte Reload +; SOFT-NEXT: cmp r0, #0 +; SOFT-NEXT: bne .LBB15_5 +; SOFT-NEXT: @ %bb.4: +; SOFT-NEXT: mov r7, r1 +; SOFT-NEXT: b .LBB15_6 ; SOFT-NEXT: .LBB15_5: +; SOFT-NEXT: ldr r7, .LCPI15_3 +; SOFT-NEXT: .LBB15_6: ; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: mov r2, r5 ; SOFT-NEXT: mov r3, r4 ; SOFT-NEXT: bl __aeabi_dcmpun ; SOFT-NEXT: cmp r0, #0 -; SOFT-NEXT: bne .LBB15_7 -; SOFT-NEXT: @ %bb.6: +; SOFT-NEXT: bne .LBB15_8 +; SOFT-NEXT: @ %bb.7: ; SOFT-NEXT: mov r6, r7 -; SOFT-NEXT: .LBB15_7: +; SOFT-NEXT: .LBB15_8: ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: add sp, #4 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .p2align 2 -; SOFT-NEXT: @ %bb.8: +; SOFT-NEXT: @ %bb.9: ; SOFT-NEXT: .LCPI15_0: ; SOFT-NEXT: .long 4290772992 @ 0xffc00000 ; SOFT-NEXT: .LCPI15_1: @@ -1824,24 +1824,24 @@ define i50 @test_signed_i50_f64(double %f) nounwind { ; SOFT-NEXT: push {r4, r5, r6, r7, lr} ; SOFT-NEXT: .pad #12 ; SOFT-NEXT: sub sp, #12 -; SOFT-NEXT: mov r7, r1 -; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r6, r1 +; SOFT-NEXT: mov r7, r0 ; SOFT-NEXT: movs r0, #15 ; SOFT-NEXT: mvns r2, r0 ; SOFT-NEXT: ldr r3, .LCPI16_0 -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r0, r7 ; SOFT-NEXT: bl __aeabi_dcmpgt ; SOFT-NEXT: str r0, [sp, #8] @ 4-byte Spill ; SOFT-NEXT: movs r0, #195 ; SOFT-NEXT: lsls r3, r0, #24 ; SOFT-NEXT: movs r4, #0 -; SOFT-NEXT: mov r0, r6 -; SOFT-NEXT: mov r1, r7 +; SOFT-NEXT: mov r0, r7 +; SOFT-NEXT: mov r1, r6 ; SOFT-NEXT: mov r2, r4 ; SOFT-NEXT: bl __aeabi_dcmpge ; SOFT-NEXT: mov r5, r0 -; SOFT-NEXT: mov r0, r6 -; SOFT-NEXT: mov r1, r7 +; SOFT-NEXT: mov r0, r7 +; SOFT-NEXT: mov r1, r6 ; SOFT-NEXT: bl __aeabi_d2lz ; SOFT-NEXT: mov r2, r0 ; SOFT-NEXT: str r1, [sp, #4] @ 4-byte Spill @@ -1860,10 +1860,10 @@ define i50 @test_signed_i50_f64(double %f) nounwind { ; SOFT-NEXT: mvns r0, r4 ; SOFT-NEXT: str r0, [sp] @ 4-byte Spill ; SOFT-NEXT: .LBB16_5: -; SOFT-NEXT: mov r0, r6 -; SOFT-NEXT: mov r1, r7 -; SOFT-NEXT: mov r2, r6 -; SOFT-NEXT: mov r3, r7 +; SOFT-NEXT: mov r0, r7 +; SOFT-NEXT: mov r1, r6 +; SOFT-NEXT: mov r2, r7 +; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: bl __aeabi_dcmpun ; SOFT-NEXT: mov r1, r0 ; SOFT-NEXT: cmp r0, #0 @@ -1989,7 +1989,7 @@ define i64 @test_signed_i64_f64(double %f) nounwind { ; SOFT-NEXT: push {r4, r5, r6, r7, lr} ; SOFT-NEXT: .pad #12 ; SOFT-NEXT: sub sp, #12 -; SOFT-NEXT: mov r6, r1 +; SOFT-NEXT: mov r7, r1 ; SOFT-NEXT: mov r5, r0 ; SOFT-NEXT: movs r4, #0 ; SOFT-NEXT: mvns r2, r4 @@ -1999,18 +1999,18 @@ define i64 @test_signed_i64_f64(double %f) nounwind { ; SOFT-NEXT: str r0, [sp, #8] @ 4-byte Spill ; SOFT-NEXT: ldr r3, .LCPI17_1 ; SOFT-NEXT: mov r0, r5 -; SOFT-NEXT: mov r1, r6 +; SOFT-NEXT: mov r1, r7 ; SOFT-NEXT: mov r2, r4 ; SOFT-NEXT: bl __aeabi_dcmpge -; SOFT-NEXT: mov r7, r0 +; SOFT-NEXT: mov r6, r0 ; SOFT-NEXT: mov r0, r5 -; SOFT-NEXT: mov r1, r6 +; SOFT-NEXT: mov r1, r7 ; SOFT-NEXT: bl __aeabi_d2lz ; SOFT-NEXT: str r1, [sp] @ 4-byte Spill -; SOFT-NEXT: cmp r7, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB17_2 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: mov r0, r7 +; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: .LBB17_2: ; SOFT-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; SOFT-NEXT: cmp r1, #0 @@ -2019,9 +2019,9 @@ define i64 @test_signed_i64_f64(double %f) nounwind { ; SOFT-NEXT: str r0, [sp, #4] @ 4-byte Spill ; SOFT-NEXT: .LBB17_4: ; SOFT-NEXT: mov r0, r5 -; SOFT-NEXT: mov r1, r6 +; SOFT-NEXT: mov r1, r7 ; SOFT-NEXT: mov r2, r5 -; SOFT-NEXT: mov r3, r6 +; SOFT-NEXT: mov r3, r7 ; SOFT-NEXT: bl __aeabi_dcmpun ; SOFT-NEXT: mov r1, r0 ; SOFT-NEXT: cmp r0, #0 @@ -2030,7 +2030,7 @@ define i64 @test_signed_i64_f64(double %f) nounwind { ; SOFT-NEXT: @ %bb.5: ; SOFT-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; SOFT-NEXT: .LBB17_6: -; SOFT-NEXT: cmp r7, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB17_8 ; SOFT-NEXT: @ %bb.7: ; SOFT-NEXT: ldr r3, [sp] @ 4-byte Reload @@ -2350,14 +2350,14 @@ define i100 @test_signed_i100_f64(double %f) nounwind { ; FP16-NEXT: mov r4, r1 ; FP16-NEXT: mov r5, r0 ; FP16-NEXT: bl __fixdfti -; FP16-NEXT: vldr d2, .LCPI18_0 +; FP16-NEXT: vldr d1, .LCPI18_0 ; FP16-NEXT: vmov d0, r5, r4 -; FP16-NEXT: vldr d1, .LCPI18_1 -; FP16-NEXT: vcmp.f64 d0, d2 +; FP16-NEXT: vldr d2, .LCPI18_1 +; FP16-NEXT: vcmp.f64 d0, d1 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it lt ; FP16-NEXT: movlt r0, #0 -; FP16-NEXT: vcmp.f64 d0, d1 +; FP16-NEXT: vcmp.f64 d0, d2 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it gt ; FP16-NEXT: movgt.w r0, #-1 @@ -2365,11 +2365,11 @@ define i100 @test_signed_i100_f64(double %f) nounwind { ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it vs ; FP16-NEXT: movvs r0, #0 -; FP16-NEXT: vcmp.f64 d0, d2 +; FP16-NEXT: vcmp.f64 d0, d1 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it lt ; FP16-NEXT: movlt r1, #0 -; FP16-NEXT: vcmp.f64 d0, d1 +; FP16-NEXT: vcmp.f64 d0, d2 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it gt ; FP16-NEXT: movgt.w r1, #-1 @@ -2377,11 +2377,11 @@ define i100 @test_signed_i100_f64(double %f) nounwind { ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it vs ; FP16-NEXT: movvs r1, #0 -; FP16-NEXT: vcmp.f64 d0, d2 +; FP16-NEXT: vcmp.f64 d0, d1 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it lt ; FP16-NEXT: movlt r2, #0 -; FP16-NEXT: vcmp.f64 d0, d1 +; FP16-NEXT: vcmp.f64 d0, d2 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it gt ; FP16-NEXT: movgt.w r2, #-1 @@ -2389,11 +2389,11 @@ define i100 @test_signed_i100_f64(double %f) nounwind { ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it vs ; FP16-NEXT: movvs r2, #0 -; FP16-NEXT: vcmp.f64 d0, d2 +; FP16-NEXT: vcmp.f64 d0, d1 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it lt ; FP16-NEXT: mvnlt r3, #7 -; FP16-NEXT: vcmp.f64 d0, d1 +; FP16-NEXT: vcmp.f64 d0, d2 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it gt ; FP16-NEXT: movgt r3, #7 @@ -2613,14 +2613,14 @@ define i128 @test_signed_i128_f64(double %f) nounwind { ; FP16-NEXT: mov r4, r1 ; FP16-NEXT: mov r5, r0 ; FP16-NEXT: bl __fixdfti -; FP16-NEXT: vldr d2, .LCPI19_0 +; FP16-NEXT: vldr d1, .LCPI19_0 ; FP16-NEXT: vmov d0, r5, r4 -; FP16-NEXT: vldr d1, .LCPI19_1 -; FP16-NEXT: vcmp.f64 d0, d2 +; FP16-NEXT: vldr d2, .LCPI19_1 +; FP16-NEXT: vcmp.f64 d0, d1 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it lt ; FP16-NEXT: movlt r0, #0 -; FP16-NEXT: vcmp.f64 d0, d1 +; FP16-NEXT: vcmp.f64 d0, d2 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it gt ; FP16-NEXT: movgt.w r0, #-1 @@ -2628,11 +2628,11 @@ define i128 @test_signed_i128_f64(double %f) nounwind { ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it vs ; FP16-NEXT: movvs r0, #0 -; FP16-NEXT: vcmp.f64 d0, d2 +; FP16-NEXT: vcmp.f64 d0, d1 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it lt ; FP16-NEXT: movlt r1, #0 -; FP16-NEXT: vcmp.f64 d0, d1 +; FP16-NEXT: vcmp.f64 d0, d2 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it gt ; FP16-NEXT: movgt.w r1, #-1 @@ -2640,11 +2640,11 @@ define i128 @test_signed_i128_f64(double %f) nounwind { ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it vs ; FP16-NEXT: movvs r1, #0 -; FP16-NEXT: vcmp.f64 d0, d2 +; FP16-NEXT: vcmp.f64 d0, d1 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it lt ; FP16-NEXT: movlt r2, #0 -; FP16-NEXT: vcmp.f64 d0, d1 +; FP16-NEXT: vcmp.f64 d0, d2 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it gt ; FP16-NEXT: movgt.w r2, #-1 @@ -2652,11 +2652,11 @@ define i128 @test_signed_i128_f64(double %f) nounwind { ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it vs ; FP16-NEXT: movvs r2, #0 -; FP16-NEXT: vcmp.f64 d0, d2 +; FP16-NEXT: vcmp.f64 d0, d1 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it lt ; FP16-NEXT: movlt.w r3, #-2147483648 -; FP16-NEXT: vcmp.f64 d0, d1 +; FP16-NEXT: vcmp.f64 d0, d2 ; FP16-NEXT: vmrs APSR_nzcv, fpscr ; FP16-NEXT: it gt ; FP16-NEXT: mvngt r3, #-2147483648 @@ -3621,7 +3621,7 @@ define i100 @test_signed_i100_f16(half %f) nounwind { ; SOFT-NEXT: mov r6, r0 ; SOFT-NEXT: ldr r1, .LCPI28_0 ; SOFT-NEXT: bl __aeabi_fcmpgt -; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: str r0, [sp, #16] @ 4-byte Spill ; SOFT-NEXT: movs r0, #241 ; SOFT-NEXT: lsls r1, r0, #24 ; SOFT-NEXT: mov r0, r6 @@ -3629,24 +3629,24 @@ define i100 @test_signed_i100_f16(half %f) nounwind { ; SOFT-NEXT: mov r7, r0 ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: bl __fixsfti -; SOFT-NEXT: str r1, [sp, #4] @ 4-byte Spill +; SOFT-NEXT: str r1, [sp, #8] @ 4-byte Spill ; SOFT-NEXT: str r2, [sp, #12] @ 4-byte Spill -; SOFT-NEXT: str r3, [sp] @ 4-byte Spill +; SOFT-NEXT: str r3, [sp, #4] @ 4-byte Spill ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB28_2 ; SOFT-NEXT: @ %bb.1: ; SOFT-NEXT: mov r0, r7 ; SOFT-NEXT: .LBB28_2: ; SOFT-NEXT: movs r5, #0 -; SOFT-NEXT: mvns r1, r5 -; SOFT-NEXT: str r4, [sp, #16] @ 4-byte Spill -; SOFT-NEXT: cmp r4, #0 -; SOFT-NEXT: str r1, [sp, #8] @ 4-byte Spill -; SOFT-NEXT: mov r4, r1 +; SOFT-NEXT: mvns r4, r5 +; SOFT-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; SOFT-NEXT: cmp r1, #0 +; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: bne .LBB28_4 ; SOFT-NEXT: @ %bb.3: -; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: mov r1, r0 ; SOFT-NEXT: .LBB28_4: +; SOFT-NEXT: str r1, [sp] @ 4-byte Spill ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: mov r1, r6 ; SOFT-NEXT: bl __aeabi_fcmpun @@ -3655,20 +3655,19 @@ define i100 @test_signed_i100_f16(half %f) nounwind { ; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: bne .LBB28_6 ; SOFT-NEXT: @ %bb.5: -; SOFT-NEXT: mov r0, r4 +; SOFT-NEXT: ldr r0, [sp] @ 4-byte Reload ; SOFT-NEXT: .LBB28_6: ; SOFT-NEXT: cmp r7, #0 -; SOFT-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; SOFT-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: bne .LBB28_8 ; SOFT-NEXT: @ %bb.7: -; SOFT-NEXT: str r7, [sp, #4] @ 4-byte Spill +; SOFT-NEXT: str r7, [sp, #8] @ 4-byte Spill ; SOFT-NEXT: .LBB28_8: -; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload -; SOFT-NEXT: cmp r4, #0 -; SOFT-NEXT: mov r2, r6 +; SOFT-NEXT: cmp r6, #0 +; SOFT-NEXT: mov r2, r4 ; SOFT-NEXT: bne .LBB28_10 ; SOFT-NEXT: @ %bb.9: -; SOFT-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; SOFT-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; SOFT-NEXT: .LBB28_10: ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: mov r1, r5 @@ -3677,51 +3676,51 @@ define i100 @test_signed_i100_f16(half %f) nounwind { ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB28_19 ; SOFT-NEXT: .LBB28_12: -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB28_14 ; SOFT-NEXT: .LBB28_13: -; SOFT-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; SOFT-NEXT: ldr r4, [sp, #12] @ 4-byte Reload ; SOFT-NEXT: .LBB28_14: ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: mov r2, r5 ; SOFT-NEXT: bne .LBB28_16 ; SOFT-NEXT: @ %bb.15: -; SOFT-NEXT: mov r2, r6 +; SOFT-NEXT: mov r2, r4 ; SOFT-NEXT: .LBB28_16: ; SOFT-NEXT: movs r4, #7 ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB28_20 ; SOFT-NEXT: @ %bb.17: -; SOFT-NEXT: ldr r7, [sp] @ 4-byte Reload -; SOFT-NEXT: b .LBB28_21 +; SOFT-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; SOFT-NEXT: cmp r6, #0 +; SOFT-NEXT: beq .LBB28_21 +; SOFT-NEXT: b .LBB28_22 ; SOFT-NEXT: .LBB28_18: ; SOFT-NEXT: mov r1, r2 ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB28_12 ; SOFT-NEXT: .LBB28_19: ; SOFT-NEXT: str r7, [sp, #12] @ 4-byte Spill -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB28_13 ; SOFT-NEXT: b .LBB28_14 ; SOFT-NEXT: .LBB28_20: ; SOFT-NEXT: mvns r7, r4 -; SOFT-NEXT: .LBB28_21: -; SOFT-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: cmp r6, #0 -; SOFT-NEXT: bne .LBB28_23 -; SOFT-NEXT: @ %bb.22: +; SOFT-NEXT: bne .LBB28_22 +; SOFT-NEXT: .LBB28_21: ; SOFT-NEXT: mov r4, r7 -; SOFT-NEXT: .LBB28_23: +; SOFT-NEXT: .LBB28_22: ; SOFT-NEXT: cmp r3, #0 -; SOFT-NEXT: bne .LBB28_25 -; SOFT-NEXT: @ %bb.24: +; SOFT-NEXT: bne .LBB28_24 +; SOFT-NEXT: @ %bb.23: ; SOFT-NEXT: mov r5, r4 -; SOFT-NEXT: .LBB28_25: +; SOFT-NEXT: .LBB28_24: ; SOFT-NEXT: mov r3, r5 ; SOFT-NEXT: add sp, #20 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .p2align 2 -; SOFT-NEXT: @ %bb.26: +; SOFT-NEXT: @ %bb.25: ; SOFT-NEXT: .LCPI28_0: ; SOFT-NEXT: .long 1895825407 @ 0x70ffffff ; @@ -3875,7 +3874,7 @@ define i128 @test_signed_i128_f16(half %f) nounwind { ; SOFT-NEXT: mov r6, r0 ; SOFT-NEXT: ldr r1, .LCPI29_0 ; SOFT-NEXT: bl __aeabi_fcmpgt -; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: str r0, [sp, #16] @ 4-byte Spill ; SOFT-NEXT: movs r0, #255 ; SOFT-NEXT: lsls r1, r0, #24 ; SOFT-NEXT: mov r0, r6 @@ -3883,24 +3882,24 @@ define i128 @test_signed_i128_f16(half %f) nounwind { ; SOFT-NEXT: mov r7, r0 ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: bl __fixsfti -; SOFT-NEXT: str r1, [sp, #4] @ 4-byte Spill +; SOFT-NEXT: str r1, [sp, #8] @ 4-byte Spill ; SOFT-NEXT: str r2, [sp, #12] @ 4-byte Spill -; SOFT-NEXT: str r3, [sp] @ 4-byte Spill +; SOFT-NEXT: str r3, [sp, #4] @ 4-byte Spill ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB29_2 ; SOFT-NEXT: @ %bb.1: ; SOFT-NEXT: mov r0, r7 ; SOFT-NEXT: .LBB29_2: ; SOFT-NEXT: movs r5, #0 -; SOFT-NEXT: mvns r1, r5 -; SOFT-NEXT: str r4, [sp, #16] @ 4-byte Spill -; SOFT-NEXT: cmp r4, #0 -; SOFT-NEXT: str r1, [sp, #8] @ 4-byte Spill -; SOFT-NEXT: mov r4, r1 +; SOFT-NEXT: mvns r4, r5 +; SOFT-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; SOFT-NEXT: cmp r1, #0 +; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: bne .LBB29_4 ; SOFT-NEXT: @ %bb.3: -; SOFT-NEXT: mov r4, r0 +; SOFT-NEXT: mov r1, r0 ; SOFT-NEXT: .LBB29_4: +; SOFT-NEXT: str r1, [sp] @ 4-byte Spill ; SOFT-NEXT: mov r0, r6 ; SOFT-NEXT: mov r1, r6 ; SOFT-NEXT: bl __aeabi_fcmpun @@ -3909,20 +3908,19 @@ define i128 @test_signed_i128_f16(half %f) nounwind { ; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: bne .LBB29_6 ; SOFT-NEXT: @ %bb.5: -; SOFT-NEXT: mov r0, r4 +; SOFT-NEXT: ldr r0, [sp] @ 4-byte Reload ; SOFT-NEXT: .LBB29_6: ; SOFT-NEXT: cmp r7, #0 -; SOFT-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; SOFT-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: bne .LBB29_8 ; SOFT-NEXT: @ %bb.7: -; SOFT-NEXT: str r7, [sp, #4] @ 4-byte Spill +; SOFT-NEXT: str r7, [sp, #8] @ 4-byte Spill ; SOFT-NEXT: .LBB29_8: -; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload -; SOFT-NEXT: cmp r4, #0 -; SOFT-NEXT: mov r2, r6 +; SOFT-NEXT: cmp r6, #0 +; SOFT-NEXT: mov r2, r4 ; SOFT-NEXT: bne .LBB29_10 ; SOFT-NEXT: @ %bb.9: -; SOFT-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; SOFT-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; SOFT-NEXT: .LBB29_10: ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: mov r1, r5 @@ -3931,22 +3929,22 @@ define i128 @test_signed_i128_f16(half %f) nounwind { ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB29_19 ; SOFT-NEXT: .LBB29_12: -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB29_14 ; SOFT-NEXT: .LBB29_13: -; SOFT-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; SOFT-NEXT: ldr r4, [sp, #12] @ 4-byte Reload ; SOFT-NEXT: .LBB29_14: ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: mov r2, r5 ; SOFT-NEXT: bne .LBB29_16 ; SOFT-NEXT: @ %bb.15: -; SOFT-NEXT: mov r2, r6 +; SOFT-NEXT: mov r2, r4 ; SOFT-NEXT: .LBB29_16: ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB29_20 ; SOFT-NEXT: @ %bb.17: -; SOFT-NEXT: ldr r6, [sp] @ 4-byte Reload -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: ldr r4, [sp, #4] @ 4-byte Reload +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB29_21 ; SOFT-NEXT: b .LBB29_22 ; SOFT-NEXT: .LBB29_18: @@ -3955,22 +3953,21 @@ define i128 @test_signed_i128_f16(half %f) nounwind { ; SOFT-NEXT: bne .LBB29_12 ; SOFT-NEXT: .LBB29_19: ; SOFT-NEXT: str r7, [sp, #12] @ 4-byte Spill -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB29_13 ; SOFT-NEXT: b .LBB29_14 ; SOFT-NEXT: .LBB29_20: ; SOFT-NEXT: movs r4, #1 -; SOFT-NEXT: lsls r6, r4, #31 -; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload -; SOFT-NEXT: cmp r4, #0 +; SOFT-NEXT: lsls r4, r4, #31 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB29_22 ; SOFT-NEXT: .LBB29_21: -; SOFT-NEXT: ldr r6, .LCPI29_1 +; SOFT-NEXT: ldr r4, .LCPI29_1 ; SOFT-NEXT: .LBB29_22: ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: bne .LBB29_24 ; SOFT-NEXT: @ %bb.23: -; SOFT-NEXT: mov r5, r6 +; SOFT-NEXT: mov r5, r4 ; SOFT-NEXT: .LBB29_24: ; SOFT-NEXT: mov r3, r5 ; SOFT-NEXT: add sp, #20 diff --git a/llvm/test/CodeGen/ARM/fptoui-sat-scalar.ll b/llvm/test/CodeGen/ARM/fptoui-sat-scalar.ll index 3438fb113015cb..6eb174f0e4d771 100644 --- a/llvm/test/CodeGen/ARM/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/ARM/fptoui-sat-scalar.ll @@ -894,32 +894,32 @@ define i1 @test_signed_i1_f64(double %f) nounwind { ; SOFT-NEXT: .pad #4 ; SOFT-NEXT: sub sp, #4 ; SOFT-NEXT: mov r4, r1 -; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r5, r0 ; SOFT-NEXT: movs r7, #0 ; SOFT-NEXT: ldr r3, .LCPI10_0 ; SOFT-NEXT: mov r2, r7 ; SOFT-NEXT: bl __aeabi_dcmpgt -; SOFT-NEXT: mov r5, r0 -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: mov r2, r7 ; SOFT-NEXT: mov r3, r7 ; SOFT-NEXT: bl __aeabi_dcmpge ; SOFT-NEXT: mov r7, r0 -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: bl __aeabi_d2uiz ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB10_3 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB10_4 ; SOFT-NEXT: .LBB10_2: ; SOFT-NEXT: add sp, #4 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .LBB10_3: ; SOFT-NEXT: mov r0, r7 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB10_2 ; SOFT-NEXT: .LBB10_4: ; SOFT-NEXT: movs r0, #1 @@ -973,32 +973,32 @@ define i8 @test_signed_i8_f64(double %f) nounwind { ; SOFT-NEXT: .pad #4 ; SOFT-NEXT: sub sp, #4 ; SOFT-NEXT: mov r4, r1 -; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r5, r0 ; SOFT-NEXT: movs r7, #0 ; SOFT-NEXT: ldr r3, .LCPI11_0 ; SOFT-NEXT: mov r2, r7 ; SOFT-NEXT: bl __aeabi_dcmpgt -; SOFT-NEXT: mov r5, r0 -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: mov r2, r7 ; SOFT-NEXT: mov r3, r7 ; SOFT-NEXT: bl __aeabi_dcmpge ; SOFT-NEXT: mov r7, r0 -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: bl __aeabi_d2uiz ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB11_3 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB11_4 ; SOFT-NEXT: .LBB11_2: ; SOFT-NEXT: add sp, #4 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .LBB11_3: ; SOFT-NEXT: mov r0, r7 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB11_2 ; SOFT-NEXT: .LBB11_4: ; SOFT-NEXT: movs r0, #255 @@ -1060,32 +1060,32 @@ define i13 @test_signed_i13_f64(double %f) nounwind { ; SOFT-NEXT: .pad #4 ; SOFT-NEXT: sub sp, #4 ; SOFT-NEXT: mov r4, r1 -; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r5, r0 ; SOFT-NEXT: movs r7, #0 ; SOFT-NEXT: ldr r3, .LCPI12_0 ; SOFT-NEXT: mov r2, r7 ; SOFT-NEXT: bl __aeabi_dcmpgt -; SOFT-NEXT: mov r5, r0 -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: mov r2, r7 ; SOFT-NEXT: mov r3, r7 ; SOFT-NEXT: bl __aeabi_dcmpge ; SOFT-NEXT: mov r7, r0 -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: bl __aeabi_d2uiz ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB12_3 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB12_4 ; SOFT-NEXT: .LBB12_2: ; SOFT-NEXT: add sp, #4 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .LBB12_3: ; SOFT-NEXT: mov r0, r7 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB12_2 ; SOFT-NEXT: .LBB12_4: ; SOFT-NEXT: ldr r0, .LCPI12_1 @@ -1149,32 +1149,32 @@ define i16 @test_signed_i16_f64(double %f) nounwind { ; SOFT-NEXT: .pad #4 ; SOFT-NEXT: sub sp, #4 ; SOFT-NEXT: mov r4, r1 -; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r5, r0 ; SOFT-NEXT: movs r7, #0 ; SOFT-NEXT: ldr r3, .LCPI13_0 ; SOFT-NEXT: mov r2, r7 ; SOFT-NEXT: bl __aeabi_dcmpgt -; SOFT-NEXT: mov r5, r0 -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: mov r2, r7 ; SOFT-NEXT: mov r3, r7 ; SOFT-NEXT: bl __aeabi_dcmpge ; SOFT-NEXT: mov r7, r0 -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: bl __aeabi_d2uiz ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB13_3 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB13_4 ; SOFT-NEXT: .LBB13_2: ; SOFT-NEXT: add sp, #4 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .LBB13_3: ; SOFT-NEXT: mov r0, r7 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB13_2 ; SOFT-NEXT: .LBB13_4: ; SOFT-NEXT: ldr r0, .LCPI13_1 @@ -1238,32 +1238,32 @@ define i19 @test_signed_i19_f64(double %f) nounwind { ; SOFT-NEXT: .pad #4 ; SOFT-NEXT: sub sp, #4 ; SOFT-NEXT: mov r4, r1 -; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r5, r0 ; SOFT-NEXT: movs r7, #0 ; SOFT-NEXT: ldr r3, .LCPI14_0 ; SOFT-NEXT: mov r2, r7 ; SOFT-NEXT: bl __aeabi_dcmpgt -; SOFT-NEXT: mov r5, r0 -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: mov r2, r7 ; SOFT-NEXT: mov r3, r7 ; SOFT-NEXT: bl __aeabi_dcmpge ; SOFT-NEXT: mov r7, r0 -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r0, r5 ; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: bl __aeabi_d2uiz ; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB14_3 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: bne .LBB14_4 ; SOFT-NEXT: .LBB14_2: ; SOFT-NEXT: add sp, #4 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .LBB14_3: ; SOFT-NEXT: mov r0, r7 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r6, #0 ; SOFT-NEXT: beq .LBB14_2 ; SOFT-NEXT: .LBB14_4: ; SOFT-NEXT: ldr r0, .LCPI14_1 @@ -1327,32 +1327,32 @@ define i32 @test_signed_i32_f64(double %f) nounwind { ; SOFT-NEXT: push {r4, r5, r6, r7, lr} ; SOFT-NEXT: .pad #4 ; SOFT-NEXT: sub sp, #4 -; SOFT-NEXT: mov r5, r1 -; SOFT-NEXT: mov r7, r0 +; SOFT-NEXT: mov r4, r1 +; SOFT-NEXT: mov r6, r0 ; SOFT-NEXT: ldr r2, .LCPI15_0 ; SOFT-NEXT: ldr r3, .LCPI15_1 ; SOFT-NEXT: bl __aeabi_dcmpgt ; SOFT-NEXT: str r0, [sp] @ 4-byte Spill -; SOFT-NEXT: movs r4, #0 -; SOFT-NEXT: mov r0, r7 -; SOFT-NEXT: mov r1, r5 -; SOFT-NEXT: mov r2, r4 -; SOFT-NEXT: mov r3, r4 +; SOFT-NEXT: movs r5, #0 +; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r1, r4 +; SOFT-NEXT: mov r2, r5 +; SOFT-NEXT: mov r3, r5 ; SOFT-NEXT: bl __aeabi_dcmpge -; SOFT-NEXT: mov r6, r0 -; SOFT-NEXT: mov r0, r7 -; SOFT-NEXT: mov r1, r5 +; SOFT-NEXT: mov r7, r0 +; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r1, r4 ; SOFT-NEXT: bl __aeabi_d2uiz -; SOFT-NEXT: cmp r6, #0 +; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB15_2 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: mov r0, r6 +; SOFT-NEXT: mov r0, r7 ; SOFT-NEXT: .LBB15_2: ; SOFT-NEXT: ldr r1, [sp] @ 4-byte Reload ; SOFT-NEXT: cmp r1, #0 ; SOFT-NEXT: beq .LBB15_4 ; SOFT-NEXT: @ %bb.3: -; SOFT-NEXT: mvns r0, r4 +; SOFT-NEXT: mvns r0, r5 ; SOFT-NEXT: .LBB15_4: ; SOFT-NEXT: add sp, #4 ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} @@ -1387,7 +1387,7 @@ define i50 @test_signed_i50_f64(double %f) nounwind { ; SOFT-NEXT: push {r4, r5, r6, r7, lr} ; SOFT-NEXT: .pad #4 ; SOFT-NEXT: sub sp, #4 -; SOFT-NEXT: mov r7, r1 +; SOFT-NEXT: mov r5, r1 ; SOFT-NEXT: mov r4, r0 ; SOFT-NEXT: movs r0, #7 ; SOFT-NEXT: mvns r2, r0 @@ -1397,24 +1397,24 @@ define i50 @test_signed_i50_f64(double %f) nounwind { ; SOFT-NEXT: str r0, [sp] @ 4-byte Spill ; SOFT-NEXT: movs r6, #0 ; SOFT-NEXT: mov r0, r4 -; SOFT-NEXT: mov r1, r7 +; SOFT-NEXT: mov r1, r5 ; SOFT-NEXT: mov r2, r6 ; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: bl __aeabi_dcmpge -; SOFT-NEXT: mov r5, r0 +; SOFT-NEXT: mov r7, r0 ; SOFT-NEXT: mov r0, r4 -; SOFT-NEXT: mov r1, r7 +; SOFT-NEXT: mov r1, r5 ; SOFT-NEXT: bl __aeabi_d2ulz -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB16_2 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: mov r0, r5 +; SOFT-NEXT: mov r0, r7 ; SOFT-NEXT: .LBB16_2: ; SOFT-NEXT: ldr r2, [sp] @ 4-byte Reload ; SOFT-NEXT: cmp r2, #0 ; SOFT-NEXT: bne .LBB16_6 ; SOFT-NEXT: @ %bb.3: -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB16_7 ; SOFT-NEXT: .LBB16_4: ; SOFT-NEXT: cmp r2, #0 @@ -1424,10 +1424,10 @@ define i50 @test_signed_i50_f64(double %f) nounwind { ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .LBB16_6: ; SOFT-NEXT: mvns r0, r6 -; SOFT-NEXT: cmp r5, #0 +; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB16_4 ; SOFT-NEXT: .LBB16_7: -; SOFT-NEXT: mov r1, r5 +; SOFT-NEXT: mov r1, r7 ; SOFT-NEXT: cmp r2, #0 ; SOFT-NEXT: beq .LBB16_5 ; SOFT-NEXT: .LBB16_8: @@ -1500,35 +1500,35 @@ define i64 @test_signed_i64_f64(double %f) nounwind { ; SOFT-NEXT: push {r4, r5, r6, r7, lr} ; SOFT-NEXT: .pad #4 ; SOFT-NEXT: sub sp, #4 -; SOFT-NEXT: mov r7, r1 +; SOFT-NEXT: mov r6, r1 ; SOFT-NEXT: mov r5, r0 -; SOFT-NEXT: movs r6, #0 -; SOFT-NEXT: mvns r4, r6 +; SOFT-NEXT: movs r7, #0 +; SOFT-NEXT: mvns r4, r7 ; SOFT-NEXT: ldr r3, .LCPI17_0 ; SOFT-NEXT: mov r2, r4 ; SOFT-NEXT: bl __aeabi_dcmpgt ; SOFT-NEXT: str r0, [sp] @ 4-byte Spill ; SOFT-NEXT: mov r0, r5 -; SOFT-NEXT: mov r1, r7 -; SOFT-NEXT: mov r2, r6 -; SOFT-NEXT: mov r3, r6 +; SOFT-NEXT: mov r1, r6 +; SOFT-NEXT: mov r2, r7 +; SOFT-NEXT: mov r3, r7 ; SOFT-NEXT: bl __aeabi_dcmpge -; SOFT-NEXT: mov r6, r0 +; SOFT-NEXT: mov r7, r0 ; SOFT-NEXT: mov r0, r5 -; SOFT-NEXT: mov r1, r7 +; SOFT-NEXT: mov r1, r6 ; SOFT-NEXT: bl __aeabi_d2ulz ; SOFT-NEXT: mov r2, r0 -; SOFT-NEXT: cmp r6, #0 +; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB17_2 ; SOFT-NEXT: @ %bb.1: -; SOFT-NEXT: mov r2, r6 +; SOFT-NEXT: mov r2, r7 ; SOFT-NEXT: .LBB17_2: ; SOFT-NEXT: ldr r3, [sp] @ 4-byte Reload ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: mov r0, r4 ; SOFT-NEXT: beq .LBB17_7 ; SOFT-NEXT: @ %bb.3: -; SOFT-NEXT: cmp r6, #0 +; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: beq .LBB17_8 ; SOFT-NEXT: .LBB17_4: ; SOFT-NEXT: cmp r3, #0 @@ -1541,10 +1541,10 @@ define i64 @test_signed_i64_f64(double %f) nounwind { ; SOFT-NEXT: pop {r4, r5, r6, r7, pc} ; SOFT-NEXT: .LBB17_7: ; SOFT-NEXT: mov r0, r2 -; SOFT-NEXT: cmp r6, #0 +; SOFT-NEXT: cmp r7, #0 ; SOFT-NEXT: bne .LBB17_4 ; SOFT-NEXT: .LBB17_8: -; SOFT-NEXT: mov r1, r6 +; SOFT-NEXT: mov r1, r7 ; SOFT-NEXT: cmp r3, #0 ; SOFT-NEXT: beq .LBB17_5 ; SOFT-NEXT: b .LBB17_6 diff --git a/llvm/test/CodeGen/ARM/funnel-shift.ll b/llvm/test/CodeGen/ARM/funnel-shift.ll index 5a7c4384428e1a..d56c2f3a40ce2f 100644 --- a/llvm/test/CodeGen/ARM/funnel-shift.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift.ll @@ -47,67 +47,67 @@ declare i37 @llvm.fshl.i37(i37, i37, i37) define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) { ; SCALAR-LABEL: fshl_i37: ; SCALAR: @ %bb.0: -; SCALAR-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; SCALAR-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; SCALAR-NEXT: .save {r4, r5, r6, r7, r8, lr} +; SCALAR-NEXT: push {r4, r5, r6, r7, r8, lr} ; SCALAR-NEXT: mov r8, r0 -; SCALAR-NEXT: ldr r0, [sp, #36] -; SCALAR-NEXT: mov r4, r1 -; SCALAR-NEXT: mov r6, r3 +; SCALAR-NEXT: ldr r0, [sp, #28] +; SCALAR-NEXT: mov r6, r1 +; SCALAR-NEXT: mov r4, r3 ; SCALAR-NEXT: and r1, r0, #31 -; SCALAR-NEXT: ldr r0, [sp, #32] -; SCALAR-NEXT: mov r9, r2 +; SCALAR-NEXT: ldr r0, [sp, #24] +; SCALAR-NEXT: mov r5, r2 ; SCALAR-NEXT: mov r2, #37 ; SCALAR-NEXT: mov r3, #0 ; SCALAR-NEXT: bl __aeabi_uldivmod -; SCALAR-NEXT: lsl r1, r6, #27 -; SCALAR-NEXT: ands r0, r2, #32 -; SCALAR-NEXT: orr r1, r1, r9, lsr #5 +; SCALAR-NEXT: lsl r1, r4, #27 +; SCALAR-NEXT: ands r12, r2, #32 +; SCALAR-NEXT: orr r1, r1, r5, lsr #5 ; SCALAR-NEXT: mov r3, r8 -; SCALAR-NEXT: and r6, r2, #31 +; SCALAR-NEXT: and r4, r2, #31 ; SCALAR-NEXT: mov r7, #31 ; SCALAR-NEXT: movne r3, r1 -; SCALAR-NEXT: cmp r0, #0 -; SCALAR-NEXT: lslne r1, r9, #27 +; SCALAR-NEXT: cmp r12, #0 +; SCALAR-NEXT: lslne r1, r5, #27 ; SCALAR-NEXT: bic r2, r7, r2 -; SCALAR-NEXT: movne r4, r8 -; SCALAR-NEXT: lsl r5, r3, r6 -; SCALAR-NEXT: lsr r0, r1, #1 -; SCALAR-NEXT: lsl r1, r4, r6 +; SCALAR-NEXT: lsl r0, r3, r4 +; SCALAR-NEXT: lsr r1, r1, #1 +; SCALAR-NEXT: movne r6, r8 +; SCALAR-NEXT: orr r0, r0, r1, lsr r2 +; SCALAR-NEXT: lsl r1, r6, r4 ; SCALAR-NEXT: lsr r3, r3, #1 -; SCALAR-NEXT: orr r0, r5, r0, lsr r2 ; SCALAR-NEXT: orr r1, r1, r3, lsr r2 -; SCALAR-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; SCALAR-NEXT: pop {r4, r5, r6, r7, r8, pc} ; ; NEON-LABEL: fshl_i37: ; NEON: @ %bb.0: ; NEON-NEXT: .save {r4, r5, r6, r7, r11, lr} ; NEON-NEXT: push {r4, r5, r6, r7, r11, lr} -; NEON-NEXT: mov r4, r1 +; NEON-NEXT: mov r6, r1 ; NEON-NEXT: ldr r1, [sp, #28] -; NEON-NEXT: mov r6, r0 +; NEON-NEXT: mov r4, r0 ; NEON-NEXT: ldr r0, [sp, #24] ; NEON-NEXT: and r1, r1, #31 -; NEON-NEXT: mov r5, r3 -; NEON-NEXT: mov r7, r2 +; NEON-NEXT: mov r7, r3 +; NEON-NEXT: mov r5, r2 ; NEON-NEXT: mov r2, #37 ; NEON-NEXT: mov r3, #0 ; NEON-NEXT: bl __aeabi_uldivmod ; NEON-NEXT: mov r0, #31 ; NEON-NEXT: bic r1, r0, r2 -; NEON-NEXT: lsl r0, r5, #27 +; NEON-NEXT: lsl r0, r7, #27 ; NEON-NEXT: ands r12, r2, #32 -; NEON-NEXT: orr r0, r0, r7, lsr #5 -; NEON-NEXT: mov r5, r6 +; NEON-NEXT: orr r0, r0, r5, lsr #5 +; NEON-NEXT: mov r7, r4 ; NEON-NEXT: and r2, r2, #31 -; NEON-NEXT: movne r5, r0 -; NEON-NEXT: lslne r0, r7, #27 +; NEON-NEXT: movne r7, r0 +; NEON-NEXT: lslne r0, r5, #27 ; NEON-NEXT: cmp r12, #0 -; NEON-NEXT: lsl r3, r5, r2 +; NEON-NEXT: lsl r3, r7, r2 ; NEON-NEXT: lsr r0, r0, #1 -; NEON-NEXT: movne r4, r6 +; NEON-NEXT: movne r6, r4 ; NEON-NEXT: orr r0, r3, r0, lsr r1 -; NEON-NEXT: lsr r3, r5, #1 -; NEON-NEXT: lsl r2, r4, r2 +; NEON-NEXT: lsr r3, r7, #1 +; NEON-NEXT: lsl r2, r6, r2 ; NEON-NEXT: orr r1, r2, r3, lsr r1 ; NEON-NEXT: pop {r4, r5, r6, r7, r11, pc} %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z) @@ -237,71 +237,71 @@ declare i37 @llvm.fshr.i37(i37, i37, i37) define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { ; SCALAR-LABEL: fshr_i37: ; SCALAR: @ %bb.0: -; SCALAR-NEXT: .save {r4, r5, r6, r7, r8, lr} -; SCALAR-NEXT: push {r4, r5, r6, r7, r8, lr} -; SCALAR-NEXT: mov r8, r0 +; SCALAR-NEXT: .save {r4, r5, r6, r7, r11, lr} +; SCALAR-NEXT: push {r4, r5, r6, r7, r11, lr} +; SCALAR-NEXT: mov r4, r0 ; SCALAR-NEXT: ldr r0, [sp, #28] -; SCALAR-NEXT: mov r4, r1 -; SCALAR-NEXT: mov r5, r3 +; SCALAR-NEXT: mov r6, r1 +; SCALAR-NEXT: mov r7, r3 ; SCALAR-NEXT: and r1, r0, #31 ; SCALAR-NEXT: ldr r0, [sp, #24] -; SCALAR-NEXT: mov r7, r2 +; SCALAR-NEXT: mov r5, r2 ; SCALAR-NEXT: mov r2, #37 ; SCALAR-NEXT: mov r3, #0 ; SCALAR-NEXT: bl __aeabi_uldivmod -; SCALAR-NEXT: lsl r3, r5, #27 +; SCALAR-NEXT: lsl r3, r7, #27 ; SCALAR-NEXT: add r0, r2, #27 -; SCALAR-NEXT: orr r3, r3, r7, lsr #5 -; SCALAR-NEXT: ands r2, r0, #32 -; SCALAR-NEXT: mov r5, r8 +; SCALAR-NEXT: orr r3, r3, r5, lsr #5 ; SCALAR-NEXT: mov r1, #31 -; SCALAR-NEXT: moveq r5, r3 -; SCALAR-NEXT: lsleq r3, r7, #27 -; SCALAR-NEXT: cmp r2, #0 +; SCALAR-NEXT: ands r12, r0, #32 +; SCALAR-NEXT: mov r7, r4 +; SCALAR-NEXT: moveq r7, r3 ; SCALAR-NEXT: bic r1, r1, r0 -; SCALAR-NEXT: moveq r4, r8 -; SCALAR-NEXT: lsl r6, r5, #1 -; SCALAR-NEXT: and r7, r0, #31 -; SCALAR-NEXT: lsl r2, r4, #1 -; SCALAR-NEXT: lsl r6, r6, r1 +; SCALAR-NEXT: lsl r2, r7, #1 +; SCALAR-NEXT: lsleq r3, r5, #27 +; SCALAR-NEXT: cmp r12, #0 +; SCALAR-NEXT: and r5, r0, #31 +; SCALAR-NEXT: lsl r2, r2, r1 +; SCALAR-NEXT: moveq r6, r4 +; SCALAR-NEXT: orr r0, r2, r3, lsr r5 +; SCALAR-NEXT: lsl r2, r6, #1 ; SCALAR-NEXT: lsl r1, r2, r1 -; SCALAR-NEXT: orr r0, r6, r3, lsr r7 -; SCALAR-NEXT: orr r1, r1, r5, lsr r7 -; SCALAR-NEXT: pop {r4, r5, r6, r7, r8, pc} +; SCALAR-NEXT: orr r1, r1, r7, lsr r5 +; SCALAR-NEXT: pop {r4, r5, r6, r7, r11, pc} ; ; NEON-LABEL: fshr_i37: ; NEON: @ %bb.0: -; NEON-NEXT: .save {r4, r5, r6, r7, r8, lr} -; NEON-NEXT: push {r4, r5, r6, r7, r8, lr} -; NEON-NEXT: mov r4, r1 +; NEON-NEXT: .save {r4, r5, r6, r7, r11, lr} +; NEON-NEXT: push {r4, r5, r6, r7, r11, lr} +; NEON-NEXT: mov r6, r1 ; NEON-NEXT: ldr r1, [sp, #28] -; NEON-NEXT: mov r8, r0 +; NEON-NEXT: mov r4, r0 ; NEON-NEXT: ldr r0, [sp, #24] ; NEON-NEXT: and r1, r1, #31 -; NEON-NEXT: mov r5, r3 -; NEON-NEXT: mov r7, r2 +; NEON-NEXT: mov r7, r3 +; NEON-NEXT: mov r5, r2 ; NEON-NEXT: mov r2, #37 ; NEON-NEXT: mov r3, #0 ; NEON-NEXT: bl __aeabi_uldivmod -; NEON-NEXT: lsl r3, r5, #27 +; NEON-NEXT: lsl r3, r7, #27 ; NEON-NEXT: add r0, r2, #27 -; NEON-NEXT: orr r3, r3, r7, lsr #5 -; NEON-NEXT: ands r2, r0, #32 -; NEON-NEXT: mov r5, r8 +; NEON-NEXT: orr r3, r3, r5, lsr #5 ; NEON-NEXT: mov r1, #31 -; NEON-NEXT: moveq r5, r3 -; NEON-NEXT: lsleq r3, r7, #27 -; NEON-NEXT: cmp r2, #0 +; NEON-NEXT: ands r12, r0, #32 +; NEON-NEXT: mov r7, r4 +; NEON-NEXT: moveq r7, r3 ; NEON-NEXT: bic r1, r1, r0 -; NEON-NEXT: moveq r4, r8 -; NEON-NEXT: lsl r6, r5, #1 -; NEON-NEXT: and r7, r0, #31 -; NEON-NEXT: lsl r2, r4, #1 -; NEON-NEXT: lsl r6, r6, r1 +; NEON-NEXT: lsl r2, r7, #1 +; NEON-NEXT: lsleq r3, r5, #27 +; NEON-NEXT: cmp r12, #0 +; NEON-NEXT: and r5, r0, #31 +; NEON-NEXT: lsl r2, r2, r1 +; NEON-NEXT: moveq r6, r4 +; NEON-NEXT: orr r0, r2, r3, lsr r5 +; NEON-NEXT: lsl r2, r6, #1 ; NEON-NEXT: lsl r1, r2, r1 -; NEON-NEXT: orr r0, r6, r3, lsr r7 -; NEON-NEXT: orr r1, r1, r5, lsr r7 -; NEON-NEXT: pop {r4, r5, r6, r7, r8, pc} +; NEON-NEXT: orr r1, r1, r7, lsr r5 +; NEON-NEXT: pop {r4, r5, r6, r7, r11, pc} %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z) ret i37 %f } diff --git a/llvm/test/CodeGen/ARM/machine-cse-cmp.ll b/llvm/test/CodeGen/ARM/machine-cse-cmp.ll index 6e891a04808148..750500ee83154e 100644 --- a/llvm/test/CodeGen/ARM/machine-cse-cmp.ll +++ b/llvm/test/CodeGen/ARM/machine-cse-cmp.ll @@ -70,18 +70,18 @@ declare void @llvm.memset.p0.i32(ptr nocapture, i8, i32, i1) nounwind define ptr @f3(ptr %base, ptr nocapture %offset, i32 %size) nounwind { ; CHECK-LABEL: f3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r3, [r1] -; CHECK-NEXT: mov r9, #0 -; CHECK-NEXT: cmp r3, r2 +; CHECK-NEXT: ldr r9, [r1] +; CHECK-NEXT: mov r3, #0 +; CHECK-NEXT: cmp r9, r2 ; CHECK-NEXT: blt LBB2_2 ; CHECK-NEXT: @ %bb.1: @ %if.end -; CHECK-NEXT: sub r3, r3, r2 -; CHECK-NEXT: add r9, r0, r3 +; CHECK-NEXT: sub r3, r9, r2 ; CHECK-NEXT: sub r2, r2, r3 ; CHECK-NEXT: add r2, r3, r2 +; CHECK-NEXT: add r3, r0, r3 ; CHECK-NEXT: str r2, [r1] ; CHECK-NEXT: LBB2_2: @ %return -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r0, r3 ; CHECK-NEXT: bx lr entry: %0 = load i32, ptr %offset, align 4 diff --git a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll index be741f536ac757..a4d370eab12d84 100644 --- a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll +++ b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll @@ -499,23 +499,23 @@ define <4 x float> @fminnumv432_intrinsic(<4 x float> %x, <4 x float> %y) { ; ARMV7-LABEL: fminnumv432_intrinsic: ; ARMV7: @ %bb.0: ; ARMV7-NEXT: mov r12, sp -; ARMV7-NEXT: vld1.64 {d0, d1}, [r12] -; ARMV7-NEXT: vmov d3, r2, r3 -; ARMV7-NEXT: vmov d2, r0, r1 -; ARMV7-NEXT: vcmp.f32 s7, s3 +; ARMV7-NEXT: vld1.64 {d2, d3}, [r12] +; ARMV7-NEXT: vmov d1, r2, r3 +; ARMV7-NEXT: vmov d0, r0, r1 +; ARMV7-NEXT: vcmp.f32 s3, s7 ; ARMV7-NEXT: vmrs APSR_nzcv, fpscr -; ARMV7-NEXT: vcmp.f32 s6, s2 -; ARMV7-NEXT: vmovlt.f32 s3, s7 +; ARMV7-NEXT: vcmp.f32 s2, s6 +; ARMV7-NEXT: vmovlt.f32 s7, s3 ; ARMV7-NEXT: vmrs APSR_nzcv, fpscr -; ARMV7-NEXT: vcmp.f32 s5, s1 -; ARMV7-NEXT: vmovlt.f32 s2, s6 +; ARMV7-NEXT: vcmp.f32 s1, s5 +; ARMV7-NEXT: vmovlt.f32 s6, s2 ; ARMV7-NEXT: vmrs APSR_nzcv, fpscr -; ARMV7-NEXT: vcmp.f32 s4, s0 -; ARMV7-NEXT: vmovlt.f32 s1, s5 +; ARMV7-NEXT: vcmp.f32 s0, s4 +; ARMV7-NEXT: vmovlt.f32 s5, s1 ; ARMV7-NEXT: vmrs APSR_nzcv, fpscr -; ARMV7-NEXT: vmovlt.f32 s0, s4 -; ARMV7-NEXT: vmov r2, r3, d1 -; ARMV7-NEXT: vmov r0, r1, d0 +; ARMV7-NEXT: vmovlt.f32 s4, s0 +; ARMV7-NEXT: vmov r2, r3, d3 +; ARMV7-NEXT: vmov r0, r1, d2 ; ARMV7-NEXT: bx lr ; ; ARMV8-LABEL: fminnumv432_intrinsic: @@ -686,23 +686,23 @@ define <4 x float> @fmaxnumv432_intrinsic(<4 x float> %x, <4 x float> %y) { ; ARMV7-LABEL: fmaxnumv432_intrinsic: ; ARMV7: @ %bb.0: ; ARMV7-NEXT: mov r12, sp -; ARMV7-NEXT: vld1.64 {d0, d1}, [r12] -; ARMV7-NEXT: vmov d3, r2, r3 -; ARMV7-NEXT: vmov d2, r0, r1 -; ARMV7-NEXT: vcmp.f32 s7, s3 +; ARMV7-NEXT: vld1.64 {d2, d3}, [r12] +; ARMV7-NEXT: vmov d1, r2, r3 +; ARMV7-NEXT: vmov d0, r0, r1 +; ARMV7-NEXT: vcmp.f32 s3, s7 ; ARMV7-NEXT: vmrs APSR_nzcv, fpscr -; ARMV7-NEXT: vcmp.f32 s6, s2 -; ARMV7-NEXT: vmovgt.f32 s3, s7 +; ARMV7-NEXT: vcmp.f32 s2, s6 +; ARMV7-NEXT: vmovgt.f32 s7, s3 ; ARMV7-NEXT: vmrs APSR_nzcv, fpscr -; ARMV7-NEXT: vcmp.f32 s5, s1 -; ARMV7-NEXT: vmovgt.f32 s2, s6 +; ARMV7-NEXT: vcmp.f32 s1, s5 +; ARMV7-NEXT: vmovgt.f32 s6, s2 ; ARMV7-NEXT: vmrs APSR_nzcv, fpscr -; ARMV7-NEXT: vcmp.f32 s4, s0 -; ARMV7-NEXT: vmovgt.f32 s1, s5 +; ARMV7-NEXT: vcmp.f32 s0, s4 +; ARMV7-NEXT: vmovgt.f32 s5, s1 ; ARMV7-NEXT: vmrs APSR_nzcv, fpscr -; ARMV7-NEXT: vmovgt.f32 s0, s4 -; ARMV7-NEXT: vmov r2, r3, d1 -; ARMV7-NEXT: vmov r0, r1, d0 +; ARMV7-NEXT: vmovgt.f32 s4, s0 +; ARMV7-NEXT: vmov r2, r3, d3 +; ARMV7-NEXT: vmov r0, r1, d2 ; ARMV7-NEXT: bx lr ; ; ARMV8-LABEL: fmaxnumv432_intrinsic: diff --git a/llvm/test/CodeGen/ARM/neon-copy.ll b/llvm/test/CodeGen/ARM/neon-copy.ll index e356b7e2181da0..fcb94f268b16f8 100644 --- a/llvm/test/CodeGen/ARM/neon-copy.ll +++ b/llvm/test/CodeGen/ARM/neon-copy.ll @@ -1301,17 +1301,17 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) { ; CHECK-NEXT: .pad #28 ; CHECK-NEXT: sub sp, sp, #28 ; CHECK-NEXT: bfc sp, #0, #4 -; CHECK-NEXT: vmov.u16 r1, d0[1] +; CHECK-NEXT: vmov.u16 r12, d0[1] ; CHECK-NEXT: and r0, r0, #7 ; CHECK-NEXT: vmov.u16 r2, d0[2] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vmov.u16 r12, d0[3] +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmov.u16 r3, d0[3] ; CHECK-NEXT: lsl r0, r0, #1 -; CHECK-NEXT: vst1.64 {d0, d1}, [r3:128], r0 -; CHECK-NEXT: vld1.16 {d0[0]}, [r3:16] -; CHECK-NEXT: vmov.16 d0[1], r1 +; CHECK-NEXT: vst1.64 {d0, d1}, [r1:128], r0 +; CHECK-NEXT: vld1.16 {d0[0]}, [r1:16] +; CHECK-NEXT: vmov.16 d0[1], r12 ; CHECK-NEXT: vmov.16 d0[2], r2 -; CHECK-NEXT: vmov.16 d0[3], r12 +; CHECK-NEXT: vmov.16 d0[3], r3 ; CHECK-NEXT: mov sp, r11 ; CHECK-NEXT: pop {r11} ; CHECK-NEXT: bx lr @@ -1331,17 +1331,17 @@ define <4 x i16> @test_extracts_inserts_varidx_insert(<8 x i16> %x, i32 %idx) { ; CHECK: @ %bb.0: ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, sp, #8 -; CHECK-NEXT: vmov.u16 r1, d0[1] +; CHECK-NEXT: vmov.u16 r12, d0[1] ; CHECK-NEXT: and r0, r0, #3 ; CHECK-NEXT: vmov.u16 r2, d0[2] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vmov.u16 r12, d0[3] -; CHECK-NEXT: orr r0, r3, r0, lsl #1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmov.u16 r3, d0[3] +; CHECK-NEXT: orr r0, r1, r0, lsl #1 ; CHECK-NEXT: vst1.16 {d0[0]}, [r0:16] ; CHECK-NEXT: vldr d0, [sp] -; CHECK-NEXT: vmov.16 d0[1], r1 +; CHECK-NEXT: vmov.16 d0[1], r12 ; CHECK-NEXT: vmov.16 d0[2], r2 -; CHECK-NEXT: vmov.16 d0[3], r12 +; CHECK-NEXT: vmov.16 d0[3], r3 ; CHECK-NEXT: add sp, sp, #8 ; CHECK-NEXT: bx lr %tmp = extractelement <8 x i16> %x, i32 0 diff --git a/llvm/test/CodeGen/ARM/select_const.ll b/llvm/test/CodeGen/ARM/select_const.ll index e12dd02f16c2fa..26f07e3e89a52f 100644 --- a/llvm/test/CodeGen/ARM/select_const.ll +++ b/llvm/test/CodeGen/ARM/select_const.ll @@ -671,22 +671,22 @@ define i64 @opaque_constant1(i1 %cond, i64 %x) { ; THUMB-NEXT: cmp r0, #0 ; THUMB-NEXT: bne .LBB24_2 ; THUMB-NEXT: @ %bb.1: -; THUMB-NEXT: movs r5, #23 +; THUMB-NEXT: movs r6, #23 ; THUMB-NEXT: b .LBB24_3 ; THUMB-NEXT: .LBB24_2: ; THUMB-NEXT: movs r0, #3 -; THUMB-NEXT: mvns r5, r0 +; THUMB-NEXT: mvns r6, r0 ; THUMB-NEXT: .LBB24_3: ; THUMB-NEXT: ldr r0, .LCPI24_0 -; THUMB-NEXT: ands r5, r0 -; THUMB-NEXT: movs r6, #0 -; THUMB-NEXT: subs r0, r5, #1 +; THUMB-NEXT: ands r6, r0 +; THUMB-NEXT: movs r5, #0 +; THUMB-NEXT: subs r0, r6, #1 ; THUMB-NEXT: push {r4} ; THUMB-NEXT: pop {r1} -; THUMB-NEXT: sbcs r1, r6 +; THUMB-NEXT: sbcs r1, r5 ; THUMB-NEXT: eors r3, r7 -; THUMB-NEXT: ldr r6, .LCPI24_0 -; THUMB-NEXT: eors r2, r6 +; THUMB-NEXT: ldr r5, .LCPI24_0 +; THUMB-NEXT: eors r2, r5 ; THUMB-NEXT: orrs r2, r3 ; THUMB-NEXT: beq .LBB24_5 ; THUMB-NEXT: @ %bb.4: @@ -695,7 +695,7 @@ define i64 @opaque_constant1(i1 %cond, i64 %x) { ; THUMB-NEXT: cmp r2, #0 ; THUMB-NEXT: beq .LBB24_7 ; THUMB-NEXT: @ %bb.6: -; THUMB-NEXT: movs r0, r5 +; THUMB-NEXT: movs r0, r6 ; THUMB-NEXT: .LBB24_7: ; THUMB-NEXT: pop {r4, r5, r6, r7} ; THUMB-NEXT: pop {r2} diff --git a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll index a4e081d5384e5e..73665360a10d7f 100644 --- a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll @@ -362,40 +362,40 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; ARM7: @ %bb.0: ; ARM7-NEXT: push {r4, r5, r6, r7, r11, lr} ; ARM7-NEXT: vpush {d8, d9} -; ARM7-NEXT: mov r6, r0 +; ARM7-NEXT: mov r5, r0 ; ARM7-NEXT: and r0, r3, #1 -; ARM7-NEXT: mov r5, r1 +; ARM7-NEXT: mov r4, r1 ; ARM7-NEXT: rsb r1, r0, #0 ; ARM7-NEXT: mov r0, r2 ; ARM7-NEXT: mov r2, #9 ; ARM7-NEXT: mov r3, #0 ; ARM7-NEXT: bl __moddi3 -; ARM7-NEXT: mov r7, r0 -; ARM7-NEXT: and r0, r5, #1 -; ARM7-NEXT: mov r4, r1 +; ARM7-NEXT: mov r6, r0 +; ARM7-NEXT: and r0, r4, #1 +; ARM7-NEXT: mov r7, r1 ; ARM7-NEXT: rsb r1, r0, #0 -; ARM7-NEXT: mov r0, r6 +; ARM7-NEXT: mov r0, r5 ; ARM7-NEXT: mov r2, #9 ; ARM7-NEXT: mov r3, #0 ; ARM7-NEXT: bl __moddi3 ; ARM7-NEXT: vmov.32 d8[0], r0 ; ARM7-NEXT: ldr r0, [sp, #44] ; ARM7-NEXT: ldr r2, [sp, #40] -; ARM7-NEXT: mov r5, r1 +; ARM7-NEXT: mov r4, r1 ; ARM7-NEXT: and r0, r0, #1 ; ARM7-NEXT: mvn r3, #0 ; ARM7-NEXT: rsb r1, r0, #0 -; ARM7-NEXT: vmov.32 d9[0], r7 +; ARM7-NEXT: vmov.32 d9[0], r6 ; ARM7-NEXT: mov r0, r2 ; ARM7-NEXT: mvn r2, #8 ; ARM7-NEXT: bl __moddi3 ; ARM7-NEXT: vmov.32 d16[0], r0 ; ARM7-NEXT: adr r0, .LCPI3_0 -; ARM7-NEXT: vmov.32 d9[1], r4 +; ARM7-NEXT: vmov.32 d9[1], r7 ; ARM7-NEXT: vld1.64 {d18, d19}, [r0:128] ; ARM7-NEXT: adr r0, .LCPI3_1 ; ARM7-NEXT: vmov.32 d16[1], r1 -; ARM7-NEXT: vmov.32 d8[1], r5 +; ARM7-NEXT: vmov.32 d8[1], r4 ; ARM7-NEXT: vand q8, q8, q9 ; ARM7-NEXT: vld1.64 {d20, d21}, [r0:128] ; ARM7-NEXT: adr r0, .LCPI3_2 @@ -438,40 +438,40 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; ARM8: @ %bb.0: ; ARM8-NEXT: push {r4, r5, r6, r7, r11, lr} ; ARM8-NEXT: vpush {d8, d9} -; ARM8-NEXT: mov r6, r0 +; ARM8-NEXT: mov r5, r0 ; ARM8-NEXT: and r0, r3, #1 -; ARM8-NEXT: mov r5, r1 +; ARM8-NEXT: mov r4, r1 ; ARM8-NEXT: rsb r1, r0, #0 ; ARM8-NEXT: mov r0, r2 ; ARM8-NEXT: mov r2, #9 ; ARM8-NEXT: mov r3, #0 ; ARM8-NEXT: bl __moddi3 -; ARM8-NEXT: mov r7, r0 -; ARM8-NEXT: and r0, r5, #1 -; ARM8-NEXT: mov r4, r1 +; ARM8-NEXT: mov r6, r0 +; ARM8-NEXT: and r0, r4, #1 +; ARM8-NEXT: mov r7, r1 ; ARM8-NEXT: rsb r1, r0, #0 -; ARM8-NEXT: mov r0, r6 +; ARM8-NEXT: mov r0, r5 ; ARM8-NEXT: mov r2, #9 ; ARM8-NEXT: mov r3, #0 ; ARM8-NEXT: bl __moddi3 ; ARM8-NEXT: vmov.32 d8[0], r0 ; ARM8-NEXT: ldr r0, [sp, #44] ; ARM8-NEXT: ldr r2, [sp, #40] -; ARM8-NEXT: mov r5, r1 +; ARM8-NEXT: mov r4, r1 ; ARM8-NEXT: and r0, r0, #1 ; ARM8-NEXT: mvn r3, #0 ; ARM8-NEXT: rsb r1, r0, #0 -; ARM8-NEXT: vmov.32 d9[0], r7 +; ARM8-NEXT: vmov.32 d9[0], r6 ; ARM8-NEXT: mov r0, r2 ; ARM8-NEXT: mvn r2, #8 ; ARM8-NEXT: bl __moddi3 ; ARM8-NEXT: vmov.32 d16[0], r0 ; ARM8-NEXT: adr r0, .LCPI3_0 -; ARM8-NEXT: vmov.32 d9[1], r4 +; ARM8-NEXT: vmov.32 d9[1], r7 ; ARM8-NEXT: vld1.64 {d18, d19}, [r0:128] ; ARM8-NEXT: adr r0, .LCPI3_1 ; ARM8-NEXT: vmov.32 d16[1], r1 -; ARM8-NEXT: vmov.32 d8[1], r5 +; ARM8-NEXT: vmov.32 d8[1], r4 ; ARM8-NEXT: vand q8, q8, q9 ; ARM8-NEXT: vld1.64 {d20, d21}, [r0:128] ; ARM8-NEXT: adr r0, .LCPI3_2 @@ -514,40 +514,40 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; NEON7: @ %bb.0: ; NEON7-NEXT: push {r4, r5, r6, r7, r11, lr} ; NEON7-NEXT: vpush {d8, d9} -; NEON7-NEXT: mov r6, r0 +; NEON7-NEXT: mov r5, r0 ; NEON7-NEXT: and r0, r3, #1 -; NEON7-NEXT: mov r5, r1 +; NEON7-NEXT: mov r4, r1 ; NEON7-NEXT: rsb r1, r0, #0 ; NEON7-NEXT: mov r0, r2 ; NEON7-NEXT: mov r2, #9 ; NEON7-NEXT: mov r3, #0 ; NEON7-NEXT: bl __moddi3 -; NEON7-NEXT: mov r7, r0 -; NEON7-NEXT: and r0, r5, #1 -; NEON7-NEXT: mov r4, r1 +; NEON7-NEXT: mov r6, r0 +; NEON7-NEXT: and r0, r4, #1 +; NEON7-NEXT: mov r7, r1 ; NEON7-NEXT: rsb r1, r0, #0 -; NEON7-NEXT: mov r0, r6 +; NEON7-NEXT: mov r0, r5 ; NEON7-NEXT: mov r2, #9 ; NEON7-NEXT: mov r3, #0 ; NEON7-NEXT: bl __moddi3 ; NEON7-NEXT: vmov.32 d8[0], r0 ; NEON7-NEXT: ldr r0, [sp, #44] ; NEON7-NEXT: ldr r2, [sp, #40] -; NEON7-NEXT: mov r5, r1 +; NEON7-NEXT: mov r4, r1 ; NEON7-NEXT: and r0, r0, #1 ; NEON7-NEXT: mvn r3, #0 ; NEON7-NEXT: rsb r1, r0, #0 -; NEON7-NEXT: vmov.32 d9[0], r7 +; NEON7-NEXT: vmov.32 d9[0], r6 ; NEON7-NEXT: mov r0, r2 ; NEON7-NEXT: mvn r2, #8 ; NEON7-NEXT: bl __moddi3 ; NEON7-NEXT: vmov.32 d16[0], r0 ; NEON7-NEXT: adr r0, .LCPI3_0 -; NEON7-NEXT: vmov.32 d9[1], r4 +; NEON7-NEXT: vmov.32 d9[1], r7 ; NEON7-NEXT: vld1.64 {d18, d19}, [r0:128] ; NEON7-NEXT: adr r0, .LCPI3_1 ; NEON7-NEXT: vmov.32 d16[1], r1 -; NEON7-NEXT: vmov.32 d8[1], r5 +; NEON7-NEXT: vmov.32 d8[1], r4 ; NEON7-NEXT: vand q8, q8, q9 ; NEON7-NEXT: vld1.64 {d20, d21}, [r0:128] ; NEON7-NEXT: adr r0, .LCPI3_2 @@ -590,40 +590,40 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; NEON8: @ %bb.0: ; NEON8-NEXT: push {r4, r5, r6, r7, r11, lr} ; NEON8-NEXT: vpush {d8, d9} -; NEON8-NEXT: mov r6, r0 +; NEON8-NEXT: mov r5, r0 ; NEON8-NEXT: and r0, r3, #1 -; NEON8-NEXT: mov r5, r1 +; NEON8-NEXT: mov r4, r1 ; NEON8-NEXT: rsb r1, r0, #0 ; NEON8-NEXT: mov r0, r2 ; NEON8-NEXT: mov r2, #9 ; NEON8-NEXT: mov r3, #0 ; NEON8-NEXT: bl __moddi3 -; NEON8-NEXT: mov r7, r0 -; NEON8-NEXT: and r0, r5, #1 -; NEON8-NEXT: mov r4, r1 +; NEON8-NEXT: mov r6, r0 +; NEON8-NEXT: and r0, r4, #1 +; NEON8-NEXT: mov r7, r1 ; NEON8-NEXT: rsb r1, r0, #0 -; NEON8-NEXT: mov r0, r6 +; NEON8-NEXT: mov r0, r5 ; NEON8-NEXT: mov r2, #9 ; NEON8-NEXT: mov r3, #0 ; NEON8-NEXT: bl __moddi3 ; NEON8-NEXT: vmov.32 d8[0], r0 ; NEON8-NEXT: ldr r0, [sp, #44] ; NEON8-NEXT: ldr r2, [sp, #40] -; NEON8-NEXT: mov r5, r1 +; NEON8-NEXT: mov r4, r1 ; NEON8-NEXT: and r0, r0, #1 ; NEON8-NEXT: mvn r3, #0 ; NEON8-NEXT: rsb r1, r0, #0 -; NEON8-NEXT: vmov.32 d9[0], r7 +; NEON8-NEXT: vmov.32 d9[0], r6 ; NEON8-NEXT: mov r0, r2 ; NEON8-NEXT: mvn r2, #8 ; NEON8-NEXT: bl __moddi3 ; NEON8-NEXT: vmov.32 d16[0], r0 ; NEON8-NEXT: adr r0, .LCPI3_0 -; NEON8-NEXT: vmov.32 d9[1], r4 +; NEON8-NEXT: vmov.32 d9[1], r7 ; NEON8-NEXT: vld1.64 {d18, d19}, [r0:128] ; NEON8-NEXT: adr r0, .LCPI3_1 ; NEON8-NEXT: vmov.32 d16[1], r1 -; NEON8-NEXT: vmov.32 d8[1], r5 +; NEON8-NEXT: vmov.32 d8[1], r4 ; NEON8-NEXT: vand q8, q8, q9 ; NEON8-NEXT: vld1.64 {d20, d21}, [r0:128] ; NEON8-NEXT: adr r0, .LCPI3_2 diff --git a/llvm/test/CodeGen/ARM/swifterror.ll b/llvm/test/CodeGen/ARM/swifterror.ll index 4f950ba6876080..10c790438d9014 100644 --- a/llvm/test/CodeGen/ARM/swifterror.ll +++ b/llvm/test/CodeGen/ARM/swifterror.ll @@ -1122,9 +1122,9 @@ define swiftcc { i32, i32, i32, i32} @params_and_return_in_reg(i32, i32, i32, i3 ; CHECK-APPLE-NEXT: add r7, sp, #20 ; CHECK-APPLE-NEXT: sub sp, sp, #20 ; CHECK-APPLE-NEXT: bfc sp, #0, #3 -; CHECK-APPLE-NEXT: mov r6, r8 -; CHECK-APPLE-NEXT: str r10, [sp, #12] @ 4-byte Spill -; CHECK-APPLE-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-APPLE-NEXT: str r8, [sp, #12] @ 4-byte Spill +; CHECK-APPLE-NEXT: mov r6, r3 +; CHECK-APPLE-NEXT: str r10, [sp, #4] @ 4-byte Spill ; CHECK-APPLE-NEXT: mov r4, r2 ; CHECK-APPLE-NEXT: mov r11, r1 ; CHECK-APPLE-NEXT: mov r5, r0 @@ -1135,17 +1135,17 @@ define swiftcc { i32, i32, i32, i32} @params_and_return_in_reg(i32, i32, i32, i3 ; CHECK-APPLE-NEXT: mov r10, #0 ; CHECK-APPLE-NEXT: mov r8, #0 ; CHECK-APPLE-NEXT: bl _params_in_reg2 -; CHECK-APPLE-NEXT: ldr r3, [sp, #8] @ 4-byte Reload +; CHECK-APPLE-NEXT: str r8, [sp, #8] @ 4-byte Spill ; CHECK-APPLE-NEXT: mov r0, r5 -; CHECK-APPLE-NEXT: ldr r10, [sp, #12] @ 4-byte Reload +; CHECK-APPLE-NEXT: ldr r10, [sp, #4] @ 4-byte Reload ; CHECK-APPLE-NEXT: mov r1, r11 -; CHECK-APPLE-NEXT: str r8, [sp, #4] @ 4-byte Spill +; CHECK-APPLE-NEXT: ldr r8, [sp, #12] @ 4-byte Reload ; CHECK-APPLE-NEXT: mov r2, r4 -; CHECK-APPLE-NEXT: mov r8, r6 +; CHECK-APPLE-NEXT: mov r3, r6 ; CHECK-APPLE-NEXT: bl _params_and_return_in_reg2 ; CHECK-APPLE-NEXT: str r8, [sp, #12] @ 4-byte Spill ; CHECK-APPLE-NEXT: mov r4, r0 -; CHECK-APPLE-NEXT: ldr r8, [sp, #4] @ 4-byte Reload +; CHECK-APPLE-NEXT: ldr r8, [sp, #8] @ 4-byte Reload ; CHECK-APPLE-NEXT: mov r5, r1 ; CHECK-APPLE-NEXT: mov r6, r2 ; CHECK-APPLE-NEXT: mov r11, r3 diff --git a/llvm/test/CodeGen/ARM/thumb2-size-opt.ll b/llvm/test/CodeGen/ARM/thumb2-size-opt.ll index 8cf7a702e8ed54..e15ff27f92fa3d 100644 --- a/llvm/test/CodeGen/ARM/thumb2-size-opt.ll +++ b/llvm/test/CodeGen/ARM/thumb2-size-opt.ll @@ -85,8 +85,8 @@ entry: define i32 @bundled_instruction(ptr %addr, ptr %addr2, i1 %tst) minsize { ; CHECK-LABEL: bundled_instruction: -; CHECK: iteee ne -; CHECK: ldmeq r2!, {{{r[0-9]+}}} +; CHECK: itee ne +; CHECK: ldmeq r0!, {{{r[0-9]+}}} br i1 %tst, label %true, label %false true: diff --git a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll index afd75940b45932..bcff1ce943c5df 100644 --- a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll @@ -7,38 +7,35 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; ARMV6: @ %bb.0: @ %start ; ARMV6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ARMV6-NEXT: sub sp, sp, #28 -; ARMV6-NEXT: ldr r7, [sp, #72] -; ARMV6-NEXT: mov r6, r0 +; ARMV6-NEXT: ldr lr, [sp, #72] +; ARMV6-NEXT: mov r7, r0 ; ARMV6-NEXT: str r0, [sp, #8] @ 4-byte Spill ; ARMV6-NEXT: ldr r4, [sp, #84] -; ARMV6-NEXT: umull r1, r0, r2, r7 -; ARMV6-NEXT: mov lr, r7 -; ARMV6-NEXT: umull r5, r10, r4, r2 -; ARMV6-NEXT: str r1, [r6] -; ARMV6-NEXT: ldr r6, [sp, #80] -; ARMV6-NEXT: umull r1, r7, r3, r6 -; ARMV6-NEXT: str r7, [sp, #12] @ 4-byte Spill +; ARMV6-NEXT: umull r1, r0, r2, lr +; ARMV6-NEXT: umull r5, r9, r4, r2 +; ARMV6-NEXT: str r1, [r7] +; ARMV6-NEXT: ldr r7, [sp, #80] +; ARMV6-NEXT: umull r1, r6, r3, r7 +; ARMV6-NEXT: str r6, [sp, #12] @ 4-byte Spill ; ARMV6-NEXT: add r1, r5, r1 -; ARMV6-NEXT: umull r7, r5, r6, r2 -; ARMV6-NEXT: mov r6, lr +; ARMV6-NEXT: umull r7, r5, r7, r2 +; ARMV6-NEXT: ldr r6, [sp, #64] ; ARMV6-NEXT: str r7, [sp, #16] @ 4-byte Spill ; ARMV6-NEXT: mov r7, #0 ; ARMV6-NEXT: adds r1, r5, r1 ; ARMV6-NEXT: str r1, [sp, #4] @ 4-byte Spill ; ARMV6-NEXT: adc r1, r7, #0 ; ARMV6-NEXT: str r1, [sp, #24] @ 4-byte Spill -; ARMV6-NEXT: ldr r1, [sp, #64] ; ARMV6-NEXT: ldr r7, [sp, #76] -; ARMV6-NEXT: ldr r5, [sp, #64] -; ARMV6-NEXT: umull r12, r9, r7, r1 +; ARMV6-NEXT: mov r5, lr ; ARMV6-NEXT: ldr r1, [sp, #68] -; ARMV6-NEXT: umull r11, r8, r1, lr +; ARMV6-NEXT: umull r12, r8, r7, r6 +; ARMV6-NEXT: umull r11, lr, r1, lr ; ARMV6-NEXT: add r12, r11, r12 -; ARMV6-NEXT: umull r11, lr, r5, lr -; ARMV6-NEXT: mov r5, r6 +; ARMV6-NEXT: umull r11, r10, r6, r5 ; ARMV6-NEXT: mov r6, #0 -; ARMV6-NEXT: adds r12, lr, r12 -; ARMV6-NEXT: umull r2, lr, r2, r7 +; ARMV6-NEXT: adds r12, r10, r12 +; ARMV6-NEXT: umull r2, r10, r2, r7 ; ARMV6-NEXT: adc r6, r6, #0 ; ARMV6-NEXT: str r6, [sp, #20] @ 4-byte Spill ; ARMV6-NEXT: ldr r6, [sp, #16] @ 4-byte Reload @@ -47,29 +44,29 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; ARMV6-NEXT: adc r6, r12, r6 ; ARMV6-NEXT: mov r12, #0 ; ARMV6-NEXT: umlal r0, r12, r3, r5 -; ARMV6-NEXT: ldr r5, [sp, #8] @ 4-byte Reload ; ARMV6-NEXT: str r6, [sp, #16] @ 4-byte Spill -; ARMV6-NEXT: ldr r6, [sp, #64] +; ARMV6-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; ARMV6-NEXT: ldr r5, [sp, #64] ; ARMV6-NEXT: adds r0, r2, r0 -; ARMV6-NEXT: str r0, [r5, #4] -; ARMV6-NEXT: adcs r0, r12, lr +; ARMV6-NEXT: str r0, [r6, #4] +; ARMV6-NEXT: adcs r0, r12, r10 ; ARMV6-NEXT: mov r2, #0 ; ARMV6-NEXT: adc r2, r2, #0 -; ARMV6-NEXT: orrs lr, r6, r1 -; ARMV6-NEXT: ldr r6, [sp, #80] -; ARMV6-NEXT: movne lr, #1 +; ARMV6-NEXT: orrs r10, r5, r1 +; ARMV6-NEXT: ldr r5, [sp, #80] +; ARMV6-NEXT: movne r10, #1 ; ARMV6-NEXT: umlal r0, r2, r3, r7 -; ARMV6-NEXT: orrs r12, r6, r4 +; ARMV6-NEXT: orrs r12, r5, r4 ; ARMV6-NEXT: movne r12, #1 -; ARMV6-NEXT: cmp r9, #0 -; ARMV6-NEXT: ldr r6, [sp, #12] @ 4-byte Reload -; ARMV6-NEXT: movne r9, #1 ; ARMV6-NEXT: cmp r8, #0 +; ARMV6-NEXT: ldr r5, [sp, #12] @ 4-byte Reload ; ARMV6-NEXT: movne r8, #1 -; ARMV6-NEXT: cmp r6, #0 -; ARMV6-NEXT: movne r6, #1 -; ARMV6-NEXT: cmp r10, #0 -; ARMV6-NEXT: movne r10, #1 +; ARMV6-NEXT: cmp lr, #0 +; ARMV6-NEXT: movne lr, #1 +; ARMV6-NEXT: cmp r5, #0 +; ARMV6-NEXT: movne r5, #1 +; ARMV6-NEXT: cmp r9, #0 +; ARMV6-NEXT: movne r9, #1 ; ARMV6-NEXT: cmp r1, #0 ; ARMV6-NEXT: movne r1, #1 ; ARMV6-NEXT: cmp r7, #0 @@ -79,28 +76,28 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; ARMV6-NEXT: cmp r3, #0 ; ARMV6-NEXT: movne r3, #1 ; ARMV6-NEXT: adds r0, r0, r11 -; ARMV6-NEXT: str r0, [r5, #8] +; ARMV6-NEXT: str r0, [r6, #8] ; ARMV6-NEXT: and r1, r1, r7 ; ARMV6-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; ARMV6-NEXT: orr r1, r1, lr ; ARMV6-NEXT: orr r1, r1, r8 -; ARMV6-NEXT: orr r1, r1, r9 ; ARMV6-NEXT: adcs r0, r2, r0 -; ARMV6-NEXT: str r0, [r5, #12] +; ARMV6-NEXT: str r0, [r6, #12] ; ARMV6-NEXT: and r0, r4, r3 ; ARMV6-NEXT: ldr r2, [sp, #24] @ 4-byte Reload -; ARMV6-NEXT: orr r0, r0, r10 -; ARMV6-NEXT: orr r0, r0, r6 +; ARMV6-NEXT: orr r0, r0, r9 +; ARMV6-NEXT: orr r0, r0, r5 ; ARMV6-NEXT: orr r0, r0, r2 ; ARMV6-NEXT: ldr r2, [sp, #20] @ 4-byte Reload ; ARMV6-NEXT: orr r1, r1, r2 -; ARMV6-NEXT: and r2, lr, r12 +; ARMV6-NEXT: and r2, r10, r12 ; ARMV6-NEXT: orr r1, r2, r1 ; ARMV6-NEXT: orr r0, r1, r0 ; ARMV6-NEXT: mov r1, #0 ; ARMV6-NEXT: adc r1, r1, #0 ; ARMV6-NEXT: orr r0, r0, r1 ; ARMV6-NEXT: and r0, r0, #1 -; ARMV6-NEXT: strb r0, [r5, #16] +; ARMV6-NEXT: strb r0, [r6, #16] ; ARMV6-NEXT: add sp, sp, #28 ; ARMV6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; @@ -108,101 +105,98 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; ARMV7: @ %bb.0: @ %start ; ARMV7-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ARMV7-NEXT: sub sp, sp, #36 -; ARMV7-NEXT: ldr r5, [sp, #84] +; ARMV7-NEXT: ldr r4, [sp, #84] ; ARMV7-NEXT: mov r8, r0 ; ARMV7-NEXT: ldr r1, [sp, #72] -; ARMV7-NEXT: ldr r10, [sp, #80] -; ARMV7-NEXT: ldr r9, [sp, #76] -; ARMV7-NEXT: umull r4, lr, r5, r1 -; ARMV7-NEXT: umull r0, r7, r2, r10 -; ARMV7-NEXT: str r4, [sp, #24] @ 4-byte Spill -; ARMV7-NEXT: ldr r4, [sp, #88] -; ARMV7-NEXT: umull r1, r6, r1, r10 -; ARMV7-NEXT: str r0, [sp, #32] @ 4-byte Spill -; ARMV7-NEXT: umull r11, r0, r2, r5 -; ARMV7-NEXT: str r6, [sp, #20] @ 4-byte Spill +; ARMV7-NEXT: ldr r9, [sp, #80] +; ARMV7-NEXT: umull r7, r6, r4, r1 +; ARMV7-NEXT: umull r1, r5, r1, r9 +; ARMV7-NEXT: str r7, [sp, #24] @ 4-byte Spill +; ARMV7-NEXT: ldr r7, [sp, #88] +; ARMV7-NEXT: umull r0, r12, r2, r9 +; ARMV7-NEXT: str r5, [sp, #16] @ 4-byte Spill ; ARMV7-NEXT: str r1, [sp, #28] @ 4-byte Spill -; ARMV7-NEXT: umull r6, r12, r3, r4 +; ARMV7-NEXT: umull r5, lr, r3, r7 ; ARMV7-NEXT: ldr r1, [sp, #92] -; ARMV7-NEXT: str r0, [sp, #8] @ 4-byte Spill +; ARMV7-NEXT: str r0, [sp, #32] @ 4-byte Spill +; ARMV7-NEXT: umull r0, r11, r2, r4 +; ARMV7-NEXT: str r5, [sp, #12] @ 4-byte Spill +; ARMV7-NEXT: umull r5, r1, r1, r2 +; ARMV7-NEXT: str r0, [sp, #20] @ 4-byte Spill ; ARMV7-NEXT: mov r0, #0 -; ARMV7-NEXT: umlal r7, r0, r3, r10 -; ARMV7-NEXT: str r6, [sp, #16] @ 4-byte Spill -; ARMV7-NEXT: umull r6, r1, r1, r2 -; ARMV7-NEXT: umull r2, r4, r4, r2 -; ARMV7-NEXT: str r6, [sp, #4] @ 4-byte Spill -; ARMV7-NEXT: str r2, [sp, #12] @ 4-byte Spill -; ARMV7-NEXT: adds r2, r11, r7 -; ARMV7-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; ARMV7-NEXT: mov r11, #0 -; ARMV7-NEXT: str r4, [sp] @ 4-byte Spill -; ARMV7-NEXT: umull r6, r4, r9, r10 -; ARMV7-NEXT: adcs r9, r0, r7 -; ARMV7-NEXT: ldr r0, [sp, #32] @ 4-byte Reload -; ARMV7-NEXT: adc r10, r11, #0 -; ARMV7-NEXT: stm r8, {r0, r2} -; ARMV7-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; ARMV7-NEXT: umlal r9, r10, r3, r5 +; ARMV7-NEXT: umull r2, r10, r7, r2 +; ARMV7-NEXT: str r5, [sp, #4] @ 4-byte Spill +; ARMV7-NEXT: ldr r5, [sp, #76] +; ARMV7-NEXT: umlal r12, r0, r3, r9 +; ARMV7-NEXT: str r2, [sp, #8] @ 4-byte Spill ; ARMV7-NEXT: ldr r2, [sp, #20] @ 4-byte Reload -; ARMV7-NEXT: add r0, r6, r0 -; ARMV7-NEXT: adds r0, r2, r0 -; ARMV7-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; ARMV7-NEXT: adc r2, r11, #0 -; ARMV7-NEXT: str r2, [sp, #32] @ 4-byte Spill -; ARMV7-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; ARMV7-NEXT: ldr r7, [sp, #28] @ 4-byte Reload -; ARMV7-NEXT: add r2, r6, r2 -; ARMV7-NEXT: ldr r6, [sp] @ 4-byte Reload -; ARMV7-NEXT: adds r2, r6, r2 -; ARMV7-NEXT: ldr r6, [sp, #12] @ 4-byte Reload -; ARMV7-NEXT: adc r11, r11, #0 -; ARMV7-NEXT: adds r7, r7, r6 -; ARMV7-NEXT: ldr r6, [sp, #92] -; ARMV7-NEXT: adc r0, r0, r2 -; ARMV7-NEXT: str r0, [sp, #28] @ 4-byte Spill -; ARMV7-NEXT: ldr r0, [sp, #92] +; ARMV7-NEXT: umull r7, r5, r5, r9 +; ARMV7-NEXT: adds r2, r2, r12 +; ARMV7-NEXT: adcs r9, r0, r11 +; ARMV7-NEXT: ldr r11, [sp, #32] @ 4-byte Reload +; ARMV7-NEXT: str r11, [r8] +; ARMV7-NEXT: mov r0, #0 +; ARMV7-NEXT: str r2, [r8, #4] +; ARMV7-NEXT: adc r12, r0, #0 +; ARMV7-NEXT: ldr r2, [sp, #24] @ 4-byte Reload +; ARMV7-NEXT: umlal r9, r12, r3, r4 +; ARMV7-NEXT: add r2, r7, r2 +; ARMV7-NEXT: ldr r7, [sp, #16] @ 4-byte Reload +; ARMV7-NEXT: adds r11, r7, r2 +; ARMV7-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; ARMV7-NEXT: adc r7, r0, #0 +; ARMV7-NEXT: str r7, [sp, #32] @ 4-byte Spill +; ARMV7-NEXT: ldr r7, [sp, #12] @ 4-byte Reload +; ARMV7-NEXT: add r7, r2, r7 +; ARMV7-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; ARMV7-NEXT: adds r7, r10, r7 +; ARMV7-NEXT: adc r10, r0, #0 +; ARMV7-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; ARMV7-NEXT: adds r0, r0, r2 +; ARMV7-NEXT: adc r11, r11, r7 +; ARMV7-NEXT: ldr r7, [sp, #92] ; ARMV7-NEXT: cmp r3, #0 ; ARMV7-NEXT: movwne r3, #1 -; ARMV7-NEXT: ldr r2, [sp, #76] -; ARMV7-NEXT: cmp r0, #0 -; ARMV7-NEXT: movwne r0, #1 +; ARMV7-NEXT: cmp r7, #0 +; ARMV7-NEXT: mov r2, r7 +; ARMV7-NEXT: movwne r2, #1 ; ARMV7-NEXT: cmp r1, #0 +; ARMV7-NEXT: and r2, r2, r3 +; ARMV7-NEXT: ldr r3, [sp, #76] ; ARMV7-NEXT: movwne r1, #1 -; ARMV7-NEXT: cmp r12, #0 -; ARMV7-NEXT: and r0, r0, r3 -; ARMV7-NEXT: movwne r12, #1 -; ARMV7-NEXT: cmp r5, #0 -; ARMV7-NEXT: orr r0, r0, r1 -; ARMV7-NEXT: movwne r5, #1 -; ARMV7-NEXT: cmp r2, #0 -; ARMV7-NEXT: mov r1, r2 -; ARMV7-NEXT: mov r3, r2 -; ARMV7-NEXT: movwne r1, #1 -; ARMV7-NEXT: cmp r4, #0 -; ARMV7-NEXT: ldr r2, [sp, #72] -; ARMV7-NEXT: movwne r4, #1 ; ARMV7-NEXT: cmp lr, #0 -; ARMV7-NEXT: and r1, r1, r5 ; ARMV7-NEXT: movwne lr, #1 -; ARMV7-NEXT: orrs r2, r2, r3 -; ARMV7-NEXT: ldr r3, [sp, #88] +; ARMV7-NEXT: cmp r4, #0 +; ARMV7-NEXT: orr r1, r2, r1 +; ARMV7-NEXT: movwne r4, #1 +; ARMV7-NEXT: cmp r3, #0 +; ARMV7-NEXT: mov r2, r3 ; ARMV7-NEXT: movwne r2, #1 -; ARMV7-NEXT: orr r1, r1, r4 -; ARMV7-NEXT: orr r0, r0, r12 -; ARMV7-NEXT: orrs r3, r3, r6 +; ARMV7-NEXT: cmp r5, #0 +; ARMV7-NEXT: movwne r5, #1 +; ARMV7-NEXT: and r2, r2, r4 +; ARMV7-NEXT: cmp r6, #0 ; ARMV7-NEXT: orr r1, r1, lr +; ARMV7-NEXT: mov lr, r3 +; ARMV7-NEXT: orr r2, r2, r5 +; ARMV7-NEXT: movwne r6, #1 +; ARMV7-NEXT: ldr r3, [sp, #72] +; ARMV7-NEXT: orr r2, r2, r6 +; ARMV7-NEXT: ldr r6, [sp, #88] +; ARMV7-NEXT: orrs r3, r3, lr +; ARMV7-NEXT: orr r1, r1, r10 ; ARMV7-NEXT: movwne r3, #1 -; ARMV7-NEXT: adds r7, r9, r7 -; ARMV7-NEXT: str r7, [r8, #8] -; ARMV7-NEXT: and r2, r2, r3 -; ARMV7-NEXT: ldr r7, [sp, #28] @ 4-byte Reload -; ARMV7-NEXT: orr r0, r0, r11 -; ARMV7-NEXT: adcs r7, r10, r7 -; ARMV7-NEXT: str r7, [r8, #12] -; ARMV7-NEXT: ldr r7, [sp, #32] @ 4-byte Reload -; ARMV7-NEXT: orr r1, r1, r7 -; ARMV7-NEXT: orr r1, r2, r1 -; ARMV7-NEXT: orr r0, r1, r0 +; ARMV7-NEXT: orrs r7, r6, r7 +; ARMV7-NEXT: movwne r7, #1 +; ARMV7-NEXT: adds r0, r9, r0 +; ARMV7-NEXT: str r0, [r8, #8] +; ARMV7-NEXT: adcs r0, r12, r11 +; ARMV7-NEXT: str r0, [r8, #12] +; ARMV7-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; ARMV7-NEXT: orr r0, r2, r0 +; ARMV7-NEXT: and r2, r3, r7 +; ARMV7-NEXT: orr r0, r2, r0 +; ARMV7-NEXT: orr r0, r0, r1 ; ARMV7-NEXT: mov r1, #0 ; ARMV7-NEXT: adc r1, r1, #0 ; ARMV7-NEXT: orr r0, r0, r1 diff --git a/llvm/test/CodeGen/ARM/usub_sat.ll b/llvm/test/CodeGen/ARM/usub_sat.ll index 9c2fd3966ea984..f73b8523ac2390 100644 --- a/llvm/test/CodeGen/ARM/usub_sat.ll +++ b/llvm/test/CodeGen/ARM/usub_sat.ll @@ -41,26 +41,27 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: .save {r4, lr} ; CHECK-T1-NEXT: push {r4, lr} -; CHECK-T1-NEXT: mov r4, r1 -; CHECK-T1-NEXT: movs r1, #0 +; CHECK-T1-NEXT: movs r4, #0 ; CHECK-T1-NEXT: subs r2, r0, r2 -; CHECK-T1-NEXT: sbcs r4, r3 -; CHECK-T1-NEXT: mov r0, r1 -; CHECK-T1-NEXT: adcs r0, r1 +; CHECK-T1-NEXT: sbcs r1, r3 +; CHECK-T1-NEXT: mov r0, r4 +; CHECK-T1-NEXT: adcs r0, r4 ; CHECK-T1-NEXT: movs r3, #1 ; CHECK-T1-NEXT: eors r3, r0 -; CHECK-T1-NEXT: mov r0, r1 +; CHECK-T1-NEXT: mov r0, r4 ; CHECK-T1-NEXT: beq .LBB1_3 ; CHECK-T1-NEXT: @ %bb.1: ; CHECK-T1-NEXT: cmp r3, #0 ; CHECK-T1-NEXT: beq .LBB1_4 ; CHECK-T1-NEXT: .LBB1_2: +; CHECK-T1-NEXT: mov r1, r4 ; CHECK-T1-NEXT: pop {r4, pc} ; CHECK-T1-NEXT: .LBB1_3: ; CHECK-T1-NEXT: mov r0, r2 ; CHECK-T1-NEXT: cmp r3, #0 ; CHECK-T1-NEXT: bne .LBB1_2 ; CHECK-T1-NEXT: .LBB1_4: +; CHECK-T1-NEXT: mov r4, r1 ; CHECK-T1-NEXT: mov r1, r4 ; CHECK-T1-NEXT: pop {r4, pc} ; diff --git a/llvm/test/CodeGen/ARM/usub_sat_plus.ll b/llvm/test/CodeGen/ARM/usub_sat_plus.ll index 51ec83c707603b..20e1d2f83f0445 100644 --- a/llvm/test/CodeGen/ARM/usub_sat_plus.ll +++ b/llvm/test/CodeGen/ARM/usub_sat_plus.ll @@ -45,28 +45,29 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind { ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: .save {r4, lr} ; CHECK-T1-NEXT: push {r4, lr} -; CHECK-T1-NEXT: mov r2, r1 -; CHECK-T1-NEXT: movs r1, #0 +; CHECK-T1-NEXT: movs r2, #0 ; CHECK-T1-NEXT: ldr r4, [sp, #12] ; CHECK-T1-NEXT: ldr r3, [sp, #8] ; CHECK-T1-NEXT: subs r3, r0, r3 -; CHECK-T1-NEXT: sbcs r2, r4 -; CHECK-T1-NEXT: mov r0, r1 -; CHECK-T1-NEXT: adcs r0, r1 +; CHECK-T1-NEXT: sbcs r1, r4 +; CHECK-T1-NEXT: mov r0, r2 +; CHECK-T1-NEXT: adcs r0, r2 ; CHECK-T1-NEXT: movs r4, #1 ; CHECK-T1-NEXT: eors r4, r0 -; CHECK-T1-NEXT: mov r0, r1 +; CHECK-T1-NEXT: mov r0, r2 ; CHECK-T1-NEXT: beq .LBB1_3 ; CHECK-T1-NEXT: @ %bb.1: ; CHECK-T1-NEXT: cmp r4, #0 ; CHECK-T1-NEXT: beq .LBB1_4 ; CHECK-T1-NEXT: .LBB1_2: +; CHECK-T1-NEXT: mov r1, r2 ; CHECK-T1-NEXT: pop {r4, pc} ; CHECK-T1-NEXT: .LBB1_3: ; CHECK-T1-NEXT: mov r0, r3 ; CHECK-T1-NEXT: cmp r4, #0 ; CHECK-T1-NEXT: bne .LBB1_2 ; CHECK-T1-NEXT: .LBB1_4: +; CHECK-T1-NEXT: mov r2, r1 ; CHECK-T1-NEXT: mov r1, r2 ; CHECK-T1-NEXT: pop {r4, pc} ; diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll index 8cfcdbd3b4467d..426e82c7ef123b 100644 --- a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll @@ -58,23 +58,23 @@ define float @test_v4f32(<4 x float> %a) nounwind { ; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: bl __aeabi_fcmpgt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: movne r5, r7 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: movne r6, r7 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: bl __aeabi_fcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: moveq r5, r6 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: moveq r6, r5 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: bl __aeabi_fcmpgt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: moveq r5, r4 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: moveq r6, r4 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll index 70c569e4f4781a..6872a39212d877 100644 --- a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll @@ -58,23 +58,23 @@ define float @test_v4f32(<4 x float> %a) nounwind { ; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: bl __aeabi_fcmplt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: movne r5, r7 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: movne r6, r7 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: bl __aeabi_fcmplt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: moveq r5, r6 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: moveq r6, r5 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: bl __aeabi_fcmplt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: moveq r5, r4 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: moveq r6, r4 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) diff --git a/llvm/test/CodeGen/ARM/vselect_imax.ll b/llvm/test/CodeGen/ARM/vselect_imax.ll index 37f511fcc68cca..369abeb450b5bc 100644 --- a/llvm/test/CodeGen/ARM/vselect_imax.ll +++ b/llvm/test/CodeGen/ARM/vselect_imax.ll @@ -67,31 +67,30 @@ define void @func_blend15(ptr %loadaddr, ptr %loadaddr2, %T0_18 = type <4 x i64> %T1_18 = type <4 x i1> define void @func_blend18(ptr %loadaddr, ptr %loadaddr2, - ptr %blend, ptr %storeaddr) { ; CHECK-LABEL: func_blend18: ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]! -; CHECK-NEXT: vld1.64 {d22, d23}, [r0:128]! +; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128]! ; CHECK-NEXT: vmov r4, r6, d16 -; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128] -; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128] -; CHECK-NEXT: vmov lr, r12, d18 -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: vmov r2, r1, d20 -; CHECK-NEXT: subs r2, r2, lr -; CHECK-NEXT: vmov r7, lr, d17 -; CHECK-NEXT: vmov r2, r5, d22 -; CHECK-NEXT: sbcs r1, r1, r12 +; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128] ; CHECK-NEXT: mov r1, #0 -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvnne r1, #0 +; CHECK-NEXT: vld1.64 {d22, d23}, [r0:128] +; CHECK-NEXT: vmov r0, r12, d20 +; CHECK-NEXT: vmov r2, lr, d22 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: vmov r2, r5, d18 +; CHECK-NEXT: sbcs r0, lr, r12 +; CHECK-NEXT: vmov r7, lr, d17 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: subs r2, r2, r4 ; CHECK-NEXT: sbcs r6, r5, r6 -; CHECK-NEXT: vmov r2, r12, d19 -; CHECK-NEXT: vmov r5, r4, d21 +; CHECK-NEXT: vmov r2, r12, d21 +; CHECK-NEXT: vmov r5, r4, d23 ; CHECK-NEXT: mov r6, #0 ; CHECK-NEXT: movlt r6, #1 ; CHECK-NEXT: cmp r6, #0 @@ -99,25 +98,27 @@ define void @func_blend18(ptr %loadaddr, ptr %loadaddr2, ; CHECK-NEXT: subs r2, r5, r2 ; CHECK-NEXT: sbcs r4, r4, r12 ; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: vmov r4, r5, d23 +; CHECK-NEXT: vmov r4, r5, d19 ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: subs r7, r4, r7 ; CHECK-NEXT: sbcs r7, r5, lr -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mvnne r1, #0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vdup.32 d25, r0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d24, r6 +; CHECK-NEXT: vdup.32 d25, r1 ; CHECK-NEXT: vdup.32 d27, r2 -; CHECK-NEXT: vbit q8, q11, q12 -; CHECK-NEXT: vdup.32 d26, r1 -; CHECK-NEXT: vbit q9, q10, q13 +; CHECK-NEXT: vdup.32 d24, r6 +; CHECK-NEXT: vdup.32 d26, r0 +; CHECK-NEXT: vbit q8, q9, q12 +; CHECK-NEXT: vorr q9, q13, q13 +; CHECK-NEXT: vbsl q9, q11, q10 ; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r3:128] ; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov pc, lr + ptr %blend, ptr %storeaddr) { %v0 = load %T0_18, ptr %loadaddr %v1 = load %T0_18, ptr %loadaddr2 %c = icmp slt %T0_18 %v0, %v1 @@ -131,101 +132,101 @@ define void @func_blend18(ptr %loadaddr, ptr %loadaddr2, %T0_19 = type <8 x i64> %T1_19 = type <8 x i1> define void @func_blend19(ptr %loadaddr, ptr %loadaddr2, - ptr %blend, ptr %storeaddr) { ; CHECK-LABEL: func_blend19: ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vld1.64 {d28, d29}, [r1:128]! -; CHECK-NEXT: mov lr, #0 ; CHECK-NEXT: vld1.64 {d30, d31}, [r0:128]! -; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128]! +; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]! +; CHECK-NEXT: vld1.64 {d22, d23}, [r0:128]! +; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]! ; CHECK-NEXT: vld1.64 {d24, d25}, [r0:128]! -; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128]! -; CHECK-NEXT: vld1.64 {d26, d27}, [r0:128]! -; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128] -; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128] -; CHECK-NEXT: vmov r0, r12, d16 -; CHECK-NEXT: vmov r1, r2, d18 +; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128] +; CHECK-NEXT: vld1.64 {d26, d27}, [r0:128] +; CHECK-NEXT: vmov r0, r12, d20 +; CHECK-NEXT: vmov r1, r2, d26 ; CHECK-NEXT: subs r0, r1, r0 -; CHECK-NEXT: vmov r1, r4, d25 +; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: sbcs r0, r2, r12 -; CHECK-NEXT: mov r12, #0 -; CHECK-NEXT: vmov r2, r0, d21 -; CHECK-NEXT: movlt r12, #1 -; CHECK-NEXT: cmp r12, #0 -; CHECK-NEXT: mvnne r12, #0 -; CHECK-NEXT: subs r1, r1, r2 -; CHECK-NEXT: sbcs r0, r4, r0 -; CHECK-NEXT: vmov r2, r4, d26 +; CHECK-NEXT: vmov lr, r12, d17 +; CHECK-NEXT: vmov r2, r4, d23 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d1, r0 -; CHECK-NEXT: vmov r0, r1, d22 -; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: subs r2, r2, lr +; CHECK-NEXT: sbcs r2, r4, r12 +; CHECK-NEXT: vmov r4, lr, d24 ; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: sbcs r0, r4, r1 +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: mvnne r2, #0 +; CHECK-NEXT: vdup.32 d1, r2 +; CHECK-NEXT: vmov r2, r12, d18 +; CHECK-NEXT: subs r2, r4, r2 ; CHECK-NEXT: vmov r4, r5, d31 -; CHECK-NEXT: vmov r0, r1, d29 +; CHECK-NEXT: sbcs r2, lr, r12 +; CHECK-NEXT: vmov lr, r12, d29 +; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r5, r1 -; CHECK-NEXT: vmov r4, r5, d30 -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d3, r0 -; CHECK-NEXT: vmov r0, r1, d28 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r5, r1 -; CHECK-NEXT: vmov r4, r5, d24 -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d2, r0 -; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: subs r4, r4, lr +; CHECK-NEXT: sbcs r5, r5, r12 +; CHECK-NEXT: vmov r4, lr, d30 +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: mvnne r5, #0 +; CHECK-NEXT: vdup.32 d3, r5 +; CHECK-NEXT: vmov r5, r12, d28 +; CHECK-NEXT: subs r4, r4, r5 +; CHECK-NEXT: sbcs r5, lr, r12 +; CHECK-NEXT: vmov r4, lr, d22 +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: mvnne r5, #0 +; CHECK-NEXT: vdup.32 d2, r5 +; CHECK-NEXT: vmov r5, r12, d16 ; CHECK-NEXT: vbit q14, q15, q1 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r5, r1 -; CHECK-NEXT: vmov r1, r4, d17 -; CHECK-NEXT: vmov r5, r6, d19 -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d0, r0 -; CHECK-NEXT: vbit q10, q12, q0 -; CHECK-NEXT: subs r1, r5, r1 -; CHECK-NEXT: sbcs r1, r6, r4 -; CHECK-NEXT: vmov r4, r5, d27 -; CHECK-NEXT: vmov r0, r1, d23 -; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: movlt r6, #1 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r5, r1 -; CHECK-NEXT: movlt lr, #1 -; CHECK-NEXT: cmp lr, #0 -; CHECK-NEXT: mvnne lr, #0 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: vdup.32 d31, lr -; CHECK-NEXT: mvnne r6, #0 +; CHECK-NEXT: subs r4, r4, r5 +; CHECK-NEXT: sbcs r5, lr, r12 +; CHECK-NEXT: vmov lr, r12, d21 +; CHECK-NEXT: vmov r4, r6, d27 +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: mvnne r5, #0 +; CHECK-NEXT: vdup.32 d0, r5 +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: vbit q8, q11, q0 +; CHECK-NEXT: subs r4, r4, lr +; CHECK-NEXT: sbcs r6, r6, r12 +; CHECK-NEXT: vmov r4, lr, d25 +; CHECK-NEXT: vmov r6, r12, d19 +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: subs r4, r4, r6 +; CHECK-NEXT: sbcs r6, lr, r12 +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mvnne r1, #0 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: vdup.32 d31, r1 +; CHECK-NEXT: mvnne r5, #0 ; CHECK-NEXT: vdup.32 d30, r2 -; CHECK-NEXT: vdup.32 d3, r6 -; CHECK-NEXT: vbit q11, q13, q15 -; CHECK-NEXT: vdup.32 d2, r12 +; CHECK-NEXT: vdup.32 d3, r5 +; CHECK-NEXT: vbit q9, q12, q15 +; CHECK-NEXT: vdup.32 d2, r0 ; CHECK-NEXT: vst1.64 {d28, d29}, [r3:128]! -; CHECK-NEXT: vbit q8, q9, q1 -; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128]! -; CHECK-NEXT: vst1.64 {d22, d23}, [r3:128]! -; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128] +; CHECK-NEXT: vbit q10, q13, q1 +; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128]! +; CHECK-NEXT: vst1.64 {d18, d19}, [r3:128]! +; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128] ; CHECK-NEXT: pop {r4, r5, r6, lr} ; CHECK-NEXT: mov pc, lr + ptr %blend, ptr %storeaddr) { %v0 = load %T0_19, ptr %loadaddr %v1 = load %T0_19, ptr %loadaddr2 %c = icmp slt %T0_19 %v0, %v1 @@ -239,7 +240,6 @@ define void @func_blend19(ptr %loadaddr, ptr %loadaddr2, %T0_20 = type <16 x i64> %T1_20 = type <16 x i1> define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, - ptr %blend, ptr %storeaddr) { ; CHECK-LABEL: func_blend20: ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} @@ -257,10 +257,10 @@ define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, ; CHECK-NEXT: vld1.64 {d20, d21}, [lr:128]! ; CHECK-NEXT: vmov r6, r4, d19 ; CHECK-NEXT: vmov r5, r7, d21 -; CHECK-NEXT: vld1.64 {d4, d5}, [r9:128]! +; CHECK-NEXT: vld1.64 {d2, d3}, [r9:128]! ; CHECK-NEXT: vld1.64 {d6, d7}, [r10:128]! ; CHECK-NEXT: vld1.64 {d0, d1}, [r10:128]! -; CHECK-NEXT: vld1.64 {d2, d3}, [r9:128]! +; CHECK-NEXT: vld1.64 {d4, d5}, [r9:128]! ; CHECK-NEXT: subs r6, r5, r6 ; CHECK-NEXT: sbcs r4, r7, r4 ; CHECK-NEXT: vmov r5, r6, d18 @@ -269,19 +269,19 @@ define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, ; CHECK-NEXT: movlt r4, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d31, r4 +; CHECK-NEXT: vdup.32 d25, r4 ; CHECK-NEXT: subs r5, r7, r5 ; CHECK-NEXT: sbcs r2, r2, r6 -; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: vmov r4, r5, d5 ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d30, r2 +; CHECK-NEXT: vdup.32 d24, r2 ; CHECK-NEXT: vmov r0, r2, d1 ; CHECK-NEXT: subs r0, r4, r0 ; CHECK-NEXT: sbcs r0, r5, r2 -; CHECK-NEXT: vmov r4, r5, d2 +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 @@ -290,7 +290,7 @@ define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: subs r0, r4, r0 ; CHECK-NEXT: sbcs r0, r5, r2 -; CHECK-NEXT: vmov r4, r5, d5 +; CHECK-NEXT: vmov r4, r5, d3 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 @@ -299,7 +299,7 @@ define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, ; CHECK-NEXT: vmov r0, r2, d7 ; CHECK-NEXT: subs r0, r4, r0 ; CHECK-NEXT: sbcs r0, r5, r2 -; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vmov r4, r5, d2 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 @@ -322,7 +322,7 @@ define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d25, r0 +; CHECK-NEXT: vdup.32 d27, r0 ; CHECK-NEXT: vmov r0, r2, d16 ; CHECK-NEXT: subs r0, r4, r0 ; CHECK-NEXT: sbcs r0, r5, r2 @@ -330,36 +330,36 @@ define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d24, r0 -; CHECK-NEXT: vorr q13, q12, q12 -; CHECK-NEXT: vbsl q13, q11, q8 -; CHECK-NEXT: vld1.64 {d24, d25}, [r9:128]! +; CHECK-NEXT: vdup.32 d26, r0 +; CHECK-NEXT: vorr q14, q13, q13 +; CHECK-NEXT: vbsl q14, q11, q8 +; CHECK-NEXT: vld1.64 {d26, d27}, [r9:128]! ; CHECK-NEXT: vorr q8, q5, q5 -; CHECK-NEXT: vld1.64 {d28, d29}, [r10:128]! -; CHECK-NEXT: vbsl q8, q2, q3 +; CHECK-NEXT: vld1.64 {d30, d31}, [r10:128]! +; CHECK-NEXT: vbsl q8, q1, q3 ; CHECK-NEXT: vld1.64 {d6, d7}, [r8:128]! ; CHECK-NEXT: vld1.64 {d22, d23}, [r8:128] -; CHECK-NEXT: vld1.64 {d4, d5}, [lr:128]! -; CHECK-NEXT: vbif q10, q9, q15 +; CHECK-NEXT: vld1.64 {d2, d3}, [lr:128]! +; CHECK-NEXT: vbif q10, q9, q12 ; CHECK-NEXT: vorr q9, q4, q4 ; CHECK-NEXT: vmov r0, r2, d22 -; CHECK-NEXT: vbsl q9, q1, q0 -; CHECK-NEXT: vld1.64 {d30, d31}, [lr:128] -; CHECK-NEXT: mov lr, #0 -; CHECK-NEXT: vmov r7, r5, d30 -; CHECK-NEXT: vld1.64 {d0, d1}, [r9:128] -; CHECK-NEXT: vld1.64 {d2, d3}, [r10:128] +; CHECK-NEXT: vbsl q9, q2, q0 +; CHECK-NEXT: vld1.64 {d24, d25}, [lr:128] +; CHECK-NEXT: vmov r7, r5, d24 +; CHECK-NEXT: vld1.64 {d4, d5}, [r9:128] +; CHECK-NEXT: vld1.64 {d8, d9}, [r10:128] ; CHECK-NEXT: subs r0, r7, r0 ; CHECK-NEXT: sbcs r0, r5, r2 -; CHECK-NEXT: vmov r5, r4, d24 -; CHECK-NEXT: vmov r0, r7, d28 -; CHECK-NEXT: movlt lr, #1 -; CHECK-NEXT: cmp lr, #0 -; CHECK-NEXT: mvnne lr, #0 +; CHECK-NEXT: vmov r5, r4, d26 +; CHECK-NEXT: vmov r0, r7, d30 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: mvnne r2, #0 ; CHECK-NEXT: subs r0, r5, r0 ; CHECK-NEXT: sbcs r0, r4, r7 -; CHECK-NEXT: vmov r7, r5, d29 -; CHECK-NEXT: vmov r4, r6, d25 +; CHECK-NEXT: vmov r7, r5, d31 +; CHECK-NEXT: vmov r4, r6, d27 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 @@ -367,7 +367,7 @@ define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, ; CHECK-NEXT: subs r7, r4, r7 ; CHECK-NEXT: mov r4, #0 ; CHECK-NEXT: sbcs r7, r6, r5 -; CHECK-NEXT: vmov r5, r1, d31 +; CHECK-NEXT: vmov r5, r1, d25 ; CHECK-NEXT: vmov r7, r6, d23 ; CHECK-NEXT: movlt r4, #1 ; CHECK-NEXT: cmp r4, #0 @@ -375,66 +375,67 @@ define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, ; CHECK-NEXT: subs r7, r5, r7 ; CHECK-NEXT: mov r5, #0 ; CHECK-NEXT: sbcs r1, r1, r6 -; CHECK-NEXT: vmov r6, r2, d5 -; CHECK-NEXT: vmov r1, r7, d7 +; CHECK-NEXT: vmov r6, r7, d3 +; CHECK-NEXT: vmov r1, lr, d7 ; CHECK-NEXT: movlt r5, #1 ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: mvnne r5, #0 ; CHECK-NEXT: subs r1, r6, r1 -; CHECK-NEXT: sbcs r1, r2, r7 -; CHECK-NEXT: vmov r6, r7, d4 +; CHECK-NEXT: sbcs r1, r7, lr +; CHECK-NEXT: vmov r6, r7, d2 ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: vdup.32 d9, r1 -; CHECK-NEXT: vmov r1, r2, d6 +; CHECK-NEXT: vdup.32 d1, r1 +; CHECK-NEXT: vmov r1, lr, d6 ; CHECK-NEXT: subs r1, r6, r1 -; CHECK-NEXT: sbcs r1, r7, r2 -; CHECK-NEXT: vmov r6, r7, d0 +; CHECK-NEXT: sbcs r1, r7, lr +; CHECK-NEXT: vmov r6, r7, d4 ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: vdup.32 d8, r1 -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vbif q2, q3, q4 -; CHECK-NEXT: vdup.32 d7, r5 -; CHECK-NEXT: vdup.32 d9, r4 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vdup.32 d8, r0 +; CHECK-NEXT: vdup.32 d0, r1 +; CHECK-NEXT: vmov r1, lr, d8 +; CHECK-NEXT: vbsl q0, q1, q3 +; CHECK-NEXT: vdup.32 d3, r5 +; CHECK-NEXT: vdup.32 d7, r4 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: vdup.32 d6, r0 ; CHECK-NEXT: mov r0, r3 -; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128]! -; CHECK-NEXT: vbif q12, q14, q4 -; CHECK-NEXT: vdup.32 d6, lr -; CHECK-NEXT: vbit q11, q15, q3 +; CHECK-NEXT: vst1.64 {d28, d29}, [r0:128]! +; CHECK-NEXT: vbif q13, q15, q3 +; CHECK-NEXT: vdup.32 d2, r2 +; CHECK-NEXT: vbit q11, q12, q1 ; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128]! ; CHECK-NEXT: subs r1, r6, r1 -; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: sbcs r1, r7, r2 -; CHECK-NEXT: vmov r1, r2, d3 -; CHECK-NEXT: movlt r6, #1 -; CHECK-NEXT: subs r1, r4, r1 -; CHECK-NEXT: sbcs r1, r5, r2 +; CHECK-NEXT: vmov r6, r5, d5 +; CHECK-NEXT: sbcs r1, r7, lr +; CHECK-NEXT: vmov r1, r7, d9 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: subs r1, r6, r1 +; CHECK-NEXT: sbcs r1, r5, r7 ; CHECK-NEXT: movlt r12, #1 ; CHECK-NEXT: cmp r12, #0 ; CHECK-NEXT: mvnne r12, #0 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: vdup.32 d27, r12 -; CHECK-NEXT: mvnne r6, #0 -; CHECK-NEXT: vdup.32 d26, r6 -; CHECK-NEXT: vorr q10, q13, q13 -; CHECK-NEXT: vbsl q10, q0, q1 -; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128]! +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: vdup.32 d29, r12 +; CHECK-NEXT: mvnne r4, #0 +; CHECK-NEXT: vdup.32 d28, r4 +; CHECK-NEXT: vorr q10, q14, q14 +; CHECK-NEXT: vbsl q10, q2, q4 +; CHECK-NEXT: vst1.64 {d0, d1}, [r0:128]! ; CHECK-NEXT: vst1.64 {d22, d23}, [r0:128] ; CHECK-NEXT: add r0, r3, #64 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128]! -; CHECK-NEXT: vst1.64 {d24, d25}, [r0:128]! +; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128]! ; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: mov pc, lr + ptr %blend, ptr %storeaddr) { %v0 = load %T0_20, ptr %loadaddr %v1 = load %T0_20, ptr %loadaddr2 %c = icmp slt %T0_20 %v0, %v1 diff --git a/llvm/test/CodeGen/AVR/hardware-mul.ll b/llvm/test/CodeGen/AVR/hardware-mul.ll index edfdc7e64e8f59..8b77f56d04f795 100644 --- a/llvm/test/CodeGen/AVR/hardware-mul.ll +++ b/llvm/test/CodeGen/AVR/hardware-mul.ll @@ -21,19 +21,19 @@ define i16 @mult16(i16 %a, i16 %b) { ; CHECK-NEXT: mov r25, r0 ; CHECK-NEXT: clr r1 ; CHECK-NEXT: mul r22, r24 -; CHECK-NEXT: mov r20, r0 -; CHECK-NEXT: mov r18, r1 +; CHECK-NEXT: mov r18, r0 +; CHECK-NEXT: mov r20, r1 ; CHECK-NEXT: clr r1 -; CHECK-NEXT: add r18, r25 +; CHECK-NEXT: add r20, r25 ; CHECK-NEXT: muls r23, r24 ; CHECK-NEXT: clr r1 -; CHECK-NEXT: add r18, r0 -; CHECK-NEXT: mov r19, r18 -; CHECK-NEXT: clr r18 -; CHECK-NEXT: mov r24, r20 +; CHECK-NEXT: add r20, r0 +; CHECK-NEXT: mov r21, r20 +; CHECK-NEXT: clr r20 +; CHECK-NEXT: mov r24, r18 ; CHECK-NEXT: clr r25 -; CHECK-NEXT: or r24, r18 -; CHECK-NEXT: or r25, r19 +; CHECK-NEXT: or r24, r20 +; CHECK-NEXT: or r25, r21 ; CHECK-NEXT: ret %mul = mul nsw i16 %b, %a ret i16 %mul diff --git a/llvm/test/CodeGen/CSKY/atomic-rmw.ll b/llvm/test/CodeGen/CSKY/atomic-rmw.ll index c9fd90bb8c3478..6aa54456a6cf96 100644 --- a/llvm/test/CodeGen/CSKY/atomic-rmw.ll +++ b/llvm/test/CodeGen/CSKY/atomic-rmw.ll @@ -14,7 +14,7 @@ define i8 @atomicrmw_xchg_i8_monotonic(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI0_0: ; CSKY-NEXT: .long __atomic_exchange_1 %1 = atomicrmw xchg i8* %a, i8 %b monotonic @@ -33,7 +33,7 @@ define i8 @atomicrmw_xchg_i8_acquire(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI1_0: ; CSKY-NEXT: .long __atomic_exchange_1 %1 = atomicrmw xchg i8* %a, i8 %b acquire @@ -52,7 +52,7 @@ define i8 @atomicrmw_xchg_i8_release(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI2_0: ; CSKY-NEXT: .long __atomic_exchange_1 %1 = atomicrmw xchg i8* %a, i8 %b release @@ -71,7 +71,7 @@ define i8 @atomicrmw_xchg_i8_acq_rel(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI3_0: ; CSKY-NEXT: .long __atomic_exchange_1 %1 = atomicrmw xchg i8* %a, i8 %b acq_rel @@ -90,7 +90,7 @@ define i8 @atomicrmw_xchg_i8_seq_cst(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI4_0: ; CSKY-NEXT: .long __atomic_exchange_1 %1 = atomicrmw xchg i8* %a, i8 %b seq_cst @@ -109,7 +109,7 @@ define i8 @atomicrmw_add_i8_monotonic(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI5_0: ; CSKY-NEXT: .long __atomic_fetch_add_1 %1 = atomicrmw add i8* %a, i8 %b monotonic @@ -128,7 +128,7 @@ define i8 @atomicrmw_add_i8_acquire(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI6_0: ; CSKY-NEXT: .long __atomic_fetch_add_1 %1 = atomicrmw add i8* %a, i8 %b acquire @@ -147,7 +147,7 @@ define i8 @atomicrmw_add_i8_release(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI7_0: ; CSKY-NEXT: .long __atomic_fetch_add_1 %1 = atomicrmw add i8* %a, i8 %b release @@ -166,7 +166,7 @@ define i8 @atomicrmw_add_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI8_0: ; CSKY-NEXT: .long __atomic_fetch_add_1 %1 = atomicrmw add i8* %a, i8 %b acq_rel @@ -185,7 +185,7 @@ define i8 @atomicrmw_add_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI9_0: ; CSKY-NEXT: .long __atomic_fetch_add_1 %1 = atomicrmw add i8* %a, i8 %b seq_cst @@ -204,7 +204,7 @@ define i8 @atomicrmw_sub_i8_monotonic(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI10_0: ; CSKY-NEXT: .long __atomic_fetch_sub_1 %1 = atomicrmw sub i8* %a, i8 %b monotonic @@ -223,7 +223,7 @@ define i8 @atomicrmw_sub_i8_acquire(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI11_0: ; CSKY-NEXT: .long __atomic_fetch_sub_1 %1 = atomicrmw sub i8* %a, i8 %b acquire @@ -242,7 +242,7 @@ define i8 @atomicrmw_sub_i8_release(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI12_0: ; CSKY-NEXT: .long __atomic_fetch_sub_1 %1 = atomicrmw sub i8* %a, i8 %b release @@ -261,7 +261,7 @@ define i8 @atomicrmw_sub_i8_acq_rel(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI13_0: ; CSKY-NEXT: .long __atomic_fetch_sub_1 %1 = atomicrmw sub i8* %a, i8 %b acq_rel @@ -280,7 +280,7 @@ define i8 @atomicrmw_sub_i8_seq_cst(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI14_0: ; CSKY-NEXT: .long __atomic_fetch_sub_1 %1 = atomicrmw sub i8* %a, i8 %b seq_cst @@ -299,7 +299,7 @@ define i8 @atomicrmw_and_i8_monotonic(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI15_0: ; CSKY-NEXT: .long __atomic_fetch_and_1 %1 = atomicrmw and i8* %a, i8 %b monotonic @@ -318,7 +318,7 @@ define i8 @atomicrmw_and_i8_acquire(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI16_0: ; CSKY-NEXT: .long __atomic_fetch_and_1 %1 = atomicrmw and i8* %a, i8 %b acquire @@ -337,7 +337,7 @@ define i8 @atomicrmw_and_i8_release(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI17_0: ; CSKY-NEXT: .long __atomic_fetch_and_1 %1 = atomicrmw and i8* %a, i8 %b release @@ -356,7 +356,7 @@ define i8 @atomicrmw_and_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI18_0: ; CSKY-NEXT: .long __atomic_fetch_and_1 %1 = atomicrmw and i8* %a, i8 %b acq_rel @@ -375,7 +375,7 @@ define i8 @atomicrmw_and_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI19_0: ; CSKY-NEXT: .long __atomic_fetch_and_1 %1 = atomicrmw and i8* %a, i8 %b seq_cst @@ -394,7 +394,7 @@ define i8 @atomicrmw_nand_i8_monotonic(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI20_0: ; CSKY-NEXT: .long __atomic_fetch_nand_1 %1 = atomicrmw nand i8* %a, i8 %b monotonic @@ -413,7 +413,7 @@ define i8 @atomicrmw_nand_i8_acquire(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI21_0: ; CSKY-NEXT: .long __atomic_fetch_nand_1 %1 = atomicrmw nand i8* %a, i8 %b acquire @@ -432,7 +432,7 @@ define i8 @atomicrmw_nand_i8_release(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI22_0: ; CSKY-NEXT: .long __atomic_fetch_nand_1 %1 = atomicrmw nand i8* %a, i8 %b release @@ -451,7 +451,7 @@ define i8 @atomicrmw_nand_i8_acq_rel(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI23_0: ; CSKY-NEXT: .long __atomic_fetch_nand_1 %1 = atomicrmw nand i8* %a, i8 %b acq_rel @@ -470,7 +470,7 @@ define i8 @atomicrmw_nand_i8_seq_cst(i8* %a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI24_0: ; CSKY-NEXT: .long __atomic_fetch_nand_1 %1 = atomicrmw nand i8* %a, i8 %b seq_cst @@ -489,7 +489,7 @@ define i8 @atomicrmw_or_i8_monotonic(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI25_0: ; CSKY-NEXT: .long __atomic_fetch_or_1 %1 = atomicrmw or i8* %a, i8 %b monotonic @@ -508,7 +508,7 @@ define i8 @atomicrmw_or_i8_acquire(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI26_0: ; CSKY-NEXT: .long __atomic_fetch_or_1 %1 = atomicrmw or i8* %a, i8 %b acquire @@ -527,7 +527,7 @@ define i8 @atomicrmw_or_i8_release(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI27_0: ; CSKY-NEXT: .long __atomic_fetch_or_1 %1 = atomicrmw or i8* %a, i8 %b release @@ -546,7 +546,7 @@ define i8 @atomicrmw_or_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI28_0: ; CSKY-NEXT: .long __atomic_fetch_or_1 %1 = atomicrmw or i8* %a, i8 %b acq_rel @@ -565,7 +565,7 @@ define i8 @atomicrmw_or_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI29_0: ; CSKY-NEXT: .long __atomic_fetch_or_1 %1 = atomicrmw or i8* %a, i8 %b seq_cst @@ -584,7 +584,7 @@ define i8 @atomicrmw_xor_i8_monotonic(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI30_0: ; CSKY-NEXT: .long __atomic_fetch_xor_1 %1 = atomicrmw xor i8* %a, i8 %b monotonic @@ -603,7 +603,7 @@ define i8 @atomicrmw_xor_i8_acquire(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI31_0: ; CSKY-NEXT: .long __atomic_fetch_xor_1 %1 = atomicrmw xor i8* %a, i8 %b acquire @@ -622,7 +622,7 @@ define i8 @atomicrmw_xor_i8_release(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI32_0: ; CSKY-NEXT: .long __atomic_fetch_xor_1 %1 = atomicrmw xor i8* %a, i8 %b release @@ -641,7 +641,7 @@ define i8 @atomicrmw_xor_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI33_0: ; CSKY-NEXT: .long __atomic_fetch_xor_1 %1 = atomicrmw xor i8* %a, i8 %b acq_rel @@ -660,7 +660,7 @@ define i8 @atomicrmw_xor_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI34_0: ; CSKY-NEXT: .long __atomic_fetch_xor_1 %1 = atomicrmw xor i8* %a, i8 %b seq_cst @@ -708,7 +708,7 @@ define i8 @atomicrmw_max_i8_monotonic(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI35_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw max i8* %a, i8 %b monotonic @@ -756,7 +756,7 @@ define i8 @atomicrmw_max_i8_acquire(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI36_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw max i8* %a, i8 %b acquire @@ -804,7 +804,7 @@ define i8 @atomicrmw_max_i8_release(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI37_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw max i8* %a, i8 %b release @@ -852,7 +852,7 @@ define i8 @atomicrmw_max_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI38_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw max i8* %a, i8 %b acq_rel @@ -900,7 +900,7 @@ define i8 @atomicrmw_max_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI39_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw max i8* %a, i8 %b seq_cst @@ -948,7 +948,7 @@ define i8 @atomicrmw_min_i8_monotonic(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI40_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw min i8* %a, i8 %b monotonic @@ -996,7 +996,7 @@ define i8 @atomicrmw_min_i8_acquire(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI41_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw min i8* %a, i8 %b acquire @@ -1044,7 +1044,7 @@ define i8 @atomicrmw_min_i8_release(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI42_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw min i8* %a, i8 %b release @@ -1092,7 +1092,7 @@ define i8 @atomicrmw_min_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI43_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw min i8* %a, i8 %b acq_rel @@ -1140,7 +1140,7 @@ define i8 @atomicrmw_min_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI44_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw min i8* %a, i8 %b seq_cst @@ -1188,7 +1188,7 @@ define i8 @atomicrmw_umax_i8_monotonic(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI45_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw umax i8* %a, i8 %b monotonic @@ -1236,7 +1236,7 @@ define i8 @atomicrmw_umax_i8_acquire(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI46_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw umax i8* %a, i8 %b acquire @@ -1284,7 +1284,7 @@ define i8 @atomicrmw_umax_i8_release(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI47_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw umax i8* %a, i8 %b release @@ -1332,7 +1332,7 @@ define i8 @atomicrmw_umax_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI48_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw umax i8* %a, i8 %b acq_rel @@ -1380,7 +1380,7 @@ define i8 @atomicrmw_umax_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI49_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw umax i8* %a, i8 %b seq_cst @@ -1428,7 +1428,7 @@ define i8 @atomicrmw_umin_i8_monotonic(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI50_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw umin i8* %a, i8 %b monotonic @@ -1476,7 +1476,7 @@ define i8 @atomicrmw_umin_i8_acquire(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI51_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw umin i8* %a, i8 %b acquire @@ -1524,7 +1524,7 @@ define i8 @atomicrmw_umin_i8_release(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI52_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw umin i8* %a, i8 %b release @@ -1572,7 +1572,7 @@ define i8 @atomicrmw_umin_i8_acq_rel(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI53_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw umin i8* %a, i8 %b acq_rel @@ -1620,7 +1620,7 @@ define i8 @atomicrmw_umin_i8_seq_cst(i8 *%a, i8 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI54_0: ; CSKY-NEXT: .long __atomic_compare_exchange_1 %1 = atomicrmw umin i8* %a, i8 %b seq_cst @@ -1639,7 +1639,7 @@ define i16 @atomicrmw_xchg_i16_monotonic(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI55_0: ; CSKY-NEXT: .long __atomic_exchange_2 %1 = atomicrmw xchg i16* %a, i16 %b monotonic @@ -1658,7 +1658,7 @@ define i16 @atomicrmw_xchg_i16_acquire(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI56_0: ; CSKY-NEXT: .long __atomic_exchange_2 %1 = atomicrmw xchg i16* %a, i16 %b acquire @@ -1677,7 +1677,7 @@ define i16 @atomicrmw_xchg_i16_release(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI57_0: ; CSKY-NEXT: .long __atomic_exchange_2 %1 = atomicrmw xchg i16* %a, i16 %b release @@ -1696,7 +1696,7 @@ define i16 @atomicrmw_xchg_i16_acq_rel(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI58_0: ; CSKY-NEXT: .long __atomic_exchange_2 %1 = atomicrmw xchg i16* %a, i16 %b acq_rel @@ -1715,7 +1715,7 @@ define i16 @atomicrmw_xchg_i16_seq_cst(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI59_0: ; CSKY-NEXT: .long __atomic_exchange_2 %1 = atomicrmw xchg i16* %a, i16 %b seq_cst @@ -1734,7 +1734,7 @@ define i16 @atomicrmw_add_i16_monotonic(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI60_0: ; CSKY-NEXT: .long __atomic_fetch_add_2 %1 = atomicrmw add i16* %a, i16 %b monotonic @@ -1753,7 +1753,7 @@ define i16 @atomicrmw_add_i16_acquire(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI61_0: ; CSKY-NEXT: .long __atomic_fetch_add_2 %1 = atomicrmw add i16* %a, i16 %b acquire @@ -1772,7 +1772,7 @@ define i16 @atomicrmw_add_i16_release(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI62_0: ; CSKY-NEXT: .long __atomic_fetch_add_2 %1 = atomicrmw add i16* %a, i16 %b release @@ -1791,7 +1791,7 @@ define i16 @atomicrmw_add_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI63_0: ; CSKY-NEXT: .long __atomic_fetch_add_2 %1 = atomicrmw add i16* %a, i16 %b acq_rel @@ -1810,7 +1810,7 @@ define i16 @atomicrmw_add_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI64_0: ; CSKY-NEXT: .long __atomic_fetch_add_2 %1 = atomicrmw add i16* %a, i16 %b seq_cst @@ -1829,7 +1829,7 @@ define i16 @atomicrmw_sub_i16_monotonic(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI65_0: ; CSKY-NEXT: .long __atomic_fetch_sub_2 %1 = atomicrmw sub i16* %a, i16 %b monotonic @@ -1848,7 +1848,7 @@ define i16 @atomicrmw_sub_i16_acquire(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI66_0: ; CSKY-NEXT: .long __atomic_fetch_sub_2 %1 = atomicrmw sub i16* %a, i16 %b acquire @@ -1867,7 +1867,7 @@ define i16 @atomicrmw_sub_i16_release(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI67_0: ; CSKY-NEXT: .long __atomic_fetch_sub_2 %1 = atomicrmw sub i16* %a, i16 %b release @@ -1886,7 +1886,7 @@ define i16 @atomicrmw_sub_i16_acq_rel(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI68_0: ; CSKY-NEXT: .long __atomic_fetch_sub_2 %1 = atomicrmw sub i16* %a, i16 %b acq_rel @@ -1905,7 +1905,7 @@ define i16 @atomicrmw_sub_i16_seq_cst(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI69_0: ; CSKY-NEXT: .long __atomic_fetch_sub_2 %1 = atomicrmw sub i16* %a, i16 %b seq_cst @@ -1924,7 +1924,7 @@ define i16 @atomicrmw_and_i16_monotonic(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI70_0: ; CSKY-NEXT: .long __atomic_fetch_and_2 %1 = atomicrmw and i16* %a, i16 %b monotonic @@ -1943,7 +1943,7 @@ define i16 @atomicrmw_and_i16_acquire(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI71_0: ; CSKY-NEXT: .long __atomic_fetch_and_2 %1 = atomicrmw and i16* %a, i16 %b acquire @@ -1962,7 +1962,7 @@ define i16 @atomicrmw_and_i16_release(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI72_0: ; CSKY-NEXT: .long __atomic_fetch_and_2 %1 = atomicrmw and i16* %a, i16 %b release @@ -1981,7 +1981,7 @@ define i16 @atomicrmw_and_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI73_0: ; CSKY-NEXT: .long __atomic_fetch_and_2 %1 = atomicrmw and i16* %a, i16 %b acq_rel @@ -2000,7 +2000,7 @@ define i16 @atomicrmw_and_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI74_0: ; CSKY-NEXT: .long __atomic_fetch_and_2 %1 = atomicrmw and i16* %a, i16 %b seq_cst @@ -2019,7 +2019,7 @@ define i16 @atomicrmw_nand_i16_monotonic(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI75_0: ; CSKY-NEXT: .long __atomic_fetch_nand_2 %1 = atomicrmw nand i16* %a, i16 %b monotonic @@ -2038,7 +2038,7 @@ define i16 @atomicrmw_nand_i16_acquire(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI76_0: ; CSKY-NEXT: .long __atomic_fetch_nand_2 %1 = atomicrmw nand i16* %a, i16 %b acquire @@ -2057,7 +2057,7 @@ define i16 @atomicrmw_nand_i16_release(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI77_0: ; CSKY-NEXT: .long __atomic_fetch_nand_2 %1 = atomicrmw nand i16* %a, i16 %b release @@ -2076,7 +2076,7 @@ define i16 @atomicrmw_nand_i16_acq_rel(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI78_0: ; CSKY-NEXT: .long __atomic_fetch_nand_2 %1 = atomicrmw nand i16* %a, i16 %b acq_rel @@ -2095,7 +2095,7 @@ define i16 @atomicrmw_nand_i16_seq_cst(i16* %a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI79_0: ; CSKY-NEXT: .long __atomic_fetch_nand_2 %1 = atomicrmw nand i16* %a, i16 %b seq_cst @@ -2114,7 +2114,7 @@ define i16 @atomicrmw_or_i16_monotonic(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI80_0: ; CSKY-NEXT: .long __atomic_fetch_or_2 %1 = atomicrmw or i16* %a, i16 %b monotonic @@ -2133,7 +2133,7 @@ define i16 @atomicrmw_or_i16_acquire(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI81_0: ; CSKY-NEXT: .long __atomic_fetch_or_2 %1 = atomicrmw or i16* %a, i16 %b acquire @@ -2152,7 +2152,7 @@ define i16 @atomicrmw_or_i16_release(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI82_0: ; CSKY-NEXT: .long __atomic_fetch_or_2 %1 = atomicrmw or i16* %a, i16 %b release @@ -2171,7 +2171,7 @@ define i16 @atomicrmw_or_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI83_0: ; CSKY-NEXT: .long __atomic_fetch_or_2 %1 = atomicrmw or i16* %a, i16 %b acq_rel @@ -2190,7 +2190,7 @@ define i16 @atomicrmw_or_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI84_0: ; CSKY-NEXT: .long __atomic_fetch_or_2 %1 = atomicrmw or i16* %a, i16 %b seq_cst @@ -2209,7 +2209,7 @@ define i16 @atomicrmw_xor_i16_monotonic(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI85_0: ; CSKY-NEXT: .long __atomic_fetch_xor_2 %1 = atomicrmw xor i16* %a, i16 %b monotonic @@ -2228,7 +2228,7 @@ define i16 @atomicrmw_xor_i16_acquire(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI86_0: ; CSKY-NEXT: .long __atomic_fetch_xor_2 %1 = atomicrmw xor i16* %a, i16 %b acquire @@ -2247,7 +2247,7 @@ define i16 @atomicrmw_xor_i16_release(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI87_0: ; CSKY-NEXT: .long __atomic_fetch_xor_2 %1 = atomicrmw xor i16* %a, i16 %b release @@ -2266,7 +2266,7 @@ define i16 @atomicrmw_xor_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI88_0: ; CSKY-NEXT: .long __atomic_fetch_xor_2 %1 = atomicrmw xor i16* %a, i16 %b acq_rel @@ -2285,7 +2285,7 @@ define i16 @atomicrmw_xor_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI89_0: ; CSKY-NEXT: .long __atomic_fetch_xor_2 %1 = atomicrmw xor i16* %a, i16 %b seq_cst @@ -2333,7 +2333,7 @@ define i16 @atomicrmw_max_i16_monotonic(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI90_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw max i16* %a, i16 %b monotonic @@ -2381,7 +2381,7 @@ define i16 @atomicrmw_max_i16_acquire(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI91_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw max i16* %a, i16 %b acquire @@ -2429,7 +2429,7 @@ define i16 @atomicrmw_max_i16_release(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI92_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw max i16* %a, i16 %b release @@ -2477,7 +2477,7 @@ define i16 @atomicrmw_max_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI93_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw max i16* %a, i16 %b acq_rel @@ -2525,7 +2525,7 @@ define i16 @atomicrmw_max_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI94_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw max i16* %a, i16 %b seq_cst @@ -2573,7 +2573,7 @@ define i16 @atomicrmw_min_i16_monotonic(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI95_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw min i16* %a, i16 %b monotonic @@ -2621,7 +2621,7 @@ define i16 @atomicrmw_min_i16_acquire(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI96_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw min i16* %a, i16 %b acquire @@ -2669,7 +2669,7 @@ define i16 @atomicrmw_min_i16_release(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI97_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw min i16* %a, i16 %b release @@ -2717,7 +2717,7 @@ define i16 @atomicrmw_min_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI98_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw min i16* %a, i16 %b acq_rel @@ -2765,7 +2765,7 @@ define i16 @atomicrmw_min_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI99_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw min i16* %a, i16 %b seq_cst @@ -2813,7 +2813,7 @@ define i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI100_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw umax i16* %a, i16 %b monotonic @@ -2861,7 +2861,7 @@ define i16 @atomicrmw_umax_i16_acquire(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI101_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw umax i16* %a, i16 %b acquire @@ -2909,7 +2909,7 @@ define i16 @atomicrmw_umax_i16_release(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI102_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw umax i16* %a, i16 %b release @@ -2957,7 +2957,7 @@ define i16 @atomicrmw_umax_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI103_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw umax i16* %a, i16 %b acq_rel @@ -3005,7 +3005,7 @@ define i16 @atomicrmw_umax_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI104_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw umax i16* %a, i16 %b seq_cst @@ -3053,7 +3053,7 @@ define i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI105_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw umin i16* %a, i16 %b monotonic @@ -3101,7 +3101,7 @@ define i16 @atomicrmw_umin_i16_acquire(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI106_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw umin i16* %a, i16 %b acquire @@ -3149,7 +3149,7 @@ define i16 @atomicrmw_umin_i16_release(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI107_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw umin i16* %a, i16 %b release @@ -3197,7 +3197,7 @@ define i16 @atomicrmw_umin_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI108_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw umin i16* %a, i16 %b acq_rel @@ -3245,7 +3245,7 @@ define i16 @atomicrmw_umin_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI109_0: ; CSKY-NEXT: .long __atomic_compare_exchange_2 %1 = atomicrmw umin i16* %a, i16 %b seq_cst @@ -3264,7 +3264,7 @@ define i32 @atomicrmw_xchg_i32_monotonic(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI110_0: ; CSKY-NEXT: .long __atomic_exchange_4 %1 = atomicrmw xchg i32* %a, i32 %b monotonic @@ -3283,7 +3283,7 @@ define i32 @atomicrmw_xchg_i32_acquire(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI111_0: ; CSKY-NEXT: .long __atomic_exchange_4 %1 = atomicrmw xchg i32* %a, i32 %b acquire @@ -3302,7 +3302,7 @@ define i32 @atomicrmw_xchg_i32_release(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI112_0: ; CSKY-NEXT: .long __atomic_exchange_4 %1 = atomicrmw xchg i32* %a, i32 %b release @@ -3321,7 +3321,7 @@ define i32 @atomicrmw_xchg_i32_acq_rel(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI113_0: ; CSKY-NEXT: .long __atomic_exchange_4 %1 = atomicrmw xchg i32* %a, i32 %b acq_rel @@ -3340,7 +3340,7 @@ define i32 @atomicrmw_xchg_i32_seq_cst(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI114_0: ; CSKY-NEXT: .long __atomic_exchange_4 %1 = atomicrmw xchg i32* %a, i32 %b seq_cst @@ -3359,7 +3359,7 @@ define i32 @atomicrmw_add_i32_monotonic(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI115_0: ; CSKY-NEXT: .long __atomic_fetch_add_4 %1 = atomicrmw add i32* %a, i32 %b monotonic @@ -3378,7 +3378,7 @@ define i32 @atomicrmw_add_i32_acquire(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI116_0: ; CSKY-NEXT: .long __atomic_fetch_add_4 %1 = atomicrmw add i32* %a, i32 %b acquire @@ -3397,7 +3397,7 @@ define i32 @atomicrmw_add_i32_release(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI117_0: ; CSKY-NEXT: .long __atomic_fetch_add_4 %1 = atomicrmw add i32* %a, i32 %b release @@ -3416,7 +3416,7 @@ define i32 @atomicrmw_add_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI118_0: ; CSKY-NEXT: .long __atomic_fetch_add_4 %1 = atomicrmw add i32* %a, i32 %b acq_rel @@ -3435,7 +3435,7 @@ define i32 @atomicrmw_add_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI119_0: ; CSKY-NEXT: .long __atomic_fetch_add_4 %1 = atomicrmw add i32* %a, i32 %b seq_cst @@ -3454,7 +3454,7 @@ define i32 @atomicrmw_sub_i32_monotonic(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI120_0: ; CSKY-NEXT: .long __atomic_fetch_sub_4 %1 = atomicrmw sub i32* %a, i32 %b monotonic @@ -3473,7 +3473,7 @@ define i32 @atomicrmw_sub_i32_acquire(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI121_0: ; CSKY-NEXT: .long __atomic_fetch_sub_4 %1 = atomicrmw sub i32* %a, i32 %b acquire @@ -3492,7 +3492,7 @@ define i32 @atomicrmw_sub_i32_release(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI122_0: ; CSKY-NEXT: .long __atomic_fetch_sub_4 %1 = atomicrmw sub i32* %a, i32 %b release @@ -3511,7 +3511,7 @@ define i32 @atomicrmw_sub_i32_acq_rel(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI123_0: ; CSKY-NEXT: .long __atomic_fetch_sub_4 %1 = atomicrmw sub i32* %a, i32 %b acq_rel @@ -3530,7 +3530,7 @@ define i32 @atomicrmw_sub_i32_seq_cst(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI124_0: ; CSKY-NEXT: .long __atomic_fetch_sub_4 %1 = atomicrmw sub i32* %a, i32 %b seq_cst @@ -3549,7 +3549,7 @@ define i32 @atomicrmw_and_i32_monotonic(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI125_0: ; CSKY-NEXT: .long __atomic_fetch_and_4 %1 = atomicrmw and i32* %a, i32 %b monotonic @@ -3568,7 +3568,7 @@ define i32 @atomicrmw_and_i32_acquire(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI126_0: ; CSKY-NEXT: .long __atomic_fetch_and_4 %1 = atomicrmw and i32* %a, i32 %b acquire @@ -3587,7 +3587,7 @@ define i32 @atomicrmw_and_i32_release(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI127_0: ; CSKY-NEXT: .long __atomic_fetch_and_4 %1 = atomicrmw and i32* %a, i32 %b release @@ -3606,7 +3606,7 @@ define i32 @atomicrmw_and_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI128_0: ; CSKY-NEXT: .long __atomic_fetch_and_4 %1 = atomicrmw and i32* %a, i32 %b acq_rel @@ -3625,7 +3625,7 @@ define i32 @atomicrmw_and_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI129_0: ; CSKY-NEXT: .long __atomic_fetch_and_4 %1 = atomicrmw and i32* %a, i32 %b seq_cst @@ -3644,7 +3644,7 @@ define i32 @atomicrmw_nand_i32_monotonic(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI130_0: ; CSKY-NEXT: .long __atomic_fetch_nand_4 %1 = atomicrmw nand i32* %a, i32 %b monotonic @@ -3663,7 +3663,7 @@ define i32 @atomicrmw_nand_i32_acquire(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI131_0: ; CSKY-NEXT: .long __atomic_fetch_nand_4 %1 = atomicrmw nand i32* %a, i32 %b acquire @@ -3682,7 +3682,7 @@ define i32 @atomicrmw_nand_i32_release(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI132_0: ; CSKY-NEXT: .long __atomic_fetch_nand_4 %1 = atomicrmw nand i32* %a, i32 %b release @@ -3701,7 +3701,7 @@ define i32 @atomicrmw_nand_i32_acq_rel(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI133_0: ; CSKY-NEXT: .long __atomic_fetch_nand_4 %1 = atomicrmw nand i32* %a, i32 %b acq_rel @@ -3720,7 +3720,7 @@ define i32 @atomicrmw_nand_i32_seq_cst(i32* %a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI134_0: ; CSKY-NEXT: .long __atomic_fetch_nand_4 %1 = atomicrmw nand i32* %a, i32 %b seq_cst @@ -3739,7 +3739,7 @@ define i32 @atomicrmw_or_i32_monotonic(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI135_0: ; CSKY-NEXT: .long __atomic_fetch_or_4 %1 = atomicrmw or i32* %a, i32 %b monotonic @@ -3758,7 +3758,7 @@ define i32 @atomicrmw_or_i32_acquire(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI136_0: ; CSKY-NEXT: .long __atomic_fetch_or_4 %1 = atomicrmw or i32* %a, i32 %b acquire @@ -3777,7 +3777,7 @@ define i32 @atomicrmw_or_i32_release(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI137_0: ; CSKY-NEXT: .long __atomic_fetch_or_4 %1 = atomicrmw or i32* %a, i32 %b release @@ -3796,7 +3796,7 @@ define i32 @atomicrmw_or_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI138_0: ; CSKY-NEXT: .long __atomic_fetch_or_4 %1 = atomicrmw or i32* %a, i32 %b acq_rel @@ -3815,7 +3815,7 @@ define i32 @atomicrmw_or_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI139_0: ; CSKY-NEXT: .long __atomic_fetch_or_4 %1 = atomicrmw or i32* %a, i32 %b seq_cst @@ -3834,7 +3834,7 @@ define i32 @atomicrmw_xor_i32_monotonic(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI140_0: ; CSKY-NEXT: .long __atomic_fetch_xor_4 %1 = atomicrmw xor i32* %a, i32 %b monotonic @@ -3853,7 +3853,7 @@ define i32 @atomicrmw_xor_i32_acquire(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI141_0: ; CSKY-NEXT: .long __atomic_fetch_xor_4 %1 = atomicrmw xor i32* %a, i32 %b acquire @@ -3872,7 +3872,7 @@ define i32 @atomicrmw_xor_i32_release(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI142_0: ; CSKY-NEXT: .long __atomic_fetch_xor_4 %1 = atomicrmw xor i32* %a, i32 %b release @@ -3891,7 +3891,7 @@ define i32 @atomicrmw_xor_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI143_0: ; CSKY-NEXT: .long __atomic_fetch_xor_4 %1 = atomicrmw xor i32* %a, i32 %b acq_rel @@ -3910,7 +3910,7 @@ define i32 @atomicrmw_xor_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI144_0: ; CSKY-NEXT: .long __atomic_fetch_xor_4 %1 = atomicrmw xor i32* %a, i32 %b seq_cst @@ -3954,7 +3954,7 @@ define i32 @atomicrmw_max_i32_monotonic(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI145_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw max i32* %a, i32 %b monotonic @@ -3998,7 +3998,7 @@ define i32 @atomicrmw_max_i32_acquire(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI146_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw max i32* %a, i32 %b acquire @@ -4042,7 +4042,7 @@ define i32 @atomicrmw_max_i32_release(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI147_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw max i32* %a, i32 %b release @@ -4086,7 +4086,7 @@ define i32 @atomicrmw_max_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI148_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw max i32* %a, i32 %b acq_rel @@ -4130,7 +4130,7 @@ define i32 @atomicrmw_max_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI149_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw max i32* %a, i32 %b seq_cst @@ -4174,7 +4174,7 @@ define i32 @atomicrmw_min_i32_monotonic(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI150_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw min i32* %a, i32 %b monotonic @@ -4218,7 +4218,7 @@ define i32 @atomicrmw_min_i32_acquire(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI151_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw min i32* %a, i32 %b acquire @@ -4262,7 +4262,7 @@ define i32 @atomicrmw_min_i32_release(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI152_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw min i32* %a, i32 %b release @@ -4306,7 +4306,7 @@ define i32 @atomicrmw_min_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI153_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw min i32* %a, i32 %b acq_rel @@ -4350,7 +4350,7 @@ define i32 @atomicrmw_min_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI154_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw min i32* %a, i32 %b seq_cst @@ -4394,7 +4394,7 @@ define i32 @atomicrmw_umax_i32_monotonic(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI155_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw umax i32* %a, i32 %b monotonic @@ -4438,7 +4438,7 @@ define i32 @atomicrmw_umax_i32_acquire(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI156_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw umax i32* %a, i32 %b acquire @@ -4482,7 +4482,7 @@ define i32 @atomicrmw_umax_i32_release(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI157_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw umax i32* %a, i32 %b release @@ -4526,7 +4526,7 @@ define i32 @atomicrmw_umax_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI158_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw umax i32* %a, i32 %b acq_rel @@ -4570,7 +4570,7 @@ define i32 @atomicrmw_umax_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI159_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw umax i32* %a, i32 %b seq_cst @@ -4614,7 +4614,7 @@ define i32 @atomicrmw_umin_i32_monotonic(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI160_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw umin i32* %a, i32 %b monotonic @@ -4658,7 +4658,7 @@ define i32 @atomicrmw_umin_i32_acquire(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI161_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw umin i32* %a, i32 %b acquire @@ -4702,7 +4702,7 @@ define i32 @atomicrmw_umin_i32_release(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI162_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw umin i32* %a, i32 %b release @@ -4746,7 +4746,7 @@ define i32 @atomicrmw_umin_i32_acq_rel(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI163_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw umin i32* %a, i32 %b acq_rel @@ -4790,7 +4790,7 @@ define i32 @atomicrmw_umin_i32_seq_cst(i32 *%a, i32 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI164_0: ; CSKY-NEXT: .long __atomic_compare_exchange_4 %1 = atomicrmw umin i32* %a, i32 %b seq_cst @@ -4809,7 +4809,7 @@ define i64 @atomicrmw_xchg_i64_monotonic(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI165_0: ; CSKY-NEXT: .long __atomic_exchange_8 %1 = atomicrmw xchg i64* %a, i64 %b monotonic @@ -4828,7 +4828,7 @@ define i64 @atomicrmw_xchg_i64_acquire(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI166_0: ; CSKY-NEXT: .long __atomic_exchange_8 %1 = atomicrmw xchg i64* %a, i64 %b acquire @@ -4847,7 +4847,7 @@ define i64 @atomicrmw_xchg_i64_release(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI167_0: ; CSKY-NEXT: .long __atomic_exchange_8 %1 = atomicrmw xchg i64* %a, i64 %b release @@ -4866,7 +4866,7 @@ define i64 @atomicrmw_xchg_i64_acq_rel(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI168_0: ; CSKY-NEXT: .long __atomic_exchange_8 %1 = atomicrmw xchg i64* %a, i64 %b acq_rel @@ -4885,7 +4885,7 @@ define i64 @atomicrmw_xchg_i64_seq_cst(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI169_0: ; CSKY-NEXT: .long __atomic_exchange_8 %1 = atomicrmw xchg i64* %a, i64 %b seq_cst @@ -4904,7 +4904,7 @@ define i64 @atomicrmw_add_i64_monotonic(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI170_0: ; CSKY-NEXT: .long __atomic_fetch_add_8 %1 = atomicrmw add i64* %a, i64 %b monotonic @@ -4923,7 +4923,7 @@ define i64 @atomicrmw_add_i64_acquire(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI171_0: ; CSKY-NEXT: .long __atomic_fetch_add_8 %1 = atomicrmw add i64* %a, i64 %b acquire @@ -4942,7 +4942,7 @@ define i64 @atomicrmw_add_i64_release(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI172_0: ; CSKY-NEXT: .long __atomic_fetch_add_8 %1 = atomicrmw add i64* %a, i64 %b release @@ -4961,7 +4961,7 @@ define i64 @atomicrmw_add_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI173_0: ; CSKY-NEXT: .long __atomic_fetch_add_8 %1 = atomicrmw add i64* %a, i64 %b acq_rel @@ -4980,7 +4980,7 @@ define i64 @atomicrmw_add_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI174_0: ; CSKY-NEXT: .long __atomic_fetch_add_8 %1 = atomicrmw add i64* %a, i64 %b seq_cst @@ -4999,7 +4999,7 @@ define i64 @atomicrmw_sub_i64_monotonic(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI175_0: ; CSKY-NEXT: .long __atomic_fetch_sub_8 %1 = atomicrmw sub i64* %a, i64 %b monotonic @@ -5018,7 +5018,7 @@ define i64 @atomicrmw_sub_i64_acquire(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI176_0: ; CSKY-NEXT: .long __atomic_fetch_sub_8 %1 = atomicrmw sub i64* %a, i64 %b acquire @@ -5037,7 +5037,7 @@ define i64 @atomicrmw_sub_i64_release(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI177_0: ; CSKY-NEXT: .long __atomic_fetch_sub_8 %1 = atomicrmw sub i64* %a, i64 %b release @@ -5056,7 +5056,7 @@ define i64 @atomicrmw_sub_i64_acq_rel(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI178_0: ; CSKY-NEXT: .long __atomic_fetch_sub_8 %1 = atomicrmw sub i64* %a, i64 %b acq_rel @@ -5075,7 +5075,7 @@ define i64 @atomicrmw_sub_i64_seq_cst(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI179_0: ; CSKY-NEXT: .long __atomic_fetch_sub_8 %1 = atomicrmw sub i64* %a, i64 %b seq_cst @@ -5094,7 +5094,7 @@ define i64 @atomicrmw_and_i64_monotonic(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI180_0: ; CSKY-NEXT: .long __atomic_fetch_and_8 %1 = atomicrmw and i64* %a, i64 %b monotonic @@ -5113,7 +5113,7 @@ define i64 @atomicrmw_and_i64_acquire(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI181_0: ; CSKY-NEXT: .long __atomic_fetch_and_8 %1 = atomicrmw and i64* %a, i64 %b acquire @@ -5132,7 +5132,7 @@ define i64 @atomicrmw_and_i64_release(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI182_0: ; CSKY-NEXT: .long __atomic_fetch_and_8 %1 = atomicrmw and i64* %a, i64 %b release @@ -5151,7 +5151,7 @@ define i64 @atomicrmw_and_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI183_0: ; CSKY-NEXT: .long __atomic_fetch_and_8 %1 = atomicrmw and i64* %a, i64 %b acq_rel @@ -5170,7 +5170,7 @@ define i64 @atomicrmw_and_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI184_0: ; CSKY-NEXT: .long __atomic_fetch_and_8 %1 = atomicrmw and i64* %a, i64 %b seq_cst @@ -5189,7 +5189,7 @@ define i64 @atomicrmw_nand_i64_monotonic(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI185_0: ; CSKY-NEXT: .long __atomic_fetch_nand_8 %1 = atomicrmw nand i64* %a, i64 %b monotonic @@ -5208,7 +5208,7 @@ define i64 @atomicrmw_nand_i64_acquire(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI186_0: ; CSKY-NEXT: .long __atomic_fetch_nand_8 %1 = atomicrmw nand i64* %a, i64 %b acquire @@ -5227,7 +5227,7 @@ define i64 @atomicrmw_nand_i64_release(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI187_0: ; CSKY-NEXT: .long __atomic_fetch_nand_8 %1 = atomicrmw nand i64* %a, i64 %b release @@ -5246,7 +5246,7 @@ define i64 @atomicrmw_nand_i64_acq_rel(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI188_0: ; CSKY-NEXT: .long __atomic_fetch_nand_8 %1 = atomicrmw nand i64* %a, i64 %b acq_rel @@ -5265,7 +5265,7 @@ define i64 @atomicrmw_nand_i64_seq_cst(i64* %a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI189_0: ; CSKY-NEXT: .long __atomic_fetch_nand_8 %1 = atomicrmw nand i64* %a, i64 %b seq_cst @@ -5284,7 +5284,7 @@ define i64 @atomicrmw_or_i64_monotonic(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI190_0: ; CSKY-NEXT: .long __atomic_fetch_or_8 %1 = atomicrmw or i64* %a, i64 %b monotonic @@ -5303,7 +5303,7 @@ define i64 @atomicrmw_or_i64_acquire(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI191_0: ; CSKY-NEXT: .long __atomic_fetch_or_8 %1 = atomicrmw or i64* %a, i64 %b acquire @@ -5322,7 +5322,7 @@ define i64 @atomicrmw_or_i64_release(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI192_0: ; CSKY-NEXT: .long __atomic_fetch_or_8 %1 = atomicrmw or i64* %a, i64 %b release @@ -5341,7 +5341,7 @@ define i64 @atomicrmw_or_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI193_0: ; CSKY-NEXT: .long __atomic_fetch_or_8 %1 = atomicrmw or i64* %a, i64 %b acq_rel @@ -5360,7 +5360,7 @@ define i64 @atomicrmw_or_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI194_0: ; CSKY-NEXT: .long __atomic_fetch_or_8 %1 = atomicrmw or i64* %a, i64 %b seq_cst @@ -5379,7 +5379,7 @@ define i64 @atomicrmw_xor_i64_monotonic(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI195_0: ; CSKY-NEXT: .long __atomic_fetch_xor_8 %1 = atomicrmw xor i64* %a, i64 %b monotonic @@ -5398,7 +5398,7 @@ define i64 @atomicrmw_xor_i64_acquire(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI196_0: ; CSKY-NEXT: .long __atomic_fetch_xor_8 %1 = atomicrmw xor i64* %a, i64 %b acquire @@ -5417,7 +5417,7 @@ define i64 @atomicrmw_xor_i64_release(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI197_0: ; CSKY-NEXT: .long __atomic_fetch_xor_8 %1 = atomicrmw xor i64* %a, i64 %b release @@ -5436,7 +5436,7 @@ define i64 @atomicrmw_xor_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI198_0: ; CSKY-NEXT: .long __atomic_fetch_xor_8 %1 = atomicrmw xor i64* %a, i64 %b acq_rel @@ -5455,7 +5455,7 @@ define i64 @atomicrmw_xor_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.1: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI199_0: ; CSKY-NEXT: .long __atomic_fetch_xor_8 %1 = atomicrmw xor i64* %a, i64 %b seq_cst @@ -5482,16 +5482,16 @@ define i64 @atomicrmw_max_i64_monotonic(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: # =>This Inner Loop Header: Depth=1 ; CSKY-NEXT: cmplt16 l0, a1 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 16) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: cmphs16 l1, a0 ; CSKY-NEXT: mvcv16 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 16) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 16) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a3, a2 ; CSKY-NEXT: btsti16 a3, 0 @@ -5521,7 +5521,7 @@ define i64 @atomicrmw_max_i64_monotonic(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI200_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw max i64* %a, i64 %b monotonic @@ -5548,16 +5548,16 @@ define i64 @atomicrmw_max_i64_acquire(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: # =>This Inner Loop Header: Depth=1 ; CSKY-NEXT: cmplt16 l0, a1 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 16) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: cmphs16 l1, a0 ; CSKY-NEXT: mvcv16 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 16) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 16) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a3, a2 ; CSKY-NEXT: btsti16 a3, 0 @@ -5587,7 +5587,7 @@ define i64 @atomicrmw_max_i64_acquire(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI201_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw max i64* %a, i64 %b acquire @@ -5616,16 +5616,16 @@ define i64 @atomicrmw_max_i64_release(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: # =>This Inner Loop Header: Depth=1 ; CSKY-NEXT: cmplt16 l0, a1 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 16) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: cmphs16 l1, a0 ; CSKY-NEXT: mvcv16 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 16) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 16) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a3, a2 ; CSKY-NEXT: btsti16 a3, 0 @@ -5656,7 +5656,7 @@ define i64 @atomicrmw_max_i64_release(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI202_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw max i64* %a, i64 %b release @@ -5685,16 +5685,16 @@ define i64 @atomicrmw_max_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: # =>This Inner Loop Header: Depth=1 ; CSKY-NEXT: cmplt16 l0, a1 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 16) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: cmphs16 l1, a0 ; CSKY-NEXT: mvcv16 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 16) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 16) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a3, a2 ; CSKY-NEXT: btsti16 a3, 0 @@ -5725,7 +5725,7 @@ define i64 @atomicrmw_max_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI203_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw max i64* %a, i64 %b acq_rel @@ -5752,16 +5752,16 @@ define i64 @atomicrmw_max_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: # =>This Inner Loop Header: Depth=1 ; CSKY-NEXT: cmplt16 l0, a1 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 16) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: cmphs16 l1, a0 ; CSKY-NEXT: mvcv16 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 16) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 16) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a3, a2 ; CSKY-NEXT: btsti16 a3, 0 @@ -5791,7 +5791,7 @@ define i64 @atomicrmw_max_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI204_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw max i64* %a, i64 %b seq_cst @@ -5818,16 +5818,16 @@ define i64 @atomicrmw_min_i64_monotonic(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: # =>This Inner Loop Header: Depth=1 ; CSKY-NEXT: cmphs16 l1, a0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 16) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: cmplt16 l0, a1 ; CSKY-NEXT: mvcv16 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 16) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 16) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a2, a3 ; CSKY-NEXT: btsti16 a2, 0 @@ -5857,7 +5857,7 @@ define i64 @atomicrmw_min_i64_monotonic(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI205_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw min i64* %a, i64 %b monotonic @@ -5884,16 +5884,16 @@ define i64 @atomicrmw_min_i64_acquire(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: # =>This Inner Loop Header: Depth=1 ; CSKY-NEXT: cmphs16 l1, a0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 16) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: cmplt16 l0, a1 ; CSKY-NEXT: mvcv16 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 16) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 16) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a2, a3 ; CSKY-NEXT: btsti16 a2, 0 @@ -5923,7 +5923,7 @@ define i64 @atomicrmw_min_i64_acquire(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI206_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw min i64* %a, i64 %b acquire @@ -5952,16 +5952,16 @@ define i64 @atomicrmw_min_i64_release(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: # =>This Inner Loop Header: Depth=1 ; CSKY-NEXT: cmphs16 l1, a0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 16) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: cmplt16 l0, a1 ; CSKY-NEXT: mvcv16 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 16) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 16) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a2, a3 ; CSKY-NEXT: btsti16 a2, 0 @@ -5992,7 +5992,7 @@ define i64 @atomicrmw_min_i64_release(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI207_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw min i64* %a, i64 %b release @@ -6021,16 +6021,16 @@ define i64 @atomicrmw_min_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: # =>This Inner Loop Header: Depth=1 ; CSKY-NEXT: cmphs16 l1, a0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 16) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: cmplt16 l0, a1 ; CSKY-NEXT: mvcv16 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 16) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 16) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a2, a3 ; CSKY-NEXT: btsti16 a2, 0 @@ -6061,7 +6061,7 @@ define i64 @atomicrmw_min_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI208_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw min i64* %a, i64 %b acq_rel @@ -6088,16 +6088,16 @@ define i64 @atomicrmw_min_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: # =>This Inner Loop Header: Depth=1 ; CSKY-NEXT: cmphs16 l1, a0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 16) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: cmplt16 l0, a1 ; CSKY-NEXT: mvcv16 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 16) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 16) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a2, a3 ; CSKY-NEXT: btsti16 a2, 0 @@ -6127,7 +6127,7 @@ define i64 @atomicrmw_min_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI209_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw min i64* %a, i64 %b seq_cst @@ -6189,7 +6189,7 @@ define i64 @atomicrmw_umax_i64_monotonic(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI210_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw umax i64* %a, i64 %b monotonic @@ -6251,7 +6251,7 @@ define i64 @atomicrmw_umax_i64_acquire(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI211_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw umax i64* %a, i64 %b acquire @@ -6316,7 +6316,7 @@ define i64 @atomicrmw_umax_i64_release(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI212_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw umax i64* %a, i64 %b release @@ -6381,7 +6381,7 @@ define i64 @atomicrmw_umax_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI213_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw umax i64* %a, i64 %b acq_rel @@ -6443,7 +6443,7 @@ define i64 @atomicrmw_umax_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI214_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw umax i64* %a, i64 %b seq_cst @@ -6473,17 +6473,17 @@ define i64 @atomicrmw_umin_i64_monotonic(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmphs16 l0, a1 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 20) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 20) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: ld16.w a2, (sp, 16) ; CSKY-NEXT: btsti16 a2, 0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 20) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 20) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a3, a2 ; CSKY-NEXT: btsti16 a3, 0 @@ -6513,7 +6513,7 @@ define i64 @atomicrmw_umin_i64_monotonic(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI215_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw umin i64* %a, i64 %b monotonic @@ -6543,17 +6543,17 @@ define i64 @atomicrmw_umin_i64_acquire(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmphs16 l0, a1 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 20) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 20) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: ld16.w a2, (sp, 16) ; CSKY-NEXT: btsti16 a2, 0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 20) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 20) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a3, a2 ; CSKY-NEXT: btsti16 a3, 0 @@ -6583,7 +6583,7 @@ define i64 @atomicrmw_umin_i64_acquire(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI216_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw umin i64* %a, i64 %b acquire @@ -6615,17 +6615,17 @@ define i64 @atomicrmw_umin_i64_release(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmphs16 l0, a1 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 20) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 20) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: ld16.w a2, (sp, 16) ; CSKY-NEXT: btsti16 a2, 0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 20) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 20) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a3, a2 ; CSKY-NEXT: btsti16 a3, 0 @@ -6656,7 +6656,7 @@ define i64 @atomicrmw_umin_i64_release(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI217_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw umin i64* %a, i64 %b release @@ -6688,17 +6688,17 @@ define i64 @atomicrmw_umin_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmphs16 l0, a1 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 20) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 20) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: ld16.w a2, (sp, 16) ; CSKY-NEXT: btsti16 a2, 0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 20) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 20) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a3, a2 ; CSKY-NEXT: btsti16 a3, 0 @@ -6729,7 +6729,7 @@ define i64 @atomicrmw_umin_i64_acq_rel(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI218_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw umin i64* %a, i64 %b acq_rel @@ -6759,17 +6759,17 @@ define i64 @atomicrmw_umin_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: st16.w a2, (sp, 16) ; CSKY-NEXT: cmphs16 l0, a1 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 12) +; CSKY-NEXT: st16.w a2, (sp, 20) ; CSKY-NEXT: cmpne16 a1, l0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: st16.w a2, (sp, 20) +; CSKY-NEXT: st16.w a2, (sp, 12) ; CSKY-NEXT: ld16.w a2, (sp, 16) ; CSKY-NEXT: btsti16 a2, 0 ; CSKY-NEXT: mvc32 a2 -; CSKY-NEXT: ld16.w a3, (sp, 12) +; CSKY-NEXT: ld16.w a3, (sp, 20) ; CSKY-NEXT: btsti16 a3, 0 ; CSKY-NEXT: mvc32 a3 -; CSKY-NEXT: ld32.w t0, (sp, 20) +; CSKY-NEXT: ld32.w t0, (sp, 12) ; CSKY-NEXT: btsti32 t0, 0 ; CSKY-NEXT: movf32 a3, a2 ; CSKY-NEXT: btsti16 a3, 0 @@ -6799,7 +6799,7 @@ define i64 @atomicrmw_umin_i64_seq_cst(i64 *%a, i64 %b) nounwind { ; CSKY-NEXT: rts16 ; CSKY-NEXT: .p2align 1 ; CSKY-NEXT: # %bb.3: -; CSKY-NEXT: .p2align 2 +; CSKY-NEXT: .p2align 2, 0x0 ; CSKY-NEXT: .LCPI219_0: ; CSKY-NEXT: .long __atomic_compare_exchange_8 %1 = atomicrmw umin i64* %a, i64 %b seq_cst diff --git a/llvm/test/CodeGen/CSKY/br.ll b/llvm/test/CodeGen/CSKY/br.ll index d3ca7ef22a7b35..23ff9e170b2fa7 100644 --- a/llvm/test/CodeGen/CSKY/br.ll +++ b/llvm/test/CodeGen/CSKY/br.ll @@ -2023,16 +2023,16 @@ define i64 @brRR_i64_sgt(i64 %x, i64 %y) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmphs16 a0, a2 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 4) +; CHECK-NEXT: st16.w a0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 8) +; CHECK-NEXT: st16.w a0, (sp, 4) ; CHECK-NEXT: cmplt16 a1, a3 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a0, a1 ; CHECK-NEXT: btsti16 a0, 0 @@ -2095,16 +2095,16 @@ define i64 @brRI_i64_sgt(i64 %x) { ; CHECK-NEXT: movi16 a2, 0 ; CHECK-NEXT: cmplt16 a1, a2 ; CHECK-NEXT: mvc32 a2 -; CHECK-NEXT: st16.w a2, (sp, 4) +; CHECK-NEXT: st16.w a2, (sp, 8) ; CHECK-NEXT: cmpnei16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmphsi16 a0, 11 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a1, a0 ; CHECK-NEXT: btsti16 a1, 0 @@ -2168,16 +2168,16 @@ define i64 @brR0_i64_sgt(i64 %x) { ; CHECK-NEXT: movi16 a2, 0 ; CHECK-NEXT: cmplt16 a1, a2 ; CHECK-NEXT: mvc32 a2 -; CHECK-NEXT: st16.w a2, (sp, 4) +; CHECK-NEXT: st16.w a2, (sp, 8) ; CHECK-NEXT: cmpnei16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmpnei16 a0, 0 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a1, a0 ; CHECK-NEXT: btsti16 a1, 0 @@ -2241,16 +2241,16 @@ define i64 @brRR_i64_sge(i64 %x, i64 %y) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmplt16 a3, a1 ; CHECK-NEXT: mvc32 t0 -; CHECK-NEXT: st32.w t0, (sp, 4) +; CHECK-NEXT: st32.w t0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmphs16 a2, a0 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a1, a0 ; CHECK-NEXT: btsti16 a1, 0 @@ -2318,16 +2318,16 @@ define i64 @brRI_i64_sge(i64 %x) { ; CHECK-NEXT: movi16 a2, 0 ; CHECK-NEXT: cmplt16 a1, a2 ; CHECK-NEXT: mvc32 a2 -; CHECK-NEXT: st16.w a2, (sp, 4) +; CHECK-NEXT: st16.w a2, (sp, 8) ; CHECK-NEXT: cmpnei16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmphsi16 a0, 10 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a1, a0 ; CHECK-NEXT: btsti16 a1, 0 @@ -2429,16 +2429,16 @@ define i64 @brRR_i64_slt(i64 %x, i64 %y) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmphs16 a2, a0 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 4) +; CHECK-NEXT: st16.w a0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 8) +; CHECK-NEXT: st16.w a0, (sp, 4) ; CHECK-NEXT: cmplt16 a3, a1 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a0, a1 ; CHECK-NEXT: btsti16 a0, 0 @@ -2623,16 +2623,16 @@ define i64 @brRR_i64_sle(i64 %x, i64 %y) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmplt16 a1, a3 ; CHECK-NEXT: mvc32 t0 -; CHECK-NEXT: st32.w t0, (sp, 4) +; CHECK-NEXT: st32.w t0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmphs16 a0, a2 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a1, a0 ; CHECK-NEXT: btsti16 a1, 0 diff --git a/llvm/test/CodeGen/CSKY/cmp-i.ll b/llvm/test/CodeGen/CSKY/cmp-i.ll index 38642d4f9de98e..aa5de0510ce773 100644 --- a/llvm/test/CodeGen/CSKY/cmp-i.ll +++ b/llvm/test/CodeGen/CSKY/cmp-i.ll @@ -2033,16 +2033,16 @@ define i1 @ICMP_LONG_sgt(i64 %x, i64 %y) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmplt16 a1, a3 ; CHECK-NEXT: mvc32 t0 -; CHECK-NEXT: st32.w t0, (sp, 4) +; CHECK-NEXT: st32.w t0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmphs16 a0, a2 ; CHECK-NEXT: mvcv16 a1 -; CHECK-NEXT: ld16.w a0, (sp, 4) +; CHECK-NEXT: ld16.w a0, (sp, 8) ; CHECK-NEXT: btsti16 a0, 0 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a0, a1 ; CHECK-NEXT: addi16 sp, sp, 12 @@ -2366,16 +2366,16 @@ define i1 @ICMP_LONG_sge(i64 %x, i64 %y) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmphs16 a2, a0 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 4) +; CHECK-NEXT: st16.w a0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 8) +; CHECK-NEXT: st16.w a0, (sp, 4) ; CHECK-NEXT: cmplt16 a3, a1 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a0, a1 ; CHECK-NEXT: addi16 sp, sp, 12 @@ -2697,16 +2697,16 @@ define i1 @ICMP_LONG_slt(i64 %x, i64 %y) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmplt16 a3, a1 ; CHECK-NEXT: mvc32 t0 -; CHECK-NEXT: st32.w t0, (sp, 4) +; CHECK-NEXT: st32.w t0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmphs16 a2, a0 ; CHECK-NEXT: mvcv16 a1 -; CHECK-NEXT: ld16.w a0, (sp, 4) +; CHECK-NEXT: ld16.w a0, (sp, 8) ; CHECK-NEXT: btsti16 a0, 0 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a0, a1 ; CHECK-NEXT: addi16 sp, sp, 12 @@ -2750,16 +2750,16 @@ define i1 @ICMP_LONG_I_slt(i64 %x) { ; CHECK-NEXT: movi16 a2, 0 ; CHECK-NEXT: cmplt16 a1, a2 ; CHECK-NEXT: mvc32 a2 -; CHECK-NEXT: st16.w a2, (sp, 4) +; CHECK-NEXT: st16.w a2, (sp, 8) ; CHECK-NEXT: cmpnei16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmpnei16 a0, 0 ; CHECK-NEXT: mvcv16 a1 -; CHECK-NEXT: ld16.w a0, (sp, 4) +; CHECK-NEXT: ld16.w a0, (sp, 8) ; CHECK-NEXT: btsti16 a0, 0 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a0, a1 ; CHECK-NEXT: addi16 sp, sp, 12 @@ -3024,16 +3024,16 @@ define i1 @ICMP_LONG_sle(i64 %x, i64 %y) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmphs16 a0, a2 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 4) +; CHECK-NEXT: st16.w a0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 8) +; CHECK-NEXT: st16.w a0, (sp, 4) ; CHECK-NEXT: cmplt16 a1, a3 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a0, a1 ; CHECK-NEXT: addi16 sp, sp, 12 @@ -3072,16 +3072,16 @@ define i1 @ICMP_LONG_I_sle(i64 %x) { ; CHECK-NEXT: movi16 a2, 0 ; CHECK-NEXT: cmplt16 a1, a2 ; CHECK-NEXT: mvc32 a2 -; CHECK-NEXT: st16.w a2, (sp, 4) +; CHECK-NEXT: st16.w a2, (sp, 8) ; CHECK-NEXT: cmpnei16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmphsi16 a0, 2 ; CHECK-NEXT: mvcv16 a1 -; CHECK-NEXT: ld16.w a0, (sp, 4) +; CHECK-NEXT: ld16.w a0, (sp, 8) ; CHECK-NEXT: btsti16 a0, 0 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a0, a1 ; CHECK-NEXT: addi16 sp, sp, 12 diff --git a/llvm/test/CodeGen/CSKY/select.ll b/llvm/test/CodeGen/CSKY/select.ll index 65488850d4bab9..e146df7644f90e 100644 --- a/llvm/test/CodeGen/CSKY/select.ll +++ b/llvm/test/CodeGen/CSKY/select.ll @@ -4995,16 +4995,16 @@ define i64 @selectRR_sgt_i64(i64 %x, i64 %y, i64 %n, i64 %m) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmplt16 a1, a3 ; CHECK-NEXT: mvc32 t0 -; CHECK-NEXT: st32.w t0, (sp, 4) +; CHECK-NEXT: st32.w t0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmphs16 a0, a2 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a1, a0 ; CHECK-NEXT: addi16 a2, sp, 12 @@ -5836,16 +5836,16 @@ define i64 @selectRR_sge_i64(i64 %x, i64 %y, i64 %n, i64 %m) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmphs16 a2, a0 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 4) +; CHECK-NEXT: st16.w a0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 8) +; CHECK-NEXT: st16.w a0, (sp, 4) ; CHECK-NEXT: cmplt16 a3, a1 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a0, a1 ; CHECK-NEXT: addi16 a1, sp, 12 @@ -6663,16 +6663,16 @@ define i64 @selectRR_slt_i64(i64 %x, i64 %y, i64 %n, i64 %m) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmplt16 a3, a1 ; CHECK-NEXT: mvc32 t0 -; CHECK-NEXT: st32.w t0, (sp, 4) +; CHECK-NEXT: st32.w t0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmphs16 a2, a0 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a1, a0 ; CHECK-NEXT: addi16 a2, sp, 12 @@ -6740,16 +6740,16 @@ define i64 @selectRI_slt_i64(i64 %x, i64 %n, i64 %m) { ; CHECK-NEXT: movi16 l0, 0 ; CHECK-NEXT: cmplt16 a1, l0 ; CHECK-NEXT: mvc32 l0 -; CHECK-NEXT: st16.w l0, (sp, 4) +; CHECK-NEXT: st16.w l0, (sp, 8) ; CHECK-NEXT: cmpnei16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmphsi16 a0, 10 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w l0, (sp, 8) +; CHECK-NEXT: ld16.w l0, (sp, 4) ; CHECK-NEXT: btsti16 l0, 0 ; CHECK-NEXT: movf32 a1, a0 ; CHECK-NEXT: btsti16 a1, 0 @@ -7489,16 +7489,16 @@ define i64 @selectRR_sle_i64(i64 %x, i64 %y, i64 %n, i64 %m) { ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: cmphs16 a0, a2 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 4) +; CHECK-NEXT: st16.w a0, (sp, 8) ; CHECK-NEXT: cmpne16 a3, a1 ; CHECK-NEXT: mvc32 a0 -; CHECK-NEXT: st16.w a0, (sp, 8) +; CHECK-NEXT: st16.w a0, (sp, 4) ; CHECK-NEXT: cmplt16 a1, a3 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w a2, (sp, 8) +; CHECK-NEXT: ld16.w a2, (sp, 4) ; CHECK-NEXT: btsti16 a2, 0 ; CHECK-NEXT: movf32 a0, a1 ; CHECK-NEXT: addi16 a1, sp, 12 @@ -7561,16 +7561,16 @@ define i64 @selectRI_sle_i64(i64 %x, i64 %n, i64 %m) { ; CHECK-NEXT: movi16 l0, 0 ; CHECK-NEXT: cmplt16 a1, l0 ; CHECK-NEXT: mvc32 l0 -; CHECK-NEXT: st16.w l0, (sp, 4) +; CHECK-NEXT: st16.w l0, (sp, 8) ; CHECK-NEXT: cmpnei16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: st16.w a1, (sp, 8) +; CHECK-NEXT: st16.w a1, (sp, 4) ; CHECK-NEXT: cmphsi16 a0, 11 ; CHECK-NEXT: mvcv16 a0 -; CHECK-NEXT: ld16.w a1, (sp, 4) +; CHECK-NEXT: ld16.w a1, (sp, 8) ; CHECK-NEXT: btsti16 a1, 0 ; CHECK-NEXT: mvc32 a1 -; CHECK-NEXT: ld16.w l0, (sp, 8) +; CHECK-NEXT: ld16.w l0, (sp, 4) ; CHECK-NEXT: btsti16 l0, 0 ; CHECK-NEXT: movf32 a1, a0 ; CHECK-NEXT: btsti16 a1, 0 diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll index 9d7570b9a929ed..6bf1a5ed214545 100644 --- a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll @@ -350,16 +350,16 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-NEXT: r13:12 = add(r5:4,r7:6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: p1 = cmp.gtu(r5:4,r3:2) -; CHECK-NEXT: p0 = cmp.eq(r5:4,r9:8) +; CHECK-NEXT: p0 = cmp.gtu(r5:4,r3:2) +; CHECK-NEXT: p1 = cmp.eq(r5:4,r9:8) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r1 = mux(p1,r2,r12) -; CHECK-NEXT: r14 = mux(p1,r3,r13) +; CHECK-NEXT: r1 = mux(p0,r2,r12) +; CHECK-NEXT: r14 = mux(p0,r3,r13) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r10 = mux(p0,r2,r1) -; CHECK-NEXT: r11 = mux(p0,r3,r14) +; CHECK-NEXT: r10 = mux(p1,r2,r1) +; CHECK-NEXT: r11 = mux(p1,r3,r14) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: memd_locked(r0,p0) = r11:10 diff --git a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll index 66db73f5c69f63..539d717a85e393 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll @@ -491,136 +491,136 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r4 = ##-2147483648 -; CHECK-NEXT: r3:2 = combine(#1,#8) -; CHECK-NEXT: v5 = vmem(r0+#0) +; CHECK-NEXT: r3:2 = combine(##-2147483648,#8) +; CHECK-NEXT: r4 = #1 +; CHECK-NEXT: v4 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vsplat(r4) +; CHECK-NEXT: v3 = vsplat(r3) ; CHECK-NEXT: r7 = #30 ; CHECK-NEXT: r6 = #24 -; CHECK-NEXT: v2 = vmem(r0+#2) +; CHECK-NEXT: v1 = vmem(r0+#2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r7) ; CHECK-NEXT: r5 = #32 -; CHECK-NEXT: v8.w = vasl(v4.w,r3) -; CHECK-NEXT: v4.cur = vmem(r0+#1) +; CHECK-NEXT: v7.w = vasl(v4.w,r4) +; CHECK-NEXT: v6 = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v5.w,r3) +; CHECK-NEXT: v8.w = vasl(v6.w,r4) ; CHECK-NEXT: v12 = vxor(v12,v12) -; CHECK-NEXT: v8.w = vsub(v8.w,v1.w) +; CHECK-NEXT: v7.w = vsub(v7.w,v3.w) ; CHECK-NEXT: v0 = vmem(r0+#3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) -; CHECK-NEXT: v11.w = vasl(v0.w,r3) -; CHECK-NEXT: v7.w = vsub(v7.w,v1.w) -; CHECK-NEXT: q0 = vcmp.gt(v12.w,v5.w) +; CHECK-NEXT: v11.w = vasl(v0.w,r4) +; CHECK-NEXT: v8.w = vsub(v8.w,v3.w) +; CHECK-NEXT: q0 = vcmp.gt(v12.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vasl(v2.w,r3) -; CHECK-NEXT: q1 = vcmp.gt(v12.w,v4.w) -; CHECK-NEXT: v11.w = vsub(v11.w,v1.w) +; CHECK-NEXT: v9.w = vasl(v1.w,r4) +; CHECK-NEXT: q1 = vcmp.gt(v12.w,v6.w) +; CHECK-NEXT: v11.w = vsub(v11.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r3 = ##2147483647 +; CHECK-NEXT: r4 = ##2147483647 ; CHECK-NEXT: r7 = #64 -; CHECK-NEXT: v8.w = vasr(v8.w,r6) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v22 = vsplat(r3) ; CHECK-NEXT: v7.w = vasr(v7.w,r6) -; CHECK-NEXT: v19.w = vsub(v9.w,v1.w) -; CHECK-NEXT: v8.w = vsub(v10.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20.w = vasl(v4.w,r2) -; CHECK-NEXT: v27 = vmux(q1,v1,v22) -; CHECK-NEXT: v25 = vmux(q0,v1,v22) +; CHECK-NEXT: v20 = vsplat(r4) +; CHECK-NEXT: v8.w = vasr(v8.w,r6) +; CHECK-NEXT: v17.w = vsub(v9.w,v3.w) ; CHECK-NEXT: v7.w = vsub(v10.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vasl(v5.w,r2) +; CHECK-NEXT: v5.w = vasl(v4.w,r2) +; CHECK-NEXT: v23 = vmux(q0,v3,v20) +; CHECK-NEXT: v25 = vmux(q1,v3,v20) +; CHECK-NEXT: v8.w = vsub(v10.w,v8.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v18.w = vasl(v6.w,r2) +; CHECK-NEXT: v19.w = vmin(v7.w,v13.w) +; CHECK-NEXT: v5 = vor(v5,v3) ; CHECK-NEXT: v8.w = vmin(v8.w,v13.w) -; CHECK-NEXT: v9 = vor(v20,v1) -; CHECK-NEXT: v21.w = vmin(v7.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vasr(v19.w,r6) +; CHECK-NEXT: v4.w = vasr(v17.w,r6) +; CHECK-NEXT: q2 = vcmp.gt(v19.w,v12.w) +; CHECK-NEXT: v9 = vor(v18,v3) ; CHECK-NEXT: q3 = vcmp.gt(v8.w,v12.w) -; CHECK-NEXT: v6 = vor(v6,v1) -; CHECK-NEXT: q2 = vcmp.gt(v21.w,v12.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.w = vasr(v11.w,r6) -; CHECK-NEXT: v5.w = vsub(v10.w,v5.w) +; CHECK-NEXT: v4.w = vsub(v10.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vasl(v2.w,r2) +; CHECK-NEXT: v2.w = vasl(v1.w,r2) ; CHECK-NEXT: v10.w = vsub(v10.w,v11.w) -; CHECK-NEXT: v5.w = vmin(v5.w,v13.w) +; CHECK-NEXT: v4.w = vmin(v4.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.w = vasl(v0.w,r2) -; CHECK-NEXT: v3 = vor(v3,v1) +; CHECK-NEXT: v21.w = vasl(v0.w,r2) +; CHECK-NEXT: v2 = vor(v2,v3) ; CHECK-NEXT: v10.w = vmin(v10.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vlsr(v9.w,v8.w) -; CHECK-NEXT: v4 = vor(v23,v1) +; CHECK-NEXT: v5.w = vlsr(v5.w,v19.w) +; CHECK-NEXT: v6 = vor(v21,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vlsr(v6.w,v21.w) -; CHECK-NEXT: v26.w = vsub(v12.w,v8.w) +; CHECK-NEXT: v8.w = vlsr(v9.w,v8.w) +; CHECK-NEXT: v22.w = vsub(v12.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vlsr(v3.w,v5.w) -; CHECK-NEXT: v24.w = vsub(v12.w,v6.w) -; CHECK-NEXT: v8 = vmux(q1,v26,v8) +; CHECK-NEXT: v2.w = vlsr(v2.w,v4.w) +; CHECK-NEXT: v24.w = vsub(v12.w,v8.w) +; CHECK-NEXT: v5 = vmux(q0,v22,v5) +; CHECK-NEXT: q0 = vcmp.gt(v12.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vlsr(v4.w,v10.w) -; CHECK-NEXT: v6 = vmux(q0,v24,v6) -; CHECK-NEXT: q0 = vcmp.gt(v12.w,v2.w) -; CHECK-NEXT: v28.w = vsub(v12.w,v3.w) +; CHECK-NEXT: v6.w = vlsr(v6.w,v10.w) +; CHECK-NEXT: v8 = vmux(q1,v24,v8) +; CHECK-NEXT: v26.w = vsub(v12.w,v2.w) +; CHECK-NEXT: v5 = vmux(q2,v5,v23) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vmux(q3,v8,v27) -; CHECK-NEXT: v29.w = vsub(v12.w,v4.w) +; CHECK-NEXT: v1 = vmux(q3,v8,v25) +; CHECK-NEXT: v27.w = vsub(v12.w,v6.w) ; CHECK-NEXT: q3 = vcmp.gt(v12.w,v0.w) -; CHECK-NEXT: v6 = vmux(q2,v6,v25) +; CHECK-NEXT: v28 = vmux(q0,v3,v20) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q0,v1,v22) -; CHECK-NEXT: v3 = vmux(q0,v28,v3) -; CHECK-NEXT: q2 = vcmp.gt(v5.w,v12.w) -; CHECK-NEXT: v4 = vmux(q3,v29,v4) +; CHECK-NEXT: v2 = vmux(q0,v26,v2) +; CHECK-NEXT: q2 = vcmp.gt(v4.w,v12.w) +; CHECK-NEXT: v29 = vmux(q3,v27,v6) +; CHECK-NEXT: v3 = vmux(q3,v3,v20) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vpack(v2.w,v6.w):sat -; CHECK-NEXT: v1 = vmux(q3,v1,v22) +; CHECK-NEXT: v1.h = vpack(v1.w,v5.w):sat ; CHECK-NEXT: q3 = vcmp.gt(v10.w,v12.w) -; CHECK-NEXT: v0 = vmux(q2,v3,v30) +; CHECK-NEXT: v0 = vmux(q2,v2,v28) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vmux(q3,v4,v1) +; CHECK-NEXT: v30 = vmux(q3,v29,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.h = vpack(v1.w,v0.w):sat +; CHECK-NEXT: v31.h = vpack(v1.w,v0.w):sat ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.h = vpack(v1.w,v0.w):sat +; CHECK-NEXT: v0.h = vpack(v30.w,v0.w):sat ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v31.b = vpack(v3.h,v2.h):sat +; CHECK-NEXT: v1.b = vpack(v31.h,v1.h):sat ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.b = vpack(v3.h,v0.h):sat +; CHECK-NEXT: v0.b = vpack(v31.h,v0.h):sat ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1:0 = vshuff(v0,v31,r7) +; CHECK-NEXT: v1:0 = vshuff(v0,v1,r7) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } @@ -1547,110 +1547,108 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r3:2 = combine(#8,#1) -; CHECK-NEXT: r4 = ##-2147483648 -; CHECK-NEXT: v5 = vmem(r0+#0) +; CHECK-NEXT: r3:2 = combine(##-2147483648,#8) +; CHECK-NEXT: r4 = #1 +; CHECK-NEXT: v4 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3 = vsplat(r4) +; CHECK-NEXT: v3 = vsplat(r3) ; CHECK-NEXT: r5 = #30 ; CHECK-NEXT: r6 = #24 -; CHECK-NEXT: v2 = vmem(r0+#1) +; CHECK-NEXT: v6 = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14 = vsplat(r5) -; CHECK-NEXT: r4 = #32 -; CHECK-NEXT: v8.w = vasl(v5.w,r2) +; CHECK-NEXT: v8.w = vasl(v4.w,r4) +; CHECK-NEXT: v13 = vxor(v13,v13) ; CHECK-NEXT: v0 = vmem(r0+#3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vasl(v2.w,r2) -; CHECK-NEXT: v13 = vxor(v13,v13) +; CHECK-NEXT: r7 = #64 +; CHECK-NEXT: v9.w = vasl(v6.w,r4) ; CHECK-NEXT: v8.w = vsub(v8.w,v3.w) ; CHECK-NEXT: v1 = vmem(r0+#2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20 = vsplat(r4) -; CHECK-NEXT: v12.w = vasl(v0.w,r2) +; CHECK-NEXT: v12.w = vasl(v0.w,r4) +; CHECK-NEXT: q0 = vcmp.gt(v13.w,v4.w) ; CHECK-NEXT: v9.w = vsub(v9.w,v3.w) -; CHECK-NEXT: q0 = vcmp.gt(v13.w,v5.w) +; CHECK-NEXT: q3 = vcmp.gt(v13.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.w = vasl(v1.w,r2) -; CHECK-NEXT: q3 = vcmp.gt(v13.w,v2.w) +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: v11.w = vasl(v1.w,r4) ; CHECK-NEXT: v12.w = vsub(v12.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r2 = ##2147483647 -; CHECK-NEXT: r7 = #64 -; CHECK-NEXT: v11.w = vsub(v11.w,v3.w) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v22 = vsplat(r2) +; CHECK-NEXT: v21 = vsplat(r4) ; CHECK-NEXT: v8.w = vasr(v8.w,r6) +; CHECK-NEXT: v11.w = vsub(v11.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.w = vasr(v9.w,r6) ; CHECK-NEXT: v8.w = vsub(v14.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vasl(v5.w,r3) +; CHECK-NEXT: v5.w = vasl(v4.w,r2) ; CHECK-NEXT: v9.w = vsub(v14.w,v9.w) -; CHECK-NEXT: v8.w = vmin(v8.w,v20.w) +; CHECK-NEXT: v8.w = vmin(v8.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v2.w,r3) -; CHECK-NEXT: v6 = vor(v6,v3) -; CHECK-NEXT: v9.w = vmin(v9.w,v20.w) +; CHECK-NEXT: v7.w = vasl(v6.w,r2) +; CHECK-NEXT: v5 = vor(v5,v3) +; CHECK-NEXT: v9.w = vmin(v9.w,v21.w) ; CHECK-NEXT: q1 = vcmp.gt(v13.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v19.w = vasr(v11.w,r6) +; CHECK-NEXT: v20.w = vasr(v11.w,r6) ; CHECK-NEXT: v7 = vor(v7,v3) ; CHECK-NEXT: q2 = vcmp.gt(v13.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.w = vasr(v12.w,r6) -; CHECK-NEXT: v5.w = vsub(v14.w,v19.w) +; CHECK-NEXT: v4.w = vsub(v14.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vasl(v1.w,r3) -; CHECK-NEXT: v21.w = vsub(v14.w,v12.w) -; CHECK-NEXT: v5.w = vmin(v5.w,v20.w) +; CHECK-NEXT: v2.w = vasl(v1.w,r2) +; CHECK-NEXT: v22.w = vsub(v14.w,v12.w) +; CHECK-NEXT: v4.w = vmin(v4.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v10.w = vasl(v0.w,r3) -; CHECK-NEXT: v4 = vor(v4,v3) +; CHECK-NEXT: r2 = ##2147483647 +; CHECK-NEXT: v10.w = vasl(v0.w,r2) +; CHECK-NEXT: v2 = vor(v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vlsr(v6.w,v8.w) +; CHECK-NEXT: v23 = vsplat(r2) +; CHECK-NEXT: v5.w = vlsr(v5.w,v8.w) ; CHECK-NEXT: v3 = vor(v10,v3) -; CHECK-NEXT: v10.w = vmin(v21.w,v20.w) +; CHECK-NEXT: v10.w = vmin(v22.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.w = vlsr(v7.w,v9.w) -; CHECK-NEXT: v24 = vmux(q1,v22,v6) -; CHECK-NEXT: q1 = vcmp.gt(v13.w,v5.w) +; CHECK-NEXT: v5 = vmux(q1,v23,v5) +; CHECK-NEXT: q1 = vcmp.gt(v13.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.w = vlsr(v4.w,v5.w) -; CHECK-NEXT: v25 = vmux(q2,v22,v7) +; CHECK-NEXT: v2.w = vlsr(v2.w,v4.w) +; CHECK-NEXT: v24 = vmux(q2,v23,v7) ; CHECK-NEXT: q2 = vcmp.gt(v13.w,v10.w) -; CHECK-NEXT: v4 = vmux(q0,v13,v24) +; CHECK-NEXT: v25 = vmux(q0,v13,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.w = vlsr(v3.w,v10.w) -; CHECK-NEXT: v26 = vmux(q3,v13,v25) -; CHECK-NEXT: v2 = vmux(q1,v22,v23) +; CHECK-NEXT: v26 = vmux(q3,v13,v24) +; CHECK-NEXT: v2 = vmux(q1,v23,v2) ; CHECK-NEXT: q1 = vcmp.gt(v13.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27 = vmux(q2,v22,v3) +; CHECK-NEXT: v27 = vmux(q2,v23,v3) ; CHECK-NEXT: q3 = vcmp.gt(v13.w,v0.w) ; CHECK-NEXT: v28 = vmux(q1,v13,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uh = vpack(v26.w,v4.w):sat +; CHECK-NEXT: v29.uh = vpack(v26.w,v25.w):sat ; CHECK-NEXT: v1 = vmux(q3,v13,v27) ; CHECK-NEXT: } ; CHECK-NEXT: { diff --git a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll index 5cfa09b0822bb1..6814eaeeaacc21 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll @@ -19,9 +19,9 @@ define void @s8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.h = vunpack(v0.b) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vsplat(r7) +; CHECK-NEXT: v3.h = vsplat(r7) ; CHECK-NEXT: r3:2 = combine(#31,#5) -; CHECK-NEXT: v3.h = vabs(v0.h) +; CHECK-NEXT: v2.h = vabs(v0.h) ; CHECK-NEXT: v4.h = vabs(v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -31,60 +31,60 @@ define void @s8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r5 = ##32768 -; CHECK-NEXT: v5.uh = vcl0(v3.uh) +; CHECK-NEXT: v5.uh = vcl0(v2.uh) ; CHECK-NEXT: q0 = vcmp.gt(v9.h,v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10.h = vsplat(r5) ; CHECK-NEXT: r4 = #10 ; CHECK-NEXT: v6.uh = vcl0(v4.uh) -; CHECK-NEXT: v5.h = vadd(v5.h,v2.h) +; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27 = vmux(q0,v10,v9) -; CHECK-NEXT: v6.h = vadd(v6.h,v2.h) +; CHECK-NEXT: v6.h = vadd(v6.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.h = vasl(v3.h,v5.h) +; CHECK-NEXT: v2.h = vasl(v2.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.h = vasl(v4.h,v6.h) -; CHECK-NEXT: v13 = vand(v3,v8) -; CHECK-NEXT: v11.h = vadd(v3.h,v7.h) +; CHECK-NEXT: v13 = vand(v2,v8) +; CHECK-NEXT: v11.h = vadd(v2.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.h = vadd(v4.h,v7.h) ; CHECK-NEXT: q2 = vcmp.eq(v13.h,v9.h) ; CHECK-NEXT: v8 = vand(v4,v8) -; CHECK-NEXT: q1 = vcmp.gt(v3.uh,v11.uh) +; CHECK-NEXT: q1 = vcmp.gt(v2.uh,v11.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.uh = vlsr(v11.uh,r2) -; CHECK-NEXT: v13 = vmux(q2,v9,v2) +; CHECK-NEXT: v13 = vmux(q2,v9,v3) ; CHECK-NEXT: q2 = vcmp.eq(v8.h,v9.h) ; CHECK-NEXT: q3 = vcmp.gt(v4.uh,v14.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v20.uh = vlsr(v14.uh,r2) -; CHECK-NEXT: v22 = vmux(q2,v9,v2) -; CHECK-NEXT: v21 = vmux(q1,v2,v9) -; CHECK-NEXT: v2 = vmux(q3,v2,v9) +; CHECK-NEXT: v22 = vmux(q2,v9,v3) +; CHECK-NEXT: v21 = vmux(q1,v3,v9) +; CHECK-NEXT: v3 = vmux(q3,v3,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uh = vlsr(v4.uh,r2) ; CHECK-NEXT: v13.h = vadd(v11.h,v13.h) ; CHECK-NEXT: v24.h = vadd(v20.h,v22.h) -; CHECK-NEXT: v2.h = vadd(v2.h,v7.h) +; CHECK-NEXT: v3.h = vadd(v3.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uh = vlsr(v3.uh,r2) +; CHECK-NEXT: v12.uh = vlsr(v2.uh,r2) ; CHECK-NEXT: v23.h = vadd(v21.h,v7.h) -; CHECK-NEXT: v2.h = vsub(v2.h,v6.h) +; CHECK-NEXT: v3.h = vsub(v3.h,v6.h) ; CHECK-NEXT: q3 = vcmp.gt(v9.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.uh = vlsr(v11.uh,r7) -; CHECK-NEXT: v3.h = vsub(v23.h,v5.h) +; CHECK-NEXT: v2.h = vsub(v23.h,v5.h) ; CHECK-NEXT: q2 = vcmp.eq(v12.h,v11.h) ; CHECK-NEXT: q1 = vcmp.eq(v19.h,v20.h) ; CHECK-NEXT: } @@ -103,21 +103,21 @@ define void @s8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v5 = vor(v27,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.h = vasl(v3.h,r4) +; CHECK-NEXT: v2.h = vasl(v2.h,r4) ; CHECK-NEXT: v4 = vmux(q1,v26,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vasl(v2.h,r4) +; CHECK-NEXT: v3.h = vasl(v3.h,r4) ; CHECK-NEXT: v4 = vor(v28,v4) -; CHECK-NEXT: v29 = vor(v5,v3) +; CHECK-NEXT: v29 = vor(v5,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vor(v4,v2) +; CHECK-NEXT: v3 = vor(v4,v3) ; CHECK-NEXT: v31 = vmux(q3,v9,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v9,v2) +; CHECK-NEXT: v30 = vmux(q2,v9,v3) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -224,201 +224,201 @@ define void @s8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vsplat(r0) +; CHECK-NEXT: v5 = vsplat(r0) ; CHECK-NEXT: r3:2 = combine(##255,#8) ; CHECK-NEXT: v1 = valign(v0,v0,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3 = vsplat(r3) +; CHECK-NEXT: v10 = vsplat(r3) ; CHECK-NEXT: r7 = #512 -; CHECK-NEXT: v9:8.h = vunpack(v0.b) +; CHECK-NEXT: v3:2.h = vunpack(v0.b) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4 = vsplat(r7) +; CHECK-NEXT: v11 = vsplat(r7) ; CHECK-NEXT: r6 = ##-2147483648 ; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #23 -; CHECK-NEXT: v7:6.h = vunpack(v1.b) +; CHECK-NEXT: v31:30.h = vunpack(v1.b) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8 = vsplat(r6) -; CHECK-NEXT: v1:0.w = vunpack(v8.h) +; CHECK-NEXT: v3:2.w = vunpack(v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7:6.w = vunpack(v6.h) -; CHECK-NEXT: v5.w = vabs(v0.w) -; CHECK-NEXT: v10.w = vabs(v1.w) +; CHECK-NEXT: v1:0.w = vunpack(v30.h) +; CHECK-NEXT: v4.w = vabs(v2.w) +; CHECK-NEXT: v8.w = vabs(v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v26.w = vabs(v6.w) -; CHECK-NEXT: v13.w = vabs(v7.w) +; CHECK-NEXT: v6.w = vabs(v0.w) +; CHECK-NEXT: v12.w = vabs(v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.uw = vcl0(v5.uw) +; CHECK-NEXT: v7.uw = vcl0(v4.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vcl0(v26.uw) -; CHECK-NEXT: v9.w = vadd(v9.w,v2.w) +; CHECK-NEXT: v13.uw = vcl0(v6.uw) +; CHECK-NEXT: v7.w = vadd(v7.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.uw = vcl0(v13.uw) -; CHECK-NEXT: v15.w = vadd(v12.w,v2.w) +; CHECK-NEXT: v14.uw = vcl0(v12.uw) +; CHECK-NEXT: v13.w = vadd(v13.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.uw = vcl0(v10.uw) -; CHECK-NEXT: v12.w = vadd(v14.w,v2.w) +; CHECK-NEXT: v9.uw = vcl0(v8.uw) +; CHECK-NEXT: v14.w = vadd(v14.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27.w = vasl(v26.w,v15.w) -; CHECK-NEXT: v11.w = vadd(v11.w,v2.w) +; CHECK-NEXT: v6 = vsplat(r6) +; CHECK-NEXT: v16.w = vasl(v6.w,v13.w) +; CHECK-NEXT: v9.w = vadd(v9.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13.w = vasl(v13.w,v12.w) -; CHECK-NEXT: v20 = vand(v27,v4) -; CHECK-NEXT: v19.w = vadd(v27.w,v3.w) +; CHECK-NEXT: v12.w = vasl(v12.w,v14.w) +; CHECK-NEXT: v20 = vand(v16,v11) +; CHECK-NEXT: v19.w = vadd(v16.w,v10.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v16.w = vasl(v5.w,v9.w) -; CHECK-NEXT: v5 = vxor(v5,v5) -; CHECK-NEXT: v23.w = vadd(v13.w,v3.w) -; CHECK-NEXT: v28 = vand(v13,v4) +; CHECK-NEXT: v15.w = vasl(v4.w,v7.w) +; CHECK-NEXT: v4 = vxor(v4,v4) +; CHECK-NEXT: v23.w = vadd(v12.w,v10.w) +; CHECK-NEXT: v28 = vand(v12,v11) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v17.w = vasl(v10.w,v11.w) -; CHECK-NEXT: q3 = vcmp.eq(v20.w,v5.w) -; CHECK-NEXT: q2 = vcmp.gt(v27.uw,v19.uw) -; CHECK-NEXT: q0 = vcmp.gt(v5.w,v6.w) +; CHECK-NEXT: v17.w = vasl(v8.w,v9.w) +; CHECK-NEXT: q3 = vcmp.eq(v20.w,v4.w) +; CHECK-NEXT: q2 = vcmp.gt(v16.uw,v19.uw) +; CHECK-NEXT: q0 = vcmp.gt(v4.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v27.uw,r2) -; CHECK-NEXT: v30 = vmux(q3,v5,v2) -; CHECK-NEXT: q3 = vcmp.eq(v28.w,v5.w) -; CHECK-NEXT: v22 = vand(v17,v4) +; CHECK-NEXT: v21.uw = vlsr(v16.uw,r2) +; CHECK-NEXT: v30 = vmux(q3,v4,v5) +; CHECK-NEXT: q3 = vcmp.eq(v28.w,v4.w) +; CHECK-NEXT: v22 = vand(v17,v11) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.uw = vlsr(v19.uw,r2) -; CHECK-NEXT: v27 = vmux(q3,v5,v2) -; CHECK-NEXT: q1 = vcmp.eq(v22.w,v5.w) -; CHECK-NEXT: v24 = vmux(q2,v2,v5) +; CHECK-NEXT: v16.uw = vlsr(v19.uw,r2) +; CHECK-NEXT: v27 = vmux(q3,v4,v5) +; CHECK-NEXT: q1 = vcmp.eq(v22.w,v4.w) +; CHECK-NEXT: v24 = vmux(q2,v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31.uw = vlsr(v23.uw,r2) -; CHECK-NEXT: v22.w = vadd(v14.w,v30.w) -; CHECK-NEXT: v30.w = vadd(v17.w,v3.w) -; CHECK-NEXT: q2 = vcmp.eq(v21.w,v14.w) +; CHECK-NEXT: v22.w = vadd(v16.w,v30.w) +; CHECK-NEXT: v30.w = vadd(v17.w,v10.w) +; CHECK-NEXT: q2 = vcmp.eq(v21.w,v16.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v13.uw,r2) +; CHECK-NEXT: v29.uw = vlsr(v12.uw,r2) ; CHECK-NEXT: v28.w = vadd(v31.w,v27.w) -; CHECK-NEXT: v3.w = vadd(v16.w,v3.w) -; CHECK-NEXT: v4 = vand(v16,v4) +; CHECK-NEXT: v10.w = vadd(v15.w,v10.w) +; CHECK-NEXT: v11 = vand(v15,v11) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.uw = vlsr(v14.uw,r0) +; CHECK-NEXT: v16.uw = vlsr(v16.uw,r0) ; CHECK-NEXT: q3 = vcmp.eq(v29.w,v31.w) -; CHECK-NEXT: v18 = vmux(q0,v8,v5) -; CHECK-NEXT: q0 = vcmp.gt(v5.w,v7.w) +; CHECK-NEXT: v18 = vmux(q0,v6,v4) +; CHECK-NEXT: q0 = vcmp.gt(v4.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uw = vlsr(v31.uw,r0) -; CHECK-NEXT: v26 = vmux(q1,v5,v2) -; CHECK-NEXT: v31 = vmux(q0,v8,v5) -; CHECK-NEXT: q0 = vcmp.gt(v16.uw,v3.uw) +; CHECK-NEXT: v26 = vmux(q1,v4,v5) +; CHECK-NEXT: v31 = vmux(q0,v6,v4) +; CHECK-NEXT: q0 = vcmp.gt(v15.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v10 = vsplat(r5) +; CHECK-NEXT: v8 = vsplat(r5) ; CHECK-NEXT: v29.uw = vlsr(v22.uw,r0) -; CHECK-NEXT: v15.w = vsub(v24.w,v15.w) +; CHECK-NEXT: v13.w = vsub(v24.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v20.uw = vlsr(v28.uw,r0) -; CHECK-NEXT: v14 = vmux(q2,v29,v14) -; CHECK-NEXT: q2 = vcmp.gt(v13.uw,v23.uw) -; CHECK-NEXT: v15.w = vadd(v15.w,v10.w) +; CHECK-NEXT: v16 = vmux(q2,v29,v16) +; CHECK-NEXT: q2 = vcmp.gt(v12.uw,v23.uw) +; CHECK-NEXT: v13.w = vadd(v13.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uw = vlsr(v30.uw,r2) ; CHECK-NEXT: v19 = vmux(q3,v20,v19) -; CHECK-NEXT: q3 = vcmp.eq(v4.w,v5.w) -; CHECK-NEXT: v27 = vmux(q2,v2,v5) +; CHECK-NEXT: q3 = vcmp.eq(v11.w,v4.w) +; CHECK-NEXT: v27 = vmux(q2,v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.uw = vlsr(v3.uw,r2) +; CHECK-NEXT: v10.uw = vlsr(v10.uw,r2) ; CHECK-NEXT: q2 = vcmp.gt(v17.uw,v30.uw) ; CHECK-NEXT: v28.w = vadd(v25.w,v26.w) -; CHECK-NEXT: v29 = vmux(q3,v5,v2) +; CHECK-NEXT: v29 = vmux(q3,v4,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v17.uw = vlsr(v17.uw,r2) ; CHECK-NEXT: v19 = vor(v31,v19) -; CHECK-NEXT: v31 = vmux(q2,v2,v5) -; CHECK-NEXT: v2 = vmux(q0,v2,v5) +; CHECK-NEXT: v31 = vmux(q2,v5,v4) +; CHECK-NEXT: v5 = vmux(q0,v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.uw = vlsr(v16.uw,r2) -; CHECK-NEXT: v30.w = vadd(v3.w,v29.w) -; CHECK-NEXT: v2.w = vsub(v2.w,v9.w) -; CHECK-NEXT: v11.w = vsub(v31.w,v11.w) +; CHECK-NEXT: v24.uw = vlsr(v15.uw,r2) +; CHECK-NEXT: v30.w = vadd(v10.w,v29.w) +; CHECK-NEXT: v5.w = vsub(v5.w,v7.w) +; CHECK-NEXT: v9.w = vsub(v31.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: v15.uw = vlsr(v28.uw,r0) ; CHECK-NEXT: q3 = vcmp.eq(v17.w,v25.w) -; CHECK-NEXT: v4.w = vsub(v27.w,v12.w) -; CHECK-NEXT: v2.w = vadd(v2.w,v10.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v8.w) +; CHECK-NEXT: q0 = vcmp.eq(v24.w,v10.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13.uw = vlsr(v25.uw,r0) -; CHECK-NEXT: q0 = vcmp.eq(v24.w,v3.w) -; CHECK-NEXT: v21.w = vadd(v11.w,v10.w) -; CHECK-NEXT: q2 = vcmp.gt(v5.w,v1.w) +; CHECK-NEXT: v12.uw = vlsr(v25.uw,r0) +; CHECK-NEXT: v21.w = vadd(v9.w,v8.w) +; CHECK-NEXT: v11.w = vsub(v27.w,v14.w) +; CHECK-NEXT: q2 = vcmp.gt(v4.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v22.uw = vlsr(v30.uw,r0) -; CHECK-NEXT: v23 = vmux(q3,v16,v13) -; CHECK-NEXT: q3 = vcmp.gt(v5.w,v0.w) -; CHECK-NEXT: v24 = vmux(q2,v8,v5) +; CHECK-NEXT: v12 = vmux(q3,v15,v12) +; CHECK-NEXT: q3 = vcmp.gt(v4.w,v2.w) +; CHECK-NEXT: v23 = vmux(q2,v6,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.uw = vlsr(v3.uw,r0) -; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) -; CHECK-NEXT: v8 = vmux(q3,v8,v5) -; CHECK-NEXT: v10 = vor(v24,v23) +; CHECK-NEXT: v10.uw = vlsr(v10.uw,r0) +; CHECK-NEXT: v8.w = vadd(v11.w,v8.w) +; CHECK-NEXT: v6 = vmux(q3,v6,v4) +; CHECK-NEXT: v24 = vor(v23,v12) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vasl(v21.w,r4) -; CHECK-NEXT: v3 = vmux(q0,v22,v3) -; CHECK-NEXT: v14 = vor(v18,v14) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w) +; CHECK-NEXT: v7.w = vasl(v21.w,r4) +; CHECK-NEXT: v9 = vmux(q0,v22,v10) +; CHECK-NEXT: v16 = vor(v18,v16) +; CHECK-NEXT: q2 = vcmp.eq(v3.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v2.w,r4) -; CHECK-NEXT: v3 = vor(v8,v3) -; CHECK-NEXT: v25 = vor(v10,v9) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v5.w) +; CHECK-NEXT: v5.w = vasl(v5.w,r4) +; CHECK-NEXT: v6 = vor(v6,v9) +; CHECK-NEXT: v7 = vor(v24,v7) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v15.w = vasl(v15.w,r4) -; CHECK-NEXT: v2 = vor(v3,v2) -; CHECK-NEXT: v27 = vmux(q2,v5,v25) -; CHECK-NEXT: vmem(r1+#1) = v27.new +; CHECK-NEXT: v13.w = vasl(v13.w,r4) +; CHECK-NEXT: v5 = vor(v6,v5) +; CHECK-NEXT: v26 = vmux(q2,v4,v7) +; CHECK-NEXT: vmem(r1+#1) = v26.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v26.w = vasl(v4.w,r4) -; CHECK-NEXT: v29 = vmux(q3,v5,v2) -; CHECK-NEXT: q2 = vcmp.eq(v7.w,v5.w) -; CHECK-NEXT: vmem(r1+#0) = v29.new +; CHECK-NEXT: v25.w = vasl(v8.w,r4) +; CHECK-NEXT: v28 = vmux(q3,v4,v5) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v4.w) +; CHECK-NEXT: vmem(r1+#0) = v28.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28 = vor(v19,v26) -; CHECK-NEXT: v30 = vor(v14,v15) -; CHECK-NEXT: q3 = vcmp.eq(v6.w,v5.w) +; CHECK-NEXT: v27 = vor(v19,v25) +; CHECK-NEXT: v29 = vor(v16,v13) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmux(q2,v5,v28) -; CHECK-NEXT: v31 = vmux(q3,v5,v30) -; CHECK-NEXT: vmem(r1+#3) = v0.new +; CHECK-NEXT: v30 = vmux(q2,v4,v27) +; CHECK-NEXT: v31 = vmux(q3,v4,v29) +; CHECK-NEXT: vmem(r1+#3) = v30.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 @@ -436,47 +436,47 @@ define void @s8f32_1(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r0 = #1 -; CHECK-NEXT: v3:2.h = vunpack(v0.b) -; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vsplat(r0) +; CHECK-NEXT: r0 = #1 ; CHECK-NEXT: r3:2 = combine(##255,#8) -; CHECK-NEXT: r6 = #512 +; CHECK-NEXT: v1:0.h = vunpack(v0.b) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r0) ; CHECK-NEXT: v7 = vsplat(r3) -; CHECK-NEXT: v3:2.w = vunpack(v2.h) -; CHECK-NEXT: v22 = vxor(v22,v22) +; CHECK-NEXT: r6 = #512 +; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r6) ; CHECK-NEXT: r7 = ##-2147483648 -; CHECK-NEXT: r5 = #159 +; CHECK-NEXT: v1:0.w = vunpack(v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9 = vsplat(r7) -; CHECK-NEXT: v4.w = vabs(v2.w) -; CHECK-NEXT: v5.w = vabs(v3.w) -; CHECK-NEXT: q0 = vcmp.gt(v22.w,v2.w) +; CHECK-NEXT: r5 = #159 +; CHECK-NEXT: v4.w = vabs(v0.w) +; CHECK-NEXT: v5.w = vabs(v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12 = vsplat(r5) -; CHECK-NEXT: r4 = #23 -; CHECK-NEXT: v11 = vmux(q0,v9,v22) -; CHECK-NEXT: q0 = vcmp.gt(v22.w,v3.w) +; CHECK-NEXT: q0 = vcmp.gt(v3.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v6.uw = vcl0(v4.uw) -; CHECK-NEXT: v30 = vmux(q0,v9,v22) +; CHECK-NEXT: v11 = vmux(q0,v9,v3) +; CHECK-NEXT: q0 = vcmp.gt(v3.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uw = vcl0(v5.uw) -; CHECK-NEXT: v6.w = vadd(v6.w,v1.w) +; CHECK-NEXT: v27 = vmux(q0,v9,v3) +; CHECK-NEXT: v6.w = vadd(v6.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vadd(v8.w,v1.w) +; CHECK-NEXT: v8.w = vadd(v8.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,v6.w) @@ -490,68 +490,68 @@ define void @s8f32_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v10 = vand(v5,v10) ; CHECK-NEXT: v7.w = vadd(v5.w,v7.w) ; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v14.uw) -; CHECK-NEXT: q1 = vcmp.eq(v13.w,v22.w) +; CHECK-NEXT: q1 = vcmp.eq(v13.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.uw = vlsr(v14.uw,r2) -; CHECK-NEXT: q3 = vcmp.eq(v10.w,v22.w) -; CHECK-NEXT: v25 = vmux(q2,v1,v22) +; CHECK-NEXT: q3 = vcmp.eq(v10.w,v3.w) +; CHECK-NEXT: v22 = vmux(q2,v2,v3) ; CHECK-NEXT: q2 = vcmp.gt(v5.uw,v7.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2) -; CHECK-NEXT: v26 = vmux(q1,v22,v1) -; CHECK-NEXT: v27 = vmux(q3,v22,v1) -; CHECK-NEXT: v1 = vmux(q2,v1,v22) +; CHECK-NEXT: v23 = vmux(q1,v3,v2) +; CHECK-NEXT: v24 = vmux(q3,v3,v2) +; CHECK-NEXT: v2 = vmux(q2,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) -; CHECK-NEXT: v5.w = vadd(v14.w,v26.w) -; CHECK-NEXT: v29.w = vadd(v7.w,v27.w) -; CHECK-NEXT: v6.w = vsub(v25.w,v6.w) +; CHECK-NEXT: v21.uw = vlsr(v5.uw,r2) +; CHECK-NEXT: v5.w = vadd(v14.w,v23.w) +; CHECK-NEXT: v26.w = vadd(v7.w,v24.w) +; CHECK-NEXT: v6.w = vsub(v22.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.uw = vlsr(v4.uw,r2) -; CHECK-NEXT: v1.w = vsub(v1.w,v8.w) +; CHECK-NEXT: v20.uw = vlsr(v4.uw,r2) +; CHECK-NEXT: v2.w = vsub(v2.w,v8.w) ; CHECK-NEXT: v6.w = vadd(v6.w,v12.w) -; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w) +; CHECK-NEXT: q3 = vcmp.eq(v21.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28.uw = vlsr(v14.uw,r0) -; CHECK-NEXT: v1.w = vadd(v1.w,v12.w) -; CHECK-NEXT: q1 = vcmp.eq(v23.w,v14.w) -; CHECK-NEXT: q2 = vcmp.eq(v3.w,v22.w) +; CHECK-NEXT: v25.uw = vlsr(v14.uw,r0) +; CHECK-NEXT: v2.w = vadd(v2.w,v12.w) +; CHECK-NEXT: q1 = vcmp.eq(v20.w,v14.w) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uw = vlsr(v7.uw,r0) -; CHECK-NEXT: v5 = vmux(q1,v5,v28) +; CHECK-NEXT: v5 = vmux(q1,v5,v25) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.uw = vlsr(v29.uw,r0) +; CHECK-NEXT: v4.uw = vlsr(v26.uw,r0) ; CHECK-NEXT: v5 = vor(v11,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v6.w,r4) ; CHECK-NEXT: v4 = vmux(q3,v4,v7) -; CHECK-NEXT: q3 = vcmp.eq(v2.w,v22.w) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.w = vasl(v1.w,r4) -; CHECK-NEXT: v4 = vor(v30,v4) -; CHECK-NEXT: v31 = vor(v5,v6) +; CHECK-NEXT: v2.w = vasl(v2.w,r4) +; CHECK-NEXT: v4 = vor(v27,v4) +; CHECK-NEXT: v29 = vor(v5,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vor(v4,v1) -; CHECK-NEXT: v0 = vmux(q3,v22,v31) -; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: v28 = vor(v4,v2) +; CHECK-NEXT: v31 = vmux(q3,v3,v29) +; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vmux(q2,v22,v1) +; CHECK-NEXT: v30 = vmux(q2,v3,v28) ; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: vmem(r1+#1) = v1.new +; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } %v0 = load <64 x i8>, ptr %a0, align 128 %v1 = sitofp <64 x i8> %v0 to <64 x float> @@ -569,79 +569,79 @@ define void @s8f32_2(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r0 = #1 -; CHECK-NEXT: r3 = #512 +; CHECK-NEXT: r2 = #255 ; CHECK-NEXT: v1:0.h = vunpack(v0.b) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vsplat(r0) -; CHECK-NEXT: v4 = vsplat(r3) -; CHECK-NEXT: r2 = #255 -; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: v1 = vsplat(r0) +; CHECK-NEXT: v28 = vsplat(r2) +; CHECK-NEXT: r3 = #512 +; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v3 = vsplat(r3) ; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) -; CHECK-NEXT: r4 = #159 -; CHECK-NEXT: v1:0.w = vunpack(v0.h) +; CHECK-NEXT: v5:4.w = vunpack(v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vsplat(r2) -; CHECK-NEXT: v8 = vsplat(r4) -; CHECK-NEXT: v5.w = vabs(v0.w) -; CHECK-NEXT: q2 = vcmp.gt(v3.w,v0.w) +; CHECK-NEXT: r4 = #159 +; CHECK-NEXT: v5.w = vabs(v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v8 = vsplat(r4) ; CHECK-NEXT: v7 = vsplat(r7) ; CHECK-NEXT: r2 = #23 +; CHECK-NEXT: q2 = vcmp.gt(v2.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vcl0(v5.uw) -; CHECK-NEXT: v30 = vmux(q2,v7,v3) +; CHECK-NEXT: v31 = vmux(q2,v7,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vadd(v6.w,v2.w) +; CHECK-NEXT: v6.w = vadd(v6.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vasl(v5.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.w = vadd(v5.w,v1.w) -; CHECK-NEXT: v4 = vand(v5,v4) +; CHECK-NEXT: v0.w = vadd(v5.w,v28.w) +; CHECK-NEXT: v3 = vand(v5,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6) -; CHECK-NEXT: q0 = vcmp.eq(v4.w,v3.w) -; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v1.uw) +; CHECK-NEXT: q0 = vcmp.eq(v3.w,v2.w) +; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v0.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) -; CHECK-NEXT: v4 = vmux(q0,v3,v2) -; CHECK-NEXT: v2 = vmux(q1,v2,v3) +; CHECK-NEXT: v0.uw = vlsr(v0.uw,r6) +; CHECK-NEXT: v3 = vmux(q0,v2,v1) +; CHECK-NEXT: v1 = vmux(q1,v1,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vadd(v1.w,v4.w) -; CHECK-NEXT: v2.w = vsub(v2.w,v6.w) -; CHECK-NEXT: q3 = vcmp.eq(v5.w,v1.w) +; CHECK-NEXT: v3.w = vadd(v0.w,v3.w) +; CHECK-NEXT: v1.w = vsub(v1.w,v6.w) +; CHECK-NEXT: q3 = vcmp.eq(v5.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28.uw = vlsr(v1.uw,r0) -; CHECK-NEXT: v2.w = vadd(v2.w,v8.w) +; CHECK-NEXT: v29.uw = vlsr(v0.uw,r0) +; CHECK-NEXT: v1.w = vadd(v1.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v4.uw,r0) +; CHECK-NEXT: v30.uw = vlsr(v3.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v2.w,r2) -; CHECK-NEXT: v1 = vmux(q3,v29,v28) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: v1.w = vasl(v1.w,r2) +; CHECK-NEXT: v0 = vmux(q3,v30,v29) +; CHECK-NEXT: q3 = vcmp.eq(v4.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vor(v30,v1) +; CHECK-NEXT: v0 = vor(v31,v0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v31 = vor(v1,v2) +; CHECK-NEXT: v0 = vor(v0,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmux(q3,v3,v31) +; CHECK-NEXT: v0 = vmux(q3,v2,v0) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } @@ -738,25 +738,25 @@ define void @s16f16_1(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r3:2 = combine(#31,#1) -; CHECK-NEXT: r7 = #64 +; CHECK-NEXT: r7 = #1 +; CHECK-NEXT: r3:2 = combine(#31,#64) ; CHECK-NEXT: v1.h = vabs(v0.h) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vsplat(r2) +; CHECK-NEXT: v2.h = vsplat(r7) ; CHECK-NEXT: v5.h = vsplat(r3) ; CHECK-NEXT: r6 = #5 ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.h = vsplat(r7) +; CHECK-NEXT: v6.h = vsplat(r2) ; CHECK-NEXT: r4 = ##32768 ; CHECK-NEXT: v4.uh = vcl0(v1.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.h = vsplat(r4) -; CHECK-NEXT: r4 = #10 +; CHECK-NEXT: r3 = #10 ; CHECK-NEXT: q2 = vcmp.gt(v3.h,v0.h) ; CHECK-NEXT: v4.h = vadd(v4.h,v2.h) ; CHECK-NEXT: } @@ -773,13 +773,13 @@ define void @s16f16_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.uh = vlsr(v1.uh,r6) -; CHECK-NEXT: q1 = vcmp.eq(v6.h,v3.h) -; CHECK-NEXT: q0 = vcmp.gt(v1.uh,v7.uh) +; CHECK-NEXT: q0 = vcmp.eq(v6.h,v3.h) +; CHECK-NEXT: q1 = vcmp.gt(v1.uh,v7.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uh = vlsr(v7.uh,r6) -; CHECK-NEXT: v26 = vmux(q1,v3,v2) -; CHECK-NEXT: v2 = vmux(q0,v2,v3) +; CHECK-NEXT: v26 = vmux(q0,v3,v2) +; CHECK-NEXT: v2 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vadd(v25.h,v26.h) @@ -787,15 +787,15 @@ define void @s16f16_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: q3 = vcmp.eq(v1.h,v25.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27.uh = vlsr(v25.uh,r2) +; CHECK-NEXT: v27.uh = vlsr(v25.uh,r7) ; CHECK-NEXT: v28.h = vsub(v2.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2) +; CHECK-NEXT: v29.uh = vlsr(v7.uh,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.h = vasl(v28.h,r4) -; CHECK-NEXT: q3 = vsetq(r7) +; CHECK-NEXT: v1.h = vasl(v28.h,r3) +; CHECK-NEXT: q3 = vsetq(r2) ; CHECK-NEXT: v2 = vmux(q3,v29,v27) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -835,61 +835,61 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vsplat(r0) ; CHECK-NEXT: r7 = #512 -; CHECK-NEXT: v4.w = vabs(v0.w) +; CHECK-NEXT: v2.w = vabs(v0.w) ; CHECK-NEXT: v6.w = vabs(v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5 = vsplat(r3) ; CHECK-NEXT: v9 = vsplat(r7) ; CHECK-NEXT: r5 = #159 -; CHECK-NEXT: v2 = vxor(v2,v2) +; CHECK-NEXT: v4 = vxor(v4,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) ; CHECK-NEXT: r6 = ##-2147483648 -; CHECK-NEXT: v7.uw = vcl0(v4.uw) +; CHECK-NEXT: v7.uw = vcl0(v2.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r6) ; CHECK-NEXT: v8.uw = vcl0(v6.uw) -; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w) +; CHECK-NEXT: q0 = vcmp.gt(v4.w,v0.w) ; CHECK-NEXT: v7.w = vadd(v7.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v8.w = vadd(v8.w,v3.w) -; CHECK-NEXT: v27 = vmux(q0,v10,v2) +; CHECK-NEXT: v27 = vmux(q0,v10,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vasl(v4.w,v7.w) +; CHECK-NEXT: v2.w = vasl(v2.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v6.w,v8.w) -; CHECK-NEXT: v11.w = vadd(v4.w,v5.w) -; CHECK-NEXT: v12 = vand(v4,v9) +; CHECK-NEXT: v11.w = vadd(v2.w,v5.w) +; CHECK-NEXT: v12 = vand(v2,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vadd(v6.w,v5.w) ; CHECK-NEXT: v9 = vand(v6,v9) -; CHECK-NEXT: q1 = vcmp.eq(v12.w,v2.w) -; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v11.uw) +; CHECK-NEXT: q1 = vcmp.eq(v12.w,v4.w) +; CHECK-NEXT: q2 = vcmp.gt(v2.uw,v11.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v22.uw = vlsr(v11.uw,r2) -; CHECK-NEXT: q3 = vcmp.eq(v9.w,v2.w) -; CHECK-NEXT: v23 = vmux(q1,v2,v3) -; CHECK-NEXT: v14 = vmux(q2,v3,v2) +; CHECK-NEXT: q3 = vcmp.eq(v9.w,v4.w) +; CHECK-NEXT: v23 = vmux(q1,v4,v3) +; CHECK-NEXT: v14 = vmux(q2,v3,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) ; CHECK-NEXT: v11.w = vadd(v22.w,v23.w) ; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v5.uw) -; CHECK-NEXT: v25 = vmux(q3,v2,v3) +; CHECK-NEXT: v25 = vmux(q3,v4,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v4.uw,r2) +; CHECK-NEXT: v21.uw = vlsr(v2.uw,r2) ; CHECK-NEXT: v5.w = vadd(v24.w,v25.w) -; CHECK-NEXT: v3 = vmux(q2,v3,v2) +; CHECK-NEXT: v3 = vmux(q2,v3,v4) ; CHECK-NEXT: v7.w = vsub(v14.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -899,7 +899,7 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v7.w = vadd(v7.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.uw = vlsr(v22.uw,r0) +; CHECK-NEXT: v2.uw = vlsr(v22.uw,r0) ; CHECK-NEXT: v3.w = vadd(v3.w,v13.w) ; CHECK-NEXT: q2 = vcmp.eq(v6.w,v24.w) ; CHECK-NEXT: } @@ -908,32 +908,32 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0) -; CHECK-NEXT: v4 = vmux(q3,v11,v4) -; CHECK-NEXT: q3 = vcmp.gt(v2.w,v1.w) +; CHECK-NEXT: v2 = vmux(q3,v11,v2) +; CHECK-NEXT: q3 = vcmp.gt(v4.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uw = vlsr(v24.uw,r0) -; CHECK-NEXT: v28 = vmux(q3,v10,v2) -; CHECK-NEXT: v4 = vor(v27,v4) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) +; CHECK-NEXT: v28 = vmux(q3,v10,v4) +; CHECK-NEXT: v2 = vor(v27,v2) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.w = vasl(v7.w,r4) ; CHECK-NEXT: v5 = vmux(q2,v5,v26) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.w = vasl(v3.w,r4) ; CHECK-NEXT: v5 = vor(v28,v5) -; CHECK-NEXT: v29 = vor(v4,v7) +; CHECK-NEXT: v29 = vor(v2,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vor(v5,v3) -; CHECK-NEXT: v31 = vmux(q3,v2,v29) +; CHECK-NEXT: v31 = vmux(q3,v4,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v2,v3) +; CHECK-NEXT: v30 = vmux(q2,v4,v3) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -1042,114 +1042,114 @@ define void @s32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#8,#1) ; CHECK-NEXT: r6 = #255 -; CHECK-NEXT: v6.w = vabs(v1.w) -; CHECK-NEXT: v1.cur = vmem(r0+#0) +; CHECK-NEXT: v3.w = vabs(v2.w) +; CHECK-NEXT: v2.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vsplat(r2) +; CHECK-NEXT: v4 = vsplat(r2) ; CHECK-NEXT: r4 = #512 -; CHECK-NEXT: v5.w = vabs(v0.w) +; CHECK-NEXT: v1.w = vabs(v0.w) ; CHECK-NEXT: v0.cur = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9 = vsplat(r4) ; CHECK-NEXT: v8 = vsplat(r6) -; CHECK-NEXT: v3.uw = vcl0(v6.uw) -; CHECK-NEXT: v20 = vxor(v20,v20) +; CHECK-NEXT: v6.uw = vcl0(v3.uw) +; CHECK-NEXT: v7 = vxor(v7,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 -; CHECK-NEXT: v4.uw = vcl0(v5.uw) -; CHECK-NEXT: v3.w = vadd(v3.w,v2.w) +; CHECK-NEXT: v5.uw = vcl0(v1.uw) +; CHECK-NEXT: v6.w = vadd(v6.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27 = vsplat(r4) ; CHECK-NEXT: r5 = ##-2147483648 -; CHECK-NEXT: v7.w = vadd(v4.w,v2.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) -; CHECK-NEXT: v6.w = vasl(v6.w,v3.w) -; CHECK-NEXT: q0 = vcmp.gt(v20.w,v1.w) +; CHECK-NEXT: v3.w = vasl(v3.w,v6.w) +; CHECK-NEXT: q0 = vcmp.gt(v7.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vasl(v5.w,v7.w) -; CHECK-NEXT: v26 = vmux(q0,v13,v20) -; CHECK-NEXT: v10.w = vadd(v6.w,v8.w) -; CHECK-NEXT: v11 = vand(v6,v9) +; CHECK-NEXT: v1.w = vasl(v1.w,v5.w) +; CHECK-NEXT: v26 = vmux(q0,v13,v7) +; CHECK-NEXT: v10.w = vadd(v3.w,v8.w) +; CHECK-NEXT: v11 = vand(v3,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9 = vand(v5,v9) -; CHECK-NEXT: q3 = vcmp.eq(v11.w,v20.w) -; CHECK-NEXT: v8.w = vadd(v5.w,v8.w) -; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v10.uw) +; CHECK-NEXT: v9 = vand(v1,v9) +; CHECK-NEXT: q1 = vcmp.eq(v11.w,v7.w) +; CHECK-NEXT: v8.w = vadd(v1.w,v8.w) +; CHECK-NEXT: q2 = vcmp.gt(v3.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v10.uw,r3) -; CHECK-NEXT: q2 = vcmp.eq(v9.w,v20.w) -; CHECK-NEXT: v22 = vmux(q3,v20,v2) -; CHECK-NEXT: q3 = vcmp.gt(v5.uw,v8.uw) +; CHECK-NEXT: v12.uw = vlsr(v3.uw,r3) +; CHECK-NEXT: q3 = vcmp.eq(v9.w,v7.w) +; CHECK-NEXT: v22 = vmux(q1,v7,v4) +; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v8.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3) -; CHECK-NEXT: v9.w = vadd(v21.w,v22.w) -; CHECK-NEXT: v24 = vmux(q2,v20,v2) -; CHECK-NEXT: v23 = vmux(q1,v2,v20) +; CHECK-NEXT: v3.uw = vlsr(v10.uw,r3) +; CHECK-NEXT: v24 = vmux(q3,v7,v4) +; CHECK-NEXT: v23 = vmux(q2,v4,v7) +; CHECK-NEXT: v4 = vmux(q1,v4,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vlsr(v6.uw,r3) -; CHECK-NEXT: v2 = vmux(q3,v2,v20) -; CHECK-NEXT: v25.w = vadd(v8.w,v24.w) -; CHECK-NEXT: v3.w = vsub(v23.w,v3.w) +; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3) +; CHECK-NEXT: v9.w = vadd(v3.w,v22.w) +; CHECK-NEXT: v6.w = vsub(v23.w,v6.w) +; CHECK-NEXT: v4.w = vsub(v4.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.uw = vlsr(v5.uw,r3) -; CHECK-NEXT: v2.w = vsub(v2.w,v7.w) -; CHECK-NEXT: q3 = vcmp.eq(v12.w,v21.w) -; CHECK-NEXT: v3.w = vadd(v3.w,v27.w) +; CHECK-NEXT: v1.uw = vlsr(v1.uw,r3) +; CHECK-NEXT: v25.w = vadd(v8.w,v24.w) +; CHECK-NEXT: q3 = vcmp.eq(v12.w,v3.w) +; CHECK-NEXT: v6.w = vadd(v6.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #23 -; CHECK-NEXT: v6.uw = vlsr(v21.uw,r2) -; CHECK-NEXT: q2 = vcmp.eq(v5.w,v8.w) -; CHECK-NEXT: v2.w = vadd(v2.w,v27.w) +; CHECK-NEXT: v3.uw = vlsr(v3.uw,r2) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v8.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v25.uw,r2) -; CHECK-NEXT: v6 = vmux(q3,v9,v6) -; CHECK-NEXT: q3 = vcmp.gt(v20.w,v0.w) +; CHECK-NEXT: v3 = vmux(q3,v9,v3) +; CHECK-NEXT: q3 = vcmp.gt(v7.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uw = vlsr(v8.uw,r2) -; CHECK-NEXT: v30 = vmux(q3,v13,v20) -; CHECK-NEXT: v6 = vor(v26,v6) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v20.w) +; CHECK-NEXT: v31 = vmux(q3,v13,v7) +; CHECK-NEXT: v3 = vor(v26,v3) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vasl(v3.w,r3) -; CHECK-NEXT: v5 = vmux(q2,v28,v29) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v20.w) +; CHECK-NEXT: v30.w = vasl(v6.w,r3) +; CHECK-NEXT: v1 = vmux(q2,v28,v29) +; CHECK-NEXT: q2 = vcmp.eq(v2.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v2.w,r3) -; CHECK-NEXT: v31 = vor(v30,v5) -; CHECK-NEXT: v3 = vor(v6,v3) +; CHECK-NEXT: v2.w = vasl(v4.w,r3) +; CHECK-NEXT: v1 = vor(v31,v1) +; CHECK-NEXT: v3 = vor(v3,v30) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vor(v31,v2) -; CHECK-NEXT: v3 = vmux(q2,v20,v3) +; CHECK-NEXT: v1 = vor(v1,v2) +; CHECK-NEXT: v3 = vmux(q2,v7,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmux(q3,v20,v1) +; CHECK-NEXT: v0 = vmux(q3,v7,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v20.sf) +; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v7.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v20.sf) +; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v7.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.hf = v3:2.qf32 @@ -1452,60 +1452,60 @@ define void @u8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.h = vsplat(r6) -; CHECK-NEXT: v4.h = vsplat(r3) +; CHECK-NEXT: v2.h = vsplat(r6) +; CHECK-NEXT: v5.h = vsplat(r3) ; CHECK-NEXT: r5 = #64 -; CHECK-NEXT: v2 = vxor(v2,v2) +; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vsplat(r5) ; CHECK-NEXT: r4 = #10 -; CHECK-NEXT: v5.uh = vcl0(v0.uh) +; CHECK-NEXT: v4.uh = vcl0(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uh = vcl0(v1.uh) -; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) +; CHECK-NEXT: v4.h = vadd(v4.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.h = vadd(v7.h,v3.h) +; CHECK-NEXT: v7.h = vadd(v7.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.h = vasl(v0.h,v5.h) +; CHECK-NEXT: v8.h = vasl(v0.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.h = vasl(v1.h,v7.h) ; CHECK-NEXT: v10 = vand(v8,v6) -; CHECK-NEXT: v9.h = vadd(v8.h,v4.h) +; CHECK-NEXT: v9.h = vadd(v8.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v22.h = vadd(v11.h,v4.h) +; CHECK-NEXT: v22.h = vadd(v11.h,v5.h) ; CHECK-NEXT: v6 = vand(v11,v6) ; CHECK-NEXT: q0 = vcmp.gt(v8.uh,v9.uh) -; CHECK-NEXT: q1 = vcmp.eq(v10.h,v2.h) +; CHECK-NEXT: q1 = vcmp.eq(v10.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uh = vlsr(v8.uh,r2) -; CHECK-NEXT: q2 = vcmp.eq(v6.h,v2.h) +; CHECK-NEXT: q2 = vcmp.eq(v6.h,v3.h) ; CHECK-NEXT: q3 = vcmp.gt(v11.uh,v22.uh) -; CHECK-NEXT: v12 = vmux(q1,v2,v3) +; CHECK-NEXT: v12 = vmux(q1,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uh = vlsr(v9.uh,r2) -; CHECK-NEXT: v13 = vmux(q2,v2,v3) -; CHECK-NEXT: v25 = vmux(q0,v3,v2) -; CHECK-NEXT: v3 = vmux(q3,v3,v2) +; CHECK-NEXT: v13 = vmux(q2,v3,v2) +; CHECK-NEXT: v25 = vmux(q0,v2,v3) +; CHECK-NEXT: v2 = vmux(q3,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uh = vlsr(v22.uh,r2) ; CHECK-NEXT: v24.h = vadd(v9.h,v12.h) -; CHECK-NEXT: v3.h = vadd(v3.h,v4.h) -; CHECK-NEXT: v12.h = vadd(v25.h,v4.h) +; CHECK-NEXT: v2.h = vadd(v2.h,v5.h) +; CHECK-NEXT: v12.h = vadd(v25.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v23.uh = vlsr(v11.uh,r2) ; CHECK-NEXT: v13.h = vadd(v8.h,v13.h) -; CHECK-NEXT: v5.h = vsub(v12.h,v5.h) -; CHECK-NEXT: v3.h = vsub(v3.h,v7.h) +; CHECK-NEXT: v4.h = vsub(v12.h,v4.h) +; CHECK-NEXT: v2.h = vsub(v2.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.uh = vlsr(v9.uh,r6) @@ -1517,28 +1517,28 @@ define void @u8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27.uh = vlsr(v13.uh,r6) -; CHECK-NEXT: v4 = vmux(q2,v26,v14) -; CHECK-NEXT: q2 = vcmp.eq(v1.h,v2.h) +; CHECK-NEXT: v5 = vmux(q2,v26,v14) +; CHECK-NEXT: q2 = vcmp.eq(v1.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uh = vlsr(v8.uh,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.h = vasl(v5.h,r4) +; CHECK-NEXT: v4.h = vasl(v4.h,r4) ; CHECK-NEXT: v6 = vmux(q3,v27,v28) -; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h) +; CHECK-NEXT: q3 = vcmp.eq(v0.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.h = vasl(v3.h,r4) -; CHECK-NEXT: v29 = vor(v4,v5) +; CHECK-NEXT: v2.h = vasl(v2.h,r4) +; CHECK-NEXT: v29 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3 = vor(v6,v3) -; CHECK-NEXT: v31 = vmux(q3,v2,v29) +; CHECK-NEXT: v2 = vor(v6,v2) +; CHECK-NEXT: v31 = vmux(q3,v3,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v2,v3) +; CHECK-NEXT: v30 = vmux(q2,v3,v2) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -1637,17 +1637,18 @@ define void @u8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4 = vsplat(r0) +; CHECK-NEXT: v5 = vsplat(r0) ; CHECK-NEXT: r3:2 = combine(##255,#8) ; CHECK-NEXT: v1 = valign(v0,v0,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v15 = vsplat(r6) +; CHECK-NEXT: v14 = vsplat(r6) ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: v3:2.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v16 = vsplat(r5) ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v31:30.uh = vunpack(v1.ub) ; CHECK-NEXT: } @@ -1658,159 +1659,158 @@ define void @u8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.uw = vunpack(v30.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.uw = vcl0(v2.uw) +; CHECK-NEXT: v4.uw = vcl0(v2.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uw = vcl0(v0.uw) -; CHECK-NEXT: v5.w = vadd(v5.w,v4.w) +; CHECK-NEXT: v8.w = vadd(v4.w,v5.w) +; CHECK-NEXT: v4 = vxor(v4,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.uw = vcl0(v3.uw) -; CHECK-NEXT: v11.w = vadd(v7.w,v4.w) -; CHECK-NEXT: v7 = vxor(v7,v7) +; CHECK-NEXT: v9.uw = vcl0(v1.uw) +; CHECK-NEXT: v7.w = vadd(v7.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.uw = vcl0(v1.uw) -; CHECK-NEXT: v10.w = vadd(v8.w,v4.w) +; CHECK-NEXT: v29.uw = vcl0(v3.uw) +; CHECK-NEXT: v9.w = vadd(v9.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9 = vsplat(r5) -; CHECK-NEXT: v14.w = vasl(v0.w,v11.w) -; CHECK-NEXT: v8.w = vadd(v9.w,v4.w) +; CHECK-NEXT: v13.w = vasl(v0.w,v7.w) +; CHECK-NEXT: v10.w = vadd(v29.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.w = vasl(v2.w,v5.w) -; CHECK-NEXT: v24 = vand(v14,v15) -; CHECK-NEXT: v20.w = vadd(v14.w,v6.w) +; CHECK-NEXT: v11.w = vasl(v2.w,v8.w) +; CHECK-NEXT: v24 = vand(v13,v14) +; CHECK-NEXT: v20.w = vadd(v13.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13.w = vasl(v3.w,v10.w) -; CHECK-NEXT: v19 = vand(v12,v15) -; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w) -; CHECK-NEXT: v18.w = vadd(v12.w,v6.w) +; CHECK-NEXT: v12.w = vasl(v3.w,v10.w) +; CHECK-NEXT: v19 = vand(v11,v14) +; CHECK-NEXT: q3 = vcmp.eq(v24.w,v4.w) +; CHECK-NEXT: v18.w = vadd(v11.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v16.w = vasl(v1.w,v8.w) -; CHECK-NEXT: v23 = vand(v13,v15) -; CHECK-NEXT: v22.w = vadd(v13.w,v6.w) -; CHECK-NEXT: q0 = vcmp.gt(v14.uw,v20.uw) +; CHECK-NEXT: v15.w = vasl(v1.w,v9.w) +; CHECK-NEXT: v23 = vand(v12,v14) +; CHECK-NEXT: v22.w = vadd(v12.w,v6.w) +; CHECK-NEXT: v30 = vmux(q3,v4,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vadd(v16.w,v6.w) -; CHECK-NEXT: v15 = vand(v16,v15) -; CHECK-NEXT: v30 = vmux(q3,v7,v4) -; CHECK-NEXT: q2 = vcmp.eq(v19.w,v7.w) +; CHECK-NEXT: v14 = vand(v15,v14) +; CHECK-NEXT: v6.w = vadd(v15.w,v6.w) +; CHECK-NEXT: q2 = vcmp.eq(v19.w,v4.w) +; CHECK-NEXT: q0 = vcmp.gt(v13.uw,v20.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v14.uw,r2) -; CHECK-NEXT: q3 = vcmp.eq(v15.w,v7.w) -; CHECK-NEXT: v28 = vmux(q0,v4,v7) -; CHECK-NEXT: q1 = vcmp.eq(v23.w,v7.w) +; CHECK-NEXT: v21.uw = vlsr(v13.uw,r2) +; CHECK-NEXT: q3 = vcmp.eq(v14.w,v4.w) +; CHECK-NEXT: q1 = vcmp.eq(v23.w,v4.w) +; CHECK-NEXT: v31 = vmux(q2,v4,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.uw = vlsr(v20.uw,r2) -; CHECK-NEXT: v26 = vmux(q3,v7,v4) -; CHECK-NEXT: v11.w = vsub(v28.w,v11.w) -; CHECK-NEXT: q3 = vcmp.gt(v13.uw,v22.uw) +; CHECK-NEXT: v14.uw = vlsr(v6.uw,r2) +; CHECK-NEXT: v26 = vmux(q3,v4,v5) +; CHECK-NEXT: v28 = vmux(q0,v5,v4) +; CHECK-NEXT: q3 = vcmp.gt(v12.uw,v22.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v15.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v20.w = vadd(v14.w,v30.w) -; CHECK-NEXT: v30 = vmux(q1,v7,v4) -; CHECK-NEXT: v31 = vmux(q2,v7,v4) +; CHECK-NEXT: v13.uw = vlsr(v20.uw,r2) +; CHECK-NEXT: v29.w = vadd(v14.w,v26.w) +; CHECK-NEXT: v7.w = vsub(v28.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uw = vlsr(v18.uw,r2) -; CHECK-NEXT: v29.w = vadd(v15.w,v26.w) -; CHECK-NEXT: q1 = vcmp.gt(v12.uw,v18.uw) -; CHECK-NEXT: v11.w = vadd(v11.w,v9.w) +; CHECK-NEXT: v20.w = vadd(v13.w,v30.w) +; CHECK-NEXT: v30 = vmux(q1,v4,v5) +; CHECK-NEXT: q1 = vcmp.gt(v11.uw,v18.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v22.uw,r2) ; CHECK-NEXT: v23.w = vadd(v19.w,v31.w) -; CHECK-NEXT: v22 = vmux(q3,v4,v7) -; CHECK-NEXT: q3 = vcmp.gt(v16.uw,v6.uw) +; CHECK-NEXT: v22 = vmux(q3,v5,v4) +; CHECK-NEXT: q3 = vcmp.gt(v15.uw,v6.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v29.uw,r0) +; CHECK-NEXT: v29 = vmux(q1,v5,v4) ; CHECK-NEXT: v31.w = vadd(v28.w,v30.w) -; CHECK-NEXT: v30 = vmux(q1,v4,v7) -; CHECK-NEXT: v4 = vmux(q3,v4,v7) +; CHECK-NEXT: v5 = vmux(q3,v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v17.uw = vlsr(v12.uw,r2) -; CHECK-NEXT: v5.w = vsub(v30.w,v5.w) -; CHECK-NEXT: v29.w = vsub(v22.w,v10.w) -; CHECK-NEXT: v4.w = vsub(v4.w,v8.w) +; CHECK-NEXT: v21.uw = vlsr(v23.uw,r0) +; CHECK-NEXT: v8.w = vsub(v29.w,v8.w) +; CHECK-NEXT: q0 = vcmp.eq(v21.w,v13.w) +; CHECK-NEXT: v23.w = vsub(v22.w,v10.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13.uw = vlsr(v13.uw,r2) -; CHECK-NEXT: v6.w = vadd(v29.w,v9.w) -; CHECK-NEXT: v5.w = vadd(v5.w,v9.w) -; CHECK-NEXT: q0 = vcmp.eq(v21.w,v14.w) +; CHECK-NEXT: v17.uw = vlsr(v11.uw,r2) +; CHECK-NEXT: v6.w = vadd(v23.w,v16.w) +; CHECK-NEXT: v5.w = vsub(v5.w,v9.w) +; CHECK-NEXT: v8.w = vadd(v8.w,v16.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v25.uw = vlsr(v16.uw,r2) +; CHECK-NEXT: v12.uw = vlsr(v12.uw,r2) +; CHECK-NEXT: v7.w = vadd(v7.w,v16.w) ; CHECK-NEXT: q2 = vcmp.eq(v17.w,v19.w) -; CHECK-NEXT: q3 = vcmp.eq(v13.w,v28.w) -; CHECK-NEXT: v4.w = vadd(v4.w,v9.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v16.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v23.uw,r0) -; CHECK-NEXT: q1 = vcmp.eq(v25.w,v15.w) +; CHECK-NEXT: v25.uw = vlsr(v15.uw,r2) +; CHECK-NEXT: q3 = vcmp.eq(v12.w,v28.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.uw = vlsr(v19.uw,r0) +; CHECK-NEXT: v18.uw = vlsr(v19.uw,r0) +; CHECK-NEXT: q1 = vcmp.eq(v25.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v31.uw = vlsr(v31.uw,r0) -; CHECK-NEXT: v23 = vmux(q2,v21,v23) -; CHECK-NEXT: q2 = vcmp.eq(v3.w,v7.w) +; CHECK-NEXT: v30.uw = vlsr(v31.uw,r0) +; CHECK-NEXT: v23 = vmux(q2,v21,v18) +; CHECK-NEXT: q2 = vcmp.eq(v3.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: v31.uw = vlsr(v28.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v26.uw = vlsr(v15.uw,r0) -; CHECK-NEXT: v8 = vmux(q3,v31,v16) -; CHECK-NEXT: q3 = vcmp.eq(v2.w,v7.w) +; CHECK-NEXT: v26.uw = vlsr(v14.uw,r0) +; CHECK-NEXT: v9 = vmux(q3,v30,v31) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v6.w,r4) ; CHECK-NEXT: v22 = vmux(q1,v24,v26) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vasl(v5.w,r4) -; CHECK-NEXT: v6 = vor(v8,v6) +; CHECK-NEXT: v8.w = vasl(v8.w,r4) +; CHECK-NEXT: v6 = vor(v9,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27.uw = vlsr(v14.uw,r0) -; CHECK-NEXT: v25 = vor(v23,v5) -; CHECK-NEXT: v26 = vmux(q2,v7,v6) +; CHECK-NEXT: v27.uw = vlsr(v13.uw,r0) +; CHECK-NEXT: v25 = vor(v23,v8) +; CHECK-NEXT: v26 = vmux(q2,v4,v6) ; CHECK-NEXT: vmem(r1+#1) = v26.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v20.uw = vlsr(v20.uw,r0) -; CHECK-NEXT: v28 = vmux(q3,v7,v25) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w) +; CHECK-NEXT: v28 = vmux(q3,v4,v25) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v4.w) ; CHECK-NEXT: vmem(r1+#0) = v28.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.w = vasl(v11.w,r4) +; CHECK-NEXT: v7.w = vasl(v7.w,r4) ; CHECK-NEXT: v20 = vmux(q0,v20,v27) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.w = vasl(v4.w,r4) -; CHECK-NEXT: v29 = vor(v20,v11) +; CHECK-NEXT: v24.w = vasl(v5.w,r4) +; CHECK-NEXT: v29 = vor(v20,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27 = vor(v22,v24) -; CHECK-NEXT: v31 = vmux(q3,v7,v29) +; CHECK-NEXT: v31 = vmux(q3,v4,v29) ; CHECK-NEXT: vmem(r1+#2) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v7,v27) +; CHECK-NEXT: v30 = vmux(q2,v4,v27) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#3) = v30.new ; CHECK-NEXT: } @@ -1826,110 +1826,112 @@ define void @u8f32_1(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r7 = #1 -; CHECK-NEXT: r6 = #512 -; CHECK-NEXT: v3:2.uh = vunpack(v0.ub) -; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vsplat(r7) -; CHECK-NEXT: v8 = vsplat(r6) +; CHECK-NEXT: r7 = #1 ; CHECK-NEXT: r3:2 = combine(##255,#8) +; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r7) ; CHECK-NEXT: v6 = vsplat(r3) +; CHECK-NEXT: r6 = #512 +; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8 = vsplat(r6) ; CHECK-NEXT: r5 = #159 -; CHECK-NEXT: v3:2.uw = vunpack(v2.uh) -; CHECK-NEXT: v21 = vxor(v21,v21) +; CHECK-NEXT: r4 = #23 +; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) -; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.uw = vcl0(v2.uw) +; CHECK-NEXT: v4.uw = vcl0(v0.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.uw = vcl0(v3.uw) -; CHECK-NEXT: v4.w = vadd(v4.w,v1.w) +; CHECK-NEXT: v5.uw = vcl0(v1.uw) +; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vadd(v5.w,v1.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v2.w,v4.w) +; CHECK-NEXT: v7.w = vasl(v0.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vasl(v3.w,v5.w) -; CHECK-NEXT: v11 = vand(v7,v8) +; CHECK-NEXT: v9.w = vasl(v1.w,v5.w) ; CHECK-NEXT: v10.w = vadd(v7.w,v6.w) +; CHECK-NEXT: v11 = vand(v7,v8) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v9.w,v6.w) -; CHECK-NEXT: q1 = vcmp.eq(v11.w,v21.w) ; CHECK-NEXT: v8 = vand(v9,v8) -; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw) +; CHECK-NEXT: q0 = vcmp.eq(v11.w,v3.w) +; CHECK-NEXT: q1 = vcmp.gt(v7.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v22.uw = vlsr(v10.uw,r2) -; CHECK-NEXT: v24 = vmux(q1,v21,v1) -; CHECK-NEXT: q3 = vcmp.eq(v8.w,v21.w) -; CHECK-NEXT: q1 = vcmp.gt(v9.uw,v6.uw) +; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2) +; CHECK-NEXT: q3 = vcmp.eq(v8.w,v3.w) +; CHECK-NEXT: q2 = vcmp.gt(v9.uw,v6.uw) +; CHECK-NEXT: v21 = vmux(q0,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v25 = vmux(q0,v1,v21) -; CHECK-NEXT: v27 = vmux(q3,v21,v1) -; CHECK-NEXT: v1 = vmux(q1,v1,v21) +; CHECK-NEXT: v20.uw = vlsr(v6.uw,r2) +; CHECK-NEXT: v22 = vmux(q1,v2,v3) +; CHECK-NEXT: v24 = vmux(q3,v3,v2) +; CHECK-NEXT: v2 = vmux(q2,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vsub(v25.w,v4.w) -; CHECK-NEXT: v1.w = vsub(v1.w,v5.w) -; CHECK-NEXT: v10.w = vadd(v22.w,v24.w) -; CHECK-NEXT: v28.w = vadd(v23.w,v27.w) +; CHECK-NEXT: v4.w = vsub(v22.w,v4.w) +; CHECK-NEXT: v2.w = vsub(v2.w,v5.w) +; CHECK-NEXT: v10.w = vadd(v19.w,v21.w) +; CHECK-NEXT: v25.w = vadd(v20.w,v24.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: v4.w = vadd(v4.w,v13.w) -; CHECK-NEXT: v1.w = vadd(v1.w,v13.w) +; CHECK-NEXT: v2.w = vadd(v2.w,v13.w) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v26.uw = vlsr(v9.uw,r2) -; CHECK-NEXT: q2 = vcmp.eq(v12.w,v22.w) +; CHECK-NEXT: v23.uw = vlsr(v9.uw,r2) +; CHECK-NEXT: q1 = vcmp.eq(v12.w,v19.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.uw = vlsr(v22.uw,r7) -; CHECK-NEXT: q3 = vcmp.eq(v26.w,v23.w) +; CHECK-NEXT: v11.uw = vlsr(v19.uw,r7) +; CHECK-NEXT: q3 = vcmp.eq(v23.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30.uw = vlsr(v10.uw,r7) +; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v23.uw,r7) -; CHECK-NEXT: v5 = vmux(q2,v30,v11) -; CHECK-NEXT: q2 = vcmp.eq(v3.w,v21.w) +; CHECK-NEXT: v26.uw = vlsr(v20.uw,r7) +; CHECK-NEXT: v5 = vmux(q1,v27,v11) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.uw = vlsr(v28.uw,r7) +; CHECK-NEXT: v6.uw = vlsr(v25.uw,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,r4) -; CHECK-NEXT: v6 = vmux(q3,v6,v29) -; CHECK-NEXT: q3 = vcmp.eq(v2.w,v21.w) +; CHECK-NEXT: v6 = vmux(q3,v6,v26) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.w = vasl(v1.w,r4) -; CHECK-NEXT: v31 = vor(v5,v4) +; CHECK-NEXT: v2.w = vasl(v2.w,r4) +; CHECK-NEXT: v29 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vor(v6,v1) -; CHECK-NEXT: v0 = vmux(q3,v21,v31) -; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: v28 = vor(v6,v2) +; CHECK-NEXT: v31 = vmux(q3,v3,v29) +; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vmux(q2,v21,v1) +; CHECK-NEXT: v30 = vmux(q2,v3,v28) ; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: vmem(r1+#1) = v1.new +; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } %v0 = load <64 x i8>, ptr %a0, align 128 %v1 = uitofp <64 x i8> %v0 to <64 x float> @@ -1947,69 +1949,69 @@ define void @u8f32_2(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r6 = #1 -; CHECK-NEXT: r3 = #512 +; CHECK-NEXT: r2 = #255 ; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vsplat(r6) -; CHECK-NEXT: v4 = vsplat(r3) -; CHECK-NEXT: r2 = #255 -; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: v1 = vsplat(r6) +; CHECK-NEXT: v29 = vsplat(r2) +; CHECK-NEXT: r3 = #512 +; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v3 = vsplat(r3) ; CHECK-NEXT: r5:4 = combine(##159,#8) -; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) +; CHECK-NEXT: v5:4.uw = vunpack(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vsplat(r2) ; CHECK-NEXT: v7 = vsplat(r5) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: q3 = vcmp.eq(v4.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.uw = vcl0(v0.uw) +; CHECK-NEXT: v5.uw = vcl0(v4.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vadd(v5.w,v2.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vasl(v0.w,v5.w) +; CHECK-NEXT: v6.w = vasl(v4.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.w = vadd(v6.w,v1.w) -; CHECK-NEXT: v4 = vand(v6,v4) +; CHECK-NEXT: v0.w = vadd(v6.w,v29.w) +; CHECK-NEXT: v3 = vand(v6,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4) -; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v1.uw) -; CHECK-NEXT: q1 = vcmp.eq(v4.w,v3.w) +; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v0.uw) +; CHECK-NEXT: q1 = vcmp.eq(v3.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #23 -; CHECK-NEXT: v1.uw = vlsr(v1.uw,r4) -; CHECK-NEXT: v4 = vmux(q1,v3,v2) -; CHECK-NEXT: v2 = vmux(q0,v2,v3) +; CHECK-NEXT: v0.uw = vlsr(v0.uw,r4) +; CHECK-NEXT: v3 = vmux(q1,v2,v1) +; CHECK-NEXT: v1 = vmux(q0,v1,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vsub(v2.w,v5.w) -; CHECK-NEXT: v4.w = vadd(v1.w,v4.w) -; CHECK-NEXT: q2 = vcmp.eq(v6.w,v1.w) +; CHECK-NEXT: v1.w = vsub(v1.w,v5.w) +; CHECK-NEXT: v3.w = vadd(v0.w,v3.w) +; CHECK-NEXT: q2 = vcmp.eq(v6.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v1.uw,r6) -; CHECK-NEXT: v2.w = vadd(v2.w,v7.w) +; CHECK-NEXT: v30.uw = vlsr(v0.uw,r6) +; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30.uw = vlsr(v4.uw,r6) +; CHECK-NEXT: v31.uw = vlsr(v3.uw,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v2.w,r4) -; CHECK-NEXT: v1 = vmux(q2,v30,v29) +; CHECK-NEXT: v1.w = vasl(v1.w,r4) +; CHECK-NEXT: v0 = vmux(q2,v31,v30) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v31 = vor(v1,v2) +; CHECK-NEXT: v0 = vor(v0,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmux(q3,v3,v31) +; CHECK-NEXT: v0 = vmux(q3,v2,v0) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } @@ -2126,13 +2128,13 @@ define void @u16f16_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5) -; CHECK-NEXT: q1 = vcmp.eq(v5.h,v3.h) -; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh) +; CHECK-NEXT: q0 = vcmp.eq(v5.h,v3.h) +; CHECK-NEXT: q1 = vcmp.gt(v6.uh,v7.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uh = vlsr(v7.uh,r5) -; CHECK-NEXT: v5 = vmux(q1,v3,v2) -; CHECK-NEXT: v2 = vmux(q0,v2,v3) +; CHECK-NEXT: v5 = vmux(q0,v3,v2) +; CHECK-NEXT: v2 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vadd(v2.h,v4.h) @@ -2182,10 +2184,10 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3 = vsplat(r7) +; CHECK-NEXT: v2 = vsplat(r7) ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r6 = #512 -; CHECK-NEXT: v2 = vxor(v2,v2) +; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8 = vsplat(r6) @@ -2196,10 +2198,10 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v14 = vsplat(r5) ; CHECK-NEXT: v5.uw = vcl0(v1.uw) -; CHECK-NEXT: v4.w = vadd(v4.w,v3.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vadd(v5.w,v3.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.w = vasl(v0.w,v4.w) @@ -2212,31 +2214,31 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v9.w,v6.w) ; CHECK-NEXT: v8 = vand(v9,v8) -; CHECK-NEXT: q1 = vcmp.eq(v11.w,v2.w) -; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw) +; CHECK-NEXT: q0 = vcmp.eq(v11.w,v3.w) +; CHECK-NEXT: q1 = vcmp.gt(v7.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2) -; CHECK-NEXT: q2 = vcmp.eq(v8.w,v2.w) +; CHECK-NEXT: q2 = vcmp.eq(v8.w,v3.w) ; CHECK-NEXT: q3 = vcmp.gt(v9.uw,v6.uw) -; CHECK-NEXT: v20 = vmux(q1,v2,v3) +; CHECK-NEXT: v20 = vmux(q0,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v22 = vmux(q2,v2,v3) -; CHECK-NEXT: v25 = vmux(q0,v3,v2) -; CHECK-NEXT: v3 = vmux(q3,v3,v2) +; CHECK-NEXT: v22 = vmux(q2,v3,v2) +; CHECK-NEXT: v25 = vmux(q1,v2,v3) +; CHECK-NEXT: v2 = vmux(q3,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vsub(v25.w,v4.w) -; CHECK-NEXT: v3.w = vsub(v3.w,v5.w) +; CHECK-NEXT: v2.w = vsub(v2.w,v5.w) ; CHECK-NEXT: v23.w = vadd(v19.w,v20.w) ; CHECK-NEXT: v10.w = vadd(v21.w,v22.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: v4.w = vadd(v4.w,v14.w) -; CHECK-NEXT: v3.w = vadd(v3.w,v14.w) +; CHECK-NEXT: v2.w = vadd(v2.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v9.uw,r2) @@ -2252,7 +2254,7 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7) ; CHECK-NEXT: v5 = vmux(q2,v26,v13) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v21.uw,r7) @@ -2260,19 +2262,19 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,r4) ; CHECK-NEXT: v6 = vmux(q3,v27,v28) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vasl(v3.w,r4) +; CHECK-NEXT: v2.w = vasl(v2.w,r4) ; CHECK-NEXT: v29 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3 = vor(v6,v3) -; CHECK-NEXT: v31 = vmux(q3,v2,v29) +; CHECK-NEXT: v2 = vor(v6,v2) +; CHECK-NEXT: v31 = vmux(q3,v3,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v2,v3) +; CHECK-NEXT: v30 = vmux(q2,v3,v2) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -2369,20 +2371,20 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#8,#1) ; CHECK-NEXT: r6 = #255 -; CHECK-NEXT: v3.uw = vcl0(v0.uw) +; CHECK-NEXT: v1.uw = vcl0(v0.uw) ; CHECK-NEXT: v0.cur = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vsplat(r2) +; CHECK-NEXT: v4 = vsplat(r2) ; CHECK-NEXT: r4 = #512 -; CHECK-NEXT: v4.uw = vcl0(v1.uw) -; CHECK-NEXT: v1.cur = vmem(r0+#0) +; CHECK-NEXT: v3.uw = vcl0(v2.uw) +; CHECK-NEXT: v2.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r4) ; CHECK-NEXT: v6 = vsplat(r6) -; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) -; CHECK-NEXT: v3.w = vadd(v3.w,v2.w) +; CHECK-NEXT: v3.w = vadd(v3.w,v4.w) +; CHECK-NEXT: v1.w = vadd(v1.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 @@ -2390,57 +2392,57 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r4) -; CHECK-NEXT: v5.w = vasl(v1.w,v4.w) +; CHECK-NEXT: v5.w = vasl(v2.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v0.w,v3.w) +; CHECK-NEXT: v8.w = vasl(v0.w,v1.w) ; CHECK-NEXT: v11.w = vadd(v5.w,v6.w) ; CHECK-NEXT: v13 = vand(v5,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v8.w,v6.w) ; CHECK-NEXT: v7 = vand(v8,v7) -; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v11.uw) -; CHECK-NEXT: q2 = vcmp.eq(v13.w,v9.w) +; CHECK-NEXT: q0 = vcmp.gt(v5.uw,v11.uw) +; CHECK-NEXT: q1 = vcmp.eq(v13.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27.uw = vlsr(v11.uw,r3) +; CHECK-NEXT: v28.uw = vlsr(v11.uw,r3) ; CHECK-NEXT: q3 = vcmp.gt(v8.uw,v6.uw) -; CHECK-NEXT: q0 = vcmp.eq(v7.w,v9.w) -; CHECK-NEXT: v28 = vmux(q2,v9,v2) +; CHECK-NEXT: q2 = vcmp.eq(v7.w,v9.w) +; CHECK-NEXT: v30 = vmux(q0,v4,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r3) -; CHECK-NEXT: v29 = vmux(q1,v2,v9) -; CHECK-NEXT: v30 = vmux(q3,v2,v9) -; CHECK-NEXT: v2 = vmux(q0,v9,v2) +; CHECK-NEXT: v29 = vmux(q1,v9,v4) +; CHECK-NEXT: v31 = vmux(q3,v4,v9) +; CHECK-NEXT: v4 = vmux(q2,v9,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vsub(v29.w,v4.w) -; CHECK-NEXT: v7.w = vadd(v27.w,v28.w) ; CHECK-NEXT: v3.w = vsub(v30.w,v3.w) -; CHECK-NEXT: v2.w = vadd(v6.w,v2.w) +; CHECK-NEXT: v7.w = vadd(v28.w,v29.w) +; CHECK-NEXT: v1.w = vsub(v31.w,v1.w) +; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v5.uw,r3) -; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) ; CHECK-NEXT: v3.w = vadd(v3.w,v10.w) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v9.w) +; CHECK-NEXT: v1.w = vadd(v1.w,v10.w) +; CHECK-NEXT: q2 = vcmp.eq(v2.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #23 ; CHECK-NEXT: v14.uw = vlsr(v8.uw,r3) -; CHECK-NEXT: q3 = vcmp.eq(v12.w,v27.w) +; CHECK-NEXT: q3 = vcmp.eq(v12.w,v28.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.uw = vlsr(v27.uw,r2) +; CHECK-NEXT: v5.uw = vlsr(v28.uw,r2) ; CHECK-NEXT: q1 = vcmp.eq(v14.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.uw = vlsr(v2.uw,r2) +; CHECK-NEXT: v4.uw = vlsr(v4.uw,r2) ; CHECK-NEXT: v5 = vmux(q3,v7,v5) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v9.w) ; CHECK-NEXT: } @@ -2448,16 +2450,16 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vasl(v4.w,r3) -; CHECK-NEXT: v31 = vmux(q1,v2,v6) +; CHECK-NEXT: v3.w = vasl(v3.w,r3) +; CHECK-NEXT: v2 = vmux(q1,v4,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v3.w,r3) -; CHECK-NEXT: v4 = vor(v5,v4) +; CHECK-NEXT: v1.w = vasl(v1.w,r3) +; CHECK-NEXT: v3 = vor(v5,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vor(v31,v2) -; CHECK-NEXT: v3 = vmux(q2,v9,v4) +; CHECK-NEXT: v1 = vor(v2,v1) +; CHECK-NEXT: v3 = vmux(q2,v9,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v9,v1) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll index 6fa0585843f46c..2384ca4f95ec43 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll @@ -14,15 +14,15 @@ define void @fred(<16 x i32> %a0, <16 x i32> %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r1:0 = combine(#-1,#32) ; CHECK-NEXT: v2 = vxor(v2,v2) -; CHECK-NEXT: q1 = vcmp.eq(v0.w,v1.w) +; CHECK-NEXT: q0 = vcmp.eq(v0.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r7 = ##g0 -; CHECK-NEXT: q0 = vsetq(r0) -; CHECK-NEXT: v0 = vmux(q1,v0,v2) +; CHECK-NEXT: q1 = vsetq(r0) +; CHECK-NEXT: v0 = vmux(q0,v0,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vand(q0,r1) +; CHECK-NEXT: v30 = vand(q1,r1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.h = vpacke(v0.w,v0.w) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll b/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll index c18672ba0a833e..5b5de3c3c988fe 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll @@ -321,58 +321,58 @@ define <64 x i32> @f10(<32 x i32> %a0, <32 x i32> %a1) #0 { ; V60-NEXT: r0 = ##33686018 ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v3:2 = vcombine(v0,v1) +; V60-NEXT: v3:2.uw = vmpy(v0.uh,v1.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v1:0.uw = vmpy(v0.uh,v1.uh) +; V60-NEXT: r2 = #16 ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: r2 = #16 +; V60-NEXT: v4 = vxor(v4,v4) ; V60-NEXT: } ; V60-NEXT: { ; V60-NEXT: v5 = vsplat(r0) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v4 = vxor(v4,v4) +; V60-NEXT: q0 = vcmp.gt(v4.w,v0.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v6.uw = vlsr(v0.uw,r2) +; V60-NEXT: q1 = vcmp.gt(v4.w,v1.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: q1 = vcmp.gt(v4.w,v3.w) +; V60-NEXT: v6.uw = vlsr(v2.uw,r2) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: q0 = vcmp.gt(v4.w,v2.w) +; V60-NEXT: v30 = vmux(q0,v1,v4) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v5 = vdelta(v2,v5) +; V60-NEXT: v5 = vdelta(v1,v5) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v2 = vmux(q1,v2,v4) +; V60-NEXT: if (q1) v30.w += v0.w ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: if (q0) v2.w += v3.w +; V60-NEXT: v9:8.uw = vmpy(v0.uh,v5.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v9:8.uw = vmpy(v3.uh,v5.uh) +; V60-NEXT: v9:8.w = vadd(v9.uh,v8.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v9:8.w = vadd(v9.uh,v8.uh) +; V60-NEXT: v29.w = vadd(v8.w,v6.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v31.w = vadd(v8.w,v6.w) +; V60-NEXT: v2.w += vasl(v8.w,r2) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v0.w += vasl(v8.w,r2) +; V60-NEXT: v9.w += vasr(v29.w,r2) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v9.w += vasr(v31.w,r2) +; V60-NEXT: v31.w = vadd(v3.w,v9.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v1.w = vadd(v1.w,v9.w) +; V60-NEXT: v3.w = vsub(v31.w,v30.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v1.w = vsub(v1.w,v2.w) +; V60-NEXT: v1:0 = vcombine(v3,v2) ; V60-NEXT: } ; V60-NEXT: { ; V60-NEXT: jumpr r31 diff --git a/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll b/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll index 150d7e271818a4..339a7627f71256 100644 --- a/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll +++ b/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll @@ -1,4 +1,5 @@ ; RUN: llc -O3 -march=hexagon < %s | FileCheck %s +; XFAIL: * ; CHECK: v{{[0-9]+}}.cur = vmem(r{{[0-9]+}}+#0) target triple = "hexagon" diff --git a/llvm/test/CodeGen/Hexagon/signext-inreg.ll b/llvm/test/CodeGen/Hexagon/signext-inreg.ll index fe74fa0f9a0ee4..3ebf850ef91175 100644 --- a/llvm/test/CodeGen/Hexagon/signext-inreg.ll +++ b/llvm/test/CodeGen/Hexagon/signext-inreg.ll @@ -47,48 +47,48 @@ define <16 x i32> @test2(<16 x i32> %m) { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r5 = extract(r5,#8,#0) -; CHECK-NEXT: r13:12 = memd(r29+#48) +; CHECK-NEXT: r7:6 = memd(r29+#48) ; CHECK-NEXT: memd(r29+#0) = r17:16 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r13 = extract(r13,#8,#0) -; CHECK-NEXT: r12 = sxtb(r12) -; CHECK-NEXT: r15:14 = memd(r29+#40) -; CHECK-NEXT: r9:8 = memd(r29+#32) +; CHECK-NEXT: r7 = extract(r7,#8,#0) +; CHECK-NEXT: r6 = sxtb(r6) +; CHECK-NEXT: r9:8 = memd(r29+#40) +; CHECK-NEXT: r13:12 = memd(r29+#32) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r9 = extract(r9,#8,#0) -; CHECK-NEXT: r8 = sxtb(r8) -; CHECK-NEXT: r11:10 = memd(r29+#24) -; CHECK-NEXT: r7:6 = memd(r29+#16) +; CHECK-NEXT: r13 = extract(r13,#8,#0) +; CHECK-NEXT: r12 = sxtb(r12) +; CHECK-NEXT: r15:14 = memd(r29+#24) +; CHECK-NEXT: r17:16 = memd(r29+#8) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r11 = extract(r11,#8,#0) -; CHECK-NEXT: r10 = sxtb(r10) +; CHECK-NEXT: r15 = extract(r15,#8,#0) ; CHECK-NEXT: r14 = sxtb(r14) -; CHECK-NEXT: r17:16 = memd(r29+#8) +; CHECK-NEXT: r11:10 = memd(r29+#16) +; CHECK-NEXT: memd(r0+#56) = r7:6 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r15 = extract(r15,#8,#0) +; CHECK-NEXT: r9 = extract(r9,#8,#0) ; CHECK-NEXT: r17 = extract(r17,#8,#0) +; CHECK-NEXT: r8 = sxtb(r8) ; CHECK-NEXT: r16 = sxtb(r16) -; CHECK-NEXT: r6 = sxtb(r6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r7 = extract(r7,#8,#0) -; CHECK-NEXT: memd(r0+#56) = r13:12 -; CHECK-NEXT: memd(r0+#48) = r15:14 +; CHECK-NEXT: r7 = extract(r11,#8,#0) +; CHECK-NEXT: r6 = sxtb(r10) +; CHECK-NEXT: memd(r0+#48) = r9:8 +; CHECK-NEXT: memd(r0+#40) = r13:12 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memd(r0+#40) = r9:8 -; CHECK-NEXT: memd(r0+#32) = r11:10 +; CHECK-NEXT: memd(r0+#32) = r15:14 +; CHECK-NEXT: memd(r0+#24) = r7:6 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memd(r0+#24) = r7:6 ; CHECK-NEXT: memd(r0+#16) = r17:16 +; CHECK-NEXT: memd(r0+#8) = r5:4 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memd(r0+#8) = r5:4 ; CHECK-NEXT: memd(r0+#0) = r3:2 ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -137,98 +137,98 @@ define <64 x i16> @test3(<64 x i16> %m) { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = vaslh(r3:2,#8) ; CHECK-NEXT: r5:4 = vaslh(r5:4,#8) +; CHECK-NEXT: r7:6 = memd(r29+#104) ; CHECK-NEXT: r9:8 = memd(r29+#96) -; CHECK-NEXT: r11:10 = memd(r29+#88) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) -; CHECK-NEXT: r11:10 = vaslh(r11:10,#8) -; CHECK-NEXT: r13:12 = memd(r29+#80) -; CHECK-NEXT: r7:6 = memd(r29+#104) +; CHECK-NEXT: r7:6 = vaslh(r7:6,#8) +; CHECK-NEXT: r13:12 = memd(r29+#88) +; CHECK-NEXT: r15:14 = memd(r29+#80) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r15:14 = vaslh(r7:6,#8) ; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) -; CHECK-NEXT: r7:6 = memd(r29+#72) +; CHECK-NEXT: r15:14 = vaslh(r15:14,#8) +; CHECK-NEXT: r11:10 = memd(r29+#72) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r15:14 = vasrh(r15:14,#8) +; CHECK-NEXT: r7:6 = vasrh(r7:6,#8) ; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r11:10 = vasrh(r11:10,#8) ; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) -; CHECK-NEXT: r15:14 = memd(r29+#64) -; CHECK-NEXT: memd(r0+#120) = r15:14 +; CHECK-NEXT: r15:14 = vasrh(r15:14,#8) +; CHECK-NEXT: r7:6 = memd(r29+#64) +; CHECK-NEXT: memd(r0+#120) = r7:6 ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: r11:10 = vaslh(r11:10,#8) ; CHECK-NEXT: r7:6 = vaslh(r7:6,#8) -; CHECK-NEXT: r15:14 = vaslh(r15:14,#8) ; CHECK-NEXT: r9:8 = memd(r29+#56) ; CHECK-NEXT: memd(r0+#112) = r9:8 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) -; CHECK-NEXT: r7:6 = vasrh(r7:6,#8) -; CHECK-NEXT: r11:10 = memd(r29+#48) -; CHECK-NEXT: memd(r0+#104) = r11:10 +; CHECK-NEXT: r11:10 = vasrh(r11:10,#8) +; CHECK-NEXT: r13:12 = memd(r29+#48) +; CHECK-NEXT: memd(r0+#104) = r13:12 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r11:10 = vaslh(r11:10,#8) -; CHECK-NEXT: r15:14 = vasrh(r15:14,#8) -; CHECK-NEXT: r13:12 = memd(r29+#40) -; CHECK-NEXT: memd(r0+#96) = r13:12 +; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) +; CHECK-NEXT: r7:6 = vasrh(r7:6,#8) +; CHECK-NEXT: r15:14 = memd(r29+#40) +; CHECK-NEXT: memd(r0+#96) = r15:14 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) +; CHECK-NEXT: r15:14 = vaslh(r15:14,#8) ; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) -; CHECK-NEXT: r7:6 = memd(r29+#32) -; CHECK-NEXT: memd(r0+#88) = r7:6 +; CHECK-NEXT: r11:10 = memd(r29+#32) +; CHECK-NEXT: memd(r0+#88) = r11:10 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r11:10 = vasrh(r11:10,#8) ; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) -; CHECK-NEXT: r15:14 = memd(r29+#0) -; CHECK-NEXT: memd(r0+#80) = r15:14 +; CHECK-NEXT: r15:14 = vasrh(r15:14,#8) +; CHECK-NEXT: r7:6 = memd(r29+#0) +; CHECK-NEXT: memd(r0+#80) = r7:6 ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: r11:10 = vaslh(r11:10,#8) ; CHECK-NEXT: r7:6 = vaslh(r7:6,#8) -; CHECK-NEXT: r15:14 = vaslh(r15:14,#8) ; CHECK-NEXT: r9:8 = memd(r29+#16) ; CHECK-NEXT: memd(r0+#72) = r9:8 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) -; CHECK-NEXT: r7:6 = vasrh(r7:6,#8) -; CHECK-NEXT: r11:10 = memd(r29+#24) -; CHECK-NEXT: memd(r0+#64) = r11:10 +; CHECK-NEXT: r11:10 = vasrh(r11:10,#8) +; CHECK-NEXT: r13:12 = memd(r29+#24) +; CHECK-NEXT: memd(r0+#64) = r13:12 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r11:10 = vaslh(r11:10,#8) +; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) ; CHECK-NEXT: r3:2 = vasrh(r3:2,#8) -; CHECK-NEXT: r13:12 = memd(r29+#8) -; CHECK-NEXT: memd(r0+#56) = r13:12 +; CHECK-NEXT: r15:14 = memd(r29+#8) +; CHECK-NEXT: memd(r0+#56) = r15:14 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) +; CHECK-NEXT: r15:14 = vaslh(r15:14,#8) ; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) -; CHECK-NEXT: memd(r0+#48) = r7:6 +; CHECK-NEXT: memd(r0+#48) = r11:10 ; CHECK-NEXT: memd(r0+#0) = r3:2 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r11:10 = vasrh(r11:10,#8) -; CHECK-NEXT: r7:6 = vasrh(r15:14,#8) +; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) +; CHECK-NEXT: r7:6 = vasrh(r7:6,#8) ; CHECK-NEXT: memd(r0+#32) = r9:8 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) +; CHECK-NEXT: r11:10 = vasrh(r15:14,#8) ; CHECK-NEXT: r5:4 = vasrh(r5:4,#8) -; CHECK-NEXT: memd(r0+#40) = r11:10 +; CHECK-NEXT: memd(r0+#40) = r13:12 ; CHECK-NEXT: memd(r0+#16) = r7:6 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: memd(r0+#24) = r13:12 +; CHECK-NEXT: memd(r0+#24) = r11:10 ; CHECK-NEXT: memd(r0+#8) = r5:4 ; CHECK-NEXT: } ; diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll index 1562f1872ceb7b..91b9ff36d29abc 100644 --- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll +++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; This version of the conv3x3 test has both loops. This test checks that the -; inner loop has 14 packets. +; inner loop has 13 packets. ; CHECK: loop0(.LBB0_[[LOOP:.]], ; CHECK: .LBB0_[[LOOP]]: @@ -17,7 +17,6 @@ ; CHECK: } ; CHECK: } ; CHECK: } -; CHECK: } ; CHECK-NOT: } ; CHECK: }{{[ \t]*}}:endloop0 diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll index 8d7958e4747d02..f7852cbcfeb54d 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=hexagon -O2 -enable-pipeliner -disable-block-placement=0 < %s | FileCheck %s +; XFAIL: * ; For the Phis generated in the epilog, test that we generate the correct ; names for the values coming from the prolog stages. The test belows diff --git a/llvm/test/CodeGen/MSP430/selectcc.ll b/llvm/test/CodeGen/MSP430/selectcc.ll index 28b90f0131703e..4426a248e070ba 100644 --- a/llvm/test/CodeGen/MSP430/selectcc.ll +++ b/llvm/test/CodeGen/MSP430/selectcc.ll @@ -3,7 +3,8 @@ define i16 @select_to_shifts_i16(i16 %a, i16 %b) { ; CHECK-LABEL: select_to_shifts_i16: -; CHECK: ; %bb.0: +; CHECK: .cfi_startproc +; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: mov r12, r14 ; CHECK-NEXT: clr r12 ; CHECK-NEXT: bit #2, r14 @@ -20,24 +21,26 @@ define i16 @select_to_shifts_i16(i16 %a, i16 %b) { define i32 @select_to_shifts_i32(i32 %a, i32 %b) { ; CHECK-LABEL: select_to_shifts_i32: -; CHECK: ; %bb.0: -; CHECK-NEXT: mov r12, r11 -; CHECK-NEXT: and #2, r11 +; CHECK: .cfi_startproc +; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: and #2, r12 ; CHECK-NEXT: clr r13 -; CHECK-NEXT: tst r11 -; CHECK-NEXT: clr r12 +; CHECK-NEXT: tst r12 +; CHECK-NEXT: clr r11 ; CHECK-NEXT: jne .LBB1_3 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: tst r11 +; CHECK-NEXT: tst r12 ; CHECK-NEXT: jne .LBB1_4 ; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: mov r11, r12 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_3: -; CHECK-NEXT: mov r14, r12 -; CHECK-NEXT: tst r11 +; CHECK-NEXT: mov r14, r11 +; CHECK-NEXT: tst r12 ; CHECK-NEXT: jeq .LBB1_2 ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: mov r15, r13 +; CHECK-NEXT: mov r11, r12 ; CHECK-NEXT: ret %and = and i32 %a, 2 %tobool = icmp eq i32 %and, 0 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll index 47d18b9b5c533e..1cd4121883b5d2 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll @@ -708,21 +708,21 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; MMR3-NEXT: andi16 $3, $3, 31 ; MMR3-NEXT: srlv $16, $5, $6 ; MMR3-NEXT: sllv $4, $4, $3 -; MMR3-NEXT: srlv $17, $7, $6 -; MMR3-NEXT: lwl $7, 0($2) -; MMR3-NEXT: lwr $7, 3($2) -; MMR3-NEXT: sll16 $3, $7, 1 +; MMR3-NEXT: srlv $7, $7, $6 +; MMR3-NEXT: lwl $17, 0($2) +; MMR3-NEXT: lwr $17, 3($2) +; MMR3-NEXT: sll16 $3, $17, 1 ; MMR3-NEXT: xori $1, $6, 31 ; MMR3-NEXT: sllv $3, $3, $1 -; MMR3-NEXT: or16 $3, $17 +; MMR3-NEXT: or16 $3, $7 ; MMR3-NEXT: or16 $4, $16 -; MMR3-NEXT: lwl $8, 12($2) -; MMR3-NEXT: lwr $8, 15($2) -; MMR3-NEXT: srlv $2, $8, $6 +; MMR3-NEXT: lwl $7, 12($2) +; MMR3-NEXT: lwr $7, 15($2) +; MMR3-NEXT: srlv $2, $7, $6 ; MMR3-NEXT: sll16 $5, $5, 1 ; MMR3-NEXT: sllv $5, $5, $1 ; MMR3-NEXT: or16 $5, $2 -; MMR3-NEXT: srav $2, $7, $6 +; MMR3-NEXT: srav $2, $17, $6 ; MMR3-NEXT: lwp $16, 32($sp) ; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll index c4e05117d28e15..4395647e07fee4 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll @@ -713,21 +713,21 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) { ; MMR3-NEXT: andi16 $3, $3, 31 ; MMR3-NEXT: srlv $16, $5, $6 ; MMR3-NEXT: sllv $4, $4, $3 -; MMR3-NEXT: srlv $17, $7, $6 -; MMR3-NEXT: lwl $7, 0($2) -; MMR3-NEXT: lwr $7, 3($2) -; MMR3-NEXT: sll16 $3, $7, 1 +; MMR3-NEXT: srlv $7, $7, $6 +; MMR3-NEXT: lwl $17, 0($2) +; MMR3-NEXT: lwr $17, 3($2) +; MMR3-NEXT: sll16 $3, $17, 1 ; MMR3-NEXT: xori $1, $6, 31 ; MMR3-NEXT: sllv $3, $3, $1 -; MMR3-NEXT: or16 $3, $17 +; MMR3-NEXT: or16 $3, $7 ; MMR3-NEXT: or16 $4, $16 -; MMR3-NEXT: lwl $8, 12($2) -; MMR3-NEXT: lwr $8, 15($2) -; MMR3-NEXT: srlv $2, $8, $6 +; MMR3-NEXT: lwl $7, 12($2) +; MMR3-NEXT: lwr $7, 15($2) +; MMR3-NEXT: srlv $2, $7, $6 ; MMR3-NEXT: sll16 $5, $5, 1 ; MMR3-NEXT: sllv $5, $5, $1 ; MMR3-NEXT: or16 $5, $2 -; MMR3-NEXT: srlv $2, $7, $6 +; MMR3-NEXT: srlv $2, $17, $6 ; MMR3-NEXT: lwp $16, 32($sp) ; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra diff --git a/llvm/test/CodeGen/PowerPC/all-atomics.ll b/llvm/test/CodeGen/PowerPC/all-atomics.ll index 093253bf8f6915..dad8a489d38426 100644 --- a/llvm/test/CodeGen/PowerPC/all-atomics.ll +++ b/llvm/test/CodeGen/PowerPC/all-atomics.ll @@ -3212,7 +3212,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: stw 29, 164(1) # 4-byte Folded Spill ; AIX32-NEXT: stw 30, 168(1) # 4-byte Folded Spill ; AIX32-NEXT: stw 31, 172(1) # 4-byte Folded Spill -; AIX32-NEXT: li 17, -1 +; AIX32-NEXT: li 16, -1 ; AIX32-NEXT: sync ; AIX32-NEXT: rlwinm 22, 27, 0, 0, 29 ; AIX32-NEXT: slw 4, 3, 24 @@ -3256,7 +3256,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: lwsync ; AIX32-NEXT: clrlwi 4, 4, 24 ; AIX32-NEXT: ori 5, 5, 65535 -; AIX32-NEXT: rlwinm 16, 23, 0, 0, 29 +; AIX32-NEXT: rlwinm 17, 23, 0, 0, 29 ; AIX32-NEXT: add 3, 4, 3 ; AIX32-NEXT: rlwinm 4, 23, 3, 27, 27 ; AIX32-NEXT: xori 18, 4, 16 @@ -3267,12 +3267,12 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: slw 5, 5, 18 ; AIX32-NEXT: L..BB2_5: # %entry ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 6, 0, 16 +; AIX32-NEXT: lwarx 6, 0, 17 ; AIX32-NEXT: add 7, 4, 6 ; AIX32-NEXT: andc 8, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: or 7, 7, 8 -; AIX32-NEXT: stwcx. 7, 0, 16 +; AIX32-NEXT: stwcx. 7, 0, 17 ; AIX32-NEXT: bne 0, L..BB2_5 ; AIX32-NEXT: # %bb.6: # %entry ; AIX32-NEXT: srw 4, 6, 18 @@ -3402,12 +3402,12 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: slw 4, 3, 18 ; AIX32-NEXT: L..BB2_17: # %entry ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 6, 0, 16 +; AIX32-NEXT: lwarx 6, 0, 17 ; AIX32-NEXT: sub 7, 6, 4 ; AIX32-NEXT: andc 8, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: or 7, 7, 8 -; AIX32-NEXT: stwcx. 7, 0, 16 +; AIX32-NEXT: stwcx. 7, 0, 17 ; AIX32-NEXT: bne 0, L..BB2_17 ; AIX32-NEXT: # %bb.18: # %entry ; AIX32-NEXT: srw 4, 6, 18 @@ -3528,12 +3528,12 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: slw 4, 3, 18 ; AIX32-NEXT: L..BB2_29: # %entry ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 6, 0, 16 +; AIX32-NEXT: lwarx 6, 0, 17 ; AIX32-NEXT: or 7, 4, 6 ; AIX32-NEXT: andc 8, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: or 7, 7, 8 -; AIX32-NEXT: stwcx. 7, 0, 16 +; AIX32-NEXT: stwcx. 7, 0, 17 ; AIX32-NEXT: bne 0, L..BB2_29 ; AIX32-NEXT: # %bb.30: # %entry ; AIX32-NEXT: srw 4, 6, 18 @@ -3652,12 +3652,12 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: slw 4, 3, 18 ; AIX32-NEXT: L..BB2_41: # %entry ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 6, 0, 16 +; AIX32-NEXT: lwarx 6, 0, 17 ; AIX32-NEXT: xor 7, 4, 6 ; AIX32-NEXT: andc 8, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: or 7, 7, 8 -; AIX32-NEXT: stwcx. 7, 0, 16 +; AIX32-NEXT: stwcx. 7, 0, 17 ; AIX32-NEXT: bne 0, L..BB2_41 ; AIX32-NEXT: # %bb.42: # %entry ; AIX32-NEXT: srw 4, 6, 18 @@ -3776,12 +3776,12 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: slw 4, 3, 18 ; AIX32-NEXT: L..BB2_53: # %entry ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 6, 0, 16 +; AIX32-NEXT: lwarx 6, 0, 17 ; AIX32-NEXT: nand 7, 4, 6 ; AIX32-NEXT: andc 8, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: or 7, 7, 8 -; AIX32-NEXT: stwcx. 7, 0, 16 +; AIX32-NEXT: stwcx. 7, 0, 17 ; AIX32-NEXT: bne 0, L..BB2_53 ; AIX32-NEXT: # %bb.54: # %entry ; AIX32-NEXT: srw 4, 6, 18 @@ -3807,6 +3807,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: # %bb.56: # %entry ; AIX32-NEXT: srw 4, 6, 15 ; AIX32-NEXT: lwsync +; AIX32-NEXT: stw 20, 56(1) # 4-byte Folded Spill ; AIX32-NEXT: clrlwi 4, 4, 16 ; AIX32-NEXT: nand 3, 4, 3 ; AIX32-NEXT: sth 3, 0(20) @@ -3819,11 +3820,11 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: stwcx. 4, 0, 13 ; AIX32-NEXT: bne 0, L..BB2_57 ; AIX32-NEXT: # %bb.58: # %entry -; AIX32-NEXT: stw 23, 56(1) # 4-byte Folded Spill -; AIX32-NEXT: stw 27, 60(1) # 4-byte Folded Spill +; AIX32-NEXT: stw 23, 60(1) # 4-byte Folded Spill +; AIX32-NEXT: mr 20, 27 ; AIX32-NEXT: lwsync -; AIX32-NEXT: stw 4, 0(13) ; AIX32-NEXT: lbz 3, 0(26) +; AIX32-NEXT: stw 4, 0(13) ; AIX32-NEXT: sync ; AIX32-NEXT: L..BB2_59: # %entry ; AIX32-NEXT: # @@ -3848,7 +3849,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: li 6, 5 ; AIX32-NEXT: stw 3, 4(31) ; AIX32-NEXT: mr 3, 30 -; AIX32-NEXT: stw 17, 0(31) +; AIX32-NEXT: stw 16, 0(31) ; AIX32-NEXT: bl .__atomic_fetch_nand_8[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: nand 3, 4, 29 @@ -3856,7 +3857,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: lbz 23, 0(26) ; AIX32-NEXT: addi 28, 1, 80 ; AIX32-NEXT: addi 27, 1, 64 -; AIX32-NEXT: stw 17, 0(30) +; AIX32-NEXT: stw 16, 0(30) ; AIX32-NEXT: lwz 4, 12(29) ; AIX32-NEXT: lwz 5, 8(29) ; AIX32-NEXT: lwz 6, 4(29) @@ -3869,7 +3870,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: stw 7, 80(1) ; AIX32-NEXT: li 7, 5 ; AIX32-NEXT: li 8, 5 -; AIX32-NEXT: xor 3, 3, 17 +; AIX32-NEXT: xor 3, 3, 16 ; AIX32-NEXT: stw 6, 84(1) ; AIX32-NEXT: stw 5, 88(1) ; AIX32-NEXT: stw 4, 92(1) @@ -3878,9 +3879,9 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: mr 6, 27 ; AIX32-NEXT: stw 3, 76(1) ; AIX32-NEXT: li 3, 16 -; AIX32-NEXT: stw 17, 72(1) -; AIX32-NEXT: stw 17, 68(1) -; AIX32-NEXT: stw 17, 64(1) +; AIX32-NEXT: stw 16, 72(1) +; AIX32-NEXT: stw 16, 68(1) +; AIX32-NEXT: stw 16, 64(1) ; AIX32-NEXT: bl .__atomic_compare_exchange[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: lwz 4, 92(1) @@ -3891,11 +3892,11 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: beq 0, L..BB2_61 ; AIX32-NEXT: # %bb.62: # %atomicrmw.end ; AIX32-NEXT: and 3, 4, 23 -; AIX32-NEXT: stw 17, 0(29) +; AIX32-NEXT: stw 16, 0(29) ; AIX32-NEXT: lbz 23, 0(26) -; AIX32-NEXT: stw 17, 4(29) -; AIX32-NEXT: stw 17, 8(29) -; AIX32-NEXT: xor 3, 3, 17 +; AIX32-NEXT: stw 16, 4(29) +; AIX32-NEXT: stw 16, 8(29) +; AIX32-NEXT: xor 3, 3, 16 ; AIX32-NEXT: addi 28, 1, 80 ; AIX32-NEXT: addi 27, 1, 64 ; AIX32-NEXT: stw 3, 12(29) @@ -3911,7 +3912,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: stw 7, 80(1) ; AIX32-NEXT: li 7, 5 ; AIX32-NEXT: li 8, 5 -; AIX32-NEXT: xor 3, 3, 17 +; AIX32-NEXT: xor 3, 3, 16 ; AIX32-NEXT: stw 6, 84(1) ; AIX32-NEXT: stw 5, 88(1) ; AIX32-NEXT: stw 4, 92(1) @@ -3920,9 +3921,9 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: mr 6, 27 ; AIX32-NEXT: stw 3, 76(1) ; AIX32-NEXT: li 3, 16 -; AIX32-NEXT: stw 17, 72(1) -; AIX32-NEXT: stw 17, 68(1) -; AIX32-NEXT: stw 17, 64(1) +; AIX32-NEXT: stw 16, 72(1) +; AIX32-NEXT: stw 16, 68(1) +; AIX32-NEXT: stw 16, 64(1) ; AIX32-NEXT: bl .__atomic_compare_exchange[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: lwz 4, 92(1) @@ -3934,10 +3935,10 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: # %bb.64: # %atomicrmw.end1 ; AIX32-NEXT: and 3, 4, 23 ; AIX32-NEXT: li 5, 255 -; AIX32-NEXT: xor 3, 3, 17 -; AIX32-NEXT: stw 17, 0(29) -; AIX32-NEXT: stw 17, 4(29) -; AIX32-NEXT: stw 17, 8(29) +; AIX32-NEXT: xor 3, 3, 16 +; AIX32-NEXT: stw 16, 0(29) +; AIX32-NEXT: stw 16, 4(29) +; AIX32-NEXT: stw 16, 8(29) ; AIX32-NEXT: slw 5, 5, 24 ; AIX32-NEXT: stw 3, 12(29) ; AIX32-NEXT: lbz 3, 0(26) @@ -3959,11 +3960,10 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: clrlwi 4, 4, 24 ; AIX32-NEXT: slw 5, 5, 21 ; AIX32-NEXT: and 3, 4, 3 -; AIX32-NEXT: lwz 4, 60(1) # 4-byte Folded Reload -; AIX32-NEXT: stb 3, 0(4) +; AIX32-NEXT: stb 3, 0(20) ; AIX32-NEXT: lbz 3, 0(26) ; AIX32-NEXT: sync -; AIX32-NEXT: lwz 9, 56(1) # 4-byte Folded Reload +; AIX32-NEXT: lwz 9, 60(1) # 4-byte Folded Reload ; AIX32-NEXT: slw 4, 3, 21 ; AIX32-NEXT: L..BB2_67: # %atomicrmw.end1 ; AIX32-NEXT: # @@ -3985,14 +3985,15 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: stb 3, 0(26) ; AIX32-NEXT: sync ; AIX32-NEXT: slw 4, 3, 18 +; AIX32-NEXT: lwz 10, 56(1) # 4-byte Folded Reload ; AIX32-NEXT: L..BB2_69: # %atomicrmw.end1 ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 6, 0, 16 +; AIX32-NEXT: lwarx 6, 0, 17 ; AIX32-NEXT: and 7, 4, 6 ; AIX32-NEXT: andc 8, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: or 7, 7, 8 -; AIX32-NEXT: stwcx. 7, 0, 16 +; AIX32-NEXT: stwcx. 7, 0, 17 ; AIX32-NEXT: bne 0, L..BB2_69 ; AIX32-NEXT: # %bb.70: # %atomicrmw.end1 ; AIX32-NEXT: srw 4, 6, 18 @@ -4020,7 +4021,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 { ; AIX32-NEXT: lwsync ; AIX32-NEXT: clrlwi 4, 4, 16 ; AIX32-NEXT: and 3, 4, 3 -; AIX32-NEXT: sth 3, 0(20) +; AIX32-NEXT: sth 3, 0(10) ; AIX32-NEXT: lbz 3, 0(26) ; AIX32-NEXT: sync ; AIX32-NEXT: L..BB2_73: # %atomicrmw.end1 @@ -4647,32 +4648,32 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lwz 29, L..C1(2) # @uc ; AIX32-NEXT: lbz 3, 0(29) ; AIX32-NEXT: rlwinm 5, 28, 3, 27, 28 -; AIX32-NEXT: stw 21, 84(1) # 4-byte Folded Spill +; AIX32-NEXT: stw 24, 96(1) # 4-byte Folded Spill ; AIX32-NEXT: lbz 4, 0(28) ; AIX32-NEXT: stw 17, 68(1) # 4-byte Folded Spill ; AIX32-NEXT: stw 18, 72(1) # 4-byte Folded Spill ; AIX32-NEXT: stw 19, 76(1) # 4-byte Folded Spill ; AIX32-NEXT: stw 20, 80(1) # 4-byte Folded Spill -; AIX32-NEXT: xori 21, 5, 24 +; AIX32-NEXT: xori 24, 5, 24 +; AIX32-NEXT: stw 21, 84(1) # 4-byte Folded Spill ; AIX32-NEXT: stw 22, 88(1) # 4-byte Folded Spill ; AIX32-NEXT: stw 23, 92(1) # 4-byte Folded Spill -; AIX32-NEXT: stw 24, 96(1) # 4-byte Folded Spill -; AIX32-NEXT: slw 5, 3, 21 +; AIX32-NEXT: slw 5, 3, 24 ; AIX32-NEXT: li 3, 255 -; AIX32-NEXT: slw 4, 4, 21 +; AIX32-NEXT: slw 4, 4, 24 ; AIX32-NEXT: stw 25, 100(1) # 4-byte Folded Spill ; AIX32-NEXT: stw 26, 104(1) # 4-byte Folded Spill -; AIX32-NEXT: slw 3, 3, 21 +; AIX32-NEXT: slw 3, 3, 24 ; AIX32-NEXT: stw 27, 108(1) # 4-byte Folded Spill ; AIX32-NEXT: stw 30, 120(1) # 4-byte Folded Spill ; AIX32-NEXT: stw 31, 124(1) # 4-byte Folded Spill ; AIX32-NEXT: sync -; AIX32-NEXT: rlwinm 18, 28, 0, 0, 29 +; AIX32-NEXT: rlwinm 20, 28, 0, 0, 29 ; AIX32-NEXT: and 4, 4, 3 ; AIX32-NEXT: and 5, 5, 3 ; AIX32-NEXT: L..BB3_1: # %entry ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 7, 0, 18 +; AIX32-NEXT: lwarx 7, 0, 20 ; AIX32-NEXT: and 6, 7, 3 ; AIX32-NEXT: cmpw 6, 5 ; AIX32-NEXT: bne 0, L..BB3_3 @@ -4680,26 +4681,26 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: # ; AIX32-NEXT: andc 7, 7, 3 ; AIX32-NEXT: or 7, 7, 4 -; AIX32-NEXT: stwcx. 7, 0, 18 +; AIX32-NEXT: stwcx. 7, 0, 20 ; AIX32-NEXT: bne 0, L..BB3_1 ; AIX32-NEXT: L..BB3_3: # %entry ; AIX32-NEXT: rlwinm 5, 29, 3, 27, 28 -; AIX32-NEXT: srw 3, 6, 21 +; AIX32-NEXT: srw 3, 6, 24 ; AIX32-NEXT: lwsync ; AIX32-NEXT: lbz 4, 0(29) -; AIX32-NEXT: rlwinm 20, 29, 0, 0, 29 -; AIX32-NEXT: xori 25, 5, 24 -; AIX32-NEXT: slw 5, 3, 25 +; AIX32-NEXT: rlwinm 22, 29, 0, 0, 29 +; AIX32-NEXT: xori 26, 5, 24 +; AIX32-NEXT: slw 5, 3, 26 ; AIX32-NEXT: stb 3, 0(28) ; AIX32-NEXT: li 3, 255 ; AIX32-NEXT: sync -; AIX32-NEXT: slw 6, 4, 25 -; AIX32-NEXT: slw 3, 3, 25 +; AIX32-NEXT: slw 6, 4, 26 +; AIX32-NEXT: slw 3, 3, 26 ; AIX32-NEXT: and 4, 5, 3 ; AIX32-NEXT: and 5, 6, 3 ; AIX32-NEXT: L..BB3_4: # %entry ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 7, 0, 20 +; AIX32-NEXT: lwarx 7, 0, 22 ; AIX32-NEXT: and 6, 7, 3 ; AIX32-NEXT: cmpw 6, 5 ; AIX32-NEXT: bne 0, L..BB3_6 @@ -4707,29 +4708,29 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: # ; AIX32-NEXT: andc 7, 7, 3 ; AIX32-NEXT: or 7, 7, 4 -; AIX32-NEXT: stwcx. 7, 0, 20 +; AIX32-NEXT: stwcx. 7, 0, 22 ; AIX32-NEXT: bne 0, L..BB3_4 ; AIX32-NEXT: L..BB3_6: # %entry ; AIX32-NEXT: lwsync -; AIX32-NEXT: srw 4, 6, 25 +; AIX32-NEXT: srw 4, 6, 26 ; AIX32-NEXT: lbz 3, 0(28) ; AIX32-NEXT: extsb 5, 3 ; AIX32-NEXT: lwz 3, L..C2(2) # @ss ; AIX32-NEXT: stb 4, 0(29) ; AIX32-NEXT: sync ; AIX32-NEXT: rlwinm 6, 3, 3, 27, 27 -; AIX32-NEXT: rlwinm 22, 3, 0, 0, 29 -; AIX32-NEXT: xori 26, 6, 16 -; AIX32-NEXT: slw 6, 4, 26 +; AIX32-NEXT: rlwinm 21, 3, 0, 0, 29 +; AIX32-NEXT: xori 25, 6, 16 +; AIX32-NEXT: slw 6, 4, 25 ; AIX32-NEXT: li 4, 0 -; AIX32-NEXT: slw 5, 5, 26 +; AIX32-NEXT: slw 5, 5, 25 ; AIX32-NEXT: ori 4, 4, 65535 -; AIX32-NEXT: slw 4, 4, 26 +; AIX32-NEXT: slw 4, 4, 25 ; AIX32-NEXT: and 5, 5, 4 ; AIX32-NEXT: and 6, 6, 4 ; AIX32-NEXT: L..BB3_7: # %entry ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 8, 0, 22 +; AIX32-NEXT: lwarx 8, 0, 21 ; AIX32-NEXT: and 7, 8, 4 ; AIX32-NEXT: cmpw 7, 6 ; AIX32-NEXT: bne 0, L..BB3_9 @@ -4737,10 +4738,10 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: # ; AIX32-NEXT: andc 8, 8, 4 ; AIX32-NEXT: or 8, 8, 5 -; AIX32-NEXT: stwcx. 8, 0, 22 +; AIX32-NEXT: stwcx. 8, 0, 21 ; AIX32-NEXT: bne 0, L..BB3_7 ; AIX32-NEXT: L..BB3_9: # %entry -; AIX32-NEXT: srw 4, 7, 26 +; AIX32-NEXT: srw 4, 7, 25 ; AIX32-NEXT: lwsync ; AIX32-NEXT: sth 4, 0(3) ; AIX32-NEXT: lbz 3, 0(28) @@ -4750,12 +4751,12 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lwz 3, L..C3(2) # @us ; AIX32-NEXT: rlwinm 6, 3, 3, 27, 27 ; AIX32-NEXT: rlwinm 19, 3, 0, 0, 29 -; AIX32-NEXT: xori 24, 6, 16 -; AIX32-NEXT: slw 6, 4, 24 +; AIX32-NEXT: xori 23, 6, 16 +; AIX32-NEXT: slw 6, 4, 23 ; AIX32-NEXT: li 4, 0 -; AIX32-NEXT: slw 5, 5, 24 +; AIX32-NEXT: slw 5, 5, 23 ; AIX32-NEXT: ori 4, 4, 65535 -; AIX32-NEXT: slw 4, 4, 24 +; AIX32-NEXT: slw 4, 4, 23 ; AIX32-NEXT: and 5, 5, 4 ; AIX32-NEXT: and 6, 6, 4 ; AIX32-NEXT: L..BB3_10: # %entry @@ -4771,7 +4772,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: stwcx. 8, 0, 19 ; AIX32-NEXT: bne 0, L..BB3_10 ; AIX32-NEXT: L..BB3_12: # %entry -; AIX32-NEXT: srw 4, 7, 24 +; AIX32-NEXT: srw 4, 7, 23 ; AIX32-NEXT: lwsync ; AIX32-NEXT: lwz 17, L..C4(2) # @si ; AIX32-NEXT: sth 4, 0(3) @@ -4810,11 +4811,11 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lwz 31, L..C6(2) # @sll ; AIX32-NEXT: stw 5, 0(27) ; AIX32-NEXT: lbz 3, 0(28) -; AIX32-NEXT: li 23, 0 +; AIX32-NEXT: li 18, 0 ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: li 7, 5 ; AIX32-NEXT: li 8, 5 -; AIX32-NEXT: stw 23, 56(1) +; AIX32-NEXT: stw 18, 56(1) ; AIX32-NEXT: extsb 6, 3 ; AIX32-NEXT: lbz 3, 0(29) ; AIX32-NEXT: srawi 5, 6, 31 @@ -4832,7 +4833,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: extsb 6, 4 ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: srawi 5, 6, 31 -; AIX32-NEXT: stw 23, 56(1) +; AIX32-NEXT: stw 18, 56(1) ; AIX32-NEXT: stw 3, 0(31) ; AIX32-NEXT: lbz 3, 0(29) ; AIX32-NEXT: stw 3, 60(1) @@ -4846,15 +4847,15 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: stw 3, 0(30) ; AIX32-NEXT: lbz 3, 0(29) ; AIX32-NEXT: sync -; AIX32-NEXT: slw 5, 4, 21 +; AIX32-NEXT: slw 5, 4, 24 ; AIX32-NEXT: li 4, 255 -; AIX32-NEXT: slw 6, 3, 21 -; AIX32-NEXT: slw 4, 4, 21 +; AIX32-NEXT: slw 6, 3, 24 +; AIX32-NEXT: slw 4, 4, 24 ; AIX32-NEXT: and 5, 5, 4 ; AIX32-NEXT: and 6, 6, 4 ; AIX32-NEXT: L..BB3_19: # %entry ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 8, 0, 18 +; AIX32-NEXT: lwarx 8, 0, 20 ; AIX32-NEXT: and 7, 8, 4 ; AIX32-NEXT: cmpw 7, 6 ; AIX32-NEXT: bne 0, L..BB3_21 @@ -4862,27 +4863,27 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: # ; AIX32-NEXT: andc 8, 8, 4 ; AIX32-NEXT: or 8, 8, 5 -; AIX32-NEXT: stwcx. 8, 0, 18 +; AIX32-NEXT: stwcx. 8, 0, 20 ; AIX32-NEXT: bne 0, L..BB3_19 ; AIX32-NEXT: L..BB3_21: # %entry -; AIX32-NEXT: srw 4, 7, 21 +; AIX32-NEXT: srw 4, 7, 24 ; AIX32-NEXT: lwsync ; AIX32-NEXT: lbz 5, 0(28) ; AIX32-NEXT: cmpw 4, 3 ; AIX32-NEXT: li 3, 1 -; AIX32-NEXT: iseleq 4, 3, 23 -; AIX32-NEXT: slw 6, 5, 25 +; AIX32-NEXT: iseleq 4, 3, 18 +; AIX32-NEXT: slw 6, 5, 26 ; AIX32-NEXT: li 5, 255 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) -; AIX32-NEXT: slw 5, 5, 25 +; AIX32-NEXT: slw 5, 5, 26 ; AIX32-NEXT: sync -; AIX32-NEXT: slw 7, 4, 25 +; AIX32-NEXT: slw 7, 4, 26 ; AIX32-NEXT: and 6, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: L..BB3_22: # %entry ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 9, 0, 20 +; AIX32-NEXT: lwarx 9, 0, 22 ; AIX32-NEXT: and 8, 9, 5 ; AIX32-NEXT: cmpw 8, 7 ; AIX32-NEXT: bne 0, L..BB3_24 @@ -4890,28 +4891,28 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: # ; AIX32-NEXT: andc 9, 9, 5 ; AIX32-NEXT: or 9, 9, 6 -; AIX32-NEXT: stwcx. 9, 0, 20 +; AIX32-NEXT: stwcx. 9, 0, 22 ; AIX32-NEXT: bne 0, L..BB3_22 ; AIX32-NEXT: L..BB3_24: # %entry -; AIX32-NEXT: srw 5, 8, 25 +; AIX32-NEXT: srw 5, 8, 26 ; AIX32-NEXT: lwsync ; AIX32-NEXT: cmpw 5, 4 ; AIX32-NEXT: lbz 5, 0(28) -; AIX32-NEXT: iseleq 4, 3, 23 +; AIX32-NEXT: iseleq 4, 3, 18 ; AIX32-NEXT: extsb 5, 5 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) ; AIX32-NEXT: sync -; AIX32-NEXT: slw 6, 5, 26 +; AIX32-NEXT: slw 6, 5, 25 ; AIX32-NEXT: li 5, 0 -; AIX32-NEXT: slw 7, 4, 26 +; AIX32-NEXT: slw 7, 4, 25 ; AIX32-NEXT: ori 5, 5, 65535 -; AIX32-NEXT: slw 5, 5, 26 +; AIX32-NEXT: slw 5, 5, 25 ; AIX32-NEXT: and 6, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: L..BB3_25: # %entry ; AIX32-NEXT: # -; AIX32-NEXT: lwarx 9, 0, 22 +; AIX32-NEXT: lwarx 9, 0, 21 ; AIX32-NEXT: and 8, 9, 5 ; AIX32-NEXT: cmpw 8, 7 ; AIX32-NEXT: bne 0, L..BB3_27 @@ -4919,23 +4920,23 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: # ; AIX32-NEXT: andc 9, 9, 5 ; AIX32-NEXT: or 9, 9, 6 -; AIX32-NEXT: stwcx. 9, 0, 22 +; AIX32-NEXT: stwcx. 9, 0, 21 ; AIX32-NEXT: bne 0, L..BB3_25 ; AIX32-NEXT: L..BB3_27: # %entry -; AIX32-NEXT: srw 5, 8, 26 +; AIX32-NEXT: srw 5, 8, 25 ; AIX32-NEXT: lwsync ; AIX32-NEXT: cmpw 5, 4 ; AIX32-NEXT: lbz 5, 0(28) -; AIX32-NEXT: iseleq 4, 3, 23 +; AIX32-NEXT: iseleq 4, 3, 18 ; AIX32-NEXT: extsb 5, 5 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) ; AIX32-NEXT: sync -; AIX32-NEXT: slw 6, 5, 24 +; AIX32-NEXT: slw 6, 5, 23 ; AIX32-NEXT: li 5, 0 -; AIX32-NEXT: slw 7, 4, 24 +; AIX32-NEXT: slw 7, 4, 23 ; AIX32-NEXT: ori 5, 5, 65535 -; AIX32-NEXT: slw 5, 5, 24 +; AIX32-NEXT: slw 5, 5, 23 ; AIX32-NEXT: and 6, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: L..BB3_28: # %entry @@ -4951,11 +4952,11 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: stwcx. 9, 0, 19 ; AIX32-NEXT: bne 0, L..BB3_28 ; AIX32-NEXT: L..BB3_30: # %entry -; AIX32-NEXT: srw 5, 8, 24 +; AIX32-NEXT: srw 5, 8, 23 ; AIX32-NEXT: lwsync ; AIX32-NEXT: cmpw 5, 4 ; AIX32-NEXT: lbz 5, 0(28) -; AIX32-NEXT: iseleq 4, 3, 23 +; AIX32-NEXT: iseleq 4, 3, 18 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) ; AIX32-NEXT: sync @@ -4971,7 +4972,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: bne 0, L..BB3_31 ; AIX32-NEXT: L..BB3_33: # %entry ; AIX32-NEXT: lwsync -; AIX32-NEXT: isel 4, 3, 23, 6 +; AIX32-NEXT: isel 4, 3, 18, 6 ; AIX32-NEXT: lbz 5, 0(28) ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) @@ -4988,13 +4989,13 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: bne 0, L..BB3_34 ; AIX32-NEXT: L..BB3_36: # %entry ; AIX32-NEXT: lwsync -; AIX32-NEXT: isel 3, 3, 23, 6 +; AIX32-NEXT: isel 3, 3, 18, 6 ; AIX32-NEXT: li 7, 5 ; AIX32-NEXT: li 8, 5 ; AIX32-NEXT: lbz 4, 0(28) ; AIX32-NEXT: stw 3, 0(27) ; AIX32-NEXT: lbz 3, 0(29) -; AIX32-NEXT: stw 23, 56(1) +; AIX32-NEXT: stw 18, 56(1) ; AIX32-NEXT: extsb 6, 4 ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: stw 3, 60(1) @@ -5011,7 +5012,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: stw 3, 60(1) ; AIX32-NEXT: mr 3, 30 -; AIX32-NEXT: stw 23, 56(1) +; AIX32-NEXT: stw 18, 56(1) ; AIX32-NEXT: srawi 5, 6, 31 ; AIX32-NEXT: bl .__atomic_compare_exchange_8[PR] ; AIX32-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/csr-split.ll b/llvm/test/CodeGen/PowerPC/csr-split.ll index e24c7e3eaab6b9..78a2296854b435 100644 --- a/llvm/test/CodeGen/PowerPC/csr-split.ll +++ b/llvm/test/CodeGen/PowerPC/csr-split.ll @@ -108,9 +108,9 @@ define dso_local signext i32 @test2(ptr %p1) local_unnamed_addr { ; CHECK-PWR9-NEXT: cmpldi r30, 0 ; CHECK-PWR9-NEXT: beq cr0, .LBB1_3 ; CHECK-PWR9-NEXT: # %bb.1: # %if.end -; CHECK-PWR9-NEXT: addis r4, r2, a@toc@ha -; CHECK-PWR9-NEXT: lwa r4, a@toc@l(r4) -; CHECK-PWR9-NEXT: cmpld r4, r30 +; CHECK-PWR9-NEXT: addis r5, r2, a@toc@ha +; CHECK-PWR9-NEXT: lwa r5, a@toc@l(r5) +; CHECK-PWR9-NEXT: cmpld r5, r30 ; CHECK-PWR9-NEXT: bne cr0, .LBB1_3 ; CHECK-PWR9-NEXT: # %bb.2: # %if.then2 ; CHECK-PWR9-NEXT: bl callVoid @@ -139,9 +139,9 @@ define dso_local signext i32 @test2(ptr %p1) local_unnamed_addr { ; CHECK-NEXT: cmpldi r30, 0 ; CHECK-NEXT: beq cr0, .LBB1_3 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: addis r4, r2, a@toc@ha -; CHECK-NEXT: lwa r4, a@toc@l(r4) -; CHECK-NEXT: cmpld r4, r30 +; CHECK-NEXT: addis r5, r2, a@toc@ha +; CHECK-NEXT: lwa r5, a@toc@l(r5) +; CHECK-NEXT: cmpld r5, r30 ; CHECK-NEXT: bne cr0, .LBB1_3 ; CHECK-NEXT: # %bb.2: # %if.then2 ; CHECK-NEXT: bl callVoid diff --git a/llvm/test/CodeGen/PowerPC/inc-of-add.ll b/llvm/test/CodeGen/PowerPC/inc-of-add.ll index c6d6f6a17b1b50..b4512062e80da0 100644 --- a/llvm/test/CodeGen/PowerPC/inc-of-add.ll +++ b/llvm/test/CodeGen/PowerPC/inc-of-add.ll @@ -66,26 +66,23 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32: # %bb.0: ; PPC32-NEXT: stwu 1, -64(1) ; PPC32-NEXT: stw 21, 20(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 21, 123(1) ; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill -; PPC32-NEXT: lbz 4, 115(1) ; PPC32-NEXT: lbz 22, 119(1) -; PPC32-NEXT: lbz 21, 123(1) -; PPC32-NEXT: add 4, 4, 5 -; PPC32-NEXT: add 5, 22, 6 -; PPC32-NEXT: lbz 22, 131(1) -; PPC32-NEXT: add 6, 21, 7 +; PPC32-NEXT: add 7, 21, 7 +; PPC32-NEXT: lbz 4, 115(1) ; PPC32-NEXT: lbz 21, 135(1) -; PPC32-NEXT: addi 6, 6, 1 -; PPC32-NEXT: stw 20, 16(1) # 4-byte Folded Spill -; PPC32-NEXT: add 9, 22, 9 -; PPC32-NEXT: lbz 20, 127(1) +; PPC32-NEXT: add 6, 22, 6 +; PPC32-NEXT: lbz 22, 131(1) +; PPC32-NEXT: add 4, 4, 5 +; PPC32-NEXT: lbz 5, 127(1) ; PPC32-NEXT: add 10, 21, 10 ; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; PPC32-NEXT: addi 5, 5, 1 +; PPC32-NEXT: add 9, 22, 9 ; PPC32-NEXT: lbz 25, 83(1) -; PPC32-NEXT: add 7, 20, 8 +; PPC32-NEXT: add 5, 5, 8 ; PPC32-NEXT: lbz 21, 147(1) -; PPC32-NEXT: addi 7, 7, 1 +; PPC32-NEXT: addi 5, 5, 1 ; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill ; PPC32-NEXT: addi 4, 4, 1 ; PPC32-NEXT: lbz 24, 79(1) @@ -140,14 +137,16 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32-NEXT: addi 11, 25, 1 ; PPC32-NEXT: stb 8, 6(3) ; PPC32-NEXT: addi 8, 10, 1 +; PPC32-NEXT: stb 5, 3(3) +; PPC32-NEXT: addi 5, 7, 1 ; PPC32-NEXT: stb 11, 8(3) ; PPC32-NEXT: addi 11, 24, 1 ; PPC32-NEXT: stb 8, 5(3) ; PPC32-NEXT: addi 8, 9, 1 +; PPC32-NEXT: stb 5, 2(3) +; PPC32-NEXT: addi 5, 6, 1 ; PPC32-NEXT: stb 11, 7(3) ; PPC32-NEXT: stb 8, 4(3) -; PPC32-NEXT: stb 7, 3(3) -; PPC32-NEXT: stb 6, 2(3) ; PPC32-NEXT: stb 5, 1(3) ; PPC32-NEXT: stb 4, 0(3) ; PPC32-NEXT: lwz 30, 56(1) # 4-byte Folded Reload @@ -160,7 +159,6 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32-NEXT: lwz 23, 28(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 22, 24(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 21, 20(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 20, 16(1) # 4-byte Folded Reload ; PPC32-NEXT: addi 1, 1, 64 ; PPC32-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir b/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir index b9c541feae5acf..202d9eb24f8af0 100644 --- a/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir +++ b/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir @@ -97,7 +97,7 @@ body: | ; CHECK-NEXT: renamable $g8p4 = LQARX $x5, $x6 ; CHECK-NEXT: STD killed $x8, -160, $x1 ; CHECK-NEXT: STD killed $x9, -152, $x1 - ; CHECK-NEXT: renamable $g8p13 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p11 = LQARX $x3, renamable $x4 ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 ; CHECK-NEXT: STD killed $x8, -176, $x1 ; CHECK-NEXT: STD killed $x9, -168, $x1 @@ -115,17 +115,17 @@ body: | ; CHECK-NEXT: renamable $g8p8 = LQARX $x3, renamable $x4 ; CHECK-NEXT: renamable $g8p7 = LQARX $x3, renamable $x4 ; CHECK-NEXT: renamable $g8p15 = LQARX $x3, renamable $x4 - ; CHECK-NEXT: renamable $g8p11 = LQARX $x3, renamable $x4 ; CHECK-NEXT: renamable $g8p12 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p13 = LQARX $x3, renamable $x4 ; CHECK-NEXT: renamable $g8p14 = LQARX $x3, renamable $x4 ; CHECK-NEXT: renamable $g8p5 = LQARX $x3, renamable $x4 ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 - ; CHECK-NEXT: $x3 = OR8 $x27, $x27 + ; CHECK-NEXT: $x3 = OR8 $x23, $x23 ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 ; CHECK-NEXT: STQCX killed renamable $g8p5, renamable $x7, renamable $x4, implicit-def dead $cr0 ; CHECK-NEXT: STQCX killed renamable $g8p14, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p13, renamable $x7, renamable $x4, implicit-def dead $cr0 ; CHECK-NEXT: STQCX killed renamable $g8p12, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK-NEXT: STQCX killed renamable $g8p11, renamable $x7, renamable $x4, implicit-def dead $cr0 ; CHECK-NEXT: STQCX killed renamable $g8p15, renamable $x7, renamable $x4, implicit-def dead $cr0 ; CHECK-NEXT: STQCX killed renamable $g8p7, renamable $x7, renamable $x4, implicit-def dead $cr0 ; CHECK-NEXT: STQCX killed renamable $g8p8, renamable $x7, renamable $x4, implicit-def dead $cr0 @@ -143,7 +143,7 @@ body: | ; CHECK-NEXT: $x8 = LD -176, $x1 ; CHECK-NEXT: $x9 = LD -168, $x1 ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK-NEXT: STQCX killed renamable $g8p13, killed renamable $x7, killed renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p11, killed renamable $x7, killed renamable $x4, implicit-def dead $cr0 ; CHECK-NEXT: $x8 = LD -160, $x1 ; CHECK-NEXT: $x9 = LD -152, $x1 ; CHECK-NEXT: STQCX killed renamable $g8p4, $x5, $x6, implicit-def dead $cr0 diff --git a/llvm/test/CodeGen/PowerPC/licm-tocReg.ll b/llvm/test/CodeGen/PowerPC/licm-tocReg.ll index 7b531087501923..af3ef7829cd618 100644 --- a/llvm/test/CodeGen/PowerPC/licm-tocReg.ll +++ b/llvm/test/CodeGen/PowerPC/licm-tocReg.ll @@ -71,19 +71,19 @@ define signext i32 @test(ptr nocapture %FP) local_unnamed_addr #0 { ; CHECKLX-NEXT: addis 3, 2, .LC0@toc@ha ; CHECKLX-NEXT: addis 4, 2, .LC1@toc@ha ; CHECKLX-NEXT: ld 3, .LC0@toc@l(3) -; CHECKLX-NEXT: ld 5, .LC1@toc@l(4) +; CHECKLX-NEXT: ld 4, .LC1@toc@l(4) ; CHECKLX-NEXT: lwz 6, 0(3) ; CHECKLX-NEXT: .p2align 5 ; CHECKLX-NEXT: .LBB0_1: # %if.end ; CHECKLX-NEXT: # -; CHECKLX-NEXT: lwz 7, 0(5) -; CHECKLX-NEXT: lwz 4, 0(3) +; CHECKLX-NEXT: lwz 7, 0(4) +; CHECKLX-NEXT: lwz 5, 0(3) ; CHECKLX-NEXT: cmpw 6, 7 ; CHECKLX-NEXT: bgt 0, .LBB0_3 ; CHECKLX-NEXT: # %bb.2: # %if.end ; CHECKLX-NEXT: # -; CHECKLX-NEXT: addi 4, 4, 1 -; CHECKLX-NEXT: stw 4, 0(3) +; CHECKLX-NEXT: addi 5, 5, 1 +; CHECKLX-NEXT: stw 5, 0(3) ; CHECKLX-NEXT: lwz 6, 0(3) ; CHECKLX-NEXT: b .LBB0_1 ; CHECKLX-NEXT: .LBB0_3: # %if.then @@ -94,7 +94,7 @@ define signext i32 @test(ptr nocapture %FP) local_unnamed_addr #0 { ; CHECKLX-NEXT: .cfi_def_cfa_offset 32 ; CHECKLX-NEXT: .cfi_offset lr, 16 ; CHECKLX-NEXT: mtctr 12 -; CHECKLX-NEXT: extsw 3, 4 +; CHECKLX-NEXT: extsw 3, 5 ; CHECKLX-NEXT: bctrl ; CHECKLX-NEXT: ld 2, 24(1) ; CHECKLX-NEXT: addi 1, 1, 32 @@ -104,30 +104,30 @@ define signext i32 @test(ptr nocapture %FP) local_unnamed_addr #0 { ; ; CHECKAIX-LABEL: test: ; CHECKAIX: # %bb.0: # %entry -; CHECKAIX-NEXT: ld 5, L..C0(2) # @ga -; CHECKAIX-NEXT: ld 6, L..C1(2) # @gb +; CHECKAIX-NEXT: ld 4, L..C0(2) # @ga +; CHECKAIX-NEXT: ld 5, L..C1(2) # @gb ; CHECKAIX-NEXT: L..BB0_1: # %if.end ; CHECKAIX-NEXT: # -; CHECKAIX-NEXT: lwz 4, 0(5) -; CHECKAIX-NEXT: lwz 7, 0(6) -; CHECKAIX-NEXT: cmpw 4, 7 -; CHECKAIX-NEXT: lwz 4, 0(5) +; CHECKAIX-NEXT: lwz 6, 0(4) +; CHECKAIX-NEXT: lwz 7, 0(5) +; CHECKAIX-NEXT: cmpw 6, 7 +; CHECKAIX-NEXT: lwz 6, 0(4) ; CHECKAIX-NEXT: bgt 0, L..BB0_3 ; CHECKAIX-NEXT: # %bb.2: # %if.end ; CHECKAIX-NEXT: # -; CHECKAIX-NEXT: addi 4, 4, 1 -; CHECKAIX-NEXT: stw 4, 0(5) +; CHECKAIX-NEXT: addi 6, 6, 1 +; CHECKAIX-NEXT: stw 6, 0(4) ; CHECKAIX-NEXT: b L..BB0_1 ; CHECKAIX-NEXT: L..BB0_3: # %if.then ; CHECKAIX-NEXT: mflr 0 ; CHECKAIX-NEXT: stdu 1, -112(1) -; CHECKAIX-NEXT: ld 5, 0(3) +; CHECKAIX-NEXT: ld 4, 0(3) ; CHECKAIX-NEXT: std 0, 128(1) ; CHECKAIX-NEXT: ld 11, 16(3) ; CHECKAIX-NEXT: std 2, 40(1) ; CHECKAIX-NEXT: ld 2, 8(3) -; CHECKAIX-NEXT: extsw 3, 4 -; CHECKAIX-NEXT: mtctr 5 +; CHECKAIX-NEXT: extsw 3, 6 +; CHECKAIX-NEXT: mtctr 4 ; CHECKAIX-NEXT: bctrl ; CHECKAIX-NEXT: ld 2, 40(1) ; CHECKAIX-NEXT: addi 1, 1, 112 diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll index 900069c6216bf6..59d6d44dd6ad41 100644 --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -105,27 +105,27 @@ define i64 @test_ds_prep(ptr %arg, i32 signext %arg1) { ; CHECK-NEXT: cmplwi r4, 0 ; CHECK-NEXT: beq cr0, .LBB1_4 ; CHECK-NEXT: # %bb.1: # %bb3.preheader -; CHECK-NEXT: cmpldi r4, 1 -; CHECK-NEXT: li r5, 1 ; CHECK-NEXT: addi r6, r3, 4002 +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r3, 1 ; CHECK-NEXT: li r7, -1 -; CHECK-NEXT: iselgt r3, r4, r5 -; CHECK-NEXT: mtctr r3 -; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: iselgt r5, r4, r3 +; CHECK-NEXT: mtctr r5 +; CHECK-NEXT: li r5, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: # %bb3 ; CHECK-NEXT: # ; CHECK-NEXT: ldx r8, r6, r7 ; CHECK-NEXT: ld r9, 0(r6) -; CHECK-NEXT: ldx r10, r6, r5 +; CHECK-NEXT: ldx r10, r6, r3 ; CHECK-NEXT: ld r11, 4(r6) ; CHECK-NEXT: addi r6, r6, 1 ; CHECK-NEXT: mulld r8, r9, r8 ; CHECK-NEXT: mulld r8, r8, r10 -; CHECK-NEXT: maddld r3, r8, r11, r3 +; CHECK-NEXT: maddld r5, r8, r11, r5 ; CHECK-NEXT: bdnz .LBB1_2 ; CHECK-NEXT: # %bb.3: # %bb25 -; CHECK-NEXT: add r3, r3, r4 +; CHECK-NEXT: add r3, r5, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: addi r3, r4, 0 @@ -194,50 +194,50 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) { ; CHECK-NEXT: cmplwi r4, 0 ; CHECK-NEXT: beq cr0, .LBB2_4 ; CHECK-NEXT: # %bb.1: # %bb3.preheader +; CHECK-NEXT: addi r9, r3, 4002 ; CHECK-NEXT: cmpldi r4, 1 -; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: addi r10, r3, 4002 +; CHECK-NEXT: li r3, 1 ; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill ; CHECK-NEXT: li r6, -1 ; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill ; CHECK-NEXT: li r7, 3 ; CHECK-NEXT: li r8, 5 -; CHECK-NEXT: li r9, 9 +; CHECK-NEXT: li r10, 9 ; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; CHECK-NEXT: iselgt r3, r4, r5 -; CHECK-NEXT: mtctr r3 -; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: iselgt r5, r4, r3 +; CHECK-NEXT: mtctr r5 +; CHECK-NEXT: li r5, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB2_2: # %bb3 ; CHECK-NEXT: # -; CHECK-NEXT: ldx r11, r10, r6 -; CHECK-NEXT: ld r12, 0(r10) -; CHECK-NEXT: ldx r0, r10, r5 -; CHECK-NEXT: ldx r30, r10, r7 +; CHECK-NEXT: ldx r11, r9, r6 +; CHECK-NEXT: ld r12, 0(r9) +; CHECK-NEXT: ldx r0, r9, r3 +; CHECK-NEXT: ldx r30, r9, r7 ; CHECK-NEXT: mulld r11, r12, r11 -; CHECK-NEXT: ld r29, 4(r10) -; CHECK-NEXT: ldx r28, r10, r8 -; CHECK-NEXT: ld r27, 12(r10) -; CHECK-NEXT: ld r26, 8(r10) -; CHECK-NEXT: ldx r25, r10, r9 -; CHECK-NEXT: addi r10, r10, 1 +; CHECK-NEXT: ld r29, 4(r9) +; CHECK-NEXT: ldx r28, r9, r8 +; CHECK-NEXT: ld r27, 12(r9) +; CHECK-NEXT: ld r26, 8(r9) +; CHECK-NEXT: ldx r25, r9, r10 +; CHECK-NEXT: addi r9, r9, 1 ; CHECK-NEXT: mulld r11, r11, r0 ; CHECK-NEXT: mulld r11, r11, r30 ; CHECK-NEXT: mulld r11, r11, r29 ; CHECK-NEXT: mulld r11, r11, r28 ; CHECK-NEXT: mulld r11, r11, r27 ; CHECK-NEXT: mulld r11, r11, r26 -; CHECK-NEXT: maddld r3, r11, r25, r3 +; CHECK-NEXT: maddld r5, r11, r25, r5 ; CHECK-NEXT: bdnz .LBB2_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload ; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload ; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload -; CHECK-NEXT: add r3, r3, r4 +; CHECK-NEXT: add r3, r5, r4 ; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload ; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload ; CHECK-NEXT: blr @@ -314,25 +314,25 @@ define dso_local i64 @test_update_ds_prep_interact(ptr %arg, i32 signext %arg1) ; CHECK-NEXT: beq cr0, .LBB3_4 ; CHECK-NEXT: # %bb.1: # %bb3.preheader ; CHECK-NEXT: cmpldi r4, 1 -; CHECK-NEXT: li r6, 1 +; CHECK-NEXT: li r5, 1 ; CHECK-NEXT: addi r3, r3, 3998 ; CHECK-NEXT: li r7, -1 -; CHECK-NEXT: iselgt r5, r4, r6 -; CHECK-NEXT: mtctr r5 -; CHECK-NEXT: li r5, 0 +; CHECK-NEXT: iselgt r6, r4, r5 +; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: li r6, 0 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB3_2: # %bb3 ; CHECK-NEXT: # ; CHECK-NEXT: ldu r8, 4(r3) ; CHECK-NEXT: ldx r9, r3, r7 -; CHECK-NEXT: ldx r10, r3, r6 +; CHECK-NEXT: ldx r10, r3, r5 ; CHECK-NEXT: ld r11, 4(r3) ; CHECK-NEXT: mulld r8, r8, r9 ; CHECK-NEXT: mulld r8, r8, r10 -; CHECK-NEXT: maddld r5, r8, r11, r5 +; CHECK-NEXT: maddld r6, r8, r11, r6 ; CHECK-NEXT: bdnz .LBB3_2 ; CHECK-NEXT: # %bb.3: # %bb26 -; CHECK-NEXT: add r3, r5, r4 +; CHECK-NEXT: add r3, r6, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: addi r3, r4, 0 diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll b/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll index 7eef9a4644db18..169a32953eafe0 100644 --- a/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll @@ -85,21 +85,21 @@ define zeroext i8 @foo1(ptr %p, i32 signext %n, i32 signext %count) { ; CHECK-NEXT: cmpwi r4, 0 ; CHECK-NEXT: ble cr0, .LBB1_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader +; CHECK-NEXT: addi r4, r4, -1 ; CHECK-NEXT: sub r3, r3, r5 -; CHECK-NEXT: addi r6, r3, 1000 -; CHECK-NEXT: addi r3, r4, -1 -; CHECK-NEXT: clrldi r3, r3, 32 -; CHECK-NEXT: addi r3, r3, 1 -; CHECK-NEXT: mtctr r3 -; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: addi r3, r3, 1000 +; CHECK-NEXT: addi r4, r4, 1 +; CHECK-NEXT: mtctr r4 +; CHECK-NEXT: li r4, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: lbzux r4, r6, r5 -; CHECK-NEXT: add r3, r4, r3 +; CHECK-NEXT: lbzux r6, r3, r5 +; CHECK-NEXT: add r4, r6, r4 ; CHECK-NEXT: bdnz .LBB1_2 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup -; CHECK-NEXT: clrldi r3, r3, 56 +; CHECK-NEXT: clrldi r3, r4, 56 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: li r3, 0 diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll index 9f62477ae01df2..58b15eea30aec8 100644 --- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -10,8 +10,8 @@ target triple = "powerpc64le-unknown-linux-gnu" define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %.vy02, ptr %.vy03, ptr %.vy04, ptr %.vy05, ptr %.vy06, ptr %.vy07, ptr %.vy08, ptr %.vy09, ptr %.vy0a, ptr %.vy0b, ptr %.vy0c, ptr %.vy21, ptr %.vy22, ptr %.vy23, ptr %.vy24, ptr %.vy25, ptr %.vy26, ptr %.vy27, ptr %.vy28, ptr %.vy29, ptr %.vy2a, ptr %.vy2b, ptr %.vy2c) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lwz 4, 0(4) -; CHECK-NEXT: cmpwi 4, 1 +; CHECK-NEXT: lwz 0, 0(4) +; CHECK-NEXT: cmpwi 0, 1 ; CHECK-NEXT: bltlr 0 ; CHECK-NEXT: # %bb.1: # %_loop_1_do_.lr.ph ; CHECK-NEXT: lwz 3, 0(3) @@ -56,186 +56,185 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: .cfi_offset v29, -240 ; CHECK-NEXT: .cfi_offset v30, -224 ; CHECK-NEXT: .cfi_offset v31, -208 +; CHECK-NEXT: ld 4, 848(1) ; CHECK-NEXT: std 22, 464(1) # 8-byte Folded Spill ; CHECK-NEXT: std 23, 472(1) # 8-byte Folded Spill -; CHECK-NEXT: mr 22, 5 -; CHECK-NEXT: ld 5, 848(1) +; CHECK-NEXT: mr 23, 5 +; CHECK-NEXT: lwa 5, 0(7) ; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: mr 11, 7 -; CHECK-NEXT: ld 23, 688(1) -; CHECK-NEXT: ld 7, 728(1) -; CHECK-NEXT: std 18, 432(1) # 8-byte Folded Spill -; CHECK-NEXT: std 19, 440(1) # 8-byte Folded Spill -; CHECK-NEXT: mr 18, 6 -; CHECK-NEXT: li 6, 9 -; CHECK-NEXT: ld 19, 768(1) -; CHECK-NEXT: ld 2, 760(1) -; CHECK-NEXT: std 26, 496(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 504(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 7, 720(1) +; CHECK-NEXT: std 24, 480(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 488(1) # 8-byte Folded Spill +; CHECK-NEXT: mr 24, 6 +; CHECK-NEXT: ld 6, 688(1) +; CHECK-NEXT: ld 22, 784(1) +; CHECK-NEXT: std 28, 512(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 520(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 28, 816(1) +; CHECK-NEXT: ld 25, 792(1) ; CHECK-NEXT: cmpldi 3, 9 -; CHECK-NEXT: ld 27, 816(1) -; CHECK-NEXT: ld 26, 808(1) +; CHECK-NEXT: std 20, 448(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 456(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 20, 768(1) +; CHECK-NEXT: ld 21, 776(1) ; CHECK-NEXT: std 14, 400(1) # 8-byte Folded Spill ; CHECK-NEXT: std 15, 408(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 15, 736(1) -; CHECK-NEXT: lxv 39, 0(8) +; CHECK-NEXT: ld 12, 728(1) +; CHECK-NEXT: ld 2, 712(1) ; CHECK-NEXT: std 30, 528(1) # 8-byte Folded Spill ; CHECK-NEXT: std 31, 536(1) # 8-byte Folded Spill ; CHECK-NEXT: ld 30, 704(1) -; CHECK-NEXT: lxv 38, 0(9) -; CHECK-NEXT: std 20, 448(1) # 8-byte Folded Spill -; CHECK-NEXT: std 21, 456(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 21, 784(1) -; CHECK-NEXT: ld 20, 776(1) -; CHECK-NEXT: std 24, 480(1) # 8-byte Folded Spill -; CHECK-NEXT: std 25, 488(1) # 8-byte Folded Spill -; CHECK-NEXT: iselgt 3, 3, 6 -; CHECK-NEXT: ld 6, 720(1) -; CHECK-NEXT: ld 24, 792(1) -; CHECK-NEXT: std 10, 72(1) # 8-byte Folded Spill -; CHECK-NEXT: std 7, 80(1) # 8-byte Folded Spill -; CHECK-NEXT: addi 3, 3, -2 -; CHECK-NEXT: lxv 6, 0(19) -; CHECK-NEXT: lxv 11, 0(7) -; CHECK-NEXT: std 5, 200(1) # 8-byte Folded Spill -; CHECK-NEXT: std 23, 40(1) # 8-byte Folded Spill -; CHECK-NEXT: std 6, 48(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 840(1) -; CHECK-NEXT: lxv 12, 0(6) -; CHECK-NEXT: rldicl 12, 3, 61, 3 -; CHECK-NEXT: std 19, 120(1) # 8-byte Folded Spill -; CHECK-NEXT: std 20, 128(1) # 8-byte Folded Spill -; CHECK-NEXT: std 21, 136(1) # 8-byte Folded Spill -; CHECK-NEXT: std 24, 144(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 4, 0(21) -; CHECK-NEXT: ld 25, 800(1) -; CHECK-NEXT: lxv 33, 0(10) -; CHECK-NEXT: lxv 32, 0(23) -; CHECK-NEXT: lxv 36, 0(30) +; CHECK-NEXT: ld 29, 696(1) ; CHECK-NEXT: std 16, 416(1) # 8-byte Folded Spill ; CHECK-NEXT: std 17, 424(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 17, 752(1) -; CHECK-NEXT: ld 16, 744(1) -; CHECK-NEXT: std 28, 512(1) # 8-byte Folded Spill -; CHECK-NEXT: std 29, 520(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 29, 712(1) -; CHECK-NEXT: ld 28, 696(1) +; CHECK-NEXT: ld 17, 744(1) +; CHECK-NEXT: ld 16, 736(1) +; CHECK-NEXT: std 18, 432(1) # 8-byte Folded Spill +; CHECK-NEXT: std 19, 440(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 19, 760(1) +; CHECK-NEXT: ld 18, 752(1) +; CHECK-NEXT: std 26, 496(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 504(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 27, 808(1) +; CHECK-NEXT: ld 26, 800(1) ; CHECK-NEXT: std 8, 56(1) # 8-byte Folded Spill ; CHECK-NEXT: std 9, 64(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 37, 0(28) -; CHECK-NEXT: lxv 13, 0(29) -; CHECK-NEXT: mr 8, 29 +; CHECK-NEXT: lxv 33, 0(8) +; CHECK-NEXT: lxv 32, 0(9) +; CHECK-NEXT: std 10, 72(1) # 8-byte Folded Spill +; CHECK-NEXT: std 12, 80(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 36, 0(6) +; CHECK-NEXT: extswsli 14, 5, 3 +; CHECK-NEXT: sldi 15, 5, 4 +; CHECK-NEXT: lxv 6, 0(20) +; CHECK-NEXT: std 4, 200(1) # 8-byte Folded Spill +; CHECK-NEXT: std 6, 40(1) # 8-byte Folded Spill +; CHECK-NEXT: std 7, 48(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 4, 840(1) +; CHECK-NEXT: mulli 6, 5, 40 +; CHECK-NEXT: lxv 12, 0(7) +; CHECK-NEXT: lxv 4, 0(22) +; CHECK-NEXT: std 20, 120(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 128(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 37, 0(10) +; CHECK-NEXT: lxv 35, 0(29) +; CHECK-NEXT: std 22, 136(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 144(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 34, 0(30) +; CHECK-NEXT: lxv 13, 0(2) +; CHECK-NEXT: mr 8, 2 ; CHECK-NEXT: mr 9, 30 -; CHECK-NEXT: mr 10, 28 -; CHECK-NEXT: std 25, 152(1) # 8-byte Folded Spill -; CHECK-NEXT: std 26, 160(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 10, 0(15) -; CHECK-NEXT: lxv 9, 0(16) -; CHECK-NEXT: li 28, 1 +; CHECK-NEXT: mr 10, 29 +; CHECK-NEXT: std 26, 152(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 160(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 11, 0(12) +; CHECK-NEXT: lxv 10, 0(16) +; CHECK-NEXT: li 29, 1 ; CHECK-NEXT: stfd 26, 544(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 27, 552(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 8, 0(17) -; CHECK-NEXT: lxv 7, 0(2) +; CHECK-NEXT: lxv 9, 0(17) +; CHECK-NEXT: lxv 8, 0(18) ; CHECK-NEXT: stfd 28, 560(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 29, 568(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 5, 0(20) -; CHECK-NEXT: lxv 3, 0(24) +; CHECK-NEXT: lxv 7, 0(19) +; CHECK-NEXT: lxv 5, 0(21) ; CHECK-NEXT: stfd 30, 576(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 31, 584(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 2, 0(25) -; CHECK-NEXT: lxv 1, 0(26) +; CHECK-NEXT: lxv 3, 0(25) +; CHECK-NEXT: lxv 2, 0(26) ; CHECK-NEXT: stxv 52, 208(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 53, 224(1) # 16-byte Folded Spill -; CHECK-NEXT: lxv 0, 0(27) +; CHECK-NEXT: lxv 1, 0(27) +; CHECK-NEXT: lxv 0, 0(28) ; CHECK-NEXT: stxv 54, 240(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 55, 256(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 56, 272(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 57, 288(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 58, 304(1) # 16-byte Folded Spill -; CHECK-NEXT: std 5, 192(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 832(1) ; CHECK-NEXT: stxv 59, 320(1) # 16-byte Folded Spill +; CHECK-NEXT: std 4, 192(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 4, 832(1) ; CHECK-NEXT: stxv 60, 336(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 61, 352(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 62, 368(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 63, 384(1) # 16-byte Folded Spill -; CHECK-NEXT: std 15, 88(1) # 8-byte Folded Spill -; CHECK-NEXT: std 16, 96(1) # 8-byte Folded Spill -; CHECK-NEXT: std 17, 104(1) # 8-byte Folded Spill -; CHECK-NEXT: std 2, 112(1) # 8-byte Folded Spill -; CHECK-NEXT: std 5, 184(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 824(1) -; CHECK-NEXT: std 5, 176(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 168(1) # 8-byte Folded Spill -; CHECK-NEXT: lwa 5, 0(11) -; CHECK-NEXT: li 27, 0 +; CHECK-NEXT: std 16, 88(1) # 8-byte Folded Spill +; CHECK-NEXT: std 17, 96(1) # 8-byte Folded Spill +; CHECK-NEXT: std 18, 104(1) # 8-byte Folded Spill +; CHECK-NEXT: std 19, 112(1) # 8-byte Folded Spill +; CHECK-NEXT: std 4, 184(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 4, 824(1) +; CHECK-NEXT: std 4, 176(1) # 8-byte Folded Spill +; CHECK-NEXT: std 28, 168(1) # 8-byte Folded Spill +; CHECK-NEXT: li 4, 9 +; CHECK-NEXT: li 28, 0 ; CHECK-NEXT: ld 7, 176(1) # 8-byte Folded Reload -; CHECK-NEXT: mulli 6, 5, 40 -; CHECK-NEXT: sldi 0, 5, 4 -; CHECK-NEXT: extswsli 14, 5, 3 -; CHECK-NEXT: lxv 40, 0(7) -; CHECK-NEXT: ld 7, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: add 31, 14, 22 -; CHECK-NEXT: add 11, 0, 22 -; CHECK-NEXT: mr 26, 22 -; CHECK-NEXT: addi 3, 11, 32 -; CHECK-NEXT: addi 11, 12, 1 -; CHECK-NEXT: mulli 12, 5, 48 -; CHECK-NEXT: addi 31, 31, 32 -; CHECK-NEXT: add 19, 22, 6 +; CHECK-NEXT: iselgt 3, 3, 4 +; CHECK-NEXT: add 4, 14, 23 +; CHECK-NEXT: add 20, 23, 6 ; CHECK-NEXT: sldi 6, 5, 5 +; CHECK-NEXT: mr 27, 23 +; CHECK-NEXT: addi 11, 3, -2 +; CHECK-NEXT: addi 31, 4, 32 +; CHECK-NEXT: add 3, 15, 23 +; CHECK-NEXT: add 21, 23, 6 +; CHECK-NEXT: rldicl 11, 11, 61, 3 +; CHECK-NEXT: addi 3, 3, 32 +; CHECK-NEXT: lxv 38, 0(7) +; CHECK-NEXT: ld 7, 184(1) # 8-byte Folded Reload +; CHECK-NEXT: addi 4, 11, 1 +; CHECK-NEXT: mulli 11, 5, 48 ; CHECK-NEXT: mulli 5, 5, 24 -; CHECK-NEXT: lxv 41, 0(7) -; CHECK-NEXT: add 20, 22, 6 -; CHECK-NEXT: add 21, 22, 5 +; CHECK-NEXT: add 22, 23, 5 ; CHECK-NEXT: ld 5, 192(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 43, 0(5) +; CHECK-NEXT: lxv 39, 0(7) +; CHECK-NEXT: lxv 41, 0(5) ; CHECK-NEXT: ld 5, 200(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 42, 0(5) +; CHECK-NEXT: lxv 40, 0(5) ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_3: # %_loop_2_do_.lr.ph ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_4 Depth 2 -; CHECK-NEXT: maddld 5, 12, 27, 0 -; CHECK-NEXT: mr 6, 18 -; CHECK-NEXT: mr 29, 21 -; CHECK-NEXT: mr 30, 20 -; CHECK-NEXT: mr 2, 19 -; CHECK-NEXT: mtctr 11 -; CHECK-NEXT: add 25, 22, 5 -; CHECK-NEXT: maddld 5, 12, 27, 14 -; CHECK-NEXT: add 24, 22, 5 -; CHECK-NEXT: mr 5, 26 +; CHECK-NEXT: maddld 5, 11, 28, 15 +; CHECK-NEXT: mr 6, 24 +; CHECK-NEXT: mr 30, 22 +; CHECK-NEXT: mr 2, 21 +; CHECK-NEXT: mr 12, 20 +; CHECK-NEXT: mtctr 4 +; CHECK-NEXT: add 26, 23, 5 +; CHECK-NEXT: maddld 5, 11, 28, 14 +; CHECK-NEXT: add 25, 23, 5 +; CHECK-NEXT: mr 5, 27 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_4: # %_loop_2_do_ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: lxvp 34, 0(6) +; CHECK-NEXT: lxvp 42, 0(6) ; CHECK-NEXT: lxvp 44, 0(5) -; CHECK-NEXT: xvmaddadp 39, 45, 35 -; CHECK-NEXT: lxvp 46, 0(24) -; CHECK-NEXT: xvmaddadp 38, 47, 35 -; CHECK-NEXT: lxvp 48, 0(25) -; CHECK-NEXT: lxvp 50, 0(29) -; CHECK-NEXT: lxvp 62, 0(30) -; CHECK-NEXT: lxvp 60, 0(2) +; CHECK-NEXT: xvmaddadp 33, 45, 43 +; CHECK-NEXT: lxvp 46, 0(25) +; CHECK-NEXT: xvmaddadp 32, 47, 43 +; CHECK-NEXT: lxvp 48, 0(26) +; CHECK-NEXT: lxvp 50, 0(30) +; CHECK-NEXT: lxvp 62, 0(2) +; CHECK-NEXT: lxvp 60, 0(12) ; CHECK-NEXT: lxvp 58, 32(6) ; CHECK-NEXT: lxvp 56, 32(5) -; CHECK-NEXT: lxvp 54, 32(24) -; CHECK-NEXT: lxvp 52, 32(25) -; CHECK-NEXT: lxvp 30, 32(29) -; CHECK-NEXT: lxvp 28, 32(30) -; CHECK-NEXT: lxvp 26, 32(2) -; CHECK-NEXT: xvmaddadp 33, 49, 35 -; CHECK-NEXT: xvmaddadp 32, 51, 35 -; CHECK-NEXT: xvmaddadp 37, 63, 35 -; CHECK-NEXT: xvmaddadp 36, 61, 35 -; CHECK-NEXT: xvmaddadp 13, 44, 34 -; CHECK-NEXT: xvmaddadp 12, 46, 34 -; CHECK-NEXT: xvmaddadp 11, 48, 34 -; CHECK-NEXT: xvmaddadp 10, 50, 34 -; CHECK-NEXT: xvmaddadp 9, 62, 34 -; CHECK-NEXT: xvmaddadp 8, 60, 34 +; CHECK-NEXT: lxvp 54, 32(25) +; CHECK-NEXT: lxvp 52, 32(26) +; CHECK-NEXT: lxvp 30, 32(30) +; CHECK-NEXT: lxvp 28, 32(2) +; CHECK-NEXT: lxvp 26, 32(12) +; CHECK-NEXT: xvmaddadp 37, 49, 43 +; CHECK-NEXT: xvmaddadp 36, 51, 43 +; CHECK-NEXT: xvmaddadp 35, 63, 43 +; CHECK-NEXT: xvmaddadp 34, 61, 43 +; CHECK-NEXT: xvmaddadp 13, 44, 42 +; CHECK-NEXT: xvmaddadp 12, 46, 42 +; CHECK-NEXT: xvmaddadp 11, 48, 42 +; CHECK-NEXT: xvmaddadp 10, 50, 42 +; CHECK-NEXT: xvmaddadp 9, 62, 42 +; CHECK-NEXT: xvmaddadp 8, 60, 42 ; CHECK-NEXT: xvmaddadp 7, 57, 59 ; CHECK-NEXT: xvmaddadp 6, 55, 59 ; CHECK-NEXT: xvmaddadp 5, 53, 59 @@ -244,34 +243,34 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: xvmaddadp 2, 27, 59 ; CHECK-NEXT: xvmaddadp 1, 56, 58 ; CHECK-NEXT: xvmaddadp 0, 54, 58 -; CHECK-NEXT: xvmaddadp 40, 52, 58 -; CHECK-NEXT: xvmaddadp 41, 30, 58 -; CHECK-NEXT: xvmaddadp 43, 28, 58 -; CHECK-NEXT: xvmaddadp 42, 26, 58 +; CHECK-NEXT: xvmaddadp 38, 52, 58 +; CHECK-NEXT: xvmaddadp 39, 30, 58 +; CHECK-NEXT: xvmaddadp 41, 28, 58 +; CHECK-NEXT: xvmaddadp 40, 26, 58 ; CHECK-NEXT: addi 6, 6, 64 ; CHECK-NEXT: addi 5, 5, 64 -; CHECK-NEXT: addi 24, 24, 64 ; CHECK-NEXT: addi 25, 25, 64 -; CHECK-NEXT: addi 29, 29, 64 +; CHECK-NEXT: addi 26, 26, 64 ; CHECK-NEXT: addi 30, 30, 64 ; CHECK-NEXT: addi 2, 2, 64 +; CHECK-NEXT: addi 12, 12, 64 ; CHECK-NEXT: bdnz .LBB0_4 ; CHECK-NEXT: # %bb.5: # %_loop_2_endl_ ; CHECK-NEXT: # -; CHECK-NEXT: addi 28, 28, 6 -; CHECK-NEXT: add 26, 26, 12 -; CHECK-NEXT: add 31, 31, 12 -; CHECK-NEXT: add 19, 19, 12 -; CHECK-NEXT: add 3, 3, 12 -; CHECK-NEXT: add 20, 20, 12 -; CHECK-NEXT: add 21, 21, 12 -; CHECK-NEXT: addi 27, 27, 1 -; CHECK-NEXT: cmpld 28, 4 +; CHECK-NEXT: addi 29, 29, 6 +; CHECK-NEXT: add 27, 27, 11 +; CHECK-NEXT: add 31, 31, 11 +; CHECK-NEXT: add 20, 20, 11 +; CHECK-NEXT: add 3, 3, 11 +; CHECK-NEXT: add 21, 21, 11 +; CHECK-NEXT: add 22, 22, 11 +; CHECK-NEXT: addi 28, 28, 1 +; CHECK-NEXT: cmpld 29, 0 ; CHECK-NEXT: ble 0, .LBB0_3 ; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit ; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload ; CHECK-NEXT: lxv 63, 384(1) # 16-byte Folded Reload -; CHECK-NEXT: stxv 39, 0(3) +; CHECK-NEXT: stxv 33, 0(3) ; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload ; CHECK-NEXT: lxv 62, 368(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 61, 352(1) # 16-byte Folded Reload @@ -284,7 +283,7 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: lxv 54, 240(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 53, 224(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 52, 208(1) # 16-byte Folded Reload -; CHECK-NEXT: stxv 38, 0(3) +; CHECK-NEXT: stxv 32, 0(3) ; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 31, 584(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 30, 576(1) # 8-byte Folded Reload @@ -297,7 +296,7 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: ld 29, 520(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 28, 512(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 27, 504(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 33, 0(3) +; CHECK-NEXT: stxv 37, 0(3) ; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 26, 496(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 25, 488(1) # 8-byte Folded Reload @@ -310,10 +309,10 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: ld 18, 432(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 17, 424(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 16, 416(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 32, 0(3) +; CHECK-NEXT: stxv 36, 0(3) ; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 37, 0(10) -; CHECK-NEXT: stxv 36, 0(9) +; CHECK-NEXT: stxv 35, 0(10) +; CHECK-NEXT: stxv 34, 0(9) ; CHECK-NEXT: stxv 13, 0(8) ; CHECK-NEXT: ld 15, 408(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 14, 400(1) # 8-byte Folded Reload @@ -343,13 +342,13 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: ld 3, 168(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 0, 0(3) ; CHECK-NEXT: ld 3, 176(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 40, 0(3) +; CHECK-NEXT: stxv 38, 0(3) ; CHECK-NEXT: ld 3, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 41, 0(3) +; CHECK-NEXT: stxv 39, 0(3) ; CHECK-NEXT: ld 3, 192(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 43, 0(3) +; CHECK-NEXT: stxv 41, 0(3) ; CHECK-NEXT: ld 3, 200(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 42, 0(3) +; CHECK-NEXT: stxv 40, 0(3) ; CHECK-NEXT: addi 1, 1, 592 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll b/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll index 799ba63a4df274..cbf3e96e9ce697 100644 --- a/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll +++ b/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll @@ -6,18 +6,18 @@ define signext i32 @test(ptr noalias %PtrA, ptr noalias %PtrB, i32 signext %LenA ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi 6, 3, 4 ; CHECK-NEXT: addi 4, 4, -4 -; CHECK-NEXT: li 8, 0 ; CHECK-NEXT: li 7, 0 +; CHECK-NEXT: li 8, 0 ; CHECK-NEXT: .LBB0_1: # %block3 ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_2 Depth 2 -; CHECK-NEXT: extsw 9, 8 -; CHECK-NEXT: addi 8, 8, 1 -; CHECK-NEXT: extsw 7, 7 -; CHECK-NEXT: cmpw 8, 5 -; CHECK-NEXT: sldi 10, 7, 2 -; CHECK-NEXT: sldi 9, 9, 2 +; CHECK-NEXT: extsw 9, 7 ; CHECK-NEXT: addi 7, 7, 1 +; CHECK-NEXT: extsw 8, 8 +; CHECK-NEXT: cmpw 7, 5 +; CHECK-NEXT: sldi 10, 8, 2 +; CHECK-NEXT: sldi 9, 9, 2 +; CHECK-NEXT: addi 8, 8, 1 ; CHECK-NEXT: add 10, 4, 10 ; CHECK-NEXT: crnot 20, 0 ; CHECK-NEXT: bc 12, 20, .LBB0_5 @@ -32,7 +32,7 @@ define signext i32 @test(ptr noalias %PtrA, ptr noalias %PtrB, i32 signext %LenA ; CHECK-NEXT: # %bb.3: # %if.then4 ; CHECK-NEXT: # ; CHECK-NEXT: lwzx 12, 6, 9 -; CHECK-NEXT: addi 7, 7, 1 +; CHECK-NEXT: addi 8, 8, 1 ; CHECK-NEXT: stw 12, 8(10) ; CHECK-NEXT: mr 10, 11 ; CHECK-NEXT: bc 4, 20, .LBB0_2 diff --git a/llvm/test/CodeGen/PowerPC/p10-handle-split-promote-vec.ll b/llvm/test/CodeGen/PowerPC/p10-handle-split-promote-vec.ll index 82641312666bcb..303e170f4ab0cb 100644 --- a/llvm/test/CodeGen/PowerPC/p10-handle-split-promote-vec.ll +++ b/llvm/test/CodeGen/PowerPC/p10-handle-split-promote-vec.ll @@ -11,52 +11,52 @@ define i32 @SplitPromoteVectorTest(i32 %Opc) align 2 { ; CHECK-NEXT: plxv v2, .LCPI0_0@PCREL(0), 1 ; CHECK-NEXT: plxv v4, .LCPI0_1@PCREL(0), 1 ; CHECK-NEXT: mtvsrws v3, r3 -; CHECK-NEXT: li r5, 12 +; CHECK-NEXT: li r4, 12 ; CHECK-NEXT: li r8, 0 ; CHECK-NEXT: vcmpequw v2, v3, v2 ; CHECK-NEXT: plxv v5, .LCPI0_2@PCREL(0), 1 ; CHECK-NEXT: vcmpequw v4, v3, v4 ; CHECK-NEXT: vcmpequw v5, v3, v5 -; CHECK-NEXT: vextubrx r4, r5, v2 -; CHECK-NEXT: vextubrx r6, r5, v4 -; CHECK-NEXT: or r9, r6, r4 +; CHECK-NEXT: vextubrx r5, r4, v2 +; CHECK-NEXT: vextubrx r6, r4, v4 +; CHECK-NEXT: or r9, r6, r5 ; CHECK-NEXT: li r6, 4 -; CHECK-NEXT: vextubrx r4, r8, v5 +; CHECK-NEXT: vextubrx r5, r8, v5 ; CHECK-NEXT: vextubrx r7, r6, v5 -; CHECK-NEXT: rlwimi r4, r7, 1, 30, 30 +; CHECK-NEXT: rlwimi r5, r7, 1, 30, 30 ; CHECK-NEXT: li r7, 8 ; CHECK-NEXT: vextubrx r10, r7, v5 -; CHECK-NEXT: rlwimi r4, r10, 2, 29, 29 -; CHECK-NEXT: vextubrx r10, r5, v5 +; CHECK-NEXT: rlwimi r5, r10, 2, 29, 29 +; CHECK-NEXT: vextubrx r10, r4, v5 ; CHECK-NEXT: plxv v5, .LCPI0_3@PCREL(0), 1 -; CHECK-NEXT: rlwimi r4, r10, 3, 28, 28 +; CHECK-NEXT: rlwimi r5, r10, 3, 28, 28 ; CHECK-NEXT: vcmpequw v5, v3, v5 ; CHECK-NEXT: vextubrx r10, r8, v5 -; CHECK-NEXT: rlwimi r4, r10, 4, 27, 27 +; CHECK-NEXT: rlwimi r5, r10, 4, 27, 27 ; CHECK-NEXT: vextubrx r10, r6, v5 -; CHECK-NEXT: rlwimi r4, r10, 5, 26, 26 +; CHECK-NEXT: rlwimi r5, r10, 5, 26, 26 ; CHECK-NEXT: vextubrx r10, r7, v5 -; CHECK-NEXT: rlwimi r4, r10, 6, 25, 25 -; CHECK-NEXT: vextubrx r10, r5, v5 +; CHECK-NEXT: rlwimi r5, r10, 6, 25, 25 +; CHECK-NEXT: vextubrx r10, r4, v5 ; CHECK-NEXT: plxv v5, .LCPI0_4@PCREL(0), 1 -; CHECK-NEXT: rlwimi r4, r10, 7, 24, 24 +; CHECK-NEXT: rlwimi r5, r10, 7, 24, 24 ; CHECK-NEXT: vcmpequw v5, v3, v5 ; CHECK-NEXT: vextubrx r10, r8, v5 -; CHECK-NEXT: rlwimi r4, r10, 8, 23, 23 +; CHECK-NEXT: rlwimi r5, r10, 8, 23, 23 ; CHECK-NEXT: vextubrx r10, r6, v5 -; CHECK-NEXT: rlwimi r4, r10, 9, 22, 22 +; CHECK-NEXT: rlwimi r5, r10, 9, 22, 22 ; CHECK-NEXT: vextubrx r10, r7, v5 -; CHECK-NEXT: rlwimi r4, r10, 10, 21, 21 -; CHECK-NEXT: vextubrx r10, r5, v5 -; CHECK-NEXT: rlwimi r4, r10, 11, 20, 20 +; CHECK-NEXT: rlwimi r5, r10, 10, 21, 21 +; CHECK-NEXT: vextubrx r10, r4, v5 +; CHECK-NEXT: rlwimi r5, r10, 11, 20, 20 ; CHECK-NEXT: vextubrx r10, r8, v4 -; CHECK-NEXT: rlwimi r4, r10, 12, 19, 19 +; CHECK-NEXT: rlwimi r5, r10, 12, 19, 19 ; CHECK-NEXT: vextubrx r10, r6, v4 -; CHECK-NEXT: rlwimi r4, r10, 13, 18, 18 +; CHECK-NEXT: rlwimi r5, r10, 13, 18, 18 ; CHECK-NEXT: vextubrx r10, r7, v4 ; CHECK-NEXT: plxv v4, .LCPI0_5@PCREL(0), 1 -; CHECK-NEXT: rlwimi r4, r10, 14, 17, 17 -; CHECK-NEXT: rlwimi r4, r9, 15, 0, 16 +; CHECK-NEXT: rlwimi r5, r10, 14, 17, 17 +; CHECK-NEXT: rlwimi r5, r9, 15, 0, 16 ; CHECK-NEXT: vcmpequw v4, v3, v4 ; CHECK-NEXT: vextubrx r10, r8, v4 ; CHECK-NEXT: vextubrx r9, r6, v4 @@ -64,7 +64,7 @@ define i32 @SplitPromoteVectorTest(i32 %Opc) align 2 { ; CHECK-NEXT: rlwimi r10, r9, 1, 30, 30 ; CHECK-NEXT: vextubrx r9, r7, v4 ; CHECK-NEXT: rlwimi r10, r9, 2, 29, 29 -; CHECK-NEXT: vextubrx r9, r5, v4 +; CHECK-NEXT: vextubrx r9, r4, v4 ; CHECK-NEXT: plxv v4, .LCPI0_6@PCREL(0), 1 ; CHECK-NEXT: rlwimi r10, r9, 3, 28, 28 ; CHECK-NEXT: vcmpequw v4, v3, v4 @@ -74,25 +74,25 @@ define i32 @SplitPromoteVectorTest(i32 %Opc) align 2 { ; CHECK-NEXT: rlwimi r10, r9, 5, 26, 26 ; CHECK-NEXT: vextubrx r9, r7, v4 ; CHECK-NEXT: rlwimi r10, r9, 6, 25, 25 -; CHECK-NEXT: vextubrx r9, r5, v4 +; CHECK-NEXT: vextubrx r9, r4, v4 ; CHECK-NEXT: plxv v4, .LCPI0_7@PCREL(0), 1 ; CHECK-NEXT: rlwimi r10, r9, 7, 24, 24 ; CHECK-NEXT: vcmpequw v3, v3, v4 ; CHECK-NEXT: vextubrx r9, r8, v3 -; CHECK-NEXT: vextubrx r5, r5, v3 +; CHECK-NEXT: vextubrx r4, r4, v3 ; CHECK-NEXT: rlwimi r10, r9, 8, 23, 23 ; CHECK-NEXT: vextubrx r9, r6, v3 ; CHECK-NEXT: rlwimi r10, r9, 9, 22, 22 ; CHECK-NEXT: vextubrx r9, r7, v3 ; CHECK-NEXT: rlwimi r10, r9, 10, 21, 21 -; CHECK-NEXT: rlwimi r10, r5, 11, 20, 20 -; CHECK-NEXT: vextubrx r5, r8, v2 -; CHECK-NEXT: rlwimi r10, r5, 12, 19, 19 -; CHECK-NEXT: vextubrx r5, r6, v2 -; CHECK-NEXT: rlwimi r10, r5, 13, 18, 18 -; CHECK-NEXT: vextubrx r5, r7, v2 -; CHECK-NEXT: rlwimi r10, r5, 14, 17, 17 -; CHECK-NEXT: or r4, r4, r10 +; CHECK-NEXT: rlwimi r10, r4, 11, 20, 20 +; CHECK-NEXT: vextubrx r4, r8, v2 +; CHECK-NEXT: rlwimi r10, r4, 12, 19, 19 +; CHECK-NEXT: vextubrx r4, r6, v2 +; CHECK-NEXT: rlwimi r10, r4, 13, 18, 18 +; CHECK-NEXT: vextubrx r4, r7, v2 +; CHECK-NEXT: rlwimi r10, r4, 14, 17, 17 +; CHECK-NEXT: or r4, r5, r10 ; CHECK-NEXT: andi. r4, r4, 65535 ; CHECK-NEXT: iseleq r3, 0, r3 ; CHECK-NEXT: blr @@ -101,98 +101,98 @@ define i32 @SplitPromoteVectorTest(i32 %Opc) align 2 { ; CHECK-AIX: # %bb.0: # %entry ; CHECK-AIX-NEXT: ld 4, L..C0(2) # %const.0 ; CHECK-AIX-NEXT: mtvsrws 34, 3 -; CHECK-AIX-NEXT: li 8, 15 +; CHECK-AIX-NEXT: li 6, 15 ; CHECK-AIX-NEXT: li 5, 11 ; CHECK-AIX-NEXT: lxv 35, 0(4) ; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 -; CHECK-AIX-NEXT: vextublx 4, 8, 3 -; CHECK-AIX-NEXT: vextublx 6, 5, 3 +; CHECK-AIX-NEXT: vextublx 4, 6, 3 +; CHECK-AIX-NEXT: vextublx 7, 5, 3 ; CHECK-AIX-NEXT: clrlwi 4, 4, 31 -; CHECK-AIX-NEXT: rlwimi 4, 6, 1, 30, 30 -; CHECK-AIX-NEXT: li 6, 7 -; CHECK-AIX-NEXT: vextublx 7, 6, 3 -; CHECK-AIX-NEXT: rlwimi 4, 7, 2, 29, 29 -; CHECK-AIX-NEXT: li 7, 3 -; CHECK-AIX-NEXT: vextublx 9, 7, 3 +; CHECK-AIX-NEXT: rlwimi 4, 7, 1, 30, 30 +; CHECK-AIX-NEXT: li 7, 7 +; CHECK-AIX-NEXT: vextublx 8, 7, 3 +; CHECK-AIX-NEXT: rlwimi 4, 8, 2, 29, 29 +; CHECK-AIX-NEXT: li 8, 3 +; CHECK-AIX-NEXT: vextublx 9, 8, 3 ; CHECK-AIX-NEXT: rlwimi 4, 9, 3, 28, 28 ; CHECK-AIX-NEXT: ld 9, L..C1(2) # %const.1 ; CHECK-AIX-NEXT: lxv 35, 0(9) ; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 -; CHECK-AIX-NEXT: vextublx 9, 8, 3 +; CHECK-AIX-NEXT: vextublx 9, 6, 3 ; CHECK-AIX-NEXT: rlwimi 4, 9, 4, 27, 27 ; CHECK-AIX-NEXT: vextublx 9, 5, 3 ; CHECK-AIX-NEXT: rlwimi 4, 9, 5, 26, 26 -; CHECK-AIX-NEXT: vextublx 9, 6, 3 -; CHECK-AIX-NEXT: rlwimi 4, 9, 6, 25, 25 ; CHECK-AIX-NEXT: vextublx 9, 7, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 6, 25, 25 +; CHECK-AIX-NEXT: vextublx 9, 8, 3 ; CHECK-AIX-NEXT: rlwimi 4, 9, 7, 24, 24 ; CHECK-AIX-NEXT: ld 9, L..C2(2) # %const.2 ; CHECK-AIX-NEXT: lxv 35, 0(9) ; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 -; CHECK-AIX-NEXT: vextublx 9, 8, 3 +; CHECK-AIX-NEXT: vextublx 9, 6, 3 ; CHECK-AIX-NEXT: rlwimi 4, 9, 8, 23, 23 ; CHECK-AIX-NEXT: vextublx 9, 5, 3 ; CHECK-AIX-NEXT: rlwimi 4, 9, 9, 22, 22 -; CHECK-AIX-NEXT: vextublx 9, 6, 3 -; CHECK-AIX-NEXT: rlwimi 4, 9, 10, 21, 21 ; CHECK-AIX-NEXT: vextublx 9, 7, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 10, 21, 21 +; CHECK-AIX-NEXT: vextublx 9, 8, 3 ; CHECK-AIX-NEXT: rlwimi 4, 9, 11, 20, 20 ; CHECK-AIX-NEXT: ld 9, L..C3(2) # %const.3 ; CHECK-AIX-NEXT: lxv 35, 0(9) ; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 -; CHECK-AIX-NEXT: vextublx 9, 8, 3 +; CHECK-AIX-NEXT: vextublx 9, 6, 3 ; CHECK-AIX-NEXT: rlwimi 4, 9, 12, 19, 19 ; CHECK-AIX-NEXT: vextublx 9, 5, 3 ; CHECK-AIX-NEXT: rlwimi 4, 9, 13, 18, 18 -; CHECK-AIX-NEXT: vextublx 9, 6, 3 -; CHECK-AIX-NEXT: rlwimi 4, 9, 14, 17, 17 ; CHECK-AIX-NEXT: vextublx 9, 7, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 14, 17, 17 +; CHECK-AIX-NEXT: vextublx 9, 8, 3 ; CHECK-AIX-NEXT: rlwimi 4, 9, 15, 16, 16 ; CHECK-AIX-NEXT: ld 9, L..C4(2) # %const.4 ; CHECK-AIX-NEXT: lxv 35, 0(9) ; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 -; CHECK-AIX-NEXT: vextublx 9, 8, 3 -; CHECK-AIX-NEXT: vextublx 10, 5, 3 -; CHECK-AIX-NEXT: clrlwi 9, 9, 31 -; CHECK-AIX-NEXT: rlwimi 9, 10, 1, 30, 30 ; CHECK-AIX-NEXT: vextublx 10, 6, 3 -; CHECK-AIX-NEXT: rlwimi 9, 10, 2, 29, 29 -; CHECK-AIX-NEXT: vextublx 10, 7, 3 -; CHECK-AIX-NEXT: rlwimi 9, 10, 3, 28, 28 -; CHECK-AIX-NEXT: ld 10, L..C5(2) # %const.5 -; CHECK-AIX-NEXT: lxv 35, 0(10) +; CHECK-AIX-NEXT: vextublx 9, 5, 3 +; CHECK-AIX-NEXT: clrlwi 10, 10, 31 +; CHECK-AIX-NEXT: rlwimi 10, 9, 1, 30, 30 +; CHECK-AIX-NEXT: vextublx 9, 7, 3 +; CHECK-AIX-NEXT: rlwimi 10, 9, 2, 29, 29 +; CHECK-AIX-NEXT: vextublx 9, 8, 3 +; CHECK-AIX-NEXT: rlwimi 10, 9, 3, 28, 28 +; CHECK-AIX-NEXT: ld 9, L..C5(2) # %const.5 +; CHECK-AIX-NEXT: lxv 35, 0(9) ; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 -; CHECK-AIX-NEXT: vextublx 10, 8, 3 -; CHECK-AIX-NEXT: rlwimi 9, 10, 4, 27, 27 -; CHECK-AIX-NEXT: vextublx 10, 5, 3 -; CHECK-AIX-NEXT: rlwimi 9, 10, 5, 26, 26 -; CHECK-AIX-NEXT: vextublx 10, 6, 3 -; CHECK-AIX-NEXT: rlwimi 9, 10, 6, 25, 25 -; CHECK-AIX-NEXT: vextublx 10, 7, 3 -; CHECK-AIX-NEXT: rlwimi 9, 10, 7, 24, 24 -; CHECK-AIX-NEXT: ld 10, L..C6(2) # %const.6 -; CHECK-AIX-NEXT: lxv 35, 0(10) +; CHECK-AIX-NEXT: vextublx 9, 6, 3 +; CHECK-AIX-NEXT: rlwimi 10, 9, 4, 27, 27 +; CHECK-AIX-NEXT: vextublx 9, 5, 3 +; CHECK-AIX-NEXT: rlwimi 10, 9, 5, 26, 26 +; CHECK-AIX-NEXT: vextublx 9, 7, 3 +; CHECK-AIX-NEXT: rlwimi 10, 9, 6, 25, 25 +; CHECK-AIX-NEXT: vextublx 9, 8, 3 +; CHECK-AIX-NEXT: rlwimi 10, 9, 7, 24, 24 +; CHECK-AIX-NEXT: ld 9, L..C6(2) # %const.6 +; CHECK-AIX-NEXT: lxv 35, 0(9) ; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 -; CHECK-AIX-NEXT: vextublx 10, 8, 3 -; CHECK-AIX-NEXT: rlwimi 9, 10, 8, 23, 23 -; CHECK-AIX-NEXT: vextublx 10, 5, 3 -; CHECK-AIX-NEXT: rlwimi 9, 10, 9, 22, 22 -; CHECK-AIX-NEXT: vextublx 10, 6, 3 -; CHECK-AIX-NEXT: rlwimi 9, 10, 10, 21, 21 -; CHECK-AIX-NEXT: vextublx 10, 7, 3 -; CHECK-AIX-NEXT: rlwimi 9, 10, 11, 20, 20 -; CHECK-AIX-NEXT: ld 10, L..C7(2) # %const.7 -; CHECK-AIX-NEXT: lxv 35, 0(10) +; CHECK-AIX-NEXT: vextublx 9, 6, 3 +; CHECK-AIX-NEXT: rlwimi 10, 9, 8, 23, 23 +; CHECK-AIX-NEXT: vextublx 9, 5, 3 +; CHECK-AIX-NEXT: rlwimi 10, 9, 9, 22, 22 +; CHECK-AIX-NEXT: vextublx 9, 7, 3 +; CHECK-AIX-NEXT: rlwimi 10, 9, 10, 21, 21 +; CHECK-AIX-NEXT: vextublx 9, 8, 3 +; CHECK-AIX-NEXT: rlwimi 10, 9, 11, 20, 20 +; CHECK-AIX-NEXT: ld 9, L..C7(2) # %const.7 +; CHECK-AIX-NEXT: lxv 35, 0(9) ; CHECK-AIX-NEXT: vcmpequw 2, 2, 3 -; CHECK-AIX-NEXT: vextublx 8, 8, 2 +; CHECK-AIX-NEXT: vextublx 6, 6, 2 ; CHECK-AIX-NEXT: vextublx 5, 5, 2 -; CHECK-AIX-NEXT: rlwimi 9, 8, 12, 19, 19 -; CHECK-AIX-NEXT: rlwimi 9, 5, 13, 18, 18 -; CHECK-AIX-NEXT: vextublx 5, 6, 2 -; CHECK-AIX-NEXT: rlwimi 9, 5, 14, 17, 17 +; CHECK-AIX-NEXT: rlwimi 10, 6, 12, 19, 19 +; CHECK-AIX-NEXT: rlwimi 10, 5, 13, 18, 18 ; CHECK-AIX-NEXT: vextublx 5, 7, 2 -; CHECK-AIX-NEXT: rlwimi 9, 5, 15, 16, 16 -; CHECK-AIX-NEXT: or 4, 9, 4 +; CHECK-AIX-NEXT: rlwimi 10, 5, 14, 17, 17 +; CHECK-AIX-NEXT: vextublx 5, 8, 2 +; CHECK-AIX-NEXT: rlwimi 10, 5, 15, 16, 16 +; CHECK-AIX-NEXT: or 4, 10, 4 ; CHECK-AIX-NEXT: andi. 4, 4, 65535 ; CHECK-AIX-NEXT: iseleq 3, 0, 3 ; CHECK-AIX-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll index ac9641ff35b0cb..b2fee2cd5518fb 100644 --- a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll +++ b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll @@ -61,16 +61,16 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 { ; CHECK-NEXT: cmpwi cr6, r4, 0 ; CHECK-NEXT: crand 4*cr5+gt, 4*cr2+gt, 4*cr1+lt ; CHECK-NEXT: crand 4*cr5+lt, 4*cr3+gt, 4*cr5+un -; CHECK-NEXT: # implicit-def: $x3 +; CHECK-NEXT: # implicit-def: $x4 ; CHECK-NEXT: bc 4, 4*cr5+gt, .LBB0_10 ; CHECK-NEXT: # %bb.9: # %bb34 -; CHECK-NEXT: ld r3, 0(r3) +; CHECK-NEXT: ld r4, 0(r3) ; CHECK-NEXT: .LBB0_10: # %bb36 ; CHECK-NEXT: cmpwi cr2, r5, 0 -; CHECK-NEXT: # implicit-def: $x4 +; CHECK-NEXT: # implicit-def: $x3 ; CHECK-NEXT: bc 4, 4*cr5+lt, .LBB0_12 ; CHECK-NEXT: # %bb.11: # %bb38 -; CHECK-NEXT: ld r4, 0(r3) +; CHECK-NEXT: ld r3, 0(r3) ; CHECK-NEXT: .LBB0_12: # %bb40 ; CHECK-NEXT: crand 4*cr6+gt, 4*cr7+lt, 4*cr1+lt ; CHECK-NEXT: crand 4*cr6+lt, 4*cr6+lt, 4*cr5+un @@ -81,7 +81,7 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 { ; CHECK-NEXT: ld r6, 0(r3) ; CHECK-NEXT: .LBB0_14: # %bb50 ; CHECK-NEXT: cmpwi cr3, r5, -1 -; CHECK-NEXT: crand 4*cr7+lt, 4*cr2+lt, 4*cr6+un +; CHECK-NEXT: crand 4*cr7+gt, 4*cr2+lt, 4*cr6+un ; CHECK-NEXT: # implicit-def: $r5 ; CHECK-NEXT: bc 4, 4*cr6+gt, .LBB0_16 ; CHECK-NEXT: # %bb.15: # %bb52 @@ -90,7 +90,7 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 { ; CHECK-NEXT: mfocrf r7, 128 ; CHECK-NEXT: stw r7, -4(r1) ; CHECK-NEXT: # implicit-def: $r7 -; CHECK-NEXT: bc 4, 4*cr7+lt, .LBB0_18 +; CHECK-NEXT: bc 4, 4*cr7+gt, .LBB0_18 ; CHECK-NEXT: # %bb.17: # %bb56 ; CHECK-NEXT: lwz r7, 0(r3) ; CHECK-NEXT: .LBB0_18: # %bb58 @@ -98,12 +98,12 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 { ; CHECK-NEXT: crand 4*cr7+un, 4*cr3+gt, 4*cr6+un ; CHECK-NEXT: cmpwi cr3, r5, 1 ; CHECK-NEXT: cmpwi cr4, r7, 1 -; CHECK-NEXT: crand 4*cr7+gt, 4*cr7+eq, 4*cr1+lt +; CHECK-NEXT: crand 4*cr7+lt, 4*cr7+eq, 4*cr1+lt ; CHECK-NEXT: # implicit-def: $x5 ; CHECK-NEXT: crand 4*cr6+un, 4*cr2+eq, 4*cr6+un ; CHECK-NEXT: crand 4*cr5+un, 4*cr6+eq, 4*cr5+un ; CHECK-NEXT: crand 4*cr6+gt, 4*cr3+lt, 4*cr6+gt -; CHECK-NEXT: crand 4*cr7+lt, 4*cr4+lt, 4*cr7+lt +; CHECK-NEXT: crand 4*cr7+gt, 4*cr4+lt, 4*cr7+gt ; CHECK-NEXT: cmpwi r6, 1 ; CHECK-NEXT: crand 4*cr6+lt, lt, 4*cr6+lt ; CHECK-NEXT: bc 4, 4*cr6+gt, .LBB0_20 @@ -119,27 +119,27 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 { ; CHECK-NEXT: setbc r8, 4*cr5+un ; CHECK-NEXT: lwz r12, 8(r1) ; CHECK-NEXT: xxlxor f2, f2, f2 -; CHECK-NEXT: isel r3, r3, r5, 4*cr5+gt -; CHECK-NEXT: setbc r5, 4*cr7+gt +; CHECK-NEXT: isel r4, r4, r5, 4*cr5+gt +; CHECK-NEXT: setbc r5, 4*cr7+lt ; CHECK-NEXT: crnor 4*cr5+gt, 4*cr6+gt, 4*cr5+gt -; CHECK-NEXT: crnor 4*cr6+gt, 4*cr7+lt, 4*cr7+eq +; CHECK-NEXT: crnor 4*cr6+gt, 4*cr7+gt, 4*cr7+eq ; CHECK-NEXT: crnor 4*cr5+lt, 4*cr6+lt, 4*cr5+lt ; CHECK-NEXT: add r5, r7, r5 ; CHECK-NEXT: add r5, r8, r5 -; CHECK-NEXT: isel r3, 0, r3, 4*cr5+gt -; CHECK-NEXT: isel r4, 0, r4, 4*cr5+lt +; CHECK-NEXT: isel r4, 0, r4, 4*cr5+gt +; CHECK-NEXT: isel r3, 0, r3, 4*cr5+lt ; CHECK-NEXT: isel r6, 0, r6, 4*cr6+gt ; CHECK-NEXT: mtocrf 128, r9 ; CHECK-NEXT: mtfprd f0, r5 -; CHECK-NEXT: isel r4, 0, r4, 4*cr5+eq +; CHECK-NEXT: isel r3, 0, r3, 4*cr5+eq ; CHECK-NEXT: mtocrf 32, r12 ; CHECK-NEXT: mtocrf 16, r12 ; CHECK-NEXT: mtocrf 8, r12 -; CHECK-NEXT: iseleq r3, 0, r3 +; CHECK-NEXT: iseleq r4, 0, r4 ; CHECK-NEXT: isel r6, 0, r6, 4*cr1+eq ; CHECK-NEXT: xscvsxddp f0, f0 -; CHECK-NEXT: add r3, r6, r3 -; CHECK-NEXT: add r3, r4, r3 +; CHECK-NEXT: add r4, r6, r4 +; CHECK-NEXT: add r3, r3, r4 ; CHECK-NEXT: mtfprd f1, r3 ; CHECK-NEXT: xsmuldp f0, f0, f2 ; CHECK-NEXT: xscvsxddp f1, f1 diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll index 7a6640fea2d1e4..cf1f6a0bfda401 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll @@ -280,24 +280,24 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR9-LE-NEXT: std r25, -56(r1) # 8-byte Folded Spill ; CHECK-PWR9-LE-NEXT: clrlwi r6, r6, 24 ; CHECK-PWR9-LE-NEXT: clrlwi r3, r3, 24 -; CHECK-PWR9-LE-NEXT: clrlwi r8, r8, 24 -; CHECK-PWR9-LE-NEXT: clrlwi r5, r5, 24 ; CHECK-PWR9-LE-NEXT: vextubrx r7, r4, v2 ; CHECK-PWR9-LE-NEXT: vextubrx r4, r4, v3 +; CHECK-PWR9-LE-NEXT: clrlwi r8, r8, 24 ; CHECK-PWR9-LE-NEXT: sub r3, r6, r3 -; CHECK-PWR9-LE-NEXT: sub r6, r8, r5 +; CHECK-PWR9-LE-NEXT: clrlwi r5, r5, 24 ; CHECK-PWR9-LE-NEXT: clrlwi r7, r7, 24 ; CHECK-PWR9-LE-NEXT: clrlwi r4, r4, 24 +; CHECK-PWR9-LE-NEXT: sub r5, r8, r5 ; CHECK-PWR9-LE-NEXT: sub r4, r7, r4 -; CHECK-PWR9-LE-NEXT: srawi r5, r3, 31 +; CHECK-PWR9-LE-NEXT: srawi r6, r3, 31 ; CHECK-PWR9-LE-NEXT: srawi r7, r4, 31 -; CHECK-PWR9-LE-NEXT: xor r3, r3, r5 +; CHECK-PWR9-LE-NEXT: xor r3, r3, r6 ; CHECK-PWR9-LE-NEXT: xor r4, r4, r7 -; CHECK-PWR9-LE-NEXT: sub r5, r3, r5 -; CHECK-PWR9-LE-NEXT: srawi r3, r6, 31 +; CHECK-PWR9-LE-NEXT: sub r3, r3, r6 +; CHECK-PWR9-LE-NEXT: srawi r6, r5, 31 ; CHECK-PWR9-LE-NEXT: sub r4, r4, r7 -; CHECK-PWR9-LE-NEXT: xor r6, r6, r3 -; CHECK-PWR9-LE-NEXT: sub r3, r6, r3 +; CHECK-PWR9-LE-NEXT: xor r5, r5, r6 +; CHECK-PWR9-LE-NEXT: sub r5, r5, r6 ; CHECK-PWR9-LE-NEXT: li r6, 3 ; CHECK-PWR9-LE-NEXT: vextubrx r7, r6, v2 ; CHECK-PWR9-LE-NEXT: vextubrx r6, r6, v3 @@ -411,10 +411,10 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR9-LE-NEXT: li r26, 15 ; CHECK-PWR9-LE-NEXT: vextubrx r25, r26, v2 ; CHECK-PWR9-LE-NEXT: vextubrx r26, r26, v3 -; CHECK-PWR9-LE-NEXT: mtvsrd v2, r5 +; CHECK-PWR9-LE-NEXT: mtvsrd v2, r3 ; CHECK-PWR9-LE-NEXT: mtvsrd v3, r4 ; CHECK-PWR9-LE-NEXT: vmrghb v2, v3, v2 -; CHECK-PWR9-LE-NEXT: mtvsrd v3, r3 +; CHECK-PWR9-LE-NEXT: mtvsrd v3, r5 ; CHECK-PWR9-LE-NEXT: clrlwi r25, r25, 24 ; CHECK-PWR9-LE-NEXT: clrlwi r26, r26, 24 ; CHECK-PWR9-LE-NEXT: vmrghb v3, v4, v3 @@ -656,91 +656,83 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-PWR8-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-PWR8-NEXT: std r26, -48(r1) # 8-byte Folded Spill -; CHECK-PWR8-NEXT: mffprd r11, f0 -; CHECK-PWR8-NEXT: mffprd r8, f1 +; CHECK-PWR8-NEXT: mffprd r10, f0 +; CHECK-PWR8-NEXT: mffprd r11, f1 ; CHECK-PWR8-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-PWR8-NEXT: std r25, -56(r1) # 8-byte Folded Spill -; CHECK-PWR8-NEXT: clrldi r3, r11, 56 -; CHECK-PWR8-NEXT: clrldi r4, r8, 56 -; CHECK-PWR8-NEXT: rldicl r5, r11, 56, 56 -; CHECK-PWR8-NEXT: rldicl r6, r8, 56, 56 -; CHECK-PWR8-NEXT: rldicl r7, r11, 48, 56 -; CHECK-PWR8-NEXT: rldicl r9, r8, 48, 56 -; CHECK-PWR8-NEXT: rldicl r0, r11, 32, 56 -; CHECK-PWR8-NEXT: rldicl r30, r8, 32, 56 -; CHECK-PWR8-NEXT: rldicl r29, r11, 24, 56 -; CHECK-PWR8-NEXT: rldicl r28, r8, 24, 56 -; CHECK-PWR8-NEXT: rldicl r10, r11, 40, 56 -; CHECK-PWR8-NEXT: rldicl r12, r8, 40, 56 -; CHECK-PWR8-NEXT: rldicl r27, r11, 16, 56 -; CHECK-PWR8-NEXT: rldicl r11, r11, 8, 56 +; CHECK-PWR8-NEXT: clrldi r3, r10, 56 +; CHECK-PWR8-NEXT: clrldi r4, r11, 56 +; CHECK-PWR8-NEXT: rldicl r5, r10, 56, 56 +; CHECK-PWR8-NEXT: rldicl r6, r11, 56, 56 +; CHECK-PWR8-NEXT: rldicl r7, r10, 48, 56 +; CHECK-PWR8-NEXT: rldicl r8, r11, 48, 56 +; CHECK-PWR8-NEXT: rldicl r9, r10, 40, 56 +; CHECK-PWR8-NEXT: rldicl r12, r11, 40, 56 +; CHECK-PWR8-NEXT: rldicl r0, r10, 32, 56 +; CHECK-PWR8-NEXT: rldicl r30, r11, 32, 56 +; CHECK-PWR8-NEXT: rldicl r29, r10, 24, 56 +; CHECK-PWR8-NEXT: rldicl r28, r11, 24, 56 +; CHECK-PWR8-NEXT: rldicl r27, r10, 16, 56 +; CHECK-PWR8-NEXT: rldicl r10, r10, 8, 56 ; CHECK-PWR8-NEXT: std r24, -64(r1) # 8-byte Folded Spill ; CHECK-PWR8-NEXT: clrlwi r3, r3, 24 ; CHECK-PWR8-NEXT: clrlwi r4, r4, 24 ; CHECK-PWR8-NEXT: clrlwi r5, r5, 24 ; CHECK-PWR8-NEXT: clrlwi r6, r6, 24 ; CHECK-PWR8-NEXT: clrlwi r7, r7, 24 -; CHECK-PWR8-NEXT: clrlwi r9, r9, 24 +; CHECK-PWR8-NEXT: clrlwi r8, r8, 24 ; CHECK-PWR8-NEXT: sub r3, r3, r4 +; CHECK-PWR8-NEXT: clrlwi r9, r9, 24 +; CHECK-PWR8-NEXT: clrlwi r12, r12, 24 ; CHECK-PWR8-NEXT: clrlwi r0, r0, 24 ; CHECK-PWR8-NEXT: clrlwi r30, r30, 24 ; CHECK-PWR8-NEXT: sub r4, r5, r6 -; CHECK-PWR8-NEXT: sub r5, r7, r9 +; CHECK-PWR8-NEXT: sub r5, r7, r8 +; CHECK-PWR8-NEXT: sub r6, r9, r12 +; CHECK-PWR8-NEXT: sub r7, r0, r30 ; CHECK-PWR8-NEXT: clrlwi r29, r29, 24 ; CHECK-PWR8-NEXT: clrlwi r28, r28, 24 -; CHECK-PWR8-NEXT: sub r7, r0, r30 -; CHECK-PWR8-NEXT: sub r9, r29, r28 +; CHECK-PWR8-NEXT: sub r8, r29, r28 ; CHECK-PWR8-NEXT: clrlwi r10, r10, 24 -; CHECK-PWR8-NEXT: clrlwi r12, r12, 24 -; CHECK-PWR8-NEXT: sub r6, r10, r12 ; CHECK-PWR8-NEXT: clrlwi r27, r27, 24 -; CHECK-PWR8-NEXT: clrlwi r11, r11, 24 -; CHECK-PWR8-NEXT: srawi r0, r5, 31 +; CHECK-PWR8-NEXT: srawi r9, r3, 31 ; CHECK-PWR8-NEXT: srawi r29, r7, 31 +; CHECK-PWR8-NEXT: srawi r0, r5, 31 ; CHECK-PWR8-NEXT: srawi r12, r4, 31 -; CHECK-PWR8-NEXT: srawi r28, r9, 31 +; CHECK-PWR8-NEXT: srawi r28, r8, 31 ; CHECK-PWR8-NEXT: srawi r30, r6, 31 -; CHECK-PWR8-NEXT: srawi r10, r3, 31 -; CHECK-PWR8-NEXT: xor r5, r5, r0 +; CHECK-PWR8-NEXT: xor r3, r3, r9 ; CHECK-PWR8-NEXT: xor r26, r7, r29 -; CHECK-PWR8-NEXT: sub r7, r5, r0 -; CHECK-PWR8-NEXT: rldicl r5, r8, 16, 56 -; CHECK-PWR8-NEXT: rldicl r8, r8, 8, 56 +; CHECK-PWR8-NEXT: sub r7, r3, r9 +; CHECK-PWR8-NEXT: sub r3, r26, r29 +; CHECK-PWR8-NEXT: rldicl r9, r11, 16, 56 +; CHECK-PWR8-NEXT: rldicl r11, r11, 8, 56 +; CHECK-PWR8-NEXT: xor r5, r5, r0 +; CHECK-PWR8-NEXT: sub r5, r5, r0 +; CHECK-PWR8-NEXT: mfvsrd r0, v3 ; CHECK-PWR8-NEXT: xor r4, r4, r12 -; CHECK-PWR8-NEXT: xor r25, r9, r28 -; CHECK-PWR8-NEXT: sub r9, r4, r12 -; CHECK-PWR8-NEXT: sub r4, r26, r29 -; CHECK-PWR8-NEXT: mtvsrd v1, r9 -; CHECK-PWR8-NEXT: clrlwi r5, r5, 24 -; CHECK-PWR8-NEXT: sub r5, r27, r5 -; CHECK-PWR8-NEXT: clrlwi r8, r8, 24 -; CHECK-PWR8-NEXT: sub r8, r11, r8 +; CHECK-PWR8-NEXT: xor r25, r8, r28 +; CHECK-PWR8-NEXT: sub r8, r4, r12 +; CHECK-PWR8-NEXT: sub r4, r25, r28 +; CHECK-PWR8-NEXT: mtvsrd v1, r8 +; CHECK-PWR8-NEXT: mtvsrd v7, r4 +; CHECK-PWR8-NEXT: clrlwi r11, r11, 24 +; CHECK-PWR8-NEXT: sub r10, r10, r11 +; CHECK-PWR8-NEXT: clrlwi r9, r9, 24 +; CHECK-PWR8-NEXT: sub r9, r27, r9 ; CHECK-PWR8-NEXT: xor r6, r6, r30 ; CHECK-PWR8-NEXT: sub r6, r6, r30 -; CHECK-PWR8-NEXT: xor r3, r3, r10 -; CHECK-PWR8-NEXT: sub r10, r3, r10 -; CHECK-PWR8-NEXT: sub r3, r25, r28 ; CHECK-PWR8-NEXT: mtvsrd v6, r6 -; CHECK-PWR8-NEXT: mtvsrd v7, r3 -; CHECK-PWR8-NEXT: srawi r12, r5, 31 -; CHECK-PWR8-NEXT: srawi r11, r8, 31 -; CHECK-PWR8-NEXT: xor r5, r5, r12 -; CHECK-PWR8-NEXT: xor r8, r8, r11 -; CHECK-PWR8-NEXT: sub r5, r5, r12 -; CHECK-PWR8-NEXT: sub r8, r8, r11 -; CHECK-PWR8-NEXT: mfvsrd r11, v2 -; CHECK-PWR8-NEXT: mfvsrd r12, v3 -; CHECK-PWR8-NEXT: mtvsrd v8, r8 -; CHECK-PWR8-NEXT: clrldi r0, r11, 56 -; CHECK-PWR8-NEXT: clrldi r30, r12, 56 -; CHECK-PWR8-NEXT: rldicl r29, r12, 56, 56 -; CHECK-PWR8-NEXT: rldicl r28, r12, 48, 56 -; CHECK-PWR8-NEXT: rldicl r27, r12, 40, 56 -; CHECK-PWR8-NEXT: rldicl r26, r12, 32, 56 -; CHECK-PWR8-NEXT: rldicl r25, r12, 24, 56 -; CHECK-PWR8-NEXT: rldicl r24, r12, 16, 56 -; CHECK-PWR8-NEXT: rldicl r12, r12, 8, 56 -; CHECK-PWR8-NEXT: clrlwi r0, r0, 24 +; CHECK-PWR8-NEXT: clrldi r30, r0, 56 +; CHECK-PWR8-NEXT: rldicl r29, r0, 56, 56 +; CHECK-PWR8-NEXT: rldicl r28, r0, 48, 56 +; CHECK-PWR8-NEXT: rldicl r27, r0, 40, 56 +; CHECK-PWR8-NEXT: rldicl r26, r0, 32, 56 +; CHECK-PWR8-NEXT: rldicl r25, r0, 24, 56 +; CHECK-PWR8-NEXT: rldicl r24, r0, 16, 56 +; CHECK-PWR8-NEXT: rldicl r0, r0, 8, 56 +; CHECK-PWR8-NEXT: srawi r11, r10, 31 +; CHECK-PWR8-NEXT: srawi r12, r9, 31 ; CHECK-PWR8-NEXT: clrlwi r30, r30, 24 ; CHECK-PWR8-NEXT: clrlwi r29, r29, 24 ; CHECK-PWR8-NEXT: clrlwi r28, r28, 24 @@ -748,14 +740,22 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: clrlwi r26, r26, 24 ; CHECK-PWR8-NEXT: clrlwi r25, r25, 24 ; CHECK-PWR8-NEXT: clrlwi r24, r24, 24 +; CHECK-PWR8-NEXT: clrlwi r0, r0, 24 +; CHECK-PWR8-NEXT: xor r10, r10, r11 +; CHECK-PWR8-NEXT: sub r10, r10, r11 +; CHECK-PWR8-NEXT: mfvsrd r11, v2 +; CHECK-PWR8-NEXT: mtvsrd v8, r10 +; CHECK-PWR8-NEXT: xor r9, r9, r12 +; CHECK-PWR8-NEXT: sub r9, r9, r12 +; CHECK-PWR8-NEXT: clrldi r12, r11, 56 ; CHECK-PWR8-NEXT: clrlwi r12, r12, 24 -; CHECK-PWR8-NEXT: sub r0, r0, r30 -; CHECK-PWR8-NEXT: srawi r30, r0, 31 -; CHECK-PWR8-NEXT: xor r0, r0, r30 -; CHECK-PWR8-NEXT: sub r0, r0, r30 +; CHECK-PWR8-NEXT: sub r12, r12, r30 +; CHECK-PWR8-NEXT: srawi r30, r12, 31 +; CHECK-PWR8-NEXT: xor r12, r12, r30 +; CHECK-PWR8-NEXT: sub r12, r12, r30 ; CHECK-PWR8-NEXT: rldicl r30, r11, 56, 56 ; CHECK-PWR8-NEXT: clrlwi r30, r30, 24 -; CHECK-PWR8-NEXT: mtvsrd v2, r0 +; CHECK-PWR8-NEXT: mtvsrd v2, r12 ; CHECK-PWR8-NEXT: sub r30, r30, r29 ; CHECK-PWR8-NEXT: srawi r29, r30, 31 ; CHECK-PWR8-NEXT: xor r30, r30, r29 @@ -795,13 +795,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: mtvsrd v5, r26 ; CHECK-PWR8-NEXT: ld r26, -48(r1) # 8-byte Folded Reload ; CHECK-PWR8-NEXT: sub r25, r25, r24 -; CHECK-PWR8-NEXT: sub r11, r11, r12 +; CHECK-PWR8-NEXT: sub r11, r11, r0 ; CHECK-PWR8-NEXT: srawi r24, r25, 31 -; CHECK-PWR8-NEXT: srawi r12, r11, 31 +; CHECK-PWR8-NEXT: srawi r0, r11, 31 ; CHECK-PWR8-NEXT: xor r25, r25, r24 -; CHECK-PWR8-NEXT: xor r11, r11, r12 +; CHECK-PWR8-NEXT: xor r11, r11, r0 ; CHECK-PWR8-NEXT: sub r25, r25, r24 -; CHECK-PWR8-NEXT: sub r11, r11, r12 +; CHECK-PWR8-NEXT: sub r11, r11, r0 ; CHECK-PWR8-NEXT: ld r24, -64(r1) # 8-byte Folded Reload ; CHECK-PWR8-NEXT: mtvsrd v0, r11 ; CHECK-PWR8-NEXT: vmrghb v2, v3, v2 @@ -815,16 +815,16 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: mtvsrd v5, r25 ; CHECK-PWR8-NEXT: ld r25, -56(r1) # 8-byte Folded Reload ; CHECK-PWR8-NEXT: vmrghb v5, v0, v5 -; CHECK-PWR8-NEXT: mtvsrd v0, r10 +; CHECK-PWR8-NEXT: mtvsrd v0, r7 ; CHECK-PWR8-NEXT: vmrglh v3, v5, v4 ; CHECK-PWR8-NEXT: xxmrglw vs0, v3, v2 ; CHECK-PWR8-NEXT: vmrghb v0, v1, v0 -; CHECK-PWR8-NEXT: mtvsrd v1, r7 +; CHECK-PWR8-NEXT: mtvsrd v1, r5 ; CHECK-PWR8-NEXT: vmrghb v1, v6, v1 -; CHECK-PWR8-NEXT: mtvsrd v6, r4 +; CHECK-PWR8-NEXT: mtvsrd v6, r3 ; CHECK-PWR8-NEXT: vmrglh v4, v1, v0 ; CHECK-PWR8-NEXT: vmrghb v6, v7, v6 -; CHECK-PWR8-NEXT: mtvsrd v7, r5 +; CHECK-PWR8-NEXT: mtvsrd v7, r9 ; CHECK-PWR8-NEXT: vmrghb v7, v8, v7 ; CHECK-PWR8-NEXT: vmrglh v5, v7, v6 ; CHECK-PWR8-NEXT: xxmrglw vs1, v5, v4 diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll index f699ea54192d88..4740fefcece6e7 100644 --- a/llvm/test/CodeGen/PowerPC/sat-add.ll +++ b/llvm/test/CodeGen/PowerPC/sat-add.ll @@ -739,27 +739,27 @@ declare <4 x i128> @llvm.sadd.sat.v4i128(<4 x i128> %a, <4 x i128> %b); define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { ; CHECK-LABEL: sadd: ; CHECK: # %bb.0: -; CHECK-NEXT: vadduqm 0, 2, 6 +; CHECK-NEXT: vadduqm 1, 2, 6 ; CHECK-NEXT: vadduqm 10, 4, 8 ; CHECK-NEXT: mfocrf 12, 32 ; CHECK-NEXT: stw 12, 8(1) ; CHECK-NEXT: xxswapd 0, 34 ; CHECK-NEXT: xxswapd 4, 36 -; CHECK-NEXT: vadduqm 1, 3, 7 -; CHECK-NEXT: vadduqm 11, 5, 9 +; CHECK-NEXT: vadduqm 11, 3, 7 +; CHECK-NEXT: vadduqm 0, 5, 9 ; CHECK-NEXT: mffprd 3, 0 ; CHECK-NEXT: mffprd 6, 4 ; CHECK-NEXT: lwz 12, 8(1) ; CHECK-NEXT: xxswapd 2, 35 ; CHECK-NEXT: xxswapd 5, 37 ; CHECK-NEXT: mffprd 4, 2 -; CHECK-NEXT: xxswapd 1, 32 +; CHECK-NEXT: xxswapd 1, 33 ; CHECK-NEXT: xxswapd 6, 42 ; CHECK-NEXT: mffprd 5, 1 ; CHECK-NEXT: cmpld 6, 5, 3 ; CHECK-NEXT: mffprd 7, 6 -; CHECK-NEXT: xxswapd 3, 33 -; CHECK-NEXT: xxswapd 7, 43 +; CHECK-NEXT: xxswapd 3, 43 +; CHECK-NEXT: xxswapd 7, 32 ; CHECK-NEXT: mffprd 3, 3 ; CHECK-NEXT: cmpld 5, 7, 6 ; CHECK-NEXT: mffprd 6, 5 @@ -768,12 +768,12 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { ; CHECK-NEXT: cmpld 3, 4 ; CHECK-NEXT: mfvsrd 3, 34 ; CHECK-NEXT: cmpld 1, 7, 6 -; CHECK-NEXT: mfvsrd 7, 32 +; CHECK-NEXT: mfvsrd 7, 33 ; CHECK-NEXT: mfvsrd 4, 35 ; CHECK-NEXT: mfvsrd 6, 37 ; CHECK-NEXT: cmpld 7, 7, 3 ; CHECK-NEXT: cmpd 2, 7, 3 -; CHECK-NEXT: mfvsrd 3, 33 +; CHECK-NEXT: mfvsrd 3, 43 ; CHECK-NEXT: crandc 21, 8, 30 ; CHECK-NEXT: crand 22, 30, 24 ; CHECK-NEXT: cmpld 6, 3, 4 @@ -786,7 +786,7 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { ; CHECK-NEXT: crand 24, 26, 0 ; CHECK-NEXT: cmpld 4, 5 ; CHECK-NEXT: cmpd 7, 4, 5 -; CHECK-NEXT: mfvsrd 5, 43 +; CHECK-NEXT: mfvsrd 5, 32 ; CHECK-NEXT: crnor 22, 24, 23 ; CHECK-NEXT: mtfprd 5, 3 ; CHECK-NEXT: sradi 4, 4, 63 @@ -844,14 +844,14 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { ; CHECK-NEXT: xxswapd 37, 9 ; CHECK-NEXT: xxlxor 2, 39, 37 ; CHECK-NEXT: xxlxor 3, 40, 37 -; CHECK-NEXT: xxsel 34, 32, 2, 0 -; CHECK-NEXT: xxsel 35, 33, 3, 1 +; CHECK-NEXT: xxsel 34, 33, 2, 0 +; CHECK-NEXT: xxsel 35, 43, 3, 1 ; CHECK-NEXT: xxlxor 0, 36, 47 ; CHECK-NEXT: xxlxor 1, 45, 37 ; CHECK-NEXT: xxsel 36, 42, 1, 0 ; CHECK-NEXT: xxlxor 0, 38, 48 ; CHECK-NEXT: xxlxor 1, 46, 37 -; CHECK-NEXT: xxsel 37, 43, 1, 0 +; CHECK-NEXT: xxsel 37, 32, 1, 0 ; CHECK-NEXT: blr %c = call <4 x i128> @llvm.sadd.sat.v4i128(<4 x i128> %a, <4 x i128> %b) ret <4 x i128> %c diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll index 4cd60c69da30e8..b6a87e0f23b986 100644 --- a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll @@ -19,34 +19,34 @@ define void @phi3(i32*) nounwind { ; CHECK-NEXT: mr 29, 3 ; CHECK-NEXT: bl malloc ; CHECK-NEXT: nop -; CHECK-NEXT: addi 7, 30, -4 +; CHECK-NEXT: addi 6, 30, -4 ; CHECK-NEXT: mtctr 3 ; CHECK-NEXT: addi 4, 29, -8 ; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: lwzu 8, 4(7) +; CHECK-NEXT: lwzu 8, 4(6) ; CHECK-NEXT: bdz .LBB0_5 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: extswsli 6, 5, 5 +; CHECK-NEXT: extswsli 7, 5, 5 ; CHECK-NEXT: add 5, 8, 5 -; CHECK-NEXT: lwzu 8, 4(7) +; CHECK-NEXT: lwzu 8, 4(6) ; CHECK-NEXT: bdz .LBB0_4 ; CHECK-NEXT: # %bb.2: -; CHECK-NEXT: add 6, 3, 6 -; CHECK-NEXT: stdu 6, 8(4) -; CHECK-NEXT: extswsli 6, 5, 5 +; CHECK-NEXT: add 7, 3, 7 +; CHECK-NEXT: stdu 7, 8(4) +; CHECK-NEXT: extswsli 7, 5, 5 ; CHECK-NEXT: add 5, 8, 5 -; CHECK-NEXT: lwzu 8, 4(7) +; CHECK-NEXT: lwzu 8, 4(6) ; CHECK-NEXT: bdz .LBB0_4 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: add 9, 3, 6 -; CHECK-NEXT: extswsli 6, 5, 5 +; CHECK-NEXT: add 9, 3, 7 +; CHECK-NEXT: extswsli 7, 5, 5 ; CHECK-NEXT: add 5, 8, 5 -; CHECK-NEXT: lwzu 8, 4(7) +; CHECK-NEXT: lwzu 8, 4(6) ; CHECK-NEXT: stdu 9, 8(4) ; CHECK-NEXT: bdnz .LBB0_3 ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: add 6, 3, 6 +; CHECK-NEXT: add 6, 3, 7 ; CHECK-NEXT: stdu 6, 8(4) ; CHECK-NEXT: .LBB0_5: ; CHECK-NEXT: extswsli 5, 5, 5 diff --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll index df55b92997765d..a263b56ce70ceb 100644 --- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -647,27 +647,27 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; ; P8BE-LABEL: combine_srem_sdiv: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r5, -21386 -; P8BE-NEXT: ori r5, r5, 37253 -; P8BE-NEXT: clrldi r3, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: extsh r8, r3 +; P8BE-NEXT: mfvsrd r3, v2 +; P8BE-NEXT: lis r4, -21386 +; P8BE-NEXT: ori r4, r4, 37253 +; P8BE-NEXT: clrldi r5, r3, 48 +; P8BE-NEXT: rldicl r6, r3, 48, 48 +; P8BE-NEXT: rldicl r7, r3, 32, 48 +; P8BE-NEXT: rldicl r3, r3, 16, 48 +; P8BE-NEXT: extsh r8, r5 ; P8BE-NEXT: extsh r9, r6 ; P8BE-NEXT: extsh r10, r7 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: mulhw r11, r8, r5 +; P8BE-NEXT: extsh r3, r3 +; P8BE-NEXT: mulhw r11, r8, r4 ; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: mulhw r11, r9, r5 +; P8BE-NEXT: mulhw r11, r9, r4 ; P8BE-NEXT: add r9, r11, r9 -; P8BE-NEXT: mulhw r11, r10, r5 -; P8BE-NEXT: mulhw r5, r4, r5 +; P8BE-NEXT: mulhw r11, r10, r4 +; P8BE-NEXT: mulhw r4, r3, r4 ; P8BE-NEXT: add r10, r11, r10 ; P8BE-NEXT: srwi r11, r8, 31 ; P8BE-NEXT: srawi r8, r8, 6 -; P8BE-NEXT: add r5, r5, r4 +; P8BE-NEXT: add r4, r4, r3 ; P8BE-NEXT: add r8, r8, r11 ; P8BE-NEXT: srwi r11, r9, 31 ; P8BE-NEXT: srawi r9, r9, 6 @@ -676,30 +676,30 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; P8BE-NEXT: srawi r10, r10, 6 ; P8BE-NEXT: mtvsrwz v3, r8 ; P8BE-NEXT: add r10, r10, r11 -; P8BE-NEXT: srwi r11, r5, 31 -; P8BE-NEXT: srawi r5, r5, 6 +; P8BE-NEXT: srwi r11, r4, 31 +; P8BE-NEXT: srawi r4, r4, 6 ; P8BE-NEXT: mtvsrwz v4, r9 -; P8BE-NEXT: add r5, r5, r11 +; P8BE-NEXT: add r4, r4, r11 ; P8BE-NEXT: mulli r11, r8, 95 -; P8BE-NEXT: sub r3, r3, r11 +; P8BE-NEXT: sub r5, r5, r11 ; P8BE-NEXT: mulli r11, r9, 95 -; P8BE-NEXT: mtvsrwz v5, r3 +; P8BE-NEXT: mtvsrwz v5, r5 ; P8BE-NEXT: sub r6, r6, r11 ; P8BE-NEXT: mulli r11, r10, 95 ; P8BE-NEXT: mtvsrwz v0, r6 ; P8BE-NEXT: sub r7, r7, r11 -; P8BE-NEXT: mulli r11, r5, 95 +; P8BE-NEXT: mulli r11, r4, 95 ; P8BE-NEXT: mtvsrwz v1, r7 -; P8BE-NEXT: sub r4, r4, r11 +; P8BE-NEXT: sub r3, r3, r11 ; P8BE-NEXT: addis r11, r2, .LCPI2_0@toc@ha ; P8BE-NEXT: addi r11, r11, .LCPI2_0@toc@l ; P8BE-NEXT: lxvw4x v2, 0, r11 ; P8BE-NEXT: vperm v5, v0, v5, v2 -; P8BE-NEXT: mtvsrwz v0, r4 +; P8BE-NEXT: mtvsrwz v0, r3 ; P8BE-NEXT: vperm v3, v4, v3, v2 ; P8BE-NEXT: mtvsrwz v4, r10 ; P8BE-NEXT: vperm v0, v0, v1, v2 -; P8BE-NEXT: mtvsrwz v1, r5 +; P8BE-NEXT: mtvsrwz v1, r4 ; P8BE-NEXT: vperm v2, v1, v4, v2 ; P8BE-NEXT: xxmrghw v4, v0, v5 ; P8BE-NEXT: xxmrghw v2, v2, v3 diff --git a/llvm/test/CodeGen/PowerPC/sub-of-not.ll b/llvm/test/CodeGen/PowerPC/sub-of-not.ll index 9cd2ec55108862..bcd5a98fdcd9c1 100644 --- a/llvm/test/CodeGen/PowerPC/sub-of-not.ll +++ b/llvm/test/CodeGen/PowerPC/sub-of-not.ll @@ -65,26 +65,23 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32: # %bb.0: ; PPC32-NEXT: stwu 1, -64(1) ; PPC32-NEXT: stw 21, 20(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 21, 123(1) ; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill -; PPC32-NEXT: lbz 4, 115(1) ; PPC32-NEXT: lbz 22, 119(1) -; PPC32-NEXT: lbz 21, 123(1) -; PPC32-NEXT: add 4, 4, 5 -; PPC32-NEXT: add 5, 22, 6 -; PPC32-NEXT: lbz 22, 131(1) -; PPC32-NEXT: add 6, 21, 7 +; PPC32-NEXT: add 7, 21, 7 +; PPC32-NEXT: lbz 4, 115(1) ; PPC32-NEXT: lbz 21, 135(1) -; PPC32-NEXT: addi 6, 6, 1 -; PPC32-NEXT: stw 20, 16(1) # 4-byte Folded Spill -; PPC32-NEXT: add 9, 22, 9 -; PPC32-NEXT: lbz 20, 127(1) +; PPC32-NEXT: add 6, 22, 6 +; PPC32-NEXT: lbz 22, 131(1) +; PPC32-NEXT: add 4, 4, 5 +; PPC32-NEXT: lbz 5, 127(1) ; PPC32-NEXT: add 10, 21, 10 ; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; PPC32-NEXT: addi 5, 5, 1 +; PPC32-NEXT: add 9, 22, 9 ; PPC32-NEXT: lbz 25, 83(1) -; PPC32-NEXT: add 7, 20, 8 +; PPC32-NEXT: add 5, 5, 8 ; PPC32-NEXT: lbz 21, 147(1) -; PPC32-NEXT: addi 7, 7, 1 +; PPC32-NEXT: addi 5, 5, 1 ; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill ; PPC32-NEXT: addi 4, 4, 1 ; PPC32-NEXT: lbz 24, 79(1) @@ -139,14 +136,16 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32-NEXT: addi 11, 25, 1 ; PPC32-NEXT: stb 8, 6(3) ; PPC32-NEXT: addi 8, 10, 1 +; PPC32-NEXT: stb 5, 3(3) +; PPC32-NEXT: addi 5, 7, 1 ; PPC32-NEXT: stb 11, 8(3) ; PPC32-NEXT: addi 11, 24, 1 ; PPC32-NEXT: stb 8, 5(3) ; PPC32-NEXT: addi 8, 9, 1 +; PPC32-NEXT: stb 5, 2(3) +; PPC32-NEXT: addi 5, 6, 1 ; PPC32-NEXT: stb 11, 7(3) ; PPC32-NEXT: stb 8, 4(3) -; PPC32-NEXT: stb 7, 3(3) -; PPC32-NEXT: stb 6, 2(3) ; PPC32-NEXT: stb 5, 1(3) ; PPC32-NEXT: stb 4, 0(3) ; PPC32-NEXT: lwz 30, 56(1) # 4-byte Folded Reload @@ -159,7 +158,6 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32-NEXT: lwz 23, 28(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 22, 24(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 21, 20(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 20, 16(1) # 4-byte Folded Reload ; PPC32-NEXT: addi 1, 1, 64 ; PPC32-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll index 48098e3a277c18..3a6eff802559d9 100644 --- a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll @@ -35,69 +35,69 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; PPC32-NEXT: stwu 1, -64(1) ; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill ; PPC32-NEXT: mulhwu. 26, 7, 6 -; PPC32-NEXT: mcrf 1, 0 -; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill +; PPC32-NEXT: mcrf 6, 0 ; PPC32-NEXT: mfcr 12 +; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill ; PPC32-NEXT: cmpwi 7, 5, 0 -; PPC32-NEXT: cmpwi 2, 7, 0 +; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill ; PPC32-NEXT: mulhwu. 26, 5, 8 ; PPC32-NEXT: mcrf 5, 0 -; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill -; PPC32-NEXT: crnor 20, 30, 10 -; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill -; PPC32-NEXT: cmpwi 7, 9, 0 -; PPC32-NEXT: mulhwu. 26, 3, 10 -; PPC32-NEXT: mcrf 6, 0 -; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; PPC32-NEXT: cmpwi 2, 3, 0 ; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill -; PPC32-NEXT: crnor 21, 30, 10 -; PPC32-NEXT: mulhwu. 26, 9, 4 +; PPC32-NEXT: cmpwi 2, 7, 0 ; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; PPC32-NEXT: crorc 20, 20, 6 +; PPC32-NEXT: crnor 20, 30, 10 +; PPC32-NEXT: mulhwu. 26, 3, 10 +; PPC32-NEXT: mcrf 1, 0 ; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill -; PPC32-NEXT: crorc 21, 21, 26 +; PPC32-NEXT: cmpwi 7, 9, 0 ; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; PPC32-NEXT: mulhwu 30, 6, 10 +; PPC32-NEXT: cmpwi 2, 3, 0 +; PPC32-NEXT: mulhwu. 26, 9, 4 +; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill +; PPC32-NEXT: crnor 21, 30, 10 +; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill +; PPC32-NEXT: crorc 21, 21, 6 ; PPC32-NEXT: stw 12, 20(1) -; PPC32-NEXT: crorc 20, 20, 22 +; PPC32-NEXT: mulhwu 12, 6, 10 ; PPC32-NEXT: crorc 21, 21, 2 ; PPC32-NEXT: li 11, 0 +; PPC32-NEXT: crorc 20, 20, 26 +; PPC32-NEXT: crorc 20, 20, 22 ; PPC32-NEXT: mullw 26, 5, 10 -; PPC32-NEXT: addc 30, 26, 30 -; PPC32-NEXT: mulhwu 29, 5, 10 -; PPC32-NEXT: addze 29, 29 +; PPC32-NEXT: addc 12, 26, 12 +; PPC32-NEXT: mulhwu 0, 5, 10 +; PPC32-NEXT: addze 0, 0 ; PPC32-NEXT: mullw 23, 5, 8 ; PPC32-NEXT: mullw 22, 7, 6 -; PPC32-NEXT: mulhwu 0, 6, 9 -; PPC32-NEXT: mulhwu 12, 5, 9 -; PPC32-NEXT: mulhwu 27, 8, 6 +; PPC32-NEXT: mulhwu 30, 6, 9 +; PPC32-NEXT: mulhwu 29, 5, 9 ; PPC32-NEXT: mullw 25, 6, 9 ; PPC32-NEXT: mullw 24, 5, 9 ; PPC32-NEXT: mullw 5, 9, 4 ; PPC32-NEXT: add 9, 22, 23 -; PPC32-NEXT: add 9, 27, 9 -; PPC32-NEXT: cmplw 1, 9, 27 -; PPC32-NEXT: cror 20, 20, 4 ; PPC32-NEXT: mullw 23, 3, 10 ; PPC32-NEXT: add 26, 23, 5 -; PPC32-NEXT: addc 5, 25, 30 -; PPC32-NEXT: addze 0, 0 +; PPC32-NEXT: addc 5, 25, 12 +; PPC32-NEXT: addze 30, 30 ; PPC32-NEXT: or. 3, 4, 3 -; PPC32-NEXT: mulhwu 28, 4, 10 +; PPC32-NEXT: mulhwu 27, 4, 10 ; PPC32-NEXT: mcrf 1, 0 -; PPC32-NEXT: addc 3, 29, 0 -; PPC32-NEXT: add 26, 28, 26 -; PPC32-NEXT: cmplw 6, 26, 28 -; PPC32-NEXT: cror 21, 21, 24 -; PPC32-NEXT: mullw 30, 4, 10 +; PPC32-NEXT: addc 3, 0, 30 +; PPC32-NEXT: add 26, 27, 26 +; PPC32-NEXT: mullw 12, 4, 10 ; PPC32-NEXT: or. 4, 8, 7 ; PPC32-NEXT: addze 4, 11 ; PPC32-NEXT: addc 7, 24, 3 ; PPC32-NEXT: crnor 22, 2, 6 -; PPC32-NEXT: mullw 27, 8, 6 -; PPC32-NEXT: adde 8, 12, 4 -; PPC32-NEXT: addc 3, 30, 27 +; PPC32-NEXT: mulhwu 28, 8, 6 +; PPC32-NEXT: add 9, 28, 9 +; PPC32-NEXT: cmplw 6, 9, 28 +; PPC32-NEXT: cror 20, 20, 24 +; PPC32-NEXT: cmplw 6, 26, 27 +; PPC32-NEXT: cror 21, 21, 24 +; PPC32-NEXT: mullw 28, 8, 6 +; PPC32-NEXT: adde 8, 29, 4 +; PPC32-NEXT: addc 3, 12, 28 ; PPC32-NEXT: adde 9, 26, 9 ; PPC32-NEXT: addc 4, 7, 3 ; PPC32-NEXT: adde 3, 8, 9 diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll index a2ad2946cc8ec1..98314a02c23fe8 100644 --- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -897,31 +897,31 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { ; P8LE-NEXT: mfvsrd r6, v2 ; P8LE-NEXT: mfvsrd r8, v3 ; P8LE-NEXT: ori r3, r3, 51289 +; P8LE-NEXT: mffprd r4, f0 ; P8LE-NEXT: ori r5, r5, 42889 -; P8LE-NEXT: rldic r4, r3, 36, 1 -; P8LE-NEXT: mffprd r3, f0 +; P8LE-NEXT: rldic r3, r3, 36, 1 ; P8LE-NEXT: rldic r5, r5, 35, 1 ; P8LE-NEXT: rldicl r7, r6, 63, 1 -; P8LE-NEXT: oris r4, r4, 45590 +; P8LE-NEXT: oris r3, r3, 45590 ; P8LE-NEXT: oris r5, r5, 1603 -; P8LE-NEXT: ori r4, r4, 17097 +; P8LE-NEXT: ori r3, r3, 17097 ; P8LE-NEXT: ori r5, r5, 21445 -; P8LE-NEXT: mulhdu r4, r3, r4 +; P8LE-NEXT: mulhdu r3, r4, r3 ; P8LE-NEXT: mulhdu r5, r7, r5 -; P8LE-NEXT: sub r7, r3, r4 +; P8LE-NEXT: sub r7, r4, r3 ; P8LE-NEXT: rldicl r5, r5, 57, 7 ; P8LE-NEXT: rldicl r7, r7, 63, 1 ; P8LE-NEXT: mulli r5, r5, 654 -; P8LE-NEXT: add r4, r7, r4 +; P8LE-NEXT: add r3, r7, r3 ; P8LE-NEXT: lis r7, -16037 ; P8LE-NEXT: ori r7, r7, 28749 -; P8LE-NEXT: rldicl r4, r4, 60, 4 +; P8LE-NEXT: rldicl r3, r3, 60, 4 ; P8LE-NEXT: sub r5, r6, r5 ; P8LE-NEXT: rldic r7, r7, 32, 0 -; P8LE-NEXT: mulli r4, r4, 23 +; P8LE-NEXT: mulli r3, r3, 23 ; P8LE-NEXT: oris r7, r7, 52170 ; P8LE-NEXT: ori r7, r7, 12109 -; P8LE-NEXT: sub r3, r3, r4 +; P8LE-NEXT: sub r3, r4, r3 ; P8LE-NEXT: mulhdu r7, r8, r7 ; P8LE-NEXT: mtfprd f1, r3 ; P8LE-NEXT: li r3, 0 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll index cc38f921b117bf..f7fa20ec7fdc3a 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll @@ -169,50 +169,50 @@ define <8 x i16> @test8elt(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v3, vs0 +; CHECK-P8-NEXT: xxswapd v2, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v2, vs2 +; CHECK-P8-NEXT: xxswapd v3, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 ; CHECK-P8-NEXT: vmrghh v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghh v3, v3, v5 +; CHECK-P8-NEXT: vmrghh v2, v2, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghh v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xxmrglw vs0, v3, v4 -; CHECK-P8-NEXT: vmrghh v2, v2, v0 -; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 +; CHECK-P8-NEXT: vmrghh v3, v3, v0 +; CHECK-P8-NEXT: xxmrglw vs1, v3, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -329,48 +329,48 @@ entry: define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs1, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs0, 0, r4 ; CHECK-P8-NEXT: li r6, 32 ; CHECK-P8-NEXT: li r5, 16 ; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs2, r4, r5 ; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 -; CHECK-P8-NEXT: xxswapd v3, vs1 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: xxswapd v3, vs0 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v0, r4 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 -; CHECK-P8-NEXT: xscvspdpn f2, vs2 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: xscvspdpn f2, vs3 -; CHECK-P8-NEXT: xxswapd v2, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xscvspdpn f1, vs3 +; CHECK-P8-NEXT: xxswapd v4, vs2 ; CHECK-P8-NEXT: xxswapd v5, vs6 -; CHECK-P8-NEXT: xxswapd v4, vs8 +; CHECK-P8-NEXT: xxswapd v2, vs8 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xscvspdpn f0, vs2 +; CHECK-P8-NEXT: xxsldwi vs4, v4, v4, 3 +; CHECK-P8-NEXT: xxsldwi vs5, v4, v4, 1 ; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 ; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 -; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v2, v2, 1 ; CHECK-P8-NEXT: mtvsrd v3, r4 -; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: vmrghh v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghh v3, v3, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 @@ -380,7 +380,7 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs6 -; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: mtvsrd v4, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs7 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 @@ -390,7 +390,7 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: vmrghh v2, v2, v6 +; CHECK-P8-NEXT: vmrghh v4, v4, v6 ; CHECK-P8-NEXT: mtvsrd v6, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs9 @@ -406,27 +406,27 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs8 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxmrglw vs1, v2, v1 +; CHECK-P8-NEXT: xxmrglw vs1, v4, v1 ; CHECK-P8-NEXT: vmrghh v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 ; CHECK-P8-NEXT: xxmrglw vs2, v5, v6 -; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: mtvsrd v2, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: vmrghh v7, v8, v7 ; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xxmrglw vs0, v3, v0 +; CHECK-P8-NEXT: vmrghh v2, v2, v8 +; CHECK-P8-NEXT: xxmrglw vs3, v2, v7 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 -; CHECK-P8-NEXT: xxswapd vs1, v2 -; CHECK-P8-NEXT: vmrghh v4, v4, v8 -; CHECK-P8-NEXT: xxmrglw vs3, v4, v7 ; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 +; CHECK-P8-NEXT: xxswapd vs1, v2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 ; CHECK-P8-NEXT: stxvd2x vs1, 0, r3 @@ -534,38 +534,38 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs3, 16(r4) -; CHECK-BE-NEXT: lxv vs2, 0(r4) +; CHECK-BE-NEXT: lxv vs2, 16(r4) +; CHECK-BE-NEXT: lxv vs1, 0(r4) ; CHECK-BE-NEXT: addis r5, r2, .LCPI3_0@toc@ha -; CHECK-BE-NEXT: lxv vs1, 48(r4) +; CHECK-BE-NEXT: lxv vs0, 48(r4) ; CHECK-BE-NEXT: addi r5, r5, .LCPI3_0@toc@l -; CHECK-BE-NEXT: lxv vs0, 0(r5) -; CHECK-BE-NEXT: xscvspdpn f6, vs3 -; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 -; CHECK-BE-NEXT: xscvspdpn f9, vs2 -; CHECK-BE-NEXT: xxswapd vs5, vs3 -; CHECK-BE-NEXT: xxsldwi vs3, vs3, vs3, 1 -; CHECK-BE-NEXT: xxsldwi vs7, vs2, vs2, 3 -; CHECK-BE-NEXT: xxswapd vs8, vs2 +; CHECK-BE-NEXT: lxv vs3, 0(r5) +; CHECK-BE-NEXT: xscvspdpn f6, vs2 +; CHECK-BE-NEXT: xxsldwi vs4, vs2, vs2, 3 +; CHECK-BE-NEXT: xscvspdpn f9, vs1 +; CHECK-BE-NEXT: xxswapd vs5, vs2 ; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-BE-NEXT: xxsldwi vs10, vs1, vs1, 3 -; CHECK-BE-NEXT: xxswapd vs11, vs1 +; CHECK-BE-NEXT: xxsldwi vs7, vs1, vs1, 3 +; CHECK-BE-NEXT: xxswapd vs8, vs1 +; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: xxsldwi vs10, vs0, vs0, 3 +; CHECK-BE-NEXT: xxswapd vs11, vs0 ; CHECK-BE-NEXT: xscvdpsxws f6, f6 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 ; CHECK-BE-NEXT: xscvdpsxws f9, f9 ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvspdpn f7, vs7 ; CHECK-BE-NEXT: xscvspdpn f8, vs8 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: xscvspdpn f10, vs10 ; CHECK-BE-NEXT: xscvspdpn f11, vs11 ; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: xscvdpsxws f8, f8 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: xscvdpsxws f10, f10 ; CHECK-BE-NEXT: xscvdpsxws f11, f11 ; CHECK-BE-NEXT: mffprwz r5, f6 @@ -576,50 +576,50 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-BE-NEXT: mtfprwz f4, r5 ; CHECK-BE-NEXT: mffprwz r5, f5 ; CHECK-BE-NEXT: mtfprwz f5, r5 -; CHECK-BE-NEXT: mffprwz r5, f3 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 -; CHECK-BE-NEXT: xscvspdpn f5, vs1 -; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 -; CHECK-BE-NEXT: mtfprwz f3, r5 +; CHECK-BE-NEXT: mffprwz r5, f2 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 +; CHECK-BE-NEXT: xscvspdpn f5, vs0 +; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 +; CHECK-BE-NEXT: mtfprwz f2, r5 ; CHECK-BE-NEXT: mffprwz r5, f7 ; CHECK-BE-NEXT: mtfprwz f7, r5 ; CHECK-BE-NEXT: mffprwz r5, f8 -; CHECK-BE-NEXT: xxperm vs3, vs6, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xxperm vs2, vs6, vs3 +; CHECK-BE-NEXT: xscvspdpn f0, vs0 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 ; CHECK-BE-NEXT: mtfprwz f8, r5 -; CHECK-BE-NEXT: mffprwz r5, f2 -; CHECK-BE-NEXT: xxmrghw vs3, vs3, vs4 +; CHECK-BE-NEXT: mffprwz r5, f1 +; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs4 ; CHECK-BE-NEXT: lxv vs4, 32(r4) -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: mtfprwz f2, r5 -; CHECK-BE-NEXT: xxperm vs7, vs8, vs0 +; CHECK-BE-NEXT: xscvdpsxws f0, f0 +; CHECK-BE-NEXT: mtfprwz f1, r5 +; CHECK-BE-NEXT: xxperm vs7, vs8, vs3 ; CHECK-BE-NEXT: mffprwz r5, f10 -; CHECK-BE-NEXT: xxperm vs2, vs9, vs0 +; CHECK-BE-NEXT: xxperm vs1, vs9, vs3 ; CHECK-BE-NEXT: mtfprwz f10, r5 ; CHECK-BE-NEXT: mffprwz r5, f11 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f11, r5 -; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs7 +; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs7 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs10, vs11, vs0 -; CHECK-BE-NEXT: mffprwz r4, f1 -; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs3 -; CHECK-BE-NEXT: xxsldwi vs3, vs4, vs4, 3 -; CHECK-BE-NEXT: mtfprwz f1, r4 -; CHECK-BE-NEXT: xxperm vs1, vs5, vs0 +; CHECK-BE-NEXT: xxperm vs10, vs11, vs3 +; CHECK-BE-NEXT: mffprwz r4, f0 +; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 +; CHECK-BE-NEXT: xxsldwi vs2, vs4, vs4, 3 +; CHECK-BE-NEXT: mtfprwz f0, r4 +; CHECK-BE-NEXT: xxperm vs0, vs5, vs3 ; CHECK-BE-NEXT: xxswapd vs5, vs4 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: stxv vs2, 0(r3) +; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: stxv vs1, 0(r3) ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 -; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs10 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xxmrghw vs0, vs0, vs10 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: mffprwz r4, f3 -; CHECK-BE-NEXT: mtfprwz f3, r4 +; CHECK-BE-NEXT: mffprwz r4, f2 +; CHECK-BE-NEXT: mtfprwz f2, r4 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs3, vs5, vs0 +; CHECK-BE-NEXT: xxperm vs2, vs5, vs3 ; CHECK-BE-NEXT: xscvspdpn f5, vs4 ; CHECK-BE-NEXT: xxsldwi vs4, vs4, vs4, 1 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 @@ -629,9 +629,9 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-BE-NEXT: mtfprwz f5, r4 ; CHECK-BE-NEXT: mffprwz r4, f4 ; CHECK-BE-NEXT: mtfprwz f4, r4 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 -; CHECK-BE-NEXT: xxmrghw vs0, vs4, vs3 -; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs1 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 +; CHECK-BE-NEXT: xxmrghw vs2, vs4, vs2 +; CHECK-BE-NEXT: xxmrghd vs0, vs2, vs0 ; CHECK-BE-NEXT: stxv vs0, 16(r3) ; CHECK-BE-NEXT: blr entry: @@ -801,50 +801,50 @@ define <8 x i16> @test8elt_signed(ptr nocapture readonly) local_unnamed_addr #2 ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v3, vs0 +; CHECK-P8-NEXT: xxswapd v2, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v2, vs2 +; CHECK-P8-NEXT: xxswapd v3, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 ; CHECK-P8-NEXT: vmrghh v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghh v3, v3, v5 +; CHECK-P8-NEXT: vmrghh v2, v2, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghh v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xxmrglw vs0, v3, v4 -; CHECK-P8-NEXT: vmrghh v2, v2, v0 -; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 +; CHECK-P8-NEXT: vmrghh v3, v3, v0 +; CHECK-P8-NEXT: xxmrglw vs1, v3, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -961,48 +961,48 @@ entry: define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs1, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs0, 0, r4 ; CHECK-P8-NEXT: li r6, 32 ; CHECK-P8-NEXT: li r5, 16 ; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs2, r4, r5 ; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 -; CHECK-P8-NEXT: xxswapd v3, vs1 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: xxswapd v3, vs0 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v0, r4 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 -; CHECK-P8-NEXT: xscvspdpn f2, vs2 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: xscvspdpn f2, vs3 -; CHECK-P8-NEXT: xxswapd v2, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xscvspdpn f1, vs3 +; CHECK-P8-NEXT: xxswapd v4, vs2 ; CHECK-P8-NEXT: xxswapd v5, vs6 -; CHECK-P8-NEXT: xxswapd v4, vs8 +; CHECK-P8-NEXT: xxswapd v2, vs8 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xscvspdpn f0, vs2 +; CHECK-P8-NEXT: xxsldwi vs4, v4, v4, 3 +; CHECK-P8-NEXT: xxsldwi vs5, v4, v4, 1 ; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 ; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 -; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v2, v2, 1 ; CHECK-P8-NEXT: mtvsrd v3, r4 -; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: vmrghh v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghh v3, v3, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 @@ -1012,7 +1012,7 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs6 -; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: mtvsrd v4, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs7 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 @@ -1022,7 +1022,7 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: vmrghh v2, v2, v6 +; CHECK-P8-NEXT: vmrghh v4, v4, v6 ; CHECK-P8-NEXT: mtvsrd v6, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs9 @@ -1038,27 +1038,27 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs8 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxmrglw vs1, v2, v1 +; CHECK-P8-NEXT: xxmrglw vs1, v4, v1 ; CHECK-P8-NEXT: vmrghh v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 ; CHECK-P8-NEXT: xxmrglw vs2, v5, v6 -; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: mtvsrd v2, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: vmrghh v7, v8, v7 ; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xxmrglw vs0, v3, v0 +; CHECK-P8-NEXT: vmrghh v2, v2, v8 +; CHECK-P8-NEXT: xxmrglw vs3, v2, v7 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 -; CHECK-P8-NEXT: xxswapd vs1, v2 -; CHECK-P8-NEXT: vmrghh v4, v4, v8 -; CHECK-P8-NEXT: xxmrglw vs3, v4, v7 ; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 +; CHECK-P8-NEXT: xxswapd vs1, v2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 ; CHECK-P8-NEXT: stxvd2x vs1, 0, r3 @@ -1166,38 +1166,38 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs3, 16(r4) -; CHECK-BE-NEXT: lxv vs2, 0(r4) +; CHECK-BE-NEXT: lxv vs2, 16(r4) +; CHECK-BE-NEXT: lxv vs1, 0(r4) ; CHECK-BE-NEXT: addis r5, r2, .LCPI7_0@toc@ha -; CHECK-BE-NEXT: lxv vs1, 48(r4) +; CHECK-BE-NEXT: lxv vs0, 48(r4) ; CHECK-BE-NEXT: addi r5, r5, .LCPI7_0@toc@l -; CHECK-BE-NEXT: lxv vs0, 0(r5) -; CHECK-BE-NEXT: xscvspdpn f6, vs3 -; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 -; CHECK-BE-NEXT: xscvspdpn f9, vs2 -; CHECK-BE-NEXT: xxswapd vs5, vs3 -; CHECK-BE-NEXT: xxsldwi vs3, vs3, vs3, 1 -; CHECK-BE-NEXT: xxsldwi vs7, vs2, vs2, 3 -; CHECK-BE-NEXT: xxswapd vs8, vs2 +; CHECK-BE-NEXT: lxv vs3, 0(r5) +; CHECK-BE-NEXT: xscvspdpn f6, vs2 +; CHECK-BE-NEXT: xxsldwi vs4, vs2, vs2, 3 +; CHECK-BE-NEXT: xscvspdpn f9, vs1 +; CHECK-BE-NEXT: xxswapd vs5, vs2 ; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-BE-NEXT: xxsldwi vs10, vs1, vs1, 3 -; CHECK-BE-NEXT: xxswapd vs11, vs1 +; CHECK-BE-NEXT: xxsldwi vs7, vs1, vs1, 3 +; CHECK-BE-NEXT: xxswapd vs8, vs1 +; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: xxsldwi vs10, vs0, vs0, 3 +; CHECK-BE-NEXT: xxswapd vs11, vs0 ; CHECK-BE-NEXT: xscvdpsxws f6, f6 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 ; CHECK-BE-NEXT: xscvdpsxws f9, f9 ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvspdpn f7, vs7 ; CHECK-BE-NEXT: xscvspdpn f8, vs8 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: xscvspdpn f10, vs10 ; CHECK-BE-NEXT: xscvspdpn f11, vs11 ; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: xscvdpsxws f8, f8 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: xscvdpsxws f10, f10 ; CHECK-BE-NEXT: xscvdpsxws f11, f11 ; CHECK-BE-NEXT: mffprwz r5, f6 @@ -1208,50 +1208,50 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-BE-NEXT: mtfprwz f4, r5 ; CHECK-BE-NEXT: mffprwz r5, f5 ; CHECK-BE-NEXT: mtfprwz f5, r5 -; CHECK-BE-NEXT: mffprwz r5, f3 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 -; CHECK-BE-NEXT: xscvspdpn f5, vs1 -; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 -; CHECK-BE-NEXT: mtfprwz f3, r5 +; CHECK-BE-NEXT: mffprwz r5, f2 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 +; CHECK-BE-NEXT: xscvspdpn f5, vs0 +; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 +; CHECK-BE-NEXT: mtfprwz f2, r5 ; CHECK-BE-NEXT: mffprwz r5, f7 ; CHECK-BE-NEXT: mtfprwz f7, r5 ; CHECK-BE-NEXT: mffprwz r5, f8 -; CHECK-BE-NEXT: xxperm vs3, vs6, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xxperm vs2, vs6, vs3 +; CHECK-BE-NEXT: xscvspdpn f0, vs0 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 ; CHECK-BE-NEXT: mtfprwz f8, r5 -; CHECK-BE-NEXT: mffprwz r5, f2 -; CHECK-BE-NEXT: xxmrghw vs3, vs3, vs4 +; CHECK-BE-NEXT: mffprwz r5, f1 +; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs4 ; CHECK-BE-NEXT: lxv vs4, 32(r4) -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: mtfprwz f2, r5 -; CHECK-BE-NEXT: xxperm vs7, vs8, vs0 +; CHECK-BE-NEXT: xscvdpsxws f0, f0 +; CHECK-BE-NEXT: mtfprwz f1, r5 +; CHECK-BE-NEXT: xxperm vs7, vs8, vs3 ; CHECK-BE-NEXT: mffprwz r5, f10 -; CHECK-BE-NEXT: xxperm vs2, vs9, vs0 +; CHECK-BE-NEXT: xxperm vs1, vs9, vs3 ; CHECK-BE-NEXT: mtfprwz f10, r5 ; CHECK-BE-NEXT: mffprwz r5, f11 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f11, r5 -; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs7 +; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs7 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs10, vs11, vs0 -; CHECK-BE-NEXT: mffprwz r4, f1 -; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs3 -; CHECK-BE-NEXT: xxsldwi vs3, vs4, vs4, 3 -; CHECK-BE-NEXT: mtfprwz f1, r4 -; CHECK-BE-NEXT: xxperm vs1, vs5, vs0 +; CHECK-BE-NEXT: xxperm vs10, vs11, vs3 +; CHECK-BE-NEXT: mffprwz r4, f0 +; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 +; CHECK-BE-NEXT: xxsldwi vs2, vs4, vs4, 3 +; CHECK-BE-NEXT: mtfprwz f0, r4 +; CHECK-BE-NEXT: xxperm vs0, vs5, vs3 ; CHECK-BE-NEXT: xxswapd vs5, vs4 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: stxv vs2, 0(r3) +; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: stxv vs1, 0(r3) ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 -; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs10 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xxmrghw vs0, vs0, vs10 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: mffprwz r4, f3 -; CHECK-BE-NEXT: mtfprwz f3, r4 +; CHECK-BE-NEXT: mffprwz r4, f2 +; CHECK-BE-NEXT: mtfprwz f2, r4 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs3, vs5, vs0 +; CHECK-BE-NEXT: xxperm vs2, vs5, vs3 ; CHECK-BE-NEXT: xscvspdpn f5, vs4 ; CHECK-BE-NEXT: xxsldwi vs4, vs4, vs4, 1 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 @@ -1261,9 +1261,9 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-BE-NEXT: mtfprwz f5, r4 ; CHECK-BE-NEXT: mffprwz r4, f4 ; CHECK-BE-NEXT: mtfprwz f4, r4 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 -; CHECK-BE-NEXT: xxmrghw vs0, vs4, vs3 -; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs1 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 +; CHECK-BE-NEXT: xxmrghw vs2, vs4, vs2 +; CHECK-BE-NEXT: xxmrghd vs0, vs2, vs0 ; CHECK-BE-NEXT: stxv vs0, 16(r3) ; CHECK-BE-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll index c6e808d145ebb3..26ae8cc38f0cf6 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll @@ -178,51 +178,51 @@ define i64 @test8elt(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v3, vs0 +; CHECK-P8-NEXT: xxswapd v2, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v2, vs2 +; CHECK-P8-NEXT: xxswapd v3, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 ; CHECK-P8-NEXT: vmrghb v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghb v3, v3, v5 +; CHECK-P8-NEXT: vmrghb v2, v2, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: vmrglh v3, v3, v4 +; CHECK-P8-NEXT: vmrglh v2, v2, v4 ; CHECK-P8-NEXT: vmrghb v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: vmrghb v2, v2, v0 -; CHECK-P8-NEXT: vmrglh v2, v2, v5 -; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 +; CHECK-P8-NEXT: vmrghb v3, v3, v0 +; CHECK-P8-NEXT: vmrglh v3, v3, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v3, v2 ; CHECK-P8-NEXT: xxswapd vs0, vs0 ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: blr @@ -343,47 +343,47 @@ entry: define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 +; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 ; CHECK-P8-NEXT: li r4, 32 ; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 48 ; CHECK-P8-NEXT: lxvd2x vs8, r3, r4 -; CHECK-P8-NEXT: xxswapd v3, vs1 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r3, f1 -; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: xxswapd v3, vs0 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r3, f0 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v0, r3 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r3, f0 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r3, f1 -; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 -; CHECK-P8-NEXT: xscvspdpn f2, vs2 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: xxswapd v2, vs0 ; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: xscvspdpn f2, vs3 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 +; CHECK-P8-NEXT: xscvspdpn f0, vs2 +; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xscvspdpn f1, vs3 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xxswapd v4, vs2 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: xscvspdpn f1, vs4 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 ; CHECK-P8-NEXT: xxswapd v5, vs6 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: xxswapd v4, vs8 +; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xxsldwi vs4, v4, v4, 3 +; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xxsldwi vs5, v4, v4, 1 ; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 +; CHECK-P8-NEXT: xxswapd v2, vs8 ; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v2, v2, 1 ; CHECK-P8-NEXT: vmrghb v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 @@ -398,7 +398,7 @@ define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs6 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: vmrglh v3, v3, v0 @@ -410,36 +410,36 @@ define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: vmrghb v2, v2, v6 +; CHECK-P8-NEXT: vmrghb v4, v4, v6 ; CHECK-P8-NEXT: mtvsrd v6, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs10 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mtvsrd v5, r3 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs8 -; CHECK-P8-NEXT: vmrglh v2, v2, v1 +; CHECK-P8-NEXT: vmrglh v4, v4, v1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghb v6, v6, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: vmrghb v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 ; CHECK-P8-NEXT: vmrglh v5, v5, v6 -; CHECK-P8-NEXT: mtvsrd v4, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghb v7, v8, v7 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 -; CHECK-P8-NEXT: vmrghb v4, v4, v8 -; CHECK-P8-NEXT: vmrglh v4, v4, v7 -; CHECK-P8-NEXT: xxmrglw vs1, v4, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v4, v3 +; CHECK-P8-NEXT: vmrghb v2, v2, v8 +; CHECK-P8-NEXT: vmrglh v2, v2, v7 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -818,51 +818,51 @@ define i64 @test8elt_signed(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v3, vs0 +; CHECK-P8-NEXT: xxswapd v2, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v2, vs2 +; CHECK-P8-NEXT: xxswapd v3, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 ; CHECK-P8-NEXT: vmrghb v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghb v3, v3, v5 +; CHECK-P8-NEXT: vmrghb v2, v2, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: vmrglh v3, v3, v4 +; CHECK-P8-NEXT: vmrglh v2, v2, v4 ; CHECK-P8-NEXT: vmrghb v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: vmrghb v2, v2, v0 -; CHECK-P8-NEXT: vmrglh v2, v2, v5 -; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 +; CHECK-P8-NEXT: vmrghb v3, v3, v0 +; CHECK-P8-NEXT: vmrglh v3, v3, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v3, v2 ; CHECK-P8-NEXT: xxswapd vs0, vs0 ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: blr @@ -983,47 +983,47 @@ entry: define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 +; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 ; CHECK-P8-NEXT: li r4, 32 ; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 48 ; CHECK-P8-NEXT: lxvd2x vs8, r3, r4 -; CHECK-P8-NEXT: xxswapd v3, vs1 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r3, f1 -; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: xxswapd v3, vs0 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r3, f0 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v0, r3 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r3, f0 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r3, f1 -; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 -; CHECK-P8-NEXT: xscvspdpn f2, vs2 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: xxswapd v2, vs0 ; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: xscvspdpn f2, vs3 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 +; CHECK-P8-NEXT: xscvspdpn f0, vs2 +; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xscvspdpn f1, vs3 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xxswapd v4, vs2 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: xscvspdpn f1, vs4 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 ; CHECK-P8-NEXT: xxswapd v5, vs6 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: xxswapd v4, vs8 +; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xxsldwi vs4, v4, v4, 3 +; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xxsldwi vs5, v4, v4, 1 ; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 +; CHECK-P8-NEXT: xxswapd v2, vs8 ; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v2, v2, 1 ; CHECK-P8-NEXT: vmrghb v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 @@ -1038,7 +1038,7 @@ define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs6 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: vmrglh v3, v3, v0 @@ -1050,36 +1050,36 @@ define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #3 ; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: vmrghb v2, v2, v6 +; CHECK-P8-NEXT: vmrghb v4, v4, v6 ; CHECK-P8-NEXT: mtvsrd v6, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs10 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mtvsrd v5, r3 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs8 -; CHECK-P8-NEXT: vmrglh v2, v2, v1 +; CHECK-P8-NEXT: vmrglh v4, v4, v1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghb v6, v6, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: vmrghb v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 ; CHECK-P8-NEXT: vmrglh v5, v5, v6 -; CHECK-P8-NEXT: mtvsrd v4, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghb v7, v8, v7 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 -; CHECK-P8-NEXT: vmrghb v4, v4, v8 -; CHECK-P8-NEXT: vmrglh v4, v4, v7 -; CHECK-P8-NEXT: xxmrglw vs1, v4, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v4, v3 +; CHECK-P8-NEXT: vmrghb v2, v2, v8 +; CHECK-P8-NEXT: vmrglh v2, v2, v7 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll index 00ca205e859725..5dcb2e4be3e37f 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll @@ -288,92 +288,92 @@ entry: define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs5, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs2, 0, r4 ; CHECK-P8-NEXT: li r5, 80 ; CHECK-P8-NEXT: li r6, 32 -; CHECK-P8-NEXT: lxvd2x vs3, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 ; CHECK-P8-NEXT: li r5, 16 -; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 ; CHECK-P8-NEXT: li r6, 64 -; CHECK-P8-NEXT: lxvd2x vs7, r4, r5 -; CHECK-P8-NEXT: lxvd2x vs11, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs4, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs12, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 ; CHECK-P8-NEXT: li r6, 96 -; CHECK-P8-NEXT: lxvd2x vs2, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs12, r4, r6 ; CHECK-P8-NEXT: li r6, 112 -; CHECK-P8-NEXT: lxvd2x vs0, r4, r6 -; CHECK-P8-NEXT: xxswapd vs6, vs5 -; CHECK-P8-NEXT: xscvdpsxws f5, f5 -; CHECK-P8-NEXT: xscvdpsxws f6, f6 -; CHECK-P8-NEXT: mffprwz r4, f5 -; CHECK-P8-NEXT: mtvsrd v4, r4 -; CHECK-P8-NEXT: xxswapd vs10, vs9 -; CHECK-P8-NEXT: xscvdpsxws f9, f9 -; CHECK-P8-NEXT: xxswapd vs4, vs3 +; CHECK-P8-NEXT: lxvd2x v2, r4, r6 +; CHECK-P8-NEXT: xxswapd vs3, vs2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: xscvdpsxws f10, f10 -; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: xxswapd vs7, vs6 +; CHECK-P8-NEXT: xscvdpsxws f6, f6 +; CHECK-P8-NEXT: xxswapd vs1, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: xscvdpsxws f7, f7 -; CHECK-P8-NEXT: mffprwz r4, f7 -; CHECK-P8-NEXT: xxswapd vs13, vs11 -; CHECK-P8-NEXT: xscvdpsxws f11, f11 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xxswapd vs5, vs4 +; CHECK-P8-NEXT: xscvdpsxws f4, f4 +; CHECK-P8-NEXT: mffprwz r4, f4 +; CHECK-P8-NEXT: xxswapd vs10, vs8 ; CHECK-P8-NEXT: xscvdpsxws f8, f8 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: xscvdpsxws f5, f5 +; CHECK-P8-NEXT: xscvdpsxws f10, f10 ; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: xxswapd v2, vs12 -; CHECK-P8-NEXT: xscvdpsxws f12, f12 -; CHECK-P8-NEXT: mffprwz r4, f9 -; CHECK-P8-NEXT: xscvdpsxws v2, v2 +; CHECK-P8-NEXT: xxswapd vs11, vs9 +; CHECK-P8-NEXT: xscvdpsxws f9, f9 +; CHECK-P8-NEXT: mffprwz r4, f6 +; CHECK-P8-NEXT: xscvdpsxws f11, f11 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: mffprwz r4, f12 +; CHECK-P8-NEXT: mffprwz r4, f9 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: mffprwz r4, f11 -; CHECK-P8-NEXT: xxswapd v3, vs2 -; CHECK-P8-NEXT: xscvdpsxws v3, v3 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: mffprwz r4, f8 +; CHECK-P8-NEXT: xxswapd vs13, vs12 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 ; CHECK-P8-NEXT: mtvsrd v6, r4 -; CHECK-P8-NEXT: mffprwz r4, f3 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, f12 ; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: mffprwz r4, f6 +; CHECK-P8-NEXT: mffprwz r4, f3 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f8 -; CHECK-P8-NEXT: xxswapd vs1, vs0 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r4, f5 +; CHECK-P8-NEXT: xxswapd v3, v2 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f10 +; CHECK-P8-NEXT: mffprwz r4, f7 ; CHECK-P8-NEXT: mtvsrd v10, r4 -; CHECK-P8-NEXT: mfvsrwz r4, v2 -; CHECK-P8-NEXT: mtvsrd v2, r4 -; CHECK-P8-NEXT: mffprwz r4, f13 +; CHECK-P8-NEXT: mffprwz r4, f11 ; CHECK-P8-NEXT: vmrghh v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f4 +; CHECK-P8-NEXT: mffprwz r4, f10 ; CHECK-P8-NEXT: vmrghh v5, v9, v5 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mfvsrwz r4, v3 -; CHECK-P8-NEXT: vmrghh v0, v10, v0 -; CHECK-P8-NEXT: vmrghh v2, v2, v1 -; CHECK-P8-NEXT: vmrghh v3, v8, v6 -; CHECK-P8-NEXT: mtvsrd v6, r4 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: vmrghh v1, v9, v7 -; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xxmrglw vs2, v1, v3 -; CHECK-P8-NEXT: xxmrglw vs1, v2, v0 -; CHECK-P8-NEXT: vmrghh v6, v6, v7 -; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: vmrghh v0, v10, v0 +; CHECK-P8-NEXT: mtvsrd v10, r4 +; CHECK-P8-NEXT: mffprwz r4, f13 +; CHECK-P8-NEXT: vmrghh v1, v8, v1 +; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, v3 +; CHECK-P8-NEXT: xxmrglw vs1, v1, v0 +; CHECK-P8-NEXT: vmrghh v6, v9, v6 +; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, v2 +; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: vmrghh v7, v10, v7 +; CHECK-P8-NEXT: xxmrglw vs2, v7, v6 +; CHECK-P8-NEXT: vmrghh v8, v8, v9 ; CHECK-P8-NEXT: xxmrglw vs0, v5, v4 -; CHECK-P8-NEXT: mtvsrd v8, r4 +; CHECK-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-P8-NEXT: xxmrglw vs3, v2, v8 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 -; CHECK-P8-NEXT: xxswapd vs1, v2 -; CHECK-P8-NEXT: vmrghh v7, v7, v8 -; CHECK-P8-NEXT: xxmrglw vs3, v7, v6 ; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 +; CHECK-P8-NEXT: xxswapd vs1, v2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 ; CHECK-P8-NEXT: stxvd2x vs1, 0, r3 @@ -835,92 +835,92 @@ entry: define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs5, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs2, 0, r4 ; CHECK-P8-NEXT: li r5, 80 ; CHECK-P8-NEXT: li r6, 32 -; CHECK-P8-NEXT: lxvd2x vs3, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 ; CHECK-P8-NEXT: li r5, 16 -; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 ; CHECK-P8-NEXT: li r6, 64 -; CHECK-P8-NEXT: lxvd2x vs7, r4, r5 -; CHECK-P8-NEXT: lxvd2x vs11, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs4, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs12, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 ; CHECK-P8-NEXT: li r6, 96 -; CHECK-P8-NEXT: lxvd2x vs2, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs12, r4, r6 ; CHECK-P8-NEXT: li r6, 112 -; CHECK-P8-NEXT: lxvd2x vs0, r4, r6 -; CHECK-P8-NEXT: xxswapd vs6, vs5 -; CHECK-P8-NEXT: xscvdpsxws f5, f5 -; CHECK-P8-NEXT: xscvdpsxws f6, f6 -; CHECK-P8-NEXT: mffprwz r4, f5 -; CHECK-P8-NEXT: mtvsrd v4, r4 -; CHECK-P8-NEXT: xxswapd vs10, vs9 -; CHECK-P8-NEXT: xscvdpsxws f9, f9 -; CHECK-P8-NEXT: xxswapd vs4, vs3 +; CHECK-P8-NEXT: lxvd2x v2, r4, r6 +; CHECK-P8-NEXT: xxswapd vs3, vs2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: xscvdpsxws f10, f10 -; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: xxswapd vs7, vs6 +; CHECK-P8-NEXT: xscvdpsxws f6, f6 +; CHECK-P8-NEXT: xxswapd vs1, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: xscvdpsxws f7, f7 -; CHECK-P8-NEXT: mffprwz r4, f7 -; CHECK-P8-NEXT: xxswapd vs13, vs11 -; CHECK-P8-NEXT: xscvdpsxws f11, f11 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xxswapd vs5, vs4 +; CHECK-P8-NEXT: xscvdpsxws f4, f4 +; CHECK-P8-NEXT: mffprwz r4, f4 +; CHECK-P8-NEXT: xxswapd vs10, vs8 ; CHECK-P8-NEXT: xscvdpsxws f8, f8 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: xscvdpsxws f5, f5 +; CHECK-P8-NEXT: xscvdpsxws f10, f10 ; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: xxswapd v2, vs12 -; CHECK-P8-NEXT: xscvdpsxws f12, f12 -; CHECK-P8-NEXT: mffprwz r4, f9 -; CHECK-P8-NEXT: xscvdpsxws v2, v2 +; CHECK-P8-NEXT: xxswapd vs11, vs9 +; CHECK-P8-NEXT: xscvdpsxws f9, f9 +; CHECK-P8-NEXT: mffprwz r4, f6 +; CHECK-P8-NEXT: xscvdpsxws f11, f11 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: mffprwz r4, f12 +; CHECK-P8-NEXT: mffprwz r4, f9 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: mffprwz r4, f11 -; CHECK-P8-NEXT: xxswapd v3, vs2 -; CHECK-P8-NEXT: xscvdpsxws v3, v3 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: mffprwz r4, f8 +; CHECK-P8-NEXT: xxswapd vs13, vs12 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 ; CHECK-P8-NEXT: mtvsrd v6, r4 -; CHECK-P8-NEXT: mffprwz r4, f3 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, f12 ; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: mffprwz r4, f6 +; CHECK-P8-NEXT: mffprwz r4, f3 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f8 -; CHECK-P8-NEXT: xxswapd vs1, vs0 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r4, f5 +; CHECK-P8-NEXT: xxswapd v3, v2 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f10 +; CHECK-P8-NEXT: mffprwz r4, f7 ; CHECK-P8-NEXT: mtvsrd v10, r4 -; CHECK-P8-NEXT: mfvsrwz r4, v2 -; CHECK-P8-NEXT: mtvsrd v2, r4 -; CHECK-P8-NEXT: mffprwz r4, f13 +; CHECK-P8-NEXT: mffprwz r4, f11 ; CHECK-P8-NEXT: vmrghh v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f4 +; CHECK-P8-NEXT: mffprwz r4, f10 ; CHECK-P8-NEXT: vmrghh v5, v9, v5 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mfvsrwz r4, v3 -; CHECK-P8-NEXT: vmrghh v0, v10, v0 -; CHECK-P8-NEXT: vmrghh v2, v2, v1 -; CHECK-P8-NEXT: vmrghh v3, v8, v6 -; CHECK-P8-NEXT: mtvsrd v6, r4 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: vmrghh v1, v9, v7 -; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xxmrglw vs2, v1, v3 -; CHECK-P8-NEXT: xxmrglw vs1, v2, v0 -; CHECK-P8-NEXT: vmrghh v6, v6, v7 -; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: vmrghh v0, v10, v0 +; CHECK-P8-NEXT: mtvsrd v10, r4 +; CHECK-P8-NEXT: mffprwz r4, f13 +; CHECK-P8-NEXT: vmrghh v1, v8, v1 +; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, v3 +; CHECK-P8-NEXT: xxmrglw vs1, v1, v0 +; CHECK-P8-NEXT: vmrghh v6, v9, v6 +; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, v2 +; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: vmrghh v7, v10, v7 +; CHECK-P8-NEXT: xxmrglw vs2, v7, v6 +; CHECK-P8-NEXT: vmrghh v8, v8, v9 ; CHECK-P8-NEXT: xxmrglw vs0, v5, v4 -; CHECK-P8-NEXT: mtvsrd v8, r4 +; CHECK-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-P8-NEXT: xxmrglw vs3, v2, v8 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 -; CHECK-P8-NEXT: xxswapd vs1, v2 -; CHECK-P8-NEXT: vmrghh v7, v7, v8 -; CHECK-P8-NEXT: xxmrglw vs3, v7, v6 ; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 +; CHECK-P8-NEXT: xxswapd vs1, v2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 ; CHECK-P8-NEXT: stxvd2x vs1, 0, r3 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll index 770689ba98049b..dd5cb59a48bf02 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll @@ -303,90 +303,90 @@ define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: li r4, 80 -; CHECK-P8-NEXT: lxvd2x vs4, 0, r3 -; CHECK-P8-NEXT: lxvd2x vs3, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 +; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: li r4, 48 -; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs3, r3, r4 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs7, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs4, r3, r4 ; CHECK-P8-NEXT: li r4, 32 -; CHECK-P8-NEXT: lxvd2x vs9, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 64 -; CHECK-P8-NEXT: lxvd2x vs12, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs9, r3, r4 ; CHECK-P8-NEXT: li r4, 96 -; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs12, r3, r4 ; CHECK-P8-NEXT: li r4, 112 -; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 -; CHECK-P8-NEXT: xxswapd vs5, vs4 -; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: mffprwz r3, f4 -; CHECK-P8-NEXT: xscvdpsxws f5, f5 +; CHECK-P8-NEXT: lxvd2x v2, r3, r4 +; CHECK-P8-NEXT: xxswapd vs2, vs1 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: mtvsrd v4, r3 -; CHECK-P8-NEXT: xxswapd vs13, vs3 -; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 -; CHECK-P8-NEXT: xxswapd vs10, vs6 -; CHECK-P8-NEXT: xscvdpsxws f6, f6 +; CHECK-P8-NEXT: xxswapd vs10, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: xscvdpsxws f10, f10 -; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: xxswapd vs7, vs3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 ; CHECK-P8-NEXT: xscvdpsxws f7, f7 -; CHECK-P8-NEXT: mffprwz r4, f7 -; CHECK-P8-NEXT: xscvdpsxws f8, f8 +; CHECK-P8-NEXT: xxswapd vs5, vs4 +; CHECK-P8-NEXT: xscvdpsxws f4, f4 +; CHECK-P8-NEXT: mffprwz r4, f4 +; CHECK-P8-NEXT: xscvdpsxws f5, f5 ; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: mffprwz r4, f6 -; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: mffprwz r4, f3 +; CHECK-P8-NEXT: mtvsrd v1, r4 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xxswapd vs8, vs6 +; CHECK-P8-NEXT: xscvdpsxws f6, f6 +; CHECK-P8-NEXT: mffprwz r3, f6 +; CHECK-P8-NEXT: mtvsrd v0, r3 +; CHECK-P8-NEXT: xscvdpsxws f8, f8 +; CHECK-P8-NEXT: xscvdpsxws f0, f12 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f5 ; CHECK-P8-NEXT: xxswapd vs11, vs9 ; CHECK-P8-NEXT: xscvdpsxws f9, f9 ; CHECK-P8-NEXT: mffprwz r3, f9 -; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xscvdpsxws f11, f11 -; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: mffprwz r4, f8 -; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: xxswapd v2, vs12 -; CHECK-P8-NEXT: xscvdpsxws f12, f12 -; CHECK-P8-NEXT: mffprwz r3, f12 ; CHECK-P8-NEXT: mtvsrd v6, r3 -; CHECK-P8-NEXT: mffprwz r3, f5 -; CHECK-P8-NEXT: xscvdpsxws v2, v2 +; CHECK-P8-NEXT: mffprwz r3, f2 +; CHECK-P8-NEXT: xscvdpsxws f11, f11 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mffprwz r3, f11 -; CHECK-P8-NEXT: xxswapd v3, vs2 -; CHECK-P8-NEXT: xscvdpsxws v3, v3 -; CHECK-P8-NEXT: mffprwz r4, f10 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: xxswapd vs1, vs0 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r3, f8 +; CHECK-P8-NEXT: xxswapd vs13, vs12 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: mffprwz r4, f7 +; CHECK-P8-NEXT: xxswapd v3, v2 ; CHECK-P8-NEXT: vmrghb v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mfvsrwz r3, v2 -; CHECK-P8-NEXT: mtvsrd v2, r4 -; CHECK-P8-NEXT: mffprwz r4, f13 +; CHECK-P8-NEXT: mffprwz r3, f11 ; CHECK-P8-NEXT: vmrghb v5, v9, v5 +; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: mffprwz r4, f10 ; CHECK-P8-NEXT: vmrghb v0, v8, v0 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mfvsrwz r3, v3 -; CHECK-P8-NEXT: vmrglh v4, v5, v4 -; CHECK-P8-NEXT: mtvsrd v3, r4 -; CHECK-P8-NEXT: vmrghb v2, v2, v1 -; CHECK-P8-NEXT: vmrghb v1, v8, v6 -; CHECK-P8-NEXT: mtvsrd v6, r3 -; CHECK-P8-NEXT: mffprwz r3, f2 -; CHECK-P8-NEXT: vmrglh v2, v2, v0 -; CHECK-P8-NEXT: vmrghb v3, v3, v7 -; CHECK-P8-NEXT: mtvsrd v7, r3 -; CHECK-P8-NEXT: mffprwz r3, f1 -; CHECK-P8-NEXT: vmrglh v3, v3, v1 -; CHECK-P8-NEXT: vmrghb v6, v6, v7 -; CHECK-P8-NEXT: mtvsrd v7, r3 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 +; CHECK-P8-NEXT: mffprwz r3, f13 +; CHECK-P8-NEXT: vmrghb v1, v9, v1 +; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: vmrghb v6, v8, v6 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: vmrghb v7, v7, v8 +; CHECK-P8-NEXT: mffprwz r3, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, v3 +; CHECK-P8-NEXT: vmrghb v7, v9, v7 +; CHECK-P8-NEXT: mtvsrd v9, r3 +; CHECK-P8-NEXT: mffprwz r3, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, v2 +; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mffprwz r3, f0 +; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: vmrghb v8, v8, v9 +; CHECK-P8-NEXT: vmrghb v2, v3, v2 +; CHECK-P8-NEXT: vmrglh v3, v5, v4 +; CHECK-P8-NEXT: vmrglh v4, v1, v0 ; CHECK-P8-NEXT: vmrglh v5, v7, v6 -; CHECK-P8-NEXT: xxmrglw vs1, v5, v3 +; CHECK-P8-NEXT: vmrglh v2, v2, v8 +; CHECK-P8-NEXT: xxmrglw vs0, v4, v3 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -858,90 +858,90 @@ define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #2 ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: li r4, 80 -; CHECK-P8-NEXT: lxvd2x vs4, 0, r3 -; CHECK-P8-NEXT: lxvd2x vs3, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 +; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: li r4, 48 -; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs3, r3, r4 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs7, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs4, r3, r4 ; CHECK-P8-NEXT: li r4, 32 -; CHECK-P8-NEXT: lxvd2x vs9, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 64 -; CHECK-P8-NEXT: lxvd2x vs12, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs9, r3, r4 ; CHECK-P8-NEXT: li r4, 96 -; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs12, r3, r4 ; CHECK-P8-NEXT: li r4, 112 -; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 -; CHECK-P8-NEXT: xxswapd vs5, vs4 -; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: mffprwz r3, f4 -; CHECK-P8-NEXT: xscvdpsxws f5, f5 +; CHECK-P8-NEXT: lxvd2x v2, r3, r4 +; CHECK-P8-NEXT: xxswapd vs2, vs1 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: mtvsrd v4, r3 -; CHECK-P8-NEXT: xxswapd vs13, vs3 -; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 -; CHECK-P8-NEXT: xxswapd vs10, vs6 -; CHECK-P8-NEXT: xscvdpsxws f6, f6 +; CHECK-P8-NEXT: xxswapd vs10, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: xscvdpsxws f10, f10 -; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: xxswapd vs7, vs3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 ; CHECK-P8-NEXT: xscvdpsxws f7, f7 -; CHECK-P8-NEXT: mffprwz r4, f7 -; CHECK-P8-NEXT: xscvdpsxws f8, f8 +; CHECK-P8-NEXT: xxswapd vs5, vs4 +; CHECK-P8-NEXT: xscvdpsxws f4, f4 +; CHECK-P8-NEXT: mffprwz r4, f4 +; CHECK-P8-NEXT: xscvdpsxws f5, f5 ; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: mffprwz r4, f6 -; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: mffprwz r4, f3 +; CHECK-P8-NEXT: mtvsrd v1, r4 +; CHECK-P8-NEXT: mffprwz r4, f0 +; CHECK-P8-NEXT: xxswapd vs8, vs6 +; CHECK-P8-NEXT: xscvdpsxws f6, f6 +; CHECK-P8-NEXT: mffprwz r3, f6 +; CHECK-P8-NEXT: mtvsrd v0, r3 +; CHECK-P8-NEXT: xscvdpsxws f8, f8 +; CHECK-P8-NEXT: xscvdpsxws f0, f12 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f5 ; CHECK-P8-NEXT: xxswapd vs11, vs9 ; CHECK-P8-NEXT: xscvdpsxws f9, f9 ; CHECK-P8-NEXT: mffprwz r3, f9 -; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xscvdpsxws f11, f11 -; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: mffprwz r4, f8 -; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: xxswapd v2, vs12 -; CHECK-P8-NEXT: xscvdpsxws f12, f12 -; CHECK-P8-NEXT: mffprwz r3, f12 ; CHECK-P8-NEXT: mtvsrd v6, r3 -; CHECK-P8-NEXT: mffprwz r3, f5 -; CHECK-P8-NEXT: xscvdpsxws v2, v2 +; CHECK-P8-NEXT: mffprwz r3, f2 +; CHECK-P8-NEXT: xscvdpsxws f11, f11 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mffprwz r3, f11 -; CHECK-P8-NEXT: xxswapd v3, vs2 -; CHECK-P8-NEXT: xscvdpsxws v3, v3 -; CHECK-P8-NEXT: mffprwz r4, f10 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: xxswapd vs1, vs0 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mffprwz r3, f8 +; CHECK-P8-NEXT: xxswapd vs13, vs12 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: mffprwz r4, f7 +; CHECK-P8-NEXT: xxswapd v3, v2 ; CHECK-P8-NEXT: vmrghb v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mfvsrwz r3, v2 -; CHECK-P8-NEXT: mtvsrd v2, r4 -; CHECK-P8-NEXT: mffprwz r4, f13 +; CHECK-P8-NEXT: mffprwz r3, f11 ; CHECK-P8-NEXT: vmrghb v5, v9, v5 +; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: mffprwz r4, f10 ; CHECK-P8-NEXT: vmrghb v0, v8, v0 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mfvsrwz r3, v3 -; CHECK-P8-NEXT: vmrglh v4, v5, v4 -; CHECK-P8-NEXT: mtvsrd v3, r4 -; CHECK-P8-NEXT: vmrghb v2, v2, v1 -; CHECK-P8-NEXT: vmrghb v1, v8, v6 -; CHECK-P8-NEXT: mtvsrd v6, r3 -; CHECK-P8-NEXT: mffprwz r3, f2 -; CHECK-P8-NEXT: vmrglh v2, v2, v0 -; CHECK-P8-NEXT: vmrghb v3, v3, v7 -; CHECK-P8-NEXT: mtvsrd v7, r3 -; CHECK-P8-NEXT: mffprwz r3, f1 -; CHECK-P8-NEXT: vmrglh v3, v3, v1 -; CHECK-P8-NEXT: vmrghb v6, v6, v7 -; CHECK-P8-NEXT: mtvsrd v7, r3 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 +; CHECK-P8-NEXT: mffprwz r3, f13 +; CHECK-P8-NEXT: vmrghb v1, v9, v1 +; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: vmrghb v6, v8, v6 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: vmrghb v7, v7, v8 +; CHECK-P8-NEXT: mffprwz r3, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, v3 +; CHECK-P8-NEXT: vmrghb v7, v9, v7 +; CHECK-P8-NEXT: mtvsrd v9, r3 +; CHECK-P8-NEXT: mffprwz r3, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, v2 +; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mffprwz r3, f0 +; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: vmrghb v8, v8, v9 +; CHECK-P8-NEXT: vmrghb v2, v3, v2 +; CHECK-P8-NEXT: vmrglh v3, v5, v4 +; CHECK-P8-NEXT: vmrglh v4, v1, v0 ; CHECK-P8-NEXT: vmrglh v5, v7, v6 -; CHECK-P8-NEXT: xxmrglw vs1, v5, v3 +; CHECK-P8-NEXT: vmrglh v2, v2, v8 +; CHECK-P8-NEXT: xxmrglw vs0, v4, v3 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll index 274f1cef49aa95..62b0bf1fa6a9c1 100644 --- a/llvm/test/CodeGen/RISCV/add-before-shl.ll +++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll @@ -200,23 +200,23 @@ define i128 @add_wide_operand(i128 %a) nounwind { ; ; RV32C-LABEL: add_wide_operand: ; RV32C: # %bb.0: -; RV32C-NEXT: lw a6, 4(a1) +; RV32C-NEXT: c.lw a4, 4(a1) ; RV32C-NEXT: c.lw a3, 12(a1) -; RV32C-NEXT: c.lw a4, 0(a1) +; RV32C-NEXT: lw a6, 0(a1) ; RV32C-NEXT: c.lw a1, 8(a1) ; RV32C-NEXT: c.lui a5, 16 ; RV32C-NEXT: c.add a3, a5 ; RV32C-NEXT: c.slli a3, 3 ; RV32C-NEXT: srli a5, a1, 29 ; RV32C-NEXT: c.or a3, a5 -; RV32C-NEXT: srli a5, a4, 29 -; RV32C-NEXT: slli a2, a6, 3 -; RV32C-NEXT: c.or a2, a5 ; RV32C-NEXT: srli a5, a6, 29 +; RV32C-NEXT: slli a2, a4, 3 +; RV32C-NEXT: c.or a2, a5 +; RV32C-NEXT: c.srli a4, 29 ; RV32C-NEXT: c.slli a1, 3 -; RV32C-NEXT: c.or a1, a5 -; RV32C-NEXT: c.slli a4, 3 -; RV32C-NEXT: c.sw a4, 0(a0) +; RV32C-NEXT: c.or a1, a4 +; RV32C-NEXT: c.slli a6, 3 +; RV32C-NEXT: sw a6, 0(a0) ; RV32C-NEXT: c.sw a1, 8(a0) ; RV32C-NEXT: c.sw a2, 4(a0) ; RV32C-NEXT: c.sw a3, 12(a0) diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll index 8a0c4240d161bf..7a982a341ef4fa 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll @@ -486,7 +486,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; RV32IZFBFMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IZFBFMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFBFMIN-NEXT: call __fixsfdi@plt -; RV32IZFBFMIN-NEXT: lui a4, 524288 +; RV32IZFBFMIN-NEXT: lui a3, 524288 ; RV32IZFBFMIN-NEXT: lui a2, 524288 ; RV32IZFBFMIN-NEXT: beqz s0, .LBB10_2 ; RV32IZFBFMIN-NEXT: # %bb.1: # %start @@ -494,19 +494,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; RV32IZFBFMIN-NEXT: .LBB10_2: # %start ; RV32IZFBFMIN-NEXT: lui a1, %hi(.LCPI10_0) ; RV32IZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32IZFBFMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IZFBFMIN-NEXT: beqz a3, .LBB10_4 +; RV32IZFBFMIN-NEXT: flt.s a4, fa5, fs0 +; RV32IZFBFMIN-NEXT: beqz a4, .LBB10_4 ; RV32IZFBFMIN-NEXT: # %bb.3: -; RV32IZFBFMIN-NEXT: addi a2, a4, -1 +; RV32IZFBFMIN-NEXT: addi a2, a3, -1 ; RV32IZFBFMIN-NEXT: .LBB10_4: # %start ; RV32IZFBFMIN-NEXT: feq.s a1, fs0, fs0 -; RV32IZFBFMIN-NEXT: neg a4, a1 -; RV32IZFBFMIN-NEXT: and a1, a4, a2 -; RV32IZFBFMIN-NEXT: neg a2, a3 -; RV32IZFBFMIN-NEXT: neg a3, s0 -; RV32IZFBFMIN-NEXT: and a0, a3, a0 -; RV32IZFBFMIN-NEXT: or a0, a2, a0 +; RV32IZFBFMIN-NEXT: neg a3, a1 +; RV32IZFBFMIN-NEXT: and a1, a3, a2 +; RV32IZFBFMIN-NEXT: neg a2, a4 +; RV32IZFBFMIN-NEXT: neg a4, s0 ; RV32IZFBFMIN-NEXT: and a0, a4, a0 +; RV32IZFBFMIN-NEXT: or a0, a2, a0 +; RV32IZFBFMIN-NEXT: and a0, a3, a0 ; RV32IZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFBFMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -525,7 +525,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; R32IDZFBFMIN-NEXT: fle.s s0, fa5, fs0 ; R32IDZFBFMIN-NEXT: fmv.s fa0, fs0 ; R32IDZFBFMIN-NEXT: call __fixsfdi@plt -; R32IDZFBFMIN-NEXT: lui a4, 524288 +; R32IDZFBFMIN-NEXT: lui a3, 524288 ; R32IDZFBFMIN-NEXT: lui a2, 524288 ; R32IDZFBFMIN-NEXT: beqz s0, .LBB10_2 ; R32IDZFBFMIN-NEXT: # %bb.1: # %start @@ -533,19 +533,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; R32IDZFBFMIN-NEXT: .LBB10_2: # %start ; R32IDZFBFMIN-NEXT: lui a1, %hi(.LCPI10_0) ; R32IDZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; R32IDZFBFMIN-NEXT: flt.s a3, fa5, fs0 -; R32IDZFBFMIN-NEXT: beqz a3, .LBB10_4 +; R32IDZFBFMIN-NEXT: flt.s a4, fa5, fs0 +; R32IDZFBFMIN-NEXT: beqz a4, .LBB10_4 ; R32IDZFBFMIN-NEXT: # %bb.3: -; R32IDZFBFMIN-NEXT: addi a2, a4, -1 +; R32IDZFBFMIN-NEXT: addi a2, a3, -1 ; R32IDZFBFMIN-NEXT: .LBB10_4: # %start ; R32IDZFBFMIN-NEXT: feq.s a1, fs0, fs0 -; R32IDZFBFMIN-NEXT: neg a4, a1 -; R32IDZFBFMIN-NEXT: and a1, a4, a2 -; R32IDZFBFMIN-NEXT: neg a2, a3 -; R32IDZFBFMIN-NEXT: neg a3, s0 -; R32IDZFBFMIN-NEXT: and a0, a3, a0 -; R32IDZFBFMIN-NEXT: or a0, a2, a0 +; R32IDZFBFMIN-NEXT: neg a3, a1 +; R32IDZFBFMIN-NEXT: and a1, a3, a2 +; R32IDZFBFMIN-NEXT: neg a2, a4 +; R32IDZFBFMIN-NEXT: neg a4, s0 ; R32IDZFBFMIN-NEXT: and a0, a4, a0 +; R32IDZFBFMIN-NEXT: or a0, a2, a0 +; R32IDZFBFMIN-NEXT: and a0, a3, a0 ; R32IDZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; R32IDZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; R32IDZFBFMIN-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -566,7 +566,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; RV32ID-NEXT: fle.s s0, fa5, fs0 ; RV32ID-NEXT: fmv.s fa0, fs0 ; RV32ID-NEXT: call __fixsfdi@plt -; RV32ID-NEXT: lui a4, 524288 +; RV32ID-NEXT: lui a3, 524288 ; RV32ID-NEXT: lui a2, 524288 ; RV32ID-NEXT: beqz s0, .LBB10_2 ; RV32ID-NEXT: # %bb.1: # %start @@ -574,19 +574,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; RV32ID-NEXT: .LBB10_2: # %start ; RV32ID-NEXT: lui a1, %hi(.LCPI10_0) ; RV32ID-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32ID-NEXT: flt.s a3, fa5, fs0 -; RV32ID-NEXT: beqz a3, .LBB10_4 +; RV32ID-NEXT: flt.s a4, fa5, fs0 +; RV32ID-NEXT: beqz a4, .LBB10_4 ; RV32ID-NEXT: # %bb.3: -; RV32ID-NEXT: addi a2, a4, -1 +; RV32ID-NEXT: addi a2, a3, -1 ; RV32ID-NEXT: .LBB10_4: # %start ; RV32ID-NEXT: feq.s a1, fs0, fs0 -; RV32ID-NEXT: neg a4, a1 -; RV32ID-NEXT: and a1, a4, a2 -; RV32ID-NEXT: neg a2, a3 -; RV32ID-NEXT: neg a3, s0 -; RV32ID-NEXT: and a0, a3, a0 -; RV32ID-NEXT: or a0, a2, a0 +; RV32ID-NEXT: neg a3, a1 +; RV32ID-NEXT: and a1, a3, a2 +; RV32ID-NEXT: neg a2, a4 +; RV32ID-NEXT: neg a4, s0 ; RV32ID-NEXT: and a0, a4, a0 +; RV32ID-NEXT: or a0, a2, a0 +; RV32ID-NEXT: and a0, a3, a0 ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32ID-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/bfloat-select-fcmp.ll b/llvm/test/CodeGen/RISCV/bfloat-select-fcmp.ll index 8335e5430f34ed..ee3f1d2a3f3c2e 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-select-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-select-fcmp.ll @@ -17,14 +17,14 @@ define bfloat @select_fcmp_false(bfloat %a, bfloat %b) nounwind { define bfloat @select_fcmp_oeq(bfloat %a, bfloat %b) nounwind { ; CHECK-LABEL: select_fcmp_oeq: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: feq.s a0, fa5, fa4 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 +; CHECK-NEXT: feq.s a0, fa4, fa5 ; CHECK-NEXT: bnez a0, .LBB1_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: fmv.s fa5, fa4 +; CHECK-NEXT: fmv.s fa4, fa5 ; CHECK-NEXT: .LBB1_2: -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: fcvt.bf16.s fa0, fa4 ; CHECK-NEXT: ret %1 = fcmp oeq bfloat %a, %b %2 = select i1 %1, bfloat %a, bfloat %b @@ -68,14 +68,14 @@ define bfloat @select_fcmp_oge(bfloat %a, bfloat %b) nounwind { define bfloat @select_fcmp_olt(bfloat %a, bfloat %b) nounwind { ; CHECK-LABEL: select_fcmp_olt: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: flt.s a0, fa5, fa4 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 +; CHECK-NEXT: flt.s a0, fa4, fa5 ; CHECK-NEXT: bnez a0, .LBB4_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: fmv.s fa5, fa4 +; CHECK-NEXT: fmv.s fa4, fa5 ; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: fcvt.bf16.s fa0, fa4 ; CHECK-NEXT: ret %1 = fcmp olt bfloat %a, %b %2 = select i1 %1, bfloat %a, bfloat %b @@ -85,14 +85,14 @@ define bfloat @select_fcmp_olt(bfloat %a, bfloat %b) nounwind { define bfloat @select_fcmp_ole(bfloat %a, bfloat %b) nounwind { ; CHECK-LABEL: select_fcmp_ole: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: fle.s a0, fa5, fa4 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 +; CHECK-NEXT: fle.s a0, fa4, fa5 ; CHECK-NEXT: bnez a0, .LBB5_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: fmv.s fa5, fa4 +; CHECK-NEXT: fmv.s fa4, fa5 ; CHECK-NEXT: .LBB5_2: -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: fcvt.bf16.s fa0, fa4 ; CHECK-NEXT: ret %1 = fcmp ole bfloat %a, %b %2 = select i1 %1, bfloat %a, bfloat %b @@ -102,16 +102,16 @@ define bfloat @select_fcmp_ole(bfloat %a, bfloat %b) nounwind { define bfloat @select_fcmp_one(bfloat %a, bfloat %b) nounwind { ; CHECK-LABEL: select_fcmp_one: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: flt.s a0, fa5, fa4 -; CHECK-NEXT: flt.s a1, fa4, fa5 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 +; CHECK-NEXT: flt.s a0, fa4, fa5 +; CHECK-NEXT: flt.s a1, fa5, fa4 ; CHECK-NEXT: or a0, a1, a0 ; CHECK-NEXT: bnez a0, .LBB6_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: fmv.s fa5, fa4 +; CHECK-NEXT: fmv.s fa4, fa5 ; CHECK-NEXT: .LBB6_2: -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: fcvt.bf16.s fa0, fa4 ; CHECK-NEXT: ret %1 = fcmp one bfloat %a, %b %2 = select i1 %1, bfloat %a, bfloat %b @@ -140,16 +140,16 @@ define bfloat @select_fcmp_ord(bfloat %a, bfloat %b) nounwind { define bfloat @select_fcmp_ueq(bfloat %a, bfloat %b) nounwind { ; CHECK-LABEL: select_fcmp_ueq: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: flt.s a0, fa5, fa4 -; CHECK-NEXT: flt.s a1, fa4, fa5 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 +; CHECK-NEXT: flt.s a0, fa4, fa5 +; CHECK-NEXT: flt.s a1, fa5, fa4 ; CHECK-NEXT: or a0, a1, a0 ; CHECK-NEXT: beqz a0, .LBB8_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: fmv.s fa5, fa4 +; CHECK-NEXT: fmv.s fa4, fa5 ; CHECK-NEXT: .LBB8_2: -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: fcvt.bf16.s fa0, fa4 ; CHECK-NEXT: ret %1 = fcmp ueq bfloat %a, %b %2 = select i1 %1, bfloat %a, bfloat %b @@ -159,14 +159,14 @@ define bfloat @select_fcmp_ueq(bfloat %a, bfloat %b) nounwind { define bfloat @select_fcmp_ugt(bfloat %a, bfloat %b) nounwind { ; CHECK-LABEL: select_fcmp_ugt: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: fle.s a0, fa5, fa4 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 +; CHECK-NEXT: fle.s a0, fa4, fa5 ; CHECK-NEXT: beqz a0, .LBB9_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: fmv.s fa5, fa4 +; CHECK-NEXT: fmv.s fa4, fa5 ; CHECK-NEXT: .LBB9_2: -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: fcvt.bf16.s fa0, fa4 ; CHECK-NEXT: ret %1 = fcmp ugt bfloat %a, %b %2 = select i1 %1, bfloat %a, bfloat %b @@ -176,14 +176,14 @@ define bfloat @select_fcmp_ugt(bfloat %a, bfloat %b) nounwind { define bfloat @select_fcmp_uge(bfloat %a, bfloat %b) nounwind { ; CHECK-LABEL: select_fcmp_uge: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: flt.s a0, fa5, fa4 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 +; CHECK-NEXT: flt.s a0, fa4, fa5 ; CHECK-NEXT: beqz a0, .LBB10_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: fmv.s fa5, fa4 +; CHECK-NEXT: fmv.s fa4, fa5 ; CHECK-NEXT: .LBB10_2: -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: fcvt.bf16.s fa0, fa4 ; CHECK-NEXT: ret %1 = fcmp uge bfloat %a, %b %2 = select i1 %1, bfloat %a, bfloat %b @@ -227,14 +227,14 @@ define bfloat @select_fcmp_ule(bfloat %a, bfloat %b) nounwind { define bfloat @select_fcmp_une(bfloat %a, bfloat %b) nounwind { ; CHECK-LABEL: select_fcmp_une: ; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: feq.s a0, fa5, fa4 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 +; CHECK-NEXT: feq.s a0, fa4, fa5 ; CHECK-NEXT: beqz a0, .LBB13_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: fmv.s fa5, fa4 +; CHECK-NEXT: fmv.s fa4, fa5 ; CHECK-NEXT: .LBB13_2: -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: fcvt.bf16.s fa0, fa4 ; CHECK-NEXT: ret %1 = fcmp une bfloat %a, %b %2 = select i1 %1, bfloat %a, bfloat %b diff --git a/llvm/test/CodeGen/RISCV/branch-relaxation.ll b/llvm/test/CodeGen/RISCV/branch-relaxation.ll index 4f7736e318cae6..8f8994dbe9342b 100644 --- a/llvm/test/CodeGen/RISCV/branch-relaxation.ll +++ b/llvm/test/CodeGen/RISCV/branch-relaxation.ll @@ -1839,220 +1839,220 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t0, -4(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t0, -8(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t1, -8(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t1, -12(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t1, 6 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t1, -12(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t1, -16(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t2, -16(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t2, -20(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t2, 7 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t2, -20(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t2, -24(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t3, -24(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t3, -28(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s0, 8 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s0, -28(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s0, -32(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s1, -32(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, -36(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s1, 9 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s1, -36(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, -40(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s2, -40(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s2, -44(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a0, 10 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a2, 1 ; CHECK-RV32-NEXT: add a2, sp, a2 -; CHECK-RV32-NEXT: sw a1, -44(a2) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a1, -48(a2) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a1, 11 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a3, 1 ; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: sw a1, -48(a3) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a1, -52(a3) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a2, -52(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a2, -56(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a2, 12 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a2, -56(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a2, -60(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a3, -60(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a3, -64(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a3, 13 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a3, -64(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a3, -68(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a4, -68(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a4, -72(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a4, 14 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a4, -72(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a4, -76(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a5, -76(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a5, -80(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a5, 15 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a5, -80(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a5, -84(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a6, -84(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a6, -88(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a6, 16 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a6, -88(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a6, -92(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a7, -92(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a7, -96(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a7, 17 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a7, -96(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a7, -100(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t0, -100(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t0, -104(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s2, 18 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s2, -104(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s2, -108(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s3, -108(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s3, -112(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s3, 19 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s3, -112(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s3, -116(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s4, -116(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s4, -120(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s4, 20 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s4, -120(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s4, -124(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s5, -124(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s5, -128(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s5, 21 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s5, -128(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s5, -132(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s6, -132(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s6, -136(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s6, 22 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s6, -136(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s6, -140(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s7, -140(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s7, -144(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s7, 23 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s7, -144(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s7, -148(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s8, -148(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s8, -152(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s8, 24 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s8, -152(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s8, -156(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s9, -156(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s9, -160(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s9, 25 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s9, -160(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s9, -164(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s10, -164(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s10, -168(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s10, 26 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s10, -168(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s10, -172(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s11, -172(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s11, -176(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s11, 27 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s11, -176(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s11, -180(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t3, 28 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t3, -180(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t3, -184(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t4, -184(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t4, -188(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t4, 29 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t4, -188(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t4, -192(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t5, -192(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t5, -196(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t5, 30 ; CHECK-RV32-NEXT: #NO_APP @@ -2062,18 +2062,16 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a2, 1 ; CHECK-RV32-NEXT: add a2, sp, a2 -; CHECK-RV32-NEXT: sw s0, -208(a2) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s0, -204(a2) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a2, 1 ; CHECK-RV32-NEXT: add a2, sp, a2 -; CHECK-RV32-NEXT: sw a1, -196(a2) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a1, -200(a2) # 4-byte Folded Spill ; CHECK-RV32-NEXT: xor a1, a1, s0 ; CHECK-RV32-NEXT: lui a2, 1 ; CHECK-RV32-NEXT: add a2, sp, a2 -; CHECK-RV32-NEXT: sw t6, -200(a2) # 4-byte Folded Spill -; CHECK-RV32-NEXT: lui a2, 1 -; CHECK-RV32-NEXT: add a2, sp, a2 -; CHECK-RV32-NEXT: sw t5, -204(a2) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t6, -4(a2) # 4-byte Folded Spill ; CHECK-RV32-NEXT: xor a2, t5, t6 +; CHECK-RV32-NEXT: mv t6, t5 ; CHECK-RV32-NEXT: or a1, a2, a1 ; CHECK-RV32-NEXT: beqz a1, .LBB5_1 ; CHECK-RV32-NEXT: # %bb.3: @@ -2088,238 +2086,236 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t0, -4(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t0, -8(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t1, -8(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t1, -12(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t0 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t1, -12(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t1, -16(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t2, -16(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t2, -20(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t1 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t2, -20(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t2, -24(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t3, -24(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t3, -28(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t2 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s0, -28(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s0, -32(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s1, -32(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, -36(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s0 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s1, -36(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, -40(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s2, -40(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s2, -44(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s1 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw a1, -44(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a1, -48(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a0 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a1, -48(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a1, -52(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a2, -52(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a2, -56(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a1 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a2, -56(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a2, -60(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a3, -60(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a3, -64(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a2 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a3, -64(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a3, -68(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a4, -68(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a4, -72(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a3 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a4, -72(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a4, -76(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a5, -76(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a5, -80(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a4 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a5, -80(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a5, -84(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a6, -84(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a6, -88(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a5 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a6, -88(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a6, -92(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a7, -92(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a7, -96(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a6 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a7, -96(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a7, -100(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t0, -100(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t0, -104(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a7 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s2, -104(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s2, -108(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s3, -108(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s3, -112(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s2 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s3, -112(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s3, -116(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s4, -116(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s4, -120(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s3 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s4, -120(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s4, -124(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s5, -124(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s5, -128(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s4 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s5, -128(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s5, -132(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s6, -132(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s6, -136(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s5 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s6, -136(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s6, -140(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s7, -140(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s7, -144(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s6 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s7, -144(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s7, -148(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s8, -148(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s8, -152(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s7 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s8, -152(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s8, -156(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s9, -156(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s9, -160(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s8 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s9, -160(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s9, -164(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s10, -164(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s10, -168(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s9 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s10, -168(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s10, -172(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s11, -172(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s11, -176(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s10 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s11, -176(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s11, -180(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s11 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t3, -180(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t3, -184(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t4, -184(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t4, -188(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t3 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t4, -188(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t4, -192(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t5, -192(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t5, -196(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t4 ; CHECK-RV32-NEXT: #NO_APP +; CHECK-RV32-NEXT: mv t5, t6 ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t5, -204(a0) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lui a0, 1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t6, -196(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t6, -200(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t5 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s0, -208(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s0, -204(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t6, -200(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t6, -4(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t6 ; CHECK-RV32-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll index 09ecbbc7e8feb8..32f5a5d76c5926 100644 --- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll +++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll @@ -50,16 +50,16 @@ define void @callee() nounwind { ; RV32I-NEXT: sw s9, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a6, %hi(var) -; RV32I-NEXT: lw a0, %lo(var)(a6) +; RV32I-NEXT: lui a7, %hi(var) +; RV32I-NEXT: lw a0, %lo(var)(a7) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+4)(a6) +; RV32I-NEXT: lw a0, %lo(var+4)(a7) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+8)(a6) +; RV32I-NEXT: lw a0, %lo(var+8)(a7) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+12)(a6) +; RV32I-NEXT: lw a0, %lo(var+12)(a7) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a6, %lo(var) +; RV32I-NEXT: addi a5, a7, %lo(var) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -84,7 +84,7 @@ define void @callee() nounwind { ; RV32I-NEXT: lw s10, 92(a5) ; RV32I-NEXT: lw s11, 96(a5) ; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a7, 104(a5) +; RV32I-NEXT: lw a6, 104(a5) ; RV32I-NEXT: lw a4, 108(a5) ; RV32I-NEXT: lw a0, 124(a5) ; RV32I-NEXT: lw a1, 120(a5) @@ -95,7 +95,7 @@ define void @callee() nounwind { ; RV32I-NEXT: sw a2, 116(a5) ; RV32I-NEXT: sw a3, 112(a5) ; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a7, 104(a5) +; RV32I-NEXT: sw a6, 104(a5) ; RV32I-NEXT: sw ra, 100(a5) ; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) @@ -121,13 +121,13 @@ define void @callee() nounwind { ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+12)(a6) +; RV32I-NEXT: sw a0, %lo(var+12)(a7) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+8)(a6) +; RV32I-NEXT: sw a0, %lo(var+8)(a7) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+4)(a6) +; RV32I-NEXT: sw a0, %lo(var+4)(a7) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var)(a6) +; RV32I-NEXT: sw a0, %lo(var)(a7) ; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -161,16 +161,16 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: sw s11, 28(sp) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: addi s0, sp, 80 -; RV32I-WITH-FP-NEXT: lui a6, %hi(var) -; RV32I-WITH-FP-NEXT: lw a0, %lo(var)(a6) +; RV32I-WITH-FP-NEXT: lui a7, %hi(var) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var)(a7) ; RV32I-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) ; RV32I-WITH-FP-NEXT: sw a0, -60(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) ; RV32I-WITH-FP-NEXT: sw a0, -64(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) ; RV32I-WITH-FP-NEXT: sw a0, -68(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: addi a5, a6, %lo(var) +; RV32I-WITH-FP-NEXT: addi a5, a7, %lo(var) ; RV32I-WITH-FP-NEXT: lw a0, 16(a5) ; RV32I-WITH-FP-NEXT: sw a0, -72(s0) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: lw a0, 20(a5) @@ -196,7 +196,7 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: lw s11, 92(a5) ; RV32I-WITH-FP-NEXT: lw ra, 96(a5) ; RV32I-WITH-FP-NEXT: lw t0, 100(a5) -; RV32I-WITH-FP-NEXT: lw a7, 104(a5) +; RV32I-WITH-FP-NEXT: lw a6, 104(a5) ; RV32I-WITH-FP-NEXT: lw a4, 108(a5) ; RV32I-WITH-FP-NEXT: lw a0, 124(a5) ; RV32I-WITH-FP-NEXT: lw a1, 120(a5) @@ -207,7 +207,7 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: sw a2, 116(a5) ; RV32I-WITH-FP-NEXT: sw a3, 112(a5) ; RV32I-WITH-FP-NEXT: sw a4, 108(a5) -; RV32I-WITH-FP-NEXT: sw a7, 104(a5) +; RV32I-WITH-FP-NEXT: sw a6, 104(a5) ; RV32I-WITH-FP-NEXT: sw t0, 100(a5) ; RV32I-WITH-FP-NEXT: sw ra, 96(a5) ; RV32I-WITH-FP-NEXT: sw s11, 92(a5) @@ -234,13 +234,13 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: lw a0, -72(s0) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: sw a0, 16(a5) ; RV32I-WITH-FP-NEXT: lw a0, -68(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) ; RV32I-WITH-FP-NEXT: lw a0, -64(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) ; RV32I-WITH-FP-NEXT: lw a0, -60(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) ; RV32I-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var)(a6) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var)(a7) ; RV32I-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -260,16 +260,16 @@ define void @callee() nounwind { ; RV32IZCMP-LABEL: callee: ; RV32IZCMP: # %bb.0: ; RV32IZCMP-NEXT: cm.push {ra, s0-s11}, -96 -; RV32IZCMP-NEXT: lui a6, %hi(var) -; RV32IZCMP-NEXT: lw a0, %lo(var)(a6) +; RV32IZCMP-NEXT: lui a7, %hi(var) +; RV32IZCMP-NEXT: lw a0, %lo(var)(a7) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+4)(a6) +; RV32IZCMP-NEXT: lw a0, %lo(var+4)(a7) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+8)(a6) +; RV32IZCMP-NEXT: lw a0, %lo(var+8)(a7) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+12)(a6) +; RV32IZCMP-NEXT: lw a0, %lo(var+12)(a7) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a6, %lo(var) +; RV32IZCMP-NEXT: addi a5, a7, %lo(var) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -294,7 +294,7 @@ define void @callee() nounwind { ; RV32IZCMP-NEXT: lw t1, 92(a5) ; RV32IZCMP-NEXT: lw t0, 96(a5) ; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a7, 104(a5) +; RV32IZCMP-NEXT: lw a6, 104(a5) ; RV32IZCMP-NEXT: lw a4, 108(a5) ; RV32IZCMP-NEXT: lw a0, 124(a5) ; RV32IZCMP-NEXT: lw a1, 120(a5) @@ -305,7 +305,7 @@ define void @callee() nounwind { ; RV32IZCMP-NEXT: sw a2, 116(a5) ; RV32IZCMP-NEXT: sw a3, 112(a5) ; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a7, 104(a5) +; RV32IZCMP-NEXT: sw a6, 104(a5) ; RV32IZCMP-NEXT: sw s0, 100(a5) ; RV32IZCMP-NEXT: sw t0, 96(a5) ; RV32IZCMP-NEXT: sw t1, 92(a5) @@ -331,13 +331,13 @@ define void @callee() nounwind { ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+12)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var+12)(a7) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+8)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var+8)(a7) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+4)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var+4)(a7) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var)(a7) ; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 96 ; ; RV32IZCMP-WITH-FP-LABEL: callee: @@ -357,16 +357,16 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32IZCMP-WITH-FP-NEXT: sw s11, 28(sp) # 4-byte Folded Spill ; RV32IZCMP-WITH-FP-NEXT: addi s0, sp, 80 -; RV32IZCMP-WITH-FP-NEXT: lui a6, %hi(var) -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a6) +; RV32IZCMP-WITH-FP-NEXT: lui a7, %hi(var) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a7) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -60(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -64(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -68(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: addi a5, a6, %lo(var) +; RV32IZCMP-WITH-FP-NEXT: addi a5, a7, %lo(var) ; RV32IZCMP-WITH-FP-NEXT: lw a0, 16(a5) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -72(s0) # 4-byte Folded Spill ; RV32IZCMP-WITH-FP-NEXT: lw a0, 20(a5) @@ -392,7 +392,7 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: lw s1, 92(a5) ; RV32IZCMP-WITH-FP-NEXT: lw t1, 96(a5) ; RV32IZCMP-WITH-FP-NEXT: lw t0, 100(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a7, 104(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a6, 104(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a4, 108(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a0, 124(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a1, 120(a5) @@ -403,7 +403,7 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: sw a2, 116(a5) ; RV32IZCMP-WITH-FP-NEXT: sw a3, 112(a5) ; RV32IZCMP-WITH-FP-NEXT: sw a4, 108(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a7, 104(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a6, 104(a5) ; RV32IZCMP-WITH-FP-NEXT: sw t0, 100(a5) ; RV32IZCMP-WITH-FP-NEXT: sw t1, 96(a5) ; RV32IZCMP-WITH-FP-NEXT: sw s1, 92(a5) @@ -430,13 +430,13 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: lw a0, -72(s0) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: sw a0, 16(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -68(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -64(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -60(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a6) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a7) ; RV32IZCMP-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -469,16 +469,16 @@ define void @callee() nounwind { ; RV64I-NEXT: sd s9, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a6, %hi(var) -; RV64I-NEXT: lw a0, %lo(var)(a6) +; RV64I-NEXT: lui a7, %hi(var) +; RV64I-NEXT: lw a0, %lo(var)(a7) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+4)(a6) +; RV64I-NEXT: lw a0, %lo(var+4)(a7) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+8)(a6) +; RV64I-NEXT: lw a0, %lo(var+8)(a7) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+12)(a6) +; RV64I-NEXT: lw a0, %lo(var+12)(a7) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a6, %lo(var) +; RV64I-NEXT: addi a5, a7, %lo(var) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -503,7 +503,7 @@ define void @callee() nounwind { ; RV64I-NEXT: lw s10, 92(a5) ; RV64I-NEXT: lw s11, 96(a5) ; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a7, 104(a5) +; RV64I-NEXT: lw a6, 104(a5) ; RV64I-NEXT: lw a4, 108(a5) ; RV64I-NEXT: lw a0, 124(a5) ; RV64I-NEXT: lw a1, 120(a5) @@ -514,7 +514,7 @@ define void @callee() nounwind { ; RV64I-NEXT: sw a2, 116(a5) ; RV64I-NEXT: sw a3, 112(a5) ; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a7, 104(a5) +; RV64I-NEXT: sw a6, 104(a5) ; RV64I-NEXT: sw ra, 100(a5) ; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) @@ -540,13 +540,13 @@ define void @callee() nounwind { ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+12)(a6) +; RV64I-NEXT: sw a0, %lo(var+12)(a7) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+8)(a6) +; RV64I-NEXT: sw a0, %lo(var+8)(a7) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+4)(a6) +; RV64I-NEXT: sw a0, %lo(var+4)(a7) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var)(a6) +; RV64I-NEXT: sw a0, %lo(var)(a7) ; RV64I-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 136(sp) # 8-byte Folded Reload @@ -580,16 +580,16 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: sd s11, 56(sp) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: addi s0, sp, 160 -; RV64I-WITH-FP-NEXT: lui a6, %hi(var) -; RV64I-WITH-FP-NEXT: lw a0, %lo(var)(a6) +; RV64I-WITH-FP-NEXT: lui a7, %hi(var) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var)(a7) ; RV64I-WITH-FP-NEXT: sd a0, -112(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) ; RV64I-WITH-FP-NEXT: sd a0, -120(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) ; RV64I-WITH-FP-NEXT: sd a0, -128(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) ; RV64I-WITH-FP-NEXT: sd a0, -136(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: addi a5, a6, %lo(var) +; RV64I-WITH-FP-NEXT: addi a5, a7, %lo(var) ; RV64I-WITH-FP-NEXT: lw a0, 16(a5) ; RV64I-WITH-FP-NEXT: sd a0, -144(s0) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: lw a0, 20(a5) @@ -615,7 +615,7 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: lw s11, 92(a5) ; RV64I-WITH-FP-NEXT: lw ra, 96(a5) ; RV64I-WITH-FP-NEXT: lw t0, 100(a5) -; RV64I-WITH-FP-NEXT: lw a7, 104(a5) +; RV64I-WITH-FP-NEXT: lw a6, 104(a5) ; RV64I-WITH-FP-NEXT: lw a4, 108(a5) ; RV64I-WITH-FP-NEXT: lw a0, 124(a5) ; RV64I-WITH-FP-NEXT: lw a1, 120(a5) @@ -626,7 +626,7 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: sw a2, 116(a5) ; RV64I-WITH-FP-NEXT: sw a3, 112(a5) ; RV64I-WITH-FP-NEXT: sw a4, 108(a5) -; RV64I-WITH-FP-NEXT: sw a7, 104(a5) +; RV64I-WITH-FP-NEXT: sw a6, 104(a5) ; RV64I-WITH-FP-NEXT: sw t0, 100(a5) ; RV64I-WITH-FP-NEXT: sw ra, 96(a5) ; RV64I-WITH-FP-NEXT: sw s11, 92(a5) @@ -653,13 +653,13 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: ld a0, -144(s0) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: sw a0, 16(a5) ; RV64I-WITH-FP-NEXT: ld a0, -136(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) ; RV64I-WITH-FP-NEXT: ld a0, -128(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) ; RV64I-WITH-FP-NEXT: ld a0, -120(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) ; RV64I-WITH-FP-NEXT: ld a0, -112(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var)(a6) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var)(a7) ; RV64I-WITH-FP-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: ld s1, 136(sp) # 8-byte Folded Reload @@ -679,16 +679,16 @@ define void @callee() nounwind { ; RV64IZCMP-LABEL: callee: ; RV64IZCMP: # %bb.0: ; RV64IZCMP-NEXT: cm.push {ra, s0-s11}, -160 -; RV64IZCMP-NEXT: lui a6, %hi(var) -; RV64IZCMP-NEXT: lw a0, %lo(var)(a6) +; RV64IZCMP-NEXT: lui a7, %hi(var) +; RV64IZCMP-NEXT: lw a0, %lo(var)(a7) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+4)(a6) +; RV64IZCMP-NEXT: lw a0, %lo(var+4)(a7) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+8)(a6) +; RV64IZCMP-NEXT: lw a0, %lo(var+8)(a7) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+12)(a6) +; RV64IZCMP-NEXT: lw a0, %lo(var+12)(a7) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a6, %lo(var) +; RV64IZCMP-NEXT: addi a5, a7, %lo(var) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -713,7 +713,7 @@ define void @callee() nounwind { ; RV64IZCMP-NEXT: lw t1, 92(a5) ; RV64IZCMP-NEXT: lw t0, 96(a5) ; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a7, 104(a5) +; RV64IZCMP-NEXT: lw a6, 104(a5) ; RV64IZCMP-NEXT: lw a4, 108(a5) ; RV64IZCMP-NEXT: lw a0, 124(a5) ; RV64IZCMP-NEXT: lw a1, 120(a5) @@ -724,7 +724,7 @@ define void @callee() nounwind { ; RV64IZCMP-NEXT: sw a2, 116(a5) ; RV64IZCMP-NEXT: sw a3, 112(a5) ; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a7, 104(a5) +; RV64IZCMP-NEXT: sw a6, 104(a5) ; RV64IZCMP-NEXT: sw s0, 100(a5) ; RV64IZCMP-NEXT: sw t0, 96(a5) ; RV64IZCMP-NEXT: sw t1, 92(a5) @@ -750,13 +750,13 @@ define void @callee() nounwind { ; RV64IZCMP-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+12)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var+12)(a7) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+8)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var+8)(a7) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+4)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var+4)(a7) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var)(a7) ; RV64IZCMP-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV64IZCMP-WITH-FP-LABEL: callee: @@ -776,16 +776,16 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64IZCMP-WITH-FP-NEXT: sd s11, 56(sp) # 8-byte Folded Spill ; RV64IZCMP-WITH-FP-NEXT: addi s0, sp, 160 -; RV64IZCMP-WITH-FP-NEXT: lui a6, %hi(var) -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a6) +; RV64IZCMP-WITH-FP-NEXT: lui a7, %hi(var) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a7) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -112(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -120(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -128(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -136(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: addi a5, a6, %lo(var) +; RV64IZCMP-WITH-FP-NEXT: addi a5, a7, %lo(var) ; RV64IZCMP-WITH-FP-NEXT: lw a0, 16(a5) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -144(s0) # 8-byte Folded Spill ; RV64IZCMP-WITH-FP-NEXT: lw a0, 20(a5) @@ -811,7 +811,7 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: lw s1, 92(a5) ; RV64IZCMP-WITH-FP-NEXT: lw t1, 96(a5) ; RV64IZCMP-WITH-FP-NEXT: lw t0, 100(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a7, 104(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a6, 104(a5) ; RV64IZCMP-WITH-FP-NEXT: lw a4, 108(a5) ; RV64IZCMP-WITH-FP-NEXT: lw a0, 124(a5) ; RV64IZCMP-WITH-FP-NEXT: lw a1, 120(a5) @@ -822,7 +822,7 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: sw a2, 116(a5) ; RV64IZCMP-WITH-FP-NEXT: sw a3, 112(a5) ; RV64IZCMP-WITH-FP-NEXT: sw a4, 108(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a7, 104(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a6, 104(a5) ; RV64IZCMP-WITH-FP-NEXT: sw t0, 100(a5) ; RV64IZCMP-WITH-FP-NEXT: sw t1, 96(a5) ; RV64IZCMP-WITH-FP-NEXT: sw s1, 92(a5) @@ -849,13 +849,13 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: ld a0, -144(s0) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: sw a0, 16(a5) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -136(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -128(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -120(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -112(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a6) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a7) ; RV64IZCMP-WITH-FP-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: ld s1, 136(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll index 39ac963051b5b0..b7a3cadc680975 100644 --- a/llvm/test/CodeGen/RISCV/double-convert.ll +++ b/llvm/test/CodeGen/RISCV/double-convert.ll @@ -758,7 +758,7 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi@plt -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB12_2 ; RV32IFD-NEXT: # %bb.1: # %start @@ -766,19 +766,19 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32IFD-NEXT: .LBB12_2: # %start ; RV32IFD-NEXT: lui a1, %hi(.LCPI12_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI12_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB12_4 +; RV32IFD-NEXT: flt.d a4, fa5, fs0 +; RV32IFD-NEXT: beqz a4, .LBB12_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB12_4: # %start ; RV32IFD-NEXT: feq.d a1, fs0, fs0 -; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 -; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: neg a3, a1 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: neg a2, a4 +; RV32IFD-NEXT: neg a4, s0 ; RV32IFD-NEXT: and a0, a4, a0 +; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: and a0, a3, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -857,19 +857,19 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s2, a0 ; RV32I-NEXT: lui a3, 278016 ; RV32I-NEXT: addi a3, a3, -1 ; RV32I-NEXT: li a2, -1 ; RV32I-NEXT: call __gtdf2@plt -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lui a3, 802304 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: li a2, 0 ; RV32I-NEXT: call __gedf2@plt ; RV32I-NEXT: mv s3, a0 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: call __fixdfdi@plt ; RV32I-NEXT: mv s4, a0 @@ -879,13 +879,13 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32I-NEXT: # %bb.1: # %start ; RV32I-NEXT: lui s5, 524288 ; RV32I-NEXT: .LBB12_2: # %start -; RV32I-NEXT: blez s2, .LBB12_4 +; RV32I-NEXT: blez s1, .LBB12_4 ; RV32I-NEXT: # %bb.3: # %start ; RV32I-NEXT: addi s5, a0, -1 ; RV32I-NEXT: .LBB12_4: # %start -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: mv a1, s0 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s2 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: call __unorddf2@plt ; RV32I-NEXT: snez a0, a0 @@ -894,7 +894,7 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32I-NEXT: slti a2, s3, 0 ; RV32I-NEXT: addi a2, a2, -1 ; RV32I-NEXT: and a2, a2, s4 -; RV32I-NEXT: sgtz a3, s2 +; RV32I-NEXT: sgtz a3, s1 ; RV32I-NEXT: neg a3, a3 ; RV32I-NEXT: or a2, a3, a2 ; RV32I-NEXT: and a0, a0, a2 diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll index 1fd0d629e9a7a9..7d134b3d1cf6f0 100644 --- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll @@ -60,7 +60,7 @@ define i64 @test_floor_si64(double %x) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi@plt -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB1_2 ; RV32IFD-NEXT: # %bb.1: @@ -68,19 +68,19 @@ define i64 @test_floor_si64(double %x) nounwind { ; RV32IFD-NEXT: .LBB1_2: ; RV32IFD-NEXT: lui a1, %hi(.LCPI1_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI1_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB1_4 +; RV32IFD-NEXT: flt.d a4, fa5, fs0 +; RV32IFD-NEXT: beqz a4, .LBB1_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB1_4: ; RV32IFD-NEXT: feq.d a1, fs0, fs0 -; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 -; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: neg a3, a1 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: neg a2, a4 +; RV32IFD-NEXT: neg a4, s0 ; RV32IFD-NEXT: and a0, a4, a0 +; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: and a0, a3, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -101,44 +101,44 @@ define i64 @test_floor_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: addi sp, sp, -32 ; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: sw s2, 20(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: call floor@plt ; RV32IZFINXZDINX-NEXT: sw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: lw s2, 8(sp) -; RV32IZFINXZDINX-NEXT: lw s3, 12(sp) +; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) +; RV32IZFINXZDINX-NEXT: lw s1, 12(sp) ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI1_0) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI1_0+4)(a2) ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI1_0)(a2) -; RV32IZFINXZDINX-NEXT: fle.d s0, a2, s2 +; RV32IZFINXZDINX-NEXT: fle.d s2, a2, s0 ; RV32IZFINXZDINX-NEXT: call __fixdfdi@plt ; RV32IZFINXZDINX-NEXT: lui a4, 524288 ; RV32IZFINXZDINX-NEXT: lui a2, 524288 -; RV32IZFINXZDINX-NEXT: beqz s0, .LBB1_2 +; RV32IZFINXZDINX-NEXT: beqz s2, .LBB1_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: ; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB1_2: ; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI1_1) ; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI1_1)(a1) ; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI1_1+4)(a1) -; RV32IZFINXZDINX-NEXT: flt.d a3, a6, s2 +; RV32IZFINXZDINX-NEXT: flt.d a3, a6, s0 ; RV32IZFINXZDINX-NEXT: beqz a3, .LBB1_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: ; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB1_4: -; RV32IZFINXZDINX-NEXT: feq.d a1, s2, s2 +; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0 ; RV32IZFINXZDINX-NEXT: neg a4, a1 ; RV32IZFINXZDINX-NEXT: and a1, a4, a2 -; RV32IZFINXZDINX-NEXT: neg a2, s0 +; RV32IZFINXZDINX-NEXT: neg a2, s2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: neg a2, a3 ; RV32IZFINXZDINX-NEXT: or a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32IZFINXZDINX-NEXT: lw s2, 20(sp) # 4-byte Folded Reload -; RV32IZFINXZDINX-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 32 ; RV32IZFINXZDINX-NEXT: ret ; @@ -326,7 +326,7 @@ define i64 @test_ceil_si64(double %x) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi@plt -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB5_2 ; RV32IFD-NEXT: # %bb.1: @@ -334,19 +334,19 @@ define i64 @test_ceil_si64(double %x) nounwind { ; RV32IFD-NEXT: .LBB5_2: ; RV32IFD-NEXT: lui a1, %hi(.LCPI5_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI5_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB5_4 +; RV32IFD-NEXT: flt.d a4, fa5, fs0 +; RV32IFD-NEXT: beqz a4, .LBB5_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB5_4: ; RV32IFD-NEXT: feq.d a1, fs0, fs0 -; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 -; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: neg a3, a1 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: neg a2, a4 +; RV32IFD-NEXT: neg a4, s0 ; RV32IFD-NEXT: and a0, a4, a0 +; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: and a0, a3, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -367,44 +367,44 @@ define i64 @test_ceil_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: addi sp, sp, -32 ; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: sw s2, 20(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: call ceil@plt ; RV32IZFINXZDINX-NEXT: sw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: lw s2, 8(sp) -; RV32IZFINXZDINX-NEXT: lw s3, 12(sp) +; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) +; RV32IZFINXZDINX-NEXT: lw s1, 12(sp) ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI5_0) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI5_0+4)(a2) ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI5_0)(a2) -; RV32IZFINXZDINX-NEXT: fle.d s0, a2, s2 +; RV32IZFINXZDINX-NEXT: fle.d s2, a2, s0 ; RV32IZFINXZDINX-NEXT: call __fixdfdi@plt ; RV32IZFINXZDINX-NEXT: lui a4, 524288 ; RV32IZFINXZDINX-NEXT: lui a2, 524288 -; RV32IZFINXZDINX-NEXT: beqz s0, .LBB5_2 +; RV32IZFINXZDINX-NEXT: beqz s2, .LBB5_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: ; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB5_2: ; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI5_1) ; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI5_1)(a1) ; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI5_1+4)(a1) -; RV32IZFINXZDINX-NEXT: flt.d a3, a6, s2 +; RV32IZFINXZDINX-NEXT: flt.d a3, a6, s0 ; RV32IZFINXZDINX-NEXT: beqz a3, .LBB5_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: ; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB5_4: -; RV32IZFINXZDINX-NEXT: feq.d a1, s2, s2 +; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0 ; RV32IZFINXZDINX-NEXT: neg a4, a1 ; RV32IZFINXZDINX-NEXT: and a1, a4, a2 -; RV32IZFINXZDINX-NEXT: neg a2, s0 +; RV32IZFINXZDINX-NEXT: neg a2, s2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: neg a2, a3 ; RV32IZFINXZDINX-NEXT: or a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32IZFINXZDINX-NEXT: lw s2, 20(sp) # 4-byte Folded Reload -; RV32IZFINXZDINX-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 32 ; RV32IZFINXZDINX-NEXT: ret ; @@ -592,7 +592,7 @@ define i64 @test_trunc_si64(double %x) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi@plt -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB9_2 ; RV32IFD-NEXT: # %bb.1: @@ -600,19 +600,19 @@ define i64 @test_trunc_si64(double %x) nounwind { ; RV32IFD-NEXT: .LBB9_2: ; RV32IFD-NEXT: lui a1, %hi(.LCPI9_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI9_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB9_4 +; RV32IFD-NEXT: flt.d a4, fa5, fs0 +; RV32IFD-NEXT: beqz a4, .LBB9_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB9_4: ; RV32IFD-NEXT: feq.d a1, fs0, fs0 -; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 -; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: neg a3, a1 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: neg a2, a4 +; RV32IFD-NEXT: neg a4, s0 ; RV32IFD-NEXT: and a0, a4, a0 +; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: and a0, a3, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -633,44 +633,44 @@ define i64 @test_trunc_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: addi sp, sp, -32 ; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: sw s2, 20(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: call trunc@plt ; RV32IZFINXZDINX-NEXT: sw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: lw s2, 8(sp) -; RV32IZFINXZDINX-NEXT: lw s3, 12(sp) +; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) +; RV32IZFINXZDINX-NEXT: lw s1, 12(sp) ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI9_0) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI9_0+4)(a2) ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI9_0)(a2) -; RV32IZFINXZDINX-NEXT: fle.d s0, a2, s2 +; RV32IZFINXZDINX-NEXT: fle.d s2, a2, s0 ; RV32IZFINXZDINX-NEXT: call __fixdfdi@plt ; RV32IZFINXZDINX-NEXT: lui a4, 524288 ; RV32IZFINXZDINX-NEXT: lui a2, 524288 -; RV32IZFINXZDINX-NEXT: beqz s0, .LBB9_2 +; RV32IZFINXZDINX-NEXT: beqz s2, .LBB9_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: ; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB9_2: ; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI9_1) ; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI9_1)(a1) ; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI9_1+4)(a1) -; RV32IZFINXZDINX-NEXT: flt.d a3, a6, s2 +; RV32IZFINXZDINX-NEXT: flt.d a3, a6, s0 ; RV32IZFINXZDINX-NEXT: beqz a3, .LBB9_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: ; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB9_4: -; RV32IZFINXZDINX-NEXT: feq.d a1, s2, s2 +; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0 ; RV32IZFINXZDINX-NEXT: neg a4, a1 ; RV32IZFINXZDINX-NEXT: and a1, a4, a2 -; RV32IZFINXZDINX-NEXT: neg a2, s0 +; RV32IZFINXZDINX-NEXT: neg a2, s2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: neg a2, a3 ; RV32IZFINXZDINX-NEXT: or a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32IZFINXZDINX-NEXT: lw s2, 20(sp) # 4-byte Folded Reload -; RV32IZFINXZDINX-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 32 ; RV32IZFINXZDINX-NEXT: ret ; @@ -858,7 +858,7 @@ define i64 @test_round_si64(double %x) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi@plt -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB13_2 ; RV32IFD-NEXT: # %bb.1: @@ -866,19 +866,19 @@ define i64 @test_round_si64(double %x) nounwind { ; RV32IFD-NEXT: .LBB13_2: ; RV32IFD-NEXT: lui a1, %hi(.LCPI13_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI13_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB13_4 +; RV32IFD-NEXT: flt.d a4, fa5, fs0 +; RV32IFD-NEXT: beqz a4, .LBB13_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB13_4: ; RV32IFD-NEXT: feq.d a1, fs0, fs0 -; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 -; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: neg a3, a1 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: neg a2, a4 +; RV32IFD-NEXT: neg a4, s0 ; RV32IFD-NEXT: and a0, a4, a0 +; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: and a0, a3, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -899,44 +899,44 @@ define i64 @test_round_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: addi sp, sp, -32 ; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: sw s2, 20(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: call round@plt ; RV32IZFINXZDINX-NEXT: sw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: lw s2, 8(sp) -; RV32IZFINXZDINX-NEXT: lw s3, 12(sp) +; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) +; RV32IZFINXZDINX-NEXT: lw s1, 12(sp) ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI13_0) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI13_0+4)(a2) ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI13_0)(a2) -; RV32IZFINXZDINX-NEXT: fle.d s0, a2, s2 +; RV32IZFINXZDINX-NEXT: fle.d s2, a2, s0 ; RV32IZFINXZDINX-NEXT: call __fixdfdi@plt ; RV32IZFINXZDINX-NEXT: lui a4, 524288 ; RV32IZFINXZDINX-NEXT: lui a2, 524288 -; RV32IZFINXZDINX-NEXT: beqz s0, .LBB13_2 +; RV32IZFINXZDINX-NEXT: beqz s2, .LBB13_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: ; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB13_2: ; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI13_1) ; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI13_1)(a1) ; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI13_1+4)(a1) -; RV32IZFINXZDINX-NEXT: flt.d a3, a6, s2 +; RV32IZFINXZDINX-NEXT: flt.d a3, a6, s0 ; RV32IZFINXZDINX-NEXT: beqz a3, .LBB13_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: ; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB13_4: -; RV32IZFINXZDINX-NEXT: feq.d a1, s2, s2 +; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0 ; RV32IZFINXZDINX-NEXT: neg a4, a1 ; RV32IZFINXZDINX-NEXT: and a1, a4, a2 -; RV32IZFINXZDINX-NEXT: neg a2, s0 +; RV32IZFINXZDINX-NEXT: neg a2, s2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: neg a2, a3 ; RV32IZFINXZDINX-NEXT: or a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32IZFINXZDINX-NEXT: lw s2, 20(sp) # 4-byte Folded Reload -; RV32IZFINXZDINX-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 32 ; RV32IZFINXZDINX-NEXT: ret ; @@ -1124,7 +1124,7 @@ define i64 @test_roundeven_si64(double %x) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi@plt -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB17_2 ; RV32IFD-NEXT: # %bb.1: @@ -1132,19 +1132,19 @@ define i64 @test_roundeven_si64(double %x) nounwind { ; RV32IFD-NEXT: .LBB17_2: ; RV32IFD-NEXT: lui a1, %hi(.LCPI17_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI17_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB17_4 +; RV32IFD-NEXT: flt.d a4, fa5, fs0 +; RV32IFD-NEXT: beqz a4, .LBB17_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB17_4: ; RV32IFD-NEXT: feq.d a1, fs0, fs0 -; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 -; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: neg a3, a1 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: neg a2, a4 +; RV32IFD-NEXT: neg a4, s0 ; RV32IFD-NEXT: and a0, a4, a0 +; RV32IFD-NEXT: or a0, a2, a0 +; RV32IFD-NEXT: and a0, a3, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -1165,44 +1165,44 @@ define i64 @test_roundeven_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: addi sp, sp, -32 ; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: sw s2, 20(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: call roundeven@plt ; RV32IZFINXZDINX-NEXT: sw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: lw s2, 8(sp) -; RV32IZFINXZDINX-NEXT: lw s3, 12(sp) +; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) +; RV32IZFINXZDINX-NEXT: lw s1, 12(sp) ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI17_0) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI17_0+4)(a2) ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI17_0)(a2) -; RV32IZFINXZDINX-NEXT: fle.d s0, a2, s2 +; RV32IZFINXZDINX-NEXT: fle.d s2, a2, s0 ; RV32IZFINXZDINX-NEXT: call __fixdfdi@plt ; RV32IZFINXZDINX-NEXT: lui a4, 524288 ; RV32IZFINXZDINX-NEXT: lui a2, 524288 -; RV32IZFINXZDINX-NEXT: beqz s0, .LBB17_2 +; RV32IZFINXZDINX-NEXT: beqz s2, .LBB17_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: ; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB17_2: ; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI17_1) ; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI17_1)(a1) ; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI17_1+4)(a1) -; RV32IZFINXZDINX-NEXT: flt.d a3, a6, s2 +; RV32IZFINXZDINX-NEXT: flt.d a3, a6, s0 ; RV32IZFINXZDINX-NEXT: beqz a3, .LBB17_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: ; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB17_4: -; RV32IZFINXZDINX-NEXT: feq.d a1, s2, s2 +; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0 ; RV32IZFINXZDINX-NEXT: neg a4, a1 ; RV32IZFINXZDINX-NEXT: and a1, a4, a2 -; RV32IZFINXZDINX-NEXT: neg a2, s0 +; RV32IZFINXZDINX-NEXT: neg a2, s2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: neg a2, a3 ; RV32IZFINXZDINX-NEXT: or a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32IZFINXZDINX-NEXT: lw s2, 20(sp) # 4-byte Folded Reload -; RV32IZFINXZDINX-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 32 ; RV32IZFINXZDINX-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll index 83a4f63add337f..075678a6655b5e 100644 --- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll +++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll @@ -24,31 +24,31 @@ define void @_Z3foov() { ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_49) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_49) ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_48) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_48) -; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_46) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_46) -; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_45) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_45) -; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v14, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v12, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v14, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vs2r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll index 235979b122215a..0468b961893aea 100644 --- a/llvm/test/CodeGen/RISCV/float-convert.ll +++ b/llvm/test/CodeGen/RISCV/float-convert.ll @@ -622,7 +622,7 @@ define i64 @fcvt_l_s_sat(float %a) nounwind { ; RV32IF-NEXT: fmv.w.x fa5, a0 ; RV32IF-NEXT: fle.s s0, fa5, fa0 ; RV32IF-NEXT: call __fixsfdi@plt -; RV32IF-NEXT: lui a4, 524288 +; RV32IF-NEXT: lui a3, 524288 ; RV32IF-NEXT: lui a2, 524288 ; RV32IF-NEXT: beqz s0, .LBB12_2 ; RV32IF-NEXT: # %bb.1: # %start @@ -630,19 +630,19 @@ define i64 @fcvt_l_s_sat(float %a) nounwind { ; RV32IF-NEXT: .LBB12_2: # %start ; RV32IF-NEXT: lui a1, %hi(.LCPI12_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI12_0)(a1) -; RV32IF-NEXT: flt.s a3, fa5, fs0 -; RV32IF-NEXT: beqz a3, .LBB12_4 +; RV32IF-NEXT: flt.s a4, fa5, fs0 +; RV32IF-NEXT: beqz a4, .LBB12_4 ; RV32IF-NEXT: # %bb.3: -; RV32IF-NEXT: addi a2, a4, -1 +; RV32IF-NEXT: addi a2, a3, -1 ; RV32IF-NEXT: .LBB12_4: # %start ; RV32IF-NEXT: feq.s a1, fs0, fs0 -; RV32IF-NEXT: neg a4, a1 -; RV32IF-NEXT: and a1, a4, a2 -; RV32IF-NEXT: neg a2, a3 -; RV32IF-NEXT: neg a3, s0 -; RV32IF-NEXT: and a0, a3, a0 -; RV32IF-NEXT: or a0, a2, a0 +; RV32IF-NEXT: neg a3, a1 +; RV32IF-NEXT: and a1, a3, a2 +; RV32IF-NEXT: neg a2, a4 +; RV32IF-NEXT: neg a4, s0 ; RV32IF-NEXT: and a0, a4, a0 +; RV32IF-NEXT: or a0, a2, a0 +; RV32IF-NEXT: and a0, a3, a0 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -715,35 +715,35 @@ define i64 @fcvt_l_s_sat(float %a) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lui a1, 913408 ; RV32I-NEXT: call __gesf2@plt -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __fixsfdi@plt ; RV32I-NEXT: mv s2, a0 ; RV32I-NEXT: mv s3, a1 ; RV32I-NEXT: lui s5, 524288 -; RV32I-NEXT: bgez s1, .LBB12_2 +; RV32I-NEXT: bgez s0, .LBB12_2 ; RV32I-NEXT: # %bb.1: # %start ; RV32I-NEXT: lui s3, 524288 ; RV32I-NEXT: .LBB12_2: # %start ; RV32I-NEXT: lui a1, 389120 ; RV32I-NEXT: addi a1, a1, -1 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __gtsf2@plt ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: blez a0, .LBB12_4 ; RV32I-NEXT: # %bb.3: # %start ; RV32I-NEXT: addi s3, s5, -1 ; RV32I-NEXT: .LBB12_4: # %start -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __unordsf2@plt ; RV32I-NEXT: snez a0, a0 ; RV32I-NEXT: addi a0, a0, -1 ; RV32I-NEXT: and a1, a0, s3 -; RV32I-NEXT: slti a2, s1, 0 +; RV32I-NEXT: slti a2, s0, 0 ; RV32I-NEXT: addi a2, a2, -1 ; RV32I-NEXT: and a2, a2, s2 ; RV32I-NEXT: sgtz a3, s4 diff --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll index 61337216c7fb5b..cc6d2aa8f7ceb8 100644 --- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll @@ -115,23 +115,23 @@ define i64 @test_floor_si64(float %x) nounwind { ; RV32IZFINX-NEXT: lui a2, %hi(.LCPI1_0) ; RV32IZFINX-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; RV32IZFINX-NEXT: and a0, s2, a0 -; RV32IZFINX-NEXT: flt.s a4, a2, s0 -; RV32IZFINX-NEXT: neg a2, a4 +; RV32IZFINX-NEXT: flt.s a3, a2, s0 +; RV32IZFINX-NEXT: neg a2, a3 ; RV32IZFINX-NEXT: or a0, a2, a0 ; RV32IZFINX-NEXT: feq.s a2, s0, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: lui a5, 524288 -; RV32IZFINX-NEXT: lui a3, 524288 +; RV32IZFINX-NEXT: lui a4, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB1_4 ; RV32IZFINX-NEXT: # %bb.3: -; RV32IZFINX-NEXT: mv a3, a1 +; RV32IZFINX-NEXT: mv a4, a1 ; RV32IZFINX-NEXT: .LBB1_4: ; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: beqz a4, .LBB1_6 +; RV32IZFINX-NEXT: beqz a3, .LBB1_6 ; RV32IZFINX-NEXT: # %bb.5: -; RV32IZFINX-NEXT: addi a3, a5, -1 +; RV32IZFINX-NEXT: addi a4, a5, -1 ; RV32IZFINX-NEXT: .LBB1_6: -; RV32IZFINX-NEXT: and a1, a2, a3 +; RV32IZFINX-NEXT: and a1, a2, a4 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -375,23 +375,23 @@ define i64 @test_ceil_si64(float %x) nounwind { ; RV32IZFINX-NEXT: lui a2, %hi(.LCPI5_0) ; RV32IZFINX-NEXT: lw a2, %lo(.LCPI5_0)(a2) ; RV32IZFINX-NEXT: and a0, s2, a0 -; RV32IZFINX-NEXT: flt.s a4, a2, s0 -; RV32IZFINX-NEXT: neg a2, a4 +; RV32IZFINX-NEXT: flt.s a3, a2, s0 +; RV32IZFINX-NEXT: neg a2, a3 ; RV32IZFINX-NEXT: or a0, a2, a0 ; RV32IZFINX-NEXT: feq.s a2, s0, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: lui a5, 524288 -; RV32IZFINX-NEXT: lui a3, 524288 +; RV32IZFINX-NEXT: lui a4, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB5_4 ; RV32IZFINX-NEXT: # %bb.3: -; RV32IZFINX-NEXT: mv a3, a1 +; RV32IZFINX-NEXT: mv a4, a1 ; RV32IZFINX-NEXT: .LBB5_4: ; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: beqz a4, .LBB5_6 +; RV32IZFINX-NEXT: beqz a3, .LBB5_6 ; RV32IZFINX-NEXT: # %bb.5: -; RV32IZFINX-NEXT: addi a3, a5, -1 +; RV32IZFINX-NEXT: addi a4, a5, -1 ; RV32IZFINX-NEXT: .LBB5_6: -; RV32IZFINX-NEXT: and a1, a2, a3 +; RV32IZFINX-NEXT: and a1, a2, a4 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -635,23 +635,23 @@ define i64 @test_trunc_si64(float %x) nounwind { ; RV32IZFINX-NEXT: lui a2, %hi(.LCPI9_0) ; RV32IZFINX-NEXT: lw a2, %lo(.LCPI9_0)(a2) ; RV32IZFINX-NEXT: and a0, s2, a0 -; RV32IZFINX-NEXT: flt.s a4, a2, s0 -; RV32IZFINX-NEXT: neg a2, a4 +; RV32IZFINX-NEXT: flt.s a3, a2, s0 +; RV32IZFINX-NEXT: neg a2, a3 ; RV32IZFINX-NEXT: or a0, a2, a0 ; RV32IZFINX-NEXT: feq.s a2, s0, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: lui a5, 524288 -; RV32IZFINX-NEXT: lui a3, 524288 +; RV32IZFINX-NEXT: lui a4, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB9_4 ; RV32IZFINX-NEXT: # %bb.3: -; RV32IZFINX-NEXT: mv a3, a1 +; RV32IZFINX-NEXT: mv a4, a1 ; RV32IZFINX-NEXT: .LBB9_4: ; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: beqz a4, .LBB9_6 +; RV32IZFINX-NEXT: beqz a3, .LBB9_6 ; RV32IZFINX-NEXT: # %bb.5: -; RV32IZFINX-NEXT: addi a3, a5, -1 +; RV32IZFINX-NEXT: addi a4, a5, -1 ; RV32IZFINX-NEXT: .LBB9_6: -; RV32IZFINX-NEXT: and a1, a2, a3 +; RV32IZFINX-NEXT: and a1, a2, a4 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -895,23 +895,23 @@ define i64 @test_round_si64(float %x) nounwind { ; RV32IZFINX-NEXT: lui a2, %hi(.LCPI13_0) ; RV32IZFINX-NEXT: lw a2, %lo(.LCPI13_0)(a2) ; RV32IZFINX-NEXT: and a0, s2, a0 -; RV32IZFINX-NEXT: flt.s a4, a2, s0 -; RV32IZFINX-NEXT: neg a2, a4 +; RV32IZFINX-NEXT: flt.s a3, a2, s0 +; RV32IZFINX-NEXT: neg a2, a3 ; RV32IZFINX-NEXT: or a0, a2, a0 ; RV32IZFINX-NEXT: feq.s a2, s0, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: lui a5, 524288 -; RV32IZFINX-NEXT: lui a3, 524288 +; RV32IZFINX-NEXT: lui a4, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB13_4 ; RV32IZFINX-NEXT: # %bb.3: -; RV32IZFINX-NEXT: mv a3, a1 +; RV32IZFINX-NEXT: mv a4, a1 ; RV32IZFINX-NEXT: .LBB13_4: ; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: beqz a4, .LBB13_6 +; RV32IZFINX-NEXT: beqz a3, .LBB13_6 ; RV32IZFINX-NEXT: # %bb.5: -; RV32IZFINX-NEXT: addi a3, a5, -1 +; RV32IZFINX-NEXT: addi a4, a5, -1 ; RV32IZFINX-NEXT: .LBB13_6: -; RV32IZFINX-NEXT: and a1, a2, a3 +; RV32IZFINX-NEXT: and a1, a2, a4 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1155,23 +1155,23 @@ define i64 @test_roundeven_si64(float %x) nounwind { ; RV32IZFINX-NEXT: lui a2, %hi(.LCPI17_0) ; RV32IZFINX-NEXT: lw a2, %lo(.LCPI17_0)(a2) ; RV32IZFINX-NEXT: and a0, s2, a0 -; RV32IZFINX-NEXT: flt.s a4, a2, s0 -; RV32IZFINX-NEXT: neg a2, a4 +; RV32IZFINX-NEXT: flt.s a3, a2, s0 +; RV32IZFINX-NEXT: neg a2, a3 ; RV32IZFINX-NEXT: or a0, a2, a0 ; RV32IZFINX-NEXT: feq.s a2, s0, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: lui a5, 524288 -; RV32IZFINX-NEXT: lui a3, 524288 +; RV32IZFINX-NEXT: lui a4, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB17_4 ; RV32IZFINX-NEXT: # %bb.3: -; RV32IZFINX-NEXT: mv a3, a1 +; RV32IZFINX-NEXT: mv a4, a1 ; RV32IZFINX-NEXT: .LBB17_4: ; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: beqz a4, .LBB17_6 +; RV32IZFINX-NEXT: beqz a3, .LBB17_6 ; RV32IZFINX-NEXT: # %bb.5: -; RV32IZFINX-NEXT: addi a3, a5, -1 +; RV32IZFINX-NEXT: addi a4, a5, -1 ; RV32IZFINX-NEXT: .LBB17_6: -; RV32IZFINX-NEXT: and a1, a2, a3 +; RV32IZFINX-NEXT: and a1, a2, a4 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/fmax-fmin.ll b/llvm/test/CodeGen/RISCV/fmax-fmin.ll index aac5e5efc2cf53..c64b844023ca56 100644 --- a/llvm/test/CodeGen/RISCV/fmax-fmin.ll +++ b/llvm/test/CodeGen/RISCV/fmax-fmin.ll @@ -31,14 +31,14 @@ define float @maxnum_f32_fast(float %x, float %y) nounwind { ; R32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; R32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; R32-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; R32-NEXT: mv s1, a1 -; R32-NEXT: mv s0, a0 +; R32-NEXT: mv s0, a1 +; R32-NEXT: mv s1, a0 ; R32-NEXT: call __gtsf2@plt ; R32-NEXT: bgtz a0, .LBB1_2 ; R32-NEXT: # %bb.1: -; R32-NEXT: mv s0, s1 +; R32-NEXT: mv s1, s0 ; R32-NEXT: .LBB1_2: -; R32-NEXT: mv a0, s0 +; R32-NEXT: mv a0, s1 ; R32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; R32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; R32-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -51,14 +51,14 @@ define float @maxnum_f32_fast(float %x, float %y) nounwind { ; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; R64-NEXT: mv s1, a1 -; R64-NEXT: mv s0, a0 +; R64-NEXT: mv s0, a1 +; R64-NEXT: mv s1, a0 ; R64-NEXT: call __gtsf2@plt ; R64-NEXT: bgtz a0, .LBB1_2 ; R64-NEXT: # %bb.1: -; R64-NEXT: mv s0, s1 +; R64-NEXT: mv s1, s0 ; R64-NEXT: .LBB1_2: -; R64-NEXT: mv a0, s0 +; R64-NEXT: mv a0, s1 ; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -99,9 +99,9 @@ define double @maxnum_f64_nnan(double %x, double %y) nounwind { ; R32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; R32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; R32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; R32-NEXT: mv s1, a3 +; R32-NEXT: mv s0, a3 ; R32-NEXT: mv s2, a2 -; R32-NEXT: mv s0, a1 +; R32-NEXT: mv s1, a1 ; R32-NEXT: mv s3, a0 ; R32-NEXT: call __gtdf2@plt ; R32-NEXT: mv a1, a0 @@ -110,16 +110,16 @@ define double @maxnum_f64_nnan(double %x, double %y) nounwind { ; R32-NEXT: # %bb.1: ; R32-NEXT: mv s3, s2 ; R32-NEXT: .LBB3_2: -; R32-NEXT: mv a1, s0 +; R32-NEXT: mv a1, s1 ; R32-NEXT: mv a2, s2 -; R32-NEXT: mv a3, s1 +; R32-NEXT: mv a3, s0 ; R32-NEXT: call __gtdf2@plt ; R32-NEXT: bgtz a0, .LBB3_4 ; R32-NEXT: # %bb.3: -; R32-NEXT: mv s0, s1 +; R32-NEXT: mv s1, s0 ; R32-NEXT: .LBB3_4: ; R32-NEXT: mv a0, s3 -; R32-NEXT: mv a1, s0 +; R32-NEXT: mv a1, s1 ; R32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; R32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; R32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -134,14 +134,14 @@ define double @maxnum_f64_nnan(double %x, double %y) nounwind { ; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; R64-NEXT: mv s1, a1 -; R64-NEXT: mv s0, a0 +; R64-NEXT: mv s0, a1 +; R64-NEXT: mv s1, a0 ; R64-NEXT: call __gtdf2@plt ; R64-NEXT: bgtz a0, .LBB3_2 ; R64-NEXT: # %bb.1: -; R64-NEXT: mv s0, s1 +; R64-NEXT: mv s1, s0 ; R64-NEXT: .LBB3_2: -; R64-NEXT: mv a0, s0 +; R64-NEXT: mv a0, s1 ; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -180,14 +180,14 @@ define float @minnum_f32_nnan(float %x, float %y) nounwind { ; R32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; R32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; R32-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; R32-NEXT: mv s1, a1 -; R32-NEXT: mv s0, a0 +; R32-NEXT: mv s0, a1 +; R32-NEXT: mv s1, a0 ; R32-NEXT: call __ltsf2@plt ; R32-NEXT: bltz a0, .LBB5_2 ; R32-NEXT: # %bb.1: -; R32-NEXT: mv s0, s1 +; R32-NEXT: mv s1, s0 ; R32-NEXT: .LBB5_2: -; R32-NEXT: mv a0, s0 +; R32-NEXT: mv a0, s1 ; R32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; R32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; R32-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -200,14 +200,14 @@ define float @minnum_f32_nnan(float %x, float %y) nounwind { ; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; R64-NEXT: mv s1, a1 -; R64-NEXT: mv s0, a0 +; R64-NEXT: mv s0, a1 +; R64-NEXT: mv s1, a0 ; R64-NEXT: call __ltsf2@plt ; R64-NEXT: bltz a0, .LBB5_2 ; R64-NEXT: # %bb.1: -; R64-NEXT: mv s0, s1 +; R64-NEXT: mv s1, s0 ; R64-NEXT: .LBB5_2: -; R64-NEXT: mv a0, s0 +; R64-NEXT: mv a0, s1 ; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -248,9 +248,9 @@ define double @minnum_f64_fast(double %x, double %y) nounwind { ; R32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; R32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; R32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; R32-NEXT: mv s1, a3 +; R32-NEXT: mv s0, a3 ; R32-NEXT: mv s2, a2 -; R32-NEXT: mv s0, a1 +; R32-NEXT: mv s1, a1 ; R32-NEXT: mv s3, a0 ; R32-NEXT: call __ltdf2@plt ; R32-NEXT: mv a1, a0 @@ -259,16 +259,16 @@ define double @minnum_f64_fast(double %x, double %y) nounwind { ; R32-NEXT: # %bb.1: ; R32-NEXT: mv s3, s2 ; R32-NEXT: .LBB7_2: -; R32-NEXT: mv a1, s0 +; R32-NEXT: mv a1, s1 ; R32-NEXT: mv a2, s2 -; R32-NEXT: mv a3, s1 +; R32-NEXT: mv a3, s0 ; R32-NEXT: call __ltdf2@plt ; R32-NEXT: bltz a0, .LBB7_4 ; R32-NEXT: # %bb.3: -; R32-NEXT: mv s0, s1 +; R32-NEXT: mv s1, s0 ; R32-NEXT: .LBB7_4: ; R32-NEXT: mv a0, s3 -; R32-NEXT: mv a1, s0 +; R32-NEXT: mv a1, s1 ; R32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; R32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; R32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -283,14 +283,14 @@ define double @minnum_f64_fast(double %x, double %y) nounwind { ; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; R64-NEXT: mv s1, a1 -; R64-NEXT: mv s0, a0 +; R64-NEXT: mv s0, a1 +; R64-NEXT: mv s1, a0 ; R64-NEXT: call __ltdf2@plt ; R64-NEXT: bltz a0, .LBB7_2 ; R64-NEXT: # %bb.1: -; R64-NEXT: mv s0, s1 +; R64-NEXT: mv s1, s0 ; R64-NEXT: .LBB7_2: -; R64-NEXT: mv a0, s0 +; R64-NEXT: mv a0, s1 ; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll index b091b0613c0f30..b38c6ffde94388 100644 --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -1069,27 +1069,27 @@ define i64 @stest_f64i64(double %x) { ; RV32IF-NEXT: mv a1, a5 ; RV32IF-NEXT: .LBB18_6: # %entry ; RV32IF-NEXT: or a4, t0, a4 -; RV32IF-NEXT: and a5, a7, a0 -; RV32IF-NEXT: and a2, a7, a2 +; RV32IF-NEXT: and a0, a7, a0 +; RV32IF-NEXT: and a5, a7, a2 ; RV32IF-NEXT: beq a1, a3, .LBB18_8 ; RV32IF-NEXT: # %bb.7: # %entry -; RV32IF-NEXT: sltu a0, a3, a1 +; RV32IF-NEXT: sltu a2, a3, a1 ; RV32IF-NEXT: j .LBB18_9 ; RV32IF-NEXT: .LBB18_8: -; RV32IF-NEXT: snez a0, a4 +; RV32IF-NEXT: snez a2, a4 ; RV32IF-NEXT: .LBB18_9: # %entry -; RV32IF-NEXT: and a2, a2, a5 +; RV32IF-NEXT: and a5, a5, a0 ; RV32IF-NEXT: li a3, -1 -; RV32IF-NEXT: beq a2, a3, .LBB18_11 +; RV32IF-NEXT: beq a5, a3, .LBB18_11 ; RV32IF-NEXT: # %bb.10: # %entry -; RV32IF-NEXT: slti a0, a5, 0 -; RV32IF-NEXT: xori a0, a0, 1 +; RV32IF-NEXT: slti a0, a0, 0 +; RV32IF-NEXT: xori a2, a0, 1 ; RV32IF-NEXT: .LBB18_11: # %entry -; RV32IF-NEXT: bnez a0, .LBB18_13 +; RV32IF-NEXT: bnez a2, .LBB18_13 ; RV32IF-NEXT: # %bb.12: # %entry ; RV32IF-NEXT: lui a1, 524288 ; RV32IF-NEXT: .LBB18_13: # %entry -; RV32IF-NEXT: neg a0, a0 +; RV32IF-NEXT: neg a0, a2 ; RV32IF-NEXT: and a0, a0, a4 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: addi sp, sp, 32 @@ -1106,21 +1106,21 @@ define i64 @stest_f64i64(double %x) { ; RV64IF-NEXT: srli a3, a2, 1 ; RV64IF-NEXT: beqz a1, .LBB18_2 ; RV64IF-NEXT: # %bb.1: # %entry -; RV64IF-NEXT: slti a4, a1, 0 +; RV64IF-NEXT: slti a5, a1, 0 ; RV64IF-NEXT: j .LBB18_3 ; RV64IF-NEXT: .LBB18_2: -; RV64IF-NEXT: sltu a4, a0, a3 +; RV64IF-NEXT: sltu a5, a0, a3 ; RV64IF-NEXT: .LBB18_3: # %entry -; RV64IF-NEXT: neg a5, a4 -; RV64IF-NEXT: and a5, a5, a1 -; RV64IF-NEXT: bnez a4, .LBB18_5 +; RV64IF-NEXT: neg a4, a5 +; RV64IF-NEXT: and a4, a4, a1 +; RV64IF-NEXT: bnez a5, .LBB18_5 ; RV64IF-NEXT: # %bb.4: # %entry ; RV64IF-NEXT: mv a0, a3 ; RV64IF-NEXT: .LBB18_5: # %entry ; RV64IF-NEXT: slli a1, a2, 63 -; RV64IF-NEXT: beq a5, a2, .LBB18_7 +; RV64IF-NEXT: beq a4, a2, .LBB18_7 ; RV64IF-NEXT: # %bb.6: # %entry -; RV64IF-NEXT: slti a2, a5, 0 +; RV64IF-NEXT: slti a2, a4, 0 ; RV64IF-NEXT: xori a2, a2, 1 ; RV64IF-NEXT: beqz a2, .LBB18_8 ; RV64IF-NEXT: j .LBB18_9 @@ -1168,27 +1168,27 @@ define i64 @stest_f64i64(double %x) { ; RV32IFD-NEXT: mv a1, a5 ; RV32IFD-NEXT: .LBB18_6: # %entry ; RV32IFD-NEXT: or a4, t0, a4 -; RV32IFD-NEXT: and a5, a7, a0 -; RV32IFD-NEXT: and a2, a7, a2 +; RV32IFD-NEXT: and a0, a7, a0 +; RV32IFD-NEXT: and a5, a7, a2 ; RV32IFD-NEXT: beq a1, a3, .LBB18_8 ; RV32IFD-NEXT: # %bb.7: # %entry -; RV32IFD-NEXT: sltu a0, a3, a1 +; RV32IFD-NEXT: sltu a2, a3, a1 ; RV32IFD-NEXT: j .LBB18_9 ; RV32IFD-NEXT: .LBB18_8: -; RV32IFD-NEXT: snez a0, a4 +; RV32IFD-NEXT: snez a2, a4 ; RV32IFD-NEXT: .LBB18_9: # %entry -; RV32IFD-NEXT: and a2, a2, a5 +; RV32IFD-NEXT: and a5, a5, a0 ; RV32IFD-NEXT: li a3, -1 -; RV32IFD-NEXT: beq a2, a3, .LBB18_11 +; RV32IFD-NEXT: beq a5, a3, .LBB18_11 ; RV32IFD-NEXT: # %bb.10: # %entry -; RV32IFD-NEXT: slti a0, a5, 0 -; RV32IFD-NEXT: xori a0, a0, 1 +; RV32IFD-NEXT: slti a0, a0, 0 +; RV32IFD-NEXT: xori a2, a0, 1 ; RV32IFD-NEXT: .LBB18_11: # %entry -; RV32IFD-NEXT: bnez a0, .LBB18_13 +; RV32IFD-NEXT: bnez a2, .LBB18_13 ; RV32IFD-NEXT: # %bb.12: # %entry ; RV32IFD-NEXT: lui a1, 524288 ; RV32IFD-NEXT: .LBB18_13: # %entry -; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: neg a0, a2 ; RV32IFD-NEXT: and a0, a0, a4 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: addi sp, sp, 32 @@ -1319,26 +1319,26 @@ define i64 @ustest_f64i64(double %x) { ; RV32IF-NEXT: li a0, 1 ; RV32IF-NEXT: .LBB20_5: # %entry ; RV32IF-NEXT: lw a3, 8(sp) -; RV32IF-NEXT: lw a4, 12(sp) -; RV32IF-NEXT: and a5, a2, a1 -; RV32IF-NEXT: beqz a5, .LBB20_7 +; RV32IF-NEXT: lw a5, 12(sp) +; RV32IF-NEXT: and a6, a2, a1 +; RV32IF-NEXT: beqz a6, .LBB20_7 ; RV32IF-NEXT: # %bb.6: # %entry -; RV32IF-NEXT: sgtz a1, a5 +; RV32IF-NEXT: sgtz a4, a6 ; RV32IF-NEXT: j .LBB20_8 ; RV32IF-NEXT: .LBB20_7: -; RV32IF-NEXT: snez a1, a0 +; RV32IF-NEXT: snez a4, a0 ; RV32IF-NEXT: .LBB20_8: # %entry -; RV32IF-NEXT: and a4, a2, a4 -; RV32IF-NEXT: or a0, a0, a5 +; RV32IF-NEXT: and a1, a2, a5 +; RV32IF-NEXT: or a0, a0, a6 ; RV32IF-NEXT: and a2, a2, a3 ; RV32IF-NEXT: bnez a0, .LBB20_10 ; RV32IF-NEXT: # %bb.9: -; RV32IF-NEXT: or a0, a2, a4 -; RV32IF-NEXT: snez a1, a0 +; RV32IF-NEXT: or a0, a2, a1 +; RV32IF-NEXT: snez a4, a0 ; RV32IF-NEXT: .LBB20_10: # %entry -; RV32IF-NEXT: neg a1, a1 -; RV32IF-NEXT: and a0, a1, a2 -; RV32IF-NEXT: and a1, a1, a4 +; RV32IF-NEXT: neg a3, a4 +; RV32IF-NEXT: and a0, a3, a2 +; RV32IF-NEXT: and a1, a3, a1 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: addi sp, sp, 32 ; RV32IF-NEXT: ret @@ -1398,26 +1398,26 @@ define i64 @ustest_f64i64(double %x) { ; RV32IFD-NEXT: li a0, 1 ; RV32IFD-NEXT: .LBB20_5: # %entry ; RV32IFD-NEXT: lw a3, 8(sp) -; RV32IFD-NEXT: lw a4, 12(sp) -; RV32IFD-NEXT: and a5, a2, a1 -; RV32IFD-NEXT: beqz a5, .LBB20_7 +; RV32IFD-NEXT: lw a5, 12(sp) +; RV32IFD-NEXT: and a6, a2, a1 +; RV32IFD-NEXT: beqz a6, .LBB20_7 ; RV32IFD-NEXT: # %bb.6: # %entry -; RV32IFD-NEXT: sgtz a1, a5 +; RV32IFD-NEXT: sgtz a4, a6 ; RV32IFD-NEXT: j .LBB20_8 ; RV32IFD-NEXT: .LBB20_7: -; RV32IFD-NEXT: snez a1, a0 +; RV32IFD-NEXT: snez a4, a0 ; RV32IFD-NEXT: .LBB20_8: # %entry -; RV32IFD-NEXT: and a4, a2, a4 -; RV32IFD-NEXT: or a0, a0, a5 +; RV32IFD-NEXT: and a1, a2, a5 +; RV32IFD-NEXT: or a0, a0, a6 ; RV32IFD-NEXT: and a2, a2, a3 ; RV32IFD-NEXT: bnez a0, .LBB20_10 ; RV32IFD-NEXT: # %bb.9: -; RV32IFD-NEXT: or a0, a2, a4 -; RV32IFD-NEXT: snez a1, a0 +; RV32IFD-NEXT: or a0, a2, a1 +; RV32IFD-NEXT: snez a4, a0 ; RV32IFD-NEXT: .LBB20_10: # %entry -; RV32IFD-NEXT: neg a1, a1 -; RV32IFD-NEXT: and a0, a1, a2 -; RV32IFD-NEXT: and a1, a1, a4 +; RV32IFD-NEXT: neg a3, a4 +; RV32IFD-NEXT: and a0, a3, a2 +; RV32IFD-NEXT: and a1, a3, a1 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: addi sp, sp, 32 ; RV32IFD-NEXT: ret @@ -1466,27 +1466,27 @@ define i64 @stest_f32i64(float %x) { ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB21_6: # %entry ; RV32-NEXT: or a4, t0, a4 -; RV32-NEXT: and a5, a7, a0 -; RV32-NEXT: and a2, a7, a2 +; RV32-NEXT: and a0, a7, a0 +; RV32-NEXT: and a5, a7, a2 ; RV32-NEXT: beq a1, a3, .LBB21_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a3, a1 +; RV32-NEXT: sltu a2, a3, a1 ; RV32-NEXT: j .LBB21_9 ; RV32-NEXT: .LBB21_8: -; RV32-NEXT: snez a0, a4 +; RV32-NEXT: snez a2, a4 ; RV32-NEXT: .LBB21_9: # %entry -; RV32-NEXT: and a2, a2, a5 +; RV32-NEXT: and a5, a5, a0 ; RV32-NEXT: li a3, -1 -; RV32-NEXT: beq a2, a3, .LBB21_11 +; RV32-NEXT: beq a5, a3, .LBB21_11 ; RV32-NEXT: # %bb.10: # %entry -; RV32-NEXT: slti a0, a5, 0 -; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: xori a2, a0, 1 ; RV32-NEXT: .LBB21_11: # %entry -; RV32-NEXT: bnez a0, .LBB21_13 +; RV32-NEXT: bnez a2, .LBB21_13 ; RV32-NEXT: # %bb.12: # %entry ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB21_13: # %entry -; RV32-NEXT: neg a0, a0 +; RV32-NEXT: neg a0, a2 ; RV32-NEXT: and a0, a0, a4 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 @@ -1587,26 +1587,26 @@ define i64 @ustest_f32i64(float %x) { ; RV32-NEXT: li a0, 1 ; RV32-NEXT: .LBB23_5: # %entry ; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: lw a4, 12(sp) -; RV32-NEXT: and a5, a2, a1 -; RV32-NEXT: beqz a5, .LBB23_7 +; RV32-NEXT: lw a5, 12(sp) +; RV32-NEXT: and a6, a2, a1 +; RV32-NEXT: beqz a6, .LBB23_7 ; RV32-NEXT: # %bb.6: # %entry -; RV32-NEXT: sgtz a1, a5 +; RV32-NEXT: sgtz a4, a6 ; RV32-NEXT: j .LBB23_8 ; RV32-NEXT: .LBB23_7: -; RV32-NEXT: snez a1, a0 +; RV32-NEXT: snez a4, a0 ; RV32-NEXT: .LBB23_8: # %entry -; RV32-NEXT: and a4, a2, a4 -; RV32-NEXT: or a0, a0, a5 +; RV32-NEXT: and a1, a2, a5 +; RV32-NEXT: or a0, a0, a6 ; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: bnez a0, .LBB23_10 ; RV32-NEXT: # %bb.9: -; RV32-NEXT: or a0, a2, a4 -; RV32-NEXT: snez a1, a0 +; RV32-NEXT: or a0, a2, a1 +; RV32-NEXT: snez a4, a0 ; RV32-NEXT: .LBB23_10: # %entry -; RV32-NEXT: neg a1, a1 -; RV32-NEXT: and a0, a1, a2 -; RV32-NEXT: and a1, a1, a4 +; RV32-NEXT: neg a3, a4 +; RV32-NEXT: and a0, a3, a2 +; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -1683,27 +1683,27 @@ define i64 @stest_f16i64(half %x) { ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB24_6: # %entry ; RV32-NEXT: or a4, t0, a4 -; RV32-NEXT: and a5, a7, a0 -; RV32-NEXT: and a2, a7, a2 +; RV32-NEXT: and a0, a7, a0 +; RV32-NEXT: and a5, a7, a2 ; RV32-NEXT: beq a1, a3, .LBB24_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a3, a1 +; RV32-NEXT: sltu a2, a3, a1 ; RV32-NEXT: j .LBB24_9 ; RV32-NEXT: .LBB24_8: -; RV32-NEXT: snez a0, a4 +; RV32-NEXT: snez a2, a4 ; RV32-NEXT: .LBB24_9: # %entry -; RV32-NEXT: and a2, a2, a5 +; RV32-NEXT: and a5, a5, a0 ; RV32-NEXT: li a3, -1 -; RV32-NEXT: beq a2, a3, .LBB24_11 +; RV32-NEXT: beq a5, a3, .LBB24_11 ; RV32-NEXT: # %bb.10: # %entry -; RV32-NEXT: slti a0, a5, 0 -; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: xori a2, a0, 1 ; RV32-NEXT: .LBB24_11: # %entry -; RV32-NEXT: bnez a0, .LBB24_13 +; RV32-NEXT: bnez a2, .LBB24_13 ; RV32-NEXT: # %bb.12: # %entry ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB24_13: # %entry -; RV32-NEXT: neg a0, a0 +; RV32-NEXT: neg a0, a2 ; RV32-NEXT: and a0, a0, a4 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 @@ -1721,21 +1721,21 @@ define i64 @stest_f16i64(half %x) { ; RV64-NEXT: srli a3, a2, 1 ; RV64-NEXT: beqz a1, .LBB24_2 ; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: slti a4, a1, 0 +; RV64-NEXT: slti a5, a1, 0 ; RV64-NEXT: j .LBB24_3 ; RV64-NEXT: .LBB24_2: -; RV64-NEXT: sltu a4, a0, a3 +; RV64-NEXT: sltu a5, a0, a3 ; RV64-NEXT: .LBB24_3: # %entry -; RV64-NEXT: neg a5, a4 -; RV64-NEXT: and a5, a5, a1 -; RV64-NEXT: bnez a4, .LBB24_5 +; RV64-NEXT: neg a4, a5 +; RV64-NEXT: and a4, a4, a1 +; RV64-NEXT: bnez a5, .LBB24_5 ; RV64-NEXT: # %bb.4: # %entry ; RV64-NEXT: mv a0, a3 ; RV64-NEXT: .LBB24_5: # %entry ; RV64-NEXT: slli a1, a2, 63 -; RV64-NEXT: beq a5, a2, .LBB24_7 +; RV64-NEXT: beq a4, a2, .LBB24_7 ; RV64-NEXT: # %bb.6: # %entry -; RV64-NEXT: slti a2, a5, 0 +; RV64-NEXT: slti a2, a4, 0 ; RV64-NEXT: xori a2, a2, 1 ; RV64-NEXT: beqz a2, .LBB24_8 ; RV64-NEXT: j .LBB24_9 @@ -1838,26 +1838,26 @@ define i64 @ustest_f16i64(half %x) { ; RV32-NEXT: li a0, 1 ; RV32-NEXT: .LBB26_5: # %entry ; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: lw a4, 12(sp) -; RV32-NEXT: and a5, a2, a1 -; RV32-NEXT: beqz a5, .LBB26_7 +; RV32-NEXT: lw a5, 12(sp) +; RV32-NEXT: and a6, a2, a1 +; RV32-NEXT: beqz a6, .LBB26_7 ; RV32-NEXT: # %bb.6: # %entry -; RV32-NEXT: sgtz a1, a5 +; RV32-NEXT: sgtz a4, a6 ; RV32-NEXT: j .LBB26_8 ; RV32-NEXT: .LBB26_7: -; RV32-NEXT: snez a1, a0 +; RV32-NEXT: snez a4, a0 ; RV32-NEXT: .LBB26_8: # %entry -; RV32-NEXT: and a4, a2, a4 -; RV32-NEXT: or a0, a0, a5 +; RV32-NEXT: and a1, a2, a5 +; RV32-NEXT: or a0, a0, a6 ; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: bnez a0, .LBB26_10 ; RV32-NEXT: # %bb.9: -; RV32-NEXT: or a0, a2, a4 -; RV32-NEXT: snez a1, a0 +; RV32-NEXT: or a0, a2, a1 +; RV32-NEXT: snez a4, a0 ; RV32-NEXT: .LBB26_10: # %entry -; RV32-NEXT: neg a1, a1 -; RV32-NEXT: and a0, a1, a2 -; RV32-NEXT: and a1, a1, a4 +; RV32-NEXT: neg a3, a4 +; RV32-NEXT: and a0, a3, a2 +; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -2918,27 +2918,27 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IF-NEXT: mv a1, a5 ; RV32IF-NEXT: .LBB45_6: # %entry ; RV32IF-NEXT: or a4, t0, a4 -; RV32IF-NEXT: and a5, a7, a0 -; RV32IF-NEXT: and a2, a7, a2 +; RV32IF-NEXT: and a0, a7, a0 +; RV32IF-NEXT: and a5, a7, a2 ; RV32IF-NEXT: beq a1, a3, .LBB45_8 ; RV32IF-NEXT: # %bb.7: # %entry -; RV32IF-NEXT: sltu a0, a3, a1 +; RV32IF-NEXT: sltu a2, a3, a1 ; RV32IF-NEXT: j .LBB45_9 ; RV32IF-NEXT: .LBB45_8: -; RV32IF-NEXT: snez a0, a4 +; RV32IF-NEXT: snez a2, a4 ; RV32IF-NEXT: .LBB45_9: # %entry -; RV32IF-NEXT: and a2, a2, a5 +; RV32IF-NEXT: and a5, a5, a0 ; RV32IF-NEXT: li a3, -1 -; RV32IF-NEXT: beq a2, a3, .LBB45_11 +; RV32IF-NEXT: beq a5, a3, .LBB45_11 ; RV32IF-NEXT: # %bb.10: # %entry -; RV32IF-NEXT: slti a0, a5, 0 -; RV32IF-NEXT: xori a0, a0, 1 +; RV32IF-NEXT: slti a0, a0, 0 +; RV32IF-NEXT: xori a2, a0, 1 ; RV32IF-NEXT: .LBB45_11: # %entry -; RV32IF-NEXT: bnez a0, .LBB45_13 +; RV32IF-NEXT: bnez a2, .LBB45_13 ; RV32IF-NEXT: # %bb.12: # %entry ; RV32IF-NEXT: lui a1, 524288 ; RV32IF-NEXT: .LBB45_13: # %entry -; RV32IF-NEXT: neg a0, a0 +; RV32IF-NEXT: neg a0, a2 ; RV32IF-NEXT: and a0, a0, a4 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: addi sp, sp, 32 @@ -2955,21 +2955,21 @@ define i64 @stest_f64i64_mm(double %x) { ; RV64IF-NEXT: srli a3, a2, 1 ; RV64IF-NEXT: beqz a1, .LBB45_2 ; RV64IF-NEXT: # %bb.1: # %entry -; RV64IF-NEXT: slti a4, a1, 0 +; RV64IF-NEXT: slti a5, a1, 0 ; RV64IF-NEXT: j .LBB45_3 ; RV64IF-NEXT: .LBB45_2: -; RV64IF-NEXT: sltu a4, a0, a3 +; RV64IF-NEXT: sltu a5, a0, a3 ; RV64IF-NEXT: .LBB45_3: # %entry -; RV64IF-NEXT: neg a5, a4 -; RV64IF-NEXT: and a5, a5, a1 -; RV64IF-NEXT: bnez a4, .LBB45_5 +; RV64IF-NEXT: neg a4, a5 +; RV64IF-NEXT: and a4, a4, a1 +; RV64IF-NEXT: bnez a5, .LBB45_5 ; RV64IF-NEXT: # %bb.4: # %entry ; RV64IF-NEXT: mv a0, a3 ; RV64IF-NEXT: .LBB45_5: # %entry ; RV64IF-NEXT: slli a1, a2, 63 -; RV64IF-NEXT: beq a5, a2, .LBB45_7 +; RV64IF-NEXT: beq a4, a2, .LBB45_7 ; RV64IF-NEXT: # %bb.6: # %entry -; RV64IF-NEXT: slti a2, a5, 0 +; RV64IF-NEXT: slti a2, a4, 0 ; RV64IF-NEXT: xori a2, a2, 1 ; RV64IF-NEXT: beqz a2, .LBB45_8 ; RV64IF-NEXT: j .LBB45_9 @@ -3017,27 +3017,27 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IFD-NEXT: mv a1, a5 ; RV32IFD-NEXT: .LBB45_6: # %entry ; RV32IFD-NEXT: or a4, t0, a4 -; RV32IFD-NEXT: and a5, a7, a0 -; RV32IFD-NEXT: and a2, a7, a2 +; RV32IFD-NEXT: and a0, a7, a0 +; RV32IFD-NEXT: and a5, a7, a2 ; RV32IFD-NEXT: beq a1, a3, .LBB45_8 ; RV32IFD-NEXT: # %bb.7: # %entry -; RV32IFD-NEXT: sltu a0, a3, a1 +; RV32IFD-NEXT: sltu a2, a3, a1 ; RV32IFD-NEXT: j .LBB45_9 ; RV32IFD-NEXT: .LBB45_8: -; RV32IFD-NEXT: snez a0, a4 +; RV32IFD-NEXT: snez a2, a4 ; RV32IFD-NEXT: .LBB45_9: # %entry -; RV32IFD-NEXT: and a2, a2, a5 +; RV32IFD-NEXT: and a5, a5, a0 ; RV32IFD-NEXT: li a3, -1 -; RV32IFD-NEXT: beq a2, a3, .LBB45_11 +; RV32IFD-NEXT: beq a5, a3, .LBB45_11 ; RV32IFD-NEXT: # %bb.10: # %entry -; RV32IFD-NEXT: slti a0, a5, 0 -; RV32IFD-NEXT: xori a0, a0, 1 +; RV32IFD-NEXT: slti a0, a0, 0 +; RV32IFD-NEXT: xori a2, a0, 1 ; RV32IFD-NEXT: .LBB45_11: # %entry -; RV32IFD-NEXT: bnez a0, .LBB45_13 +; RV32IFD-NEXT: bnez a2, .LBB45_13 ; RV32IFD-NEXT: # %bb.12: # %entry ; RV32IFD-NEXT: lui a1, 524288 ; RV32IFD-NEXT: .LBB45_13: # %entry -; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: neg a0, a2 ; RV32IFD-NEXT: and a0, a0, a4 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: addi sp, sp, 32 @@ -3146,29 +3146,29 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixdfti@plt ; RV32IF-NEXT: lw a0, 8(sp) -; RV32IF-NEXT: lw a1, 12(sp) -; RV32IF-NEXT: lw a2, 20(sp) +; RV32IF-NEXT: lw a2, 12(sp) +; RV32IF-NEXT: lw a1, 20(sp) ; RV32IF-NEXT: lw a3, 16(sp) -; RV32IF-NEXT: beqz a2, .LBB47_2 +; RV32IF-NEXT: beqz a1, .LBB47_2 ; RV32IF-NEXT: # %bb.1: # %entry -; RV32IF-NEXT: slti a4, a2, 0 +; RV32IF-NEXT: slti a4, a1, 0 ; RV32IF-NEXT: j .LBB47_3 ; RV32IF-NEXT: .LBB47_2: ; RV32IF-NEXT: seqz a4, a3 ; RV32IF-NEXT: .LBB47_3: # %entry ; RV32IF-NEXT: xori a3, a3, 1 -; RV32IF-NEXT: or a3, a3, a2 +; RV32IF-NEXT: or a3, a3, a1 ; RV32IF-NEXT: seqz a3, a3 ; RV32IF-NEXT: addi a3, a3, -1 ; RV32IF-NEXT: and a3, a3, a4 ; RV32IF-NEXT: neg a3, a3 -; RV32IF-NEXT: and a1, a3, a1 -; RV32IF-NEXT: and a0, a3, a0 ; RV32IF-NEXT: and a2, a3, a2 -; RV32IF-NEXT: slti a2, a2, 0 -; RV32IF-NEXT: addi a2, a2, -1 -; RV32IF-NEXT: and a0, a2, a0 -; RV32IF-NEXT: and a1, a2, a1 +; RV32IF-NEXT: and a0, a3, a0 +; RV32IF-NEXT: and a1, a3, a1 +; RV32IF-NEXT: slti a1, a1, 0 +; RV32IF-NEXT: addi a1, a1, -1 +; RV32IF-NEXT: and a0, a1, a0 +; RV32IF-NEXT: and a1, a1, a2 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: addi sp, sp, 32 ; RV32IF-NEXT: ret @@ -3204,29 +3204,29 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixdfti@plt ; RV32IFD-NEXT: lw a0, 8(sp) -; RV32IFD-NEXT: lw a1, 12(sp) -; RV32IFD-NEXT: lw a2, 20(sp) +; RV32IFD-NEXT: lw a2, 12(sp) +; RV32IFD-NEXT: lw a1, 20(sp) ; RV32IFD-NEXT: lw a3, 16(sp) -; RV32IFD-NEXT: beqz a2, .LBB47_2 +; RV32IFD-NEXT: beqz a1, .LBB47_2 ; RV32IFD-NEXT: # %bb.1: # %entry -; RV32IFD-NEXT: slti a4, a2, 0 +; RV32IFD-NEXT: slti a4, a1, 0 ; RV32IFD-NEXT: j .LBB47_3 ; RV32IFD-NEXT: .LBB47_2: ; RV32IFD-NEXT: seqz a4, a3 ; RV32IFD-NEXT: .LBB47_3: # %entry ; RV32IFD-NEXT: xori a3, a3, 1 -; RV32IFD-NEXT: or a3, a3, a2 +; RV32IFD-NEXT: or a3, a3, a1 ; RV32IFD-NEXT: seqz a3, a3 ; RV32IFD-NEXT: addi a3, a3, -1 ; RV32IFD-NEXT: and a3, a3, a4 ; RV32IFD-NEXT: neg a3, a3 -; RV32IFD-NEXT: and a1, a3, a1 -; RV32IFD-NEXT: and a0, a3, a0 ; RV32IFD-NEXT: and a2, a3, a2 -; RV32IFD-NEXT: slti a2, a2, 0 -; RV32IFD-NEXT: addi a2, a2, -1 -; RV32IFD-NEXT: and a0, a2, a0 -; RV32IFD-NEXT: and a1, a2, a1 +; RV32IFD-NEXT: and a0, a3, a0 +; RV32IFD-NEXT: and a1, a3, a1 +; RV32IFD-NEXT: slti a1, a1, 0 +; RV32IFD-NEXT: addi a1, a1, -1 +; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: and a1, a1, a2 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: addi sp, sp, 32 ; RV32IFD-NEXT: ret @@ -3273,27 +3273,27 @@ define i64 @stest_f32i64_mm(float %x) { ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB48_6: # %entry ; RV32-NEXT: or a4, t0, a4 -; RV32-NEXT: and a5, a7, a0 -; RV32-NEXT: and a2, a7, a2 +; RV32-NEXT: and a0, a7, a0 +; RV32-NEXT: and a5, a7, a2 ; RV32-NEXT: beq a1, a3, .LBB48_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a3, a1 +; RV32-NEXT: sltu a2, a3, a1 ; RV32-NEXT: j .LBB48_9 ; RV32-NEXT: .LBB48_8: -; RV32-NEXT: snez a0, a4 +; RV32-NEXT: snez a2, a4 ; RV32-NEXT: .LBB48_9: # %entry -; RV32-NEXT: and a2, a2, a5 +; RV32-NEXT: and a5, a5, a0 ; RV32-NEXT: li a3, -1 -; RV32-NEXT: beq a2, a3, .LBB48_11 +; RV32-NEXT: beq a5, a3, .LBB48_11 ; RV32-NEXT: # %bb.10: # %entry -; RV32-NEXT: slti a0, a5, 0 -; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: xori a2, a0, 1 ; RV32-NEXT: .LBB48_11: # %entry -; RV32-NEXT: bnez a0, .LBB48_13 +; RV32-NEXT: bnez a2, .LBB48_13 ; RV32-NEXT: # %bb.12: # %entry ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB48_13: # %entry -; RV32-NEXT: neg a0, a0 +; RV32-NEXT: neg a0, a2 ; RV32-NEXT: and a0, a0, a4 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 @@ -3372,29 +3372,29 @@ define i64 @ustest_f32i64_mm(float %x) { ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti@plt ; RV32-NEXT: lw a0, 8(sp) -; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a2, 20(sp) +; RV32-NEXT: lw a2, 12(sp) +; RV32-NEXT: lw a1, 20(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a2, .LBB50_2 +; RV32-NEXT: beqz a1, .LBB50_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a2, 0 +; RV32-NEXT: slti a4, a1, 0 ; RV32-NEXT: j .LBB50_3 ; RV32-NEXT: .LBB50_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB50_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a2 +; RV32-NEXT: or a3, a3, a1 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 -; RV32-NEXT: and a1, a3, a1 -; RV32-NEXT: and a0, a3, a0 ; RV32-NEXT: and a2, a3, a2 -; RV32-NEXT: slti a2, a2, 0 -; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: slti a1, a1, 0 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -3464,27 +3464,27 @@ define i64 @stest_f16i64_mm(half %x) { ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB51_6: # %entry ; RV32-NEXT: or a4, t0, a4 -; RV32-NEXT: and a5, a7, a0 -; RV32-NEXT: and a2, a7, a2 +; RV32-NEXT: and a0, a7, a0 +; RV32-NEXT: and a5, a7, a2 ; RV32-NEXT: beq a1, a3, .LBB51_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a3, a1 +; RV32-NEXT: sltu a2, a3, a1 ; RV32-NEXT: j .LBB51_9 ; RV32-NEXT: .LBB51_8: -; RV32-NEXT: snez a0, a4 +; RV32-NEXT: snez a2, a4 ; RV32-NEXT: .LBB51_9: # %entry -; RV32-NEXT: and a2, a2, a5 +; RV32-NEXT: and a5, a5, a0 ; RV32-NEXT: li a3, -1 -; RV32-NEXT: beq a2, a3, .LBB51_11 +; RV32-NEXT: beq a5, a3, .LBB51_11 ; RV32-NEXT: # %bb.10: # %entry -; RV32-NEXT: slti a0, a5, 0 -; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: xori a2, a0, 1 ; RV32-NEXT: .LBB51_11: # %entry -; RV32-NEXT: bnez a0, .LBB51_13 +; RV32-NEXT: bnez a2, .LBB51_13 ; RV32-NEXT: # %bb.12: # %entry ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB51_13: # %entry -; RV32-NEXT: neg a0, a0 +; RV32-NEXT: neg a0, a2 ; RV32-NEXT: and a0, a0, a4 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 @@ -3502,21 +3502,21 @@ define i64 @stest_f16i64_mm(half %x) { ; RV64-NEXT: srli a3, a2, 1 ; RV64-NEXT: beqz a1, .LBB51_2 ; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: slti a4, a1, 0 +; RV64-NEXT: slti a5, a1, 0 ; RV64-NEXT: j .LBB51_3 ; RV64-NEXT: .LBB51_2: -; RV64-NEXT: sltu a4, a0, a3 +; RV64-NEXT: sltu a5, a0, a3 ; RV64-NEXT: .LBB51_3: # %entry -; RV64-NEXT: neg a5, a4 -; RV64-NEXT: and a5, a5, a1 -; RV64-NEXT: bnez a4, .LBB51_5 +; RV64-NEXT: neg a4, a5 +; RV64-NEXT: and a4, a4, a1 +; RV64-NEXT: bnez a5, .LBB51_5 ; RV64-NEXT: # %bb.4: # %entry ; RV64-NEXT: mv a0, a3 ; RV64-NEXT: .LBB51_5: # %entry ; RV64-NEXT: slli a1, a2, 63 -; RV64-NEXT: beq a5, a2, .LBB51_7 +; RV64-NEXT: beq a4, a2, .LBB51_7 ; RV64-NEXT: # %bb.6: # %entry -; RV64-NEXT: slti a2, a5, 0 +; RV64-NEXT: slti a2, a4, 0 ; RV64-NEXT: xori a2, a2, 1 ; RV64-NEXT: beqz a2, .LBB51_8 ; RV64-NEXT: j .LBB51_9 @@ -3597,29 +3597,29 @@ define i64 @ustest_f16i64_mm(half %x) { ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti@plt ; RV32-NEXT: lw a0, 8(sp) -; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a2, 20(sp) +; RV32-NEXT: lw a2, 12(sp) +; RV32-NEXT: lw a1, 20(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a2, .LBB53_2 +; RV32-NEXT: beqz a1, .LBB53_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a2, 0 +; RV32-NEXT: slti a4, a1, 0 ; RV32-NEXT: j .LBB53_3 ; RV32-NEXT: .LBB53_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB53_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a2 +; RV32-NEXT: or a3, a3, a1 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 -; RV32-NEXT: and a1, a3, a1 -; RV32-NEXT: and a0, a3, a0 ; RV32-NEXT: and a2, a3, a2 -; RV32-NEXT: slti a2, a2, 0 -; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: slti a1, a1, 0 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll index 2d3f40e15fe432..2dae52dc7922c9 100644 --- a/llvm/test/CodeGen/RISCV/half-convert.ll +++ b/llvm/test/CodeGen/RISCV/half-convert.ll @@ -2155,7 +2155,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IZFH-NEXT: fle.s s0, fa5, fs0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixsfdi@plt -; RV32IZFH-NEXT: lui a4, 524288 +; RV32IZFH-NEXT: lui a3, 524288 ; RV32IZFH-NEXT: lui a2, 524288 ; RV32IZFH-NEXT: beqz s0, .LBB10_2 ; RV32IZFH-NEXT: # %bb.1: # %start @@ -2163,19 +2163,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IZFH-NEXT: .LBB10_2: # %start ; RV32IZFH-NEXT: lui a1, %hi(.LCPI10_0) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32IZFH-NEXT: flt.s a3, fa5, fs0 -; RV32IZFH-NEXT: beqz a3, .LBB10_4 +; RV32IZFH-NEXT: flt.s a4, fa5, fs0 +; RV32IZFH-NEXT: beqz a4, .LBB10_4 ; RV32IZFH-NEXT: # %bb.3: -; RV32IZFH-NEXT: addi a2, a4, -1 +; RV32IZFH-NEXT: addi a2, a3, -1 ; RV32IZFH-NEXT: .LBB10_4: # %start ; RV32IZFH-NEXT: feq.s a1, fs0, fs0 -; RV32IZFH-NEXT: neg a4, a1 -; RV32IZFH-NEXT: and a1, a4, a2 -; RV32IZFH-NEXT: neg a2, a3 -; RV32IZFH-NEXT: neg a3, s0 -; RV32IZFH-NEXT: and a0, a3, a0 -; RV32IZFH-NEXT: or a0, a2, a0 +; RV32IZFH-NEXT: neg a3, a1 +; RV32IZFH-NEXT: and a1, a3, a2 +; RV32IZFH-NEXT: neg a2, a4 +; RV32IZFH-NEXT: neg a4, s0 ; RV32IZFH-NEXT: and a0, a4, a0 +; RV32IZFH-NEXT: or a0, a2, a0 +; RV32IZFH-NEXT: and a0, a3, a0 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -2203,7 +2203,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IDZFH-NEXT: fle.s s0, fa5, fs0 ; RV32IDZFH-NEXT: fmv.s fa0, fs0 ; RV32IDZFH-NEXT: call __fixsfdi@plt -; RV32IDZFH-NEXT: lui a4, 524288 +; RV32IDZFH-NEXT: lui a3, 524288 ; RV32IDZFH-NEXT: lui a2, 524288 ; RV32IDZFH-NEXT: beqz s0, .LBB10_2 ; RV32IDZFH-NEXT: # %bb.1: # %start @@ -2211,19 +2211,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IDZFH-NEXT: .LBB10_2: # %start ; RV32IDZFH-NEXT: lui a1, %hi(.LCPI10_0) ; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32IDZFH-NEXT: flt.s a3, fa5, fs0 -; RV32IDZFH-NEXT: beqz a3, .LBB10_4 +; RV32IDZFH-NEXT: flt.s a4, fa5, fs0 +; RV32IDZFH-NEXT: beqz a4, .LBB10_4 ; RV32IDZFH-NEXT: # %bb.3: -; RV32IDZFH-NEXT: addi a2, a4, -1 +; RV32IDZFH-NEXT: addi a2, a3, -1 ; RV32IDZFH-NEXT: .LBB10_4: # %start ; RV32IDZFH-NEXT: feq.s a1, fs0, fs0 -; RV32IDZFH-NEXT: neg a4, a1 -; RV32IDZFH-NEXT: and a1, a4, a2 -; RV32IDZFH-NEXT: neg a2, a3 -; RV32IDZFH-NEXT: neg a3, s0 -; RV32IDZFH-NEXT: and a0, a3, a0 -; RV32IDZFH-NEXT: or a0, a2, a0 +; RV32IDZFH-NEXT: neg a3, a1 +; RV32IDZFH-NEXT: and a1, a3, a2 +; RV32IDZFH-NEXT: neg a2, a4 +; RV32IDZFH-NEXT: neg a4, s0 ; RV32IDZFH-NEXT: and a0, a4, a0 +; RV32IDZFH-NEXT: or a0, a2, a0 +; RV32IDZFH-NEXT: and a0, a3, a0 ; RV32IDZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IDZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IDZFH-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -2458,7 +2458,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: fsw fa4, 4(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: fle.s s0, fa5, fa4 ; RV32ID-ILP32-NEXT: call __fixsfdi@plt -; RV32ID-ILP32-NEXT: lui a4, 524288 +; RV32ID-ILP32-NEXT: lui a3, 524288 ; RV32ID-ILP32-NEXT: lui a2, 524288 ; RV32ID-ILP32-NEXT: beqz s0, .LBB10_2 ; RV32ID-ILP32-NEXT: # %bb.1: # %start @@ -2467,20 +2467,20 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI10_0) ; RV32ID-ILP32-NEXT: flw fa5, %lo(.LCPI10_0)(a1) ; RV32ID-ILP32-NEXT: flw fa4, 4(sp) # 4-byte Folded Reload -; RV32ID-ILP32-NEXT: flt.s a3, fa5, fa4 +; RV32ID-ILP32-NEXT: flt.s a4, fa5, fa4 ; RV32ID-ILP32-NEXT: fmv.s fa5, fa4 -; RV32ID-ILP32-NEXT: beqz a3, .LBB10_4 +; RV32ID-ILP32-NEXT: beqz a4, .LBB10_4 ; RV32ID-ILP32-NEXT: # %bb.3: -; RV32ID-ILP32-NEXT: addi a2, a4, -1 +; RV32ID-ILP32-NEXT: addi a2, a3, -1 ; RV32ID-ILP32-NEXT: .LBB10_4: # %start ; RV32ID-ILP32-NEXT: feq.s a1, fa5, fa5 -; RV32ID-ILP32-NEXT: neg a4, a1 -; RV32ID-ILP32-NEXT: and a1, a4, a2 -; RV32ID-ILP32-NEXT: neg a2, a3 -; RV32ID-ILP32-NEXT: neg a3, s0 -; RV32ID-ILP32-NEXT: and a0, a3, a0 -; RV32ID-ILP32-NEXT: or a0, a2, a0 +; RV32ID-ILP32-NEXT: neg a3, a1 +; RV32ID-ILP32-NEXT: and a1, a3, a2 +; RV32ID-ILP32-NEXT: neg a2, a4 +; RV32ID-ILP32-NEXT: neg a4, s0 ; RV32ID-ILP32-NEXT: and a0, a4, a0 +; RV32ID-ILP32-NEXT: or a0, a2, a0 +; RV32ID-ILP32-NEXT: and a0, a3, a0 ; RV32ID-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-ILP32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32ID-ILP32-NEXT: addi sp, sp, 16 @@ -2513,7 +2513,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32ID-NEXT: fmv.w.x fa5, a0 ; RV32ID-NEXT: fle.s s0, fa5, fa0 ; RV32ID-NEXT: call __fixsfdi@plt -; RV32ID-NEXT: lui a4, 524288 +; RV32ID-NEXT: lui a3, 524288 ; RV32ID-NEXT: lui a2, 524288 ; RV32ID-NEXT: beqz s0, .LBB10_2 ; RV32ID-NEXT: # %bb.1: # %start @@ -2521,19 +2521,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32ID-NEXT: .LBB10_2: # %start ; RV32ID-NEXT: lui a1, %hi(.LCPI10_0) ; RV32ID-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32ID-NEXT: flt.s a3, fa5, fs0 -; RV32ID-NEXT: beqz a3, .LBB10_4 +; RV32ID-NEXT: flt.s a4, fa5, fs0 +; RV32ID-NEXT: beqz a4, .LBB10_4 ; RV32ID-NEXT: # %bb.3: -; RV32ID-NEXT: addi a2, a4, -1 +; RV32ID-NEXT: addi a2, a3, -1 ; RV32ID-NEXT: .LBB10_4: # %start ; RV32ID-NEXT: feq.s a1, fs0, fs0 -; RV32ID-NEXT: neg a4, a1 -; RV32ID-NEXT: and a1, a4, a2 -; RV32ID-NEXT: neg a2, a3 -; RV32ID-NEXT: neg a3, s0 -; RV32ID-NEXT: and a0, a3, a0 -; RV32ID-NEXT: or a0, a2, a0 +; RV32ID-NEXT: neg a3, a1 +; RV32ID-NEXT: and a1, a3, a2 +; RV32ID-NEXT: neg a2, a4 +; RV32ID-NEXT: neg a4, s0 ; RV32ID-NEXT: and a0, a4, a0 +; RV32ID-NEXT: or a0, a2, a0 +; RV32ID-NEXT: and a0, a3, a0 ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32ID-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -2566,7 +2566,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IFZFHMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IFZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IFZFHMIN-NEXT: call __fixsfdi@plt -; RV32IFZFHMIN-NEXT: lui a4, 524288 +; RV32IFZFHMIN-NEXT: lui a3, 524288 ; RV32IFZFHMIN-NEXT: lui a2, 524288 ; RV32IFZFHMIN-NEXT: beqz s0, .LBB10_2 ; RV32IFZFHMIN-NEXT: # %bb.1: # %start @@ -2574,19 +2574,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IFZFHMIN-NEXT: .LBB10_2: # %start ; RV32IFZFHMIN-NEXT: lui a1, %hi(.LCPI10_0) ; RV32IFZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32IFZFHMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IFZFHMIN-NEXT: beqz a3, .LBB10_4 +; RV32IFZFHMIN-NEXT: flt.s a4, fa5, fs0 +; RV32IFZFHMIN-NEXT: beqz a4, .LBB10_4 ; RV32IFZFHMIN-NEXT: # %bb.3: -; RV32IFZFHMIN-NEXT: addi a2, a4, -1 +; RV32IFZFHMIN-NEXT: addi a2, a3, -1 ; RV32IFZFHMIN-NEXT: .LBB10_4: # %start ; RV32IFZFHMIN-NEXT: feq.s a1, fs0, fs0 -; RV32IFZFHMIN-NEXT: neg a4, a1 -; RV32IFZFHMIN-NEXT: and a1, a4, a2 -; RV32IFZFHMIN-NEXT: neg a2, a3 -; RV32IFZFHMIN-NEXT: neg a3, s0 -; RV32IFZFHMIN-NEXT: and a0, a3, a0 -; RV32IFZFHMIN-NEXT: or a0, a2, a0 +; RV32IFZFHMIN-NEXT: neg a3, a1 +; RV32IFZFHMIN-NEXT: and a1, a3, a2 +; RV32IFZFHMIN-NEXT: neg a2, a4 +; RV32IFZFHMIN-NEXT: neg a4, s0 ; RV32IFZFHMIN-NEXT: and a0, a4, a0 +; RV32IFZFHMIN-NEXT: or a0, a2, a0 +; RV32IFZFHMIN-NEXT: and a0, a3, a0 ; RV32IFZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -2615,7 +2615,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IDZFHMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IDZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IDZFHMIN-NEXT: call __fixsfdi@plt -; RV32IDZFHMIN-NEXT: lui a4, 524288 +; RV32IDZFHMIN-NEXT: lui a3, 524288 ; RV32IDZFHMIN-NEXT: lui a2, 524288 ; RV32IDZFHMIN-NEXT: beqz s0, .LBB10_2 ; RV32IDZFHMIN-NEXT: # %bb.1: # %start @@ -2623,19 +2623,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IDZFHMIN-NEXT: .LBB10_2: # %start ; RV32IDZFHMIN-NEXT: lui a1, %hi(.LCPI10_0) ; RV32IDZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32IDZFHMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IDZFHMIN-NEXT: beqz a3, .LBB10_4 +; RV32IDZFHMIN-NEXT: flt.s a4, fa5, fs0 +; RV32IDZFHMIN-NEXT: beqz a4, .LBB10_4 ; RV32IDZFHMIN-NEXT: # %bb.3: -; RV32IDZFHMIN-NEXT: addi a2, a4, -1 +; RV32IDZFHMIN-NEXT: addi a2, a3, -1 ; RV32IDZFHMIN-NEXT: .LBB10_4: # %start ; RV32IDZFHMIN-NEXT: feq.s a1, fs0, fs0 -; RV32IDZFHMIN-NEXT: neg a4, a1 -; RV32IDZFHMIN-NEXT: and a1, a4, a2 -; RV32IDZFHMIN-NEXT: neg a2, a3 -; RV32IDZFHMIN-NEXT: neg a3, s0 -; RV32IDZFHMIN-NEXT: and a0, a3, a0 -; RV32IDZFHMIN-NEXT: or a0, a2, a0 +; RV32IDZFHMIN-NEXT: neg a3, a1 +; RV32IDZFHMIN-NEXT: and a1, a3, a2 +; RV32IDZFHMIN-NEXT: neg a2, a4 +; RV32IDZFHMIN-NEXT: neg a4, s0 ; RV32IDZFHMIN-NEXT: and a0, a4, a0 +; RV32IDZFHMIN-NEXT: or a0, a2, a0 +; RV32IDZFHMIN-NEXT: and a0, a3, a0 ; RV32IDZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IDZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IDZFHMIN-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll index e7215f07c22045..1f035937b193c8 100644 --- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll @@ -177,16 +177,16 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZHINX-NEXT: lui a2, %hi(.LCPI1_1) ; RV32IZHINX-NEXT: lw a2, %lo(.LCPI1_1)(a2) ; RV32IZHINX-NEXT: and a0, s2, a0 -; RV32IZHINX-NEXT: flt.s a4, a2, s0 -; RV32IZHINX-NEXT: neg a2, a4 +; RV32IZHINX-NEXT: flt.s a3, a2, s0 +; RV32IZHINX-NEXT: neg a2, a3 ; RV32IZHINX-NEXT: or a0, a2, a0 ; RV32IZHINX-NEXT: feq.s a2, s0, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: lui a5, 524288 -; RV32IZHINX-NEXT: lui a3, 524288 +; RV32IZHINX-NEXT: lui a4, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB1_4 ; RV32IZHINX-NEXT: # %bb.3: -; RV32IZHINX-NEXT: mv a3, a1 +; RV32IZHINX-NEXT: mv a4, a1 ; RV32IZHINX-NEXT: .LBB1_4: ; RV32IZHINX-NEXT: and a0, a2, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -194,11 +194,11 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: addi sp, sp, 16 -; RV32IZHINX-NEXT: beqz a4, .LBB1_6 +; RV32IZHINX-NEXT: beqz a3, .LBB1_6 ; RV32IZHINX-NEXT: # %bb.5: -; RV32IZHINX-NEXT: addi a3, a5, -1 +; RV32IZHINX-NEXT: addi a4, a5, -1 ; RV32IZHINX-NEXT: .LBB1_6: -; RV32IZHINX-NEXT: and a1, a2, a3 +; RV32IZHINX-NEXT: and a1, a2, a4 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: test_floor_si64: @@ -320,16 +320,16 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI1_0) ; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI1_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s2, a0 -; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0 -; RV32IZHINXMIN-NEXT: neg a2, a4 +; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0 +; RV32IZHINXMIN-NEXT: neg a2, a3 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 ; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: lui a5, 524288 -; RV32IZHINXMIN-NEXT: lui a3, 524288 +; RV32IZHINXMIN-NEXT: lui a4, 524288 ; RV32IZHINXMIN-NEXT: beqz s1, .LBB1_4 ; RV32IZHINXMIN-NEXT: # %bb.3: -; RV32IZHINXMIN-NEXT: mv a3, a1 +; RV32IZHINXMIN-NEXT: mv a4, a1 ; RV32IZHINXMIN-NEXT: .LBB1_4: ; RV32IZHINXMIN-NEXT: and a0, a2, a0 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -337,11 +337,11 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: addi sp, sp, 16 -; RV32IZHINXMIN-NEXT: beqz a4, .LBB1_6 +; RV32IZHINXMIN-NEXT: beqz a3, .LBB1_6 ; RV32IZHINXMIN-NEXT: # %bb.5: -; RV32IZHINXMIN-NEXT: addi a3, a5, -1 +; RV32IZHINXMIN-NEXT: addi a4, a5, -1 ; RV32IZHINXMIN-NEXT: .LBB1_6: -; RV32IZHINXMIN-NEXT: and a1, a2, a3 +; RV32IZHINXMIN-NEXT: and a1, a2, a4 ; RV32IZHINXMIN-NEXT: ret ; ; RV64IZHINXMIN-LABEL: test_floor_si64: @@ -889,16 +889,16 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZHINX-NEXT: lui a2, %hi(.LCPI5_1) ; RV32IZHINX-NEXT: lw a2, %lo(.LCPI5_1)(a2) ; RV32IZHINX-NEXT: and a0, s2, a0 -; RV32IZHINX-NEXT: flt.s a4, a2, s0 -; RV32IZHINX-NEXT: neg a2, a4 +; RV32IZHINX-NEXT: flt.s a3, a2, s0 +; RV32IZHINX-NEXT: neg a2, a3 ; RV32IZHINX-NEXT: or a0, a2, a0 ; RV32IZHINX-NEXT: feq.s a2, s0, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: lui a5, 524288 -; RV32IZHINX-NEXT: lui a3, 524288 +; RV32IZHINX-NEXT: lui a4, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB5_4 ; RV32IZHINX-NEXT: # %bb.3: -; RV32IZHINX-NEXT: mv a3, a1 +; RV32IZHINX-NEXT: mv a4, a1 ; RV32IZHINX-NEXT: .LBB5_4: ; RV32IZHINX-NEXT: and a0, a2, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -906,11 +906,11 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: addi sp, sp, 16 -; RV32IZHINX-NEXT: beqz a4, .LBB5_6 +; RV32IZHINX-NEXT: beqz a3, .LBB5_6 ; RV32IZHINX-NEXT: # %bb.5: -; RV32IZHINX-NEXT: addi a3, a5, -1 +; RV32IZHINX-NEXT: addi a4, a5, -1 ; RV32IZHINX-NEXT: .LBB5_6: -; RV32IZHINX-NEXT: and a1, a2, a3 +; RV32IZHINX-NEXT: and a1, a2, a4 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: test_ceil_si64: @@ -1032,16 +1032,16 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI5_0) ; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI5_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s2, a0 -; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0 -; RV32IZHINXMIN-NEXT: neg a2, a4 +; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0 +; RV32IZHINXMIN-NEXT: neg a2, a3 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 ; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: lui a5, 524288 -; RV32IZHINXMIN-NEXT: lui a3, 524288 +; RV32IZHINXMIN-NEXT: lui a4, 524288 ; RV32IZHINXMIN-NEXT: beqz s1, .LBB5_4 ; RV32IZHINXMIN-NEXT: # %bb.3: -; RV32IZHINXMIN-NEXT: mv a3, a1 +; RV32IZHINXMIN-NEXT: mv a4, a1 ; RV32IZHINXMIN-NEXT: .LBB5_4: ; RV32IZHINXMIN-NEXT: and a0, a2, a0 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -1049,11 +1049,11 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: addi sp, sp, 16 -; RV32IZHINXMIN-NEXT: beqz a4, .LBB5_6 +; RV32IZHINXMIN-NEXT: beqz a3, .LBB5_6 ; RV32IZHINXMIN-NEXT: # %bb.5: -; RV32IZHINXMIN-NEXT: addi a3, a5, -1 +; RV32IZHINXMIN-NEXT: addi a4, a5, -1 ; RV32IZHINXMIN-NEXT: .LBB5_6: -; RV32IZHINXMIN-NEXT: and a1, a2, a3 +; RV32IZHINXMIN-NEXT: and a1, a2, a4 ; RV32IZHINXMIN-NEXT: ret ; ; RV64IZHINXMIN-LABEL: test_ceil_si64: @@ -1601,16 +1601,16 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZHINX-NEXT: lui a2, %hi(.LCPI9_1) ; RV32IZHINX-NEXT: lw a2, %lo(.LCPI9_1)(a2) ; RV32IZHINX-NEXT: and a0, s2, a0 -; RV32IZHINX-NEXT: flt.s a4, a2, s0 -; RV32IZHINX-NEXT: neg a2, a4 +; RV32IZHINX-NEXT: flt.s a3, a2, s0 +; RV32IZHINX-NEXT: neg a2, a3 ; RV32IZHINX-NEXT: or a0, a2, a0 ; RV32IZHINX-NEXT: feq.s a2, s0, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: lui a5, 524288 -; RV32IZHINX-NEXT: lui a3, 524288 +; RV32IZHINX-NEXT: lui a4, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB9_4 ; RV32IZHINX-NEXT: # %bb.3: -; RV32IZHINX-NEXT: mv a3, a1 +; RV32IZHINX-NEXT: mv a4, a1 ; RV32IZHINX-NEXT: .LBB9_4: ; RV32IZHINX-NEXT: and a0, a2, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -1618,11 +1618,11 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: addi sp, sp, 16 -; RV32IZHINX-NEXT: beqz a4, .LBB9_6 +; RV32IZHINX-NEXT: beqz a3, .LBB9_6 ; RV32IZHINX-NEXT: # %bb.5: -; RV32IZHINX-NEXT: addi a3, a5, -1 +; RV32IZHINX-NEXT: addi a4, a5, -1 ; RV32IZHINX-NEXT: .LBB9_6: -; RV32IZHINX-NEXT: and a1, a2, a3 +; RV32IZHINX-NEXT: and a1, a2, a4 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: test_trunc_si64: @@ -1744,16 +1744,16 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI9_0) ; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI9_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s2, a0 -; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0 -; RV32IZHINXMIN-NEXT: neg a2, a4 +; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0 +; RV32IZHINXMIN-NEXT: neg a2, a3 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 ; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: lui a5, 524288 -; RV32IZHINXMIN-NEXT: lui a3, 524288 +; RV32IZHINXMIN-NEXT: lui a4, 524288 ; RV32IZHINXMIN-NEXT: beqz s1, .LBB9_4 ; RV32IZHINXMIN-NEXT: # %bb.3: -; RV32IZHINXMIN-NEXT: mv a3, a1 +; RV32IZHINXMIN-NEXT: mv a4, a1 ; RV32IZHINXMIN-NEXT: .LBB9_4: ; RV32IZHINXMIN-NEXT: and a0, a2, a0 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -1761,11 +1761,11 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: addi sp, sp, 16 -; RV32IZHINXMIN-NEXT: beqz a4, .LBB9_6 +; RV32IZHINXMIN-NEXT: beqz a3, .LBB9_6 ; RV32IZHINXMIN-NEXT: # %bb.5: -; RV32IZHINXMIN-NEXT: addi a3, a5, -1 +; RV32IZHINXMIN-NEXT: addi a4, a5, -1 ; RV32IZHINXMIN-NEXT: .LBB9_6: -; RV32IZHINXMIN-NEXT: and a1, a2, a3 +; RV32IZHINXMIN-NEXT: and a1, a2, a4 ; RV32IZHINXMIN-NEXT: ret ; ; RV64IZHINXMIN-LABEL: test_trunc_si64: @@ -2313,16 +2313,16 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZHINX-NEXT: lui a2, %hi(.LCPI13_1) ; RV32IZHINX-NEXT: lw a2, %lo(.LCPI13_1)(a2) ; RV32IZHINX-NEXT: and a0, s2, a0 -; RV32IZHINX-NEXT: flt.s a4, a2, s0 -; RV32IZHINX-NEXT: neg a2, a4 +; RV32IZHINX-NEXT: flt.s a3, a2, s0 +; RV32IZHINX-NEXT: neg a2, a3 ; RV32IZHINX-NEXT: or a0, a2, a0 ; RV32IZHINX-NEXT: feq.s a2, s0, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: lui a5, 524288 -; RV32IZHINX-NEXT: lui a3, 524288 +; RV32IZHINX-NEXT: lui a4, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB13_4 ; RV32IZHINX-NEXT: # %bb.3: -; RV32IZHINX-NEXT: mv a3, a1 +; RV32IZHINX-NEXT: mv a4, a1 ; RV32IZHINX-NEXT: .LBB13_4: ; RV32IZHINX-NEXT: and a0, a2, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -2330,11 +2330,11 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: addi sp, sp, 16 -; RV32IZHINX-NEXT: beqz a4, .LBB13_6 +; RV32IZHINX-NEXT: beqz a3, .LBB13_6 ; RV32IZHINX-NEXT: # %bb.5: -; RV32IZHINX-NEXT: addi a3, a5, -1 +; RV32IZHINX-NEXT: addi a4, a5, -1 ; RV32IZHINX-NEXT: .LBB13_6: -; RV32IZHINX-NEXT: and a1, a2, a3 +; RV32IZHINX-NEXT: and a1, a2, a4 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: test_round_si64: @@ -2456,16 +2456,16 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI13_0) ; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI13_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s2, a0 -; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0 -; RV32IZHINXMIN-NEXT: neg a2, a4 +; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0 +; RV32IZHINXMIN-NEXT: neg a2, a3 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 ; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: lui a5, 524288 -; RV32IZHINXMIN-NEXT: lui a3, 524288 +; RV32IZHINXMIN-NEXT: lui a4, 524288 ; RV32IZHINXMIN-NEXT: beqz s1, .LBB13_4 ; RV32IZHINXMIN-NEXT: # %bb.3: -; RV32IZHINXMIN-NEXT: mv a3, a1 +; RV32IZHINXMIN-NEXT: mv a4, a1 ; RV32IZHINXMIN-NEXT: .LBB13_4: ; RV32IZHINXMIN-NEXT: and a0, a2, a0 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -2473,11 +2473,11 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: addi sp, sp, 16 -; RV32IZHINXMIN-NEXT: beqz a4, .LBB13_6 +; RV32IZHINXMIN-NEXT: beqz a3, .LBB13_6 ; RV32IZHINXMIN-NEXT: # %bb.5: -; RV32IZHINXMIN-NEXT: addi a3, a5, -1 +; RV32IZHINXMIN-NEXT: addi a4, a5, -1 ; RV32IZHINXMIN-NEXT: .LBB13_6: -; RV32IZHINXMIN-NEXT: and a1, a2, a3 +; RV32IZHINXMIN-NEXT: and a1, a2, a4 ; RV32IZHINXMIN-NEXT: ret ; ; RV64IZHINXMIN-LABEL: test_round_si64: @@ -3025,16 +3025,16 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZHINX-NEXT: lui a2, %hi(.LCPI17_1) ; RV32IZHINX-NEXT: lw a2, %lo(.LCPI17_1)(a2) ; RV32IZHINX-NEXT: and a0, s2, a0 -; RV32IZHINX-NEXT: flt.s a4, a2, s0 -; RV32IZHINX-NEXT: neg a2, a4 +; RV32IZHINX-NEXT: flt.s a3, a2, s0 +; RV32IZHINX-NEXT: neg a2, a3 ; RV32IZHINX-NEXT: or a0, a2, a0 ; RV32IZHINX-NEXT: feq.s a2, s0, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: lui a5, 524288 -; RV32IZHINX-NEXT: lui a3, 524288 +; RV32IZHINX-NEXT: lui a4, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB17_4 ; RV32IZHINX-NEXT: # %bb.3: -; RV32IZHINX-NEXT: mv a3, a1 +; RV32IZHINX-NEXT: mv a4, a1 ; RV32IZHINX-NEXT: .LBB17_4: ; RV32IZHINX-NEXT: and a0, a2, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -3042,11 +3042,11 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: addi sp, sp, 16 -; RV32IZHINX-NEXT: beqz a4, .LBB17_6 +; RV32IZHINX-NEXT: beqz a3, .LBB17_6 ; RV32IZHINX-NEXT: # %bb.5: -; RV32IZHINX-NEXT: addi a3, a5, -1 +; RV32IZHINX-NEXT: addi a4, a5, -1 ; RV32IZHINX-NEXT: .LBB17_6: -; RV32IZHINX-NEXT: and a1, a2, a3 +; RV32IZHINX-NEXT: and a1, a2, a4 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: test_roundeven_si64: @@ -3168,16 +3168,16 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI17_0) ; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI17_0)(a2) ; RV32IZHINXMIN-NEXT: and a0, s2, a0 -; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0 -; RV32IZHINXMIN-NEXT: neg a2, a4 +; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0 +; RV32IZHINXMIN-NEXT: neg a2, a3 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 ; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: lui a5, 524288 -; RV32IZHINXMIN-NEXT: lui a3, 524288 +; RV32IZHINXMIN-NEXT: lui a4, 524288 ; RV32IZHINXMIN-NEXT: beqz s1, .LBB17_4 ; RV32IZHINXMIN-NEXT: # %bb.3: -; RV32IZHINXMIN-NEXT: mv a3, a1 +; RV32IZHINXMIN-NEXT: mv a4, a1 ; RV32IZHINXMIN-NEXT: .LBB17_4: ; RV32IZHINXMIN-NEXT: and a0, a2, a0 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -3185,11 +3185,11 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: addi sp, sp, 16 -; RV32IZHINXMIN-NEXT: beqz a4, .LBB17_6 +; RV32IZHINXMIN-NEXT: beqz a3, .LBB17_6 ; RV32IZHINXMIN-NEXT: # %bb.5: -; RV32IZHINXMIN-NEXT: addi a3, a5, -1 +; RV32IZHINXMIN-NEXT: addi a4, a5, -1 ; RV32IZHINXMIN-NEXT: .LBB17_6: -; RV32IZHINXMIN-NEXT: and a1, a2, a3 +; RV32IZHINXMIN-NEXT: and a1, a2, a4 ; RV32IZHINXMIN-NEXT: ret ; ; RV64IZHINXMIN-LABEL: test_roundeven_si64: diff --git a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll index 19bd36bcd690e7..1f36c8d6618d28 100644 --- a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll @@ -62,14 +62,14 @@ define half @select_fcmp_oeq(half %a, half %b) nounwind { ; ; CHECKIZFHMIN-LABEL: select_fcmp_oeq: ; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: feq.s a0, fa5, fa4 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECKIZFHMIN-NEXT: feq.s a0, fa4, fa5 ; CHECKIZFHMIN-NEXT: bnez a0, .LBB1_2 ; CHECKIZFHMIN-NEXT: # %bb.1: -; CHECKIZFHMIN-NEXT: fmv.s fa5, fa4 +; CHECKIZFHMIN-NEXT: fmv.s fa4, fa5 ; CHECKIZFHMIN-NEXT: .LBB1_2: -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa4 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: select_fcmp_oeq: @@ -203,14 +203,14 @@ define half @select_fcmp_olt(half %a, half %b) nounwind { ; ; CHECKIZFHMIN-LABEL: select_fcmp_olt: ; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: flt.s a0, fa5, fa4 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECKIZFHMIN-NEXT: flt.s a0, fa4, fa5 ; CHECKIZFHMIN-NEXT: bnez a0, .LBB4_2 ; CHECKIZFHMIN-NEXT: # %bb.1: -; CHECKIZFHMIN-NEXT: fmv.s fa5, fa4 +; CHECKIZFHMIN-NEXT: fmv.s fa4, fa5 ; CHECKIZFHMIN-NEXT: .LBB4_2: -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa4 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: select_fcmp_olt: @@ -250,14 +250,14 @@ define half @select_fcmp_ole(half %a, half %b) nounwind { ; ; CHECKIZFHMIN-LABEL: select_fcmp_ole: ; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: fle.s a0, fa5, fa4 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECKIZFHMIN-NEXT: fle.s a0, fa4, fa5 ; CHECKIZFHMIN-NEXT: bnez a0, .LBB5_2 ; CHECKIZFHMIN-NEXT: # %bb.1: -; CHECKIZFHMIN-NEXT: fmv.s fa5, fa4 +; CHECKIZFHMIN-NEXT: fmv.s fa4, fa5 ; CHECKIZFHMIN-NEXT: .LBB5_2: -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa4 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: select_fcmp_ole: @@ -301,16 +301,16 @@ define half @select_fcmp_one(half %a, half %b) nounwind { ; ; CHECKIZFHMIN-LABEL: select_fcmp_one: ; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: flt.s a0, fa5, fa4 -; CHECKIZFHMIN-NEXT: flt.s a1, fa4, fa5 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECKIZFHMIN-NEXT: flt.s a0, fa4, fa5 +; CHECKIZFHMIN-NEXT: flt.s a1, fa5, fa4 ; CHECKIZFHMIN-NEXT: or a0, a1, a0 ; CHECKIZFHMIN-NEXT: bnez a0, .LBB6_2 ; CHECKIZFHMIN-NEXT: # %bb.1: -; CHECKIZFHMIN-NEXT: fmv.s fa5, fa4 +; CHECKIZFHMIN-NEXT: fmv.s fa4, fa5 ; CHECKIZFHMIN-NEXT: .LBB6_2: -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa4 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: select_fcmp_one: @@ -411,16 +411,16 @@ define half @select_fcmp_ueq(half %a, half %b) nounwind { ; ; CHECKIZFHMIN-LABEL: select_fcmp_ueq: ; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: flt.s a0, fa5, fa4 -; CHECKIZFHMIN-NEXT: flt.s a1, fa4, fa5 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECKIZFHMIN-NEXT: flt.s a0, fa4, fa5 +; CHECKIZFHMIN-NEXT: flt.s a1, fa5, fa4 ; CHECKIZFHMIN-NEXT: or a0, a1, a0 ; CHECKIZFHMIN-NEXT: beqz a0, .LBB8_2 ; CHECKIZFHMIN-NEXT: # %bb.1: -; CHECKIZFHMIN-NEXT: fmv.s fa5, fa4 +; CHECKIZFHMIN-NEXT: fmv.s fa4, fa5 ; CHECKIZFHMIN-NEXT: .LBB8_2: -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa4 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: select_fcmp_ueq: @@ -462,14 +462,14 @@ define half @select_fcmp_ugt(half %a, half %b) nounwind { ; ; CHECKIZFHMIN-LABEL: select_fcmp_ugt: ; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: fle.s a0, fa5, fa4 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECKIZFHMIN-NEXT: fle.s a0, fa4, fa5 ; CHECKIZFHMIN-NEXT: beqz a0, .LBB9_2 ; CHECKIZFHMIN-NEXT: # %bb.1: -; CHECKIZFHMIN-NEXT: fmv.s fa5, fa4 +; CHECKIZFHMIN-NEXT: fmv.s fa4, fa5 ; CHECKIZFHMIN-NEXT: .LBB9_2: -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa4 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: select_fcmp_ugt: @@ -509,14 +509,14 @@ define half @select_fcmp_uge(half %a, half %b) nounwind { ; ; CHECKIZFHMIN-LABEL: select_fcmp_uge: ; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: flt.s a0, fa5, fa4 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECKIZFHMIN-NEXT: flt.s a0, fa4, fa5 ; CHECKIZFHMIN-NEXT: beqz a0, .LBB10_2 ; CHECKIZFHMIN-NEXT: # %bb.1: -; CHECKIZFHMIN-NEXT: fmv.s fa5, fa4 +; CHECKIZFHMIN-NEXT: fmv.s fa4, fa5 ; CHECKIZFHMIN-NEXT: .LBB10_2: -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa4 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: select_fcmp_uge: @@ -650,14 +650,14 @@ define half @select_fcmp_une(half %a, half %b) nounwind { ; ; CHECKIZFHMIN-LABEL: select_fcmp_une: ; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: feq.s a0, fa5, fa4 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECKIZFHMIN-NEXT: feq.s a0, fa4, fa5 ; CHECKIZFHMIN-NEXT: beqz a0, .LBB13_2 ; CHECKIZFHMIN-NEXT: # %bb.1: -; CHECKIZFHMIN-NEXT: fmv.s fa5, fa4 +; CHECKIZFHMIN-NEXT: fmv.s fa4, fa5 ; CHECKIZFHMIN-NEXT: .LBB13_2: -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa4 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: select_fcmp_une: diff --git a/llvm/test/CodeGen/RISCV/min-max.ll b/llvm/test/CodeGen/RISCV/min-max.ll index 0115b48b7124c9..171da1e35c627f 100644 --- a/llvm/test/CodeGen/RISCV/min-max.ll +++ b/llvm/test/CodeGen/RISCV/min-max.ll @@ -684,13 +684,15 @@ define i64 @umax_i64_one(i64 %a, i64 %b) { ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: beqz a1, .LBB28_4 ; RV32I-NEXT: .LBB28_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB28_3: -; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: bnez a1, .LBB28_2 ; RV32I-NEXT: .LBB28_4: -; RV32I-NEXT: seqz a0, a2 -; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: seqz a2, a0 +; RV32I-NEXT: add a2, a0, a2 +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: umax_i64_one: @@ -701,18 +703,20 @@ define i64 @umax_i64_one(i64 %a, i64 %b) { ; ; RV32ZBB-LABEL: umax_i64_one: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: mv a2, a0 ; RV32ZBB-NEXT: li a3, 1 +; RV32ZBB-NEXT: mv a2, a0 ; RV32ZBB-NEXT: beqz a1, .LBB28_3 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: beqz a1, .LBB28_4 ; RV32ZBB-NEXT: .LBB28_2: +; RV32ZBB-NEXT: mv a0, a2 ; RV32ZBB-NEXT: ret ; RV32ZBB-NEXT: .LBB28_3: -; RV32ZBB-NEXT: li a0, 1 +; RV32ZBB-NEXT: li a2, 1 ; RV32ZBB-NEXT: bnez a1, .LBB28_2 ; RV32ZBB-NEXT: .LBB28_4: -; RV32ZBB-NEXT: maxu a0, a2, a3 +; RV32ZBB-NEXT: maxu a2, a0, a3 +; RV32ZBB-NEXT: mv a0, a2 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: umax_i64_one: diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index f2b7e8d26328d5..341db9a1a172a8 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1252,39 +1252,39 @@ define i128 @muli128_m63(i128 %a) nounwind { ; RV32I-LABEL: muli128_m63: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a4, 12(a1) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: slli a3, a2, 6 -; RV32I-NEXT: sltu a5, a2, a3 -; RV32I-NEXT: srli a7, a2, 26 -; RV32I-NEXT: slli t0, a1, 6 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: mv t0, a5 -; RV32I-NEXT: beq a1, a7, .LBB31_2 +; RV32I-NEXT: lw a5, 12(a1) +; RV32I-NEXT: lw a7, 8(a1) +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: slli a1, a2, 6 +; RV32I-NEXT: sltu a4, a2, a1 +; RV32I-NEXT: srli a6, a2, 26 +; RV32I-NEXT: slli t0, a3, 6 +; RV32I-NEXT: or a6, t0, a6 +; RV32I-NEXT: mv t0, a4 +; RV32I-NEXT: beq a3, a6, .LBB31_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t0, a1, a7 +; RV32I-NEXT: sltu t0, a3, a6 ; RV32I-NEXT: .LBB31_2: -; RV32I-NEXT: srli t1, a1, 26 -; RV32I-NEXT: slli t2, a6, 6 +; RV32I-NEXT: srli t1, a3, 26 +; RV32I-NEXT: slli t2, a7, 6 ; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: sub t2, a6, t1 +; RV32I-NEXT: sub t2, a7, t1 ; RV32I-NEXT: sltu t3, t2, t0 -; RV32I-NEXT: sltu t1, a6, t1 -; RV32I-NEXT: srli a6, a6, 26 -; RV32I-NEXT: slli t4, a4, 6 -; RV32I-NEXT: or a6, t4, a6 -; RV32I-NEXT: sub a4, a4, a6 -; RV32I-NEXT: sub a4, a4, t1 -; RV32I-NEXT: sub a4, a4, t3 -; RV32I-NEXT: sub a6, t2, t0 -; RV32I-NEXT: sub a1, a1, a7 -; RV32I-NEXT: sub a1, a1, a5 -; RV32I-NEXT: sub a2, a2, a3 +; RV32I-NEXT: sltu t1, a7, t1 +; RV32I-NEXT: srli a7, a7, 26 +; RV32I-NEXT: slli t4, a5, 6 +; RV32I-NEXT: or a7, t4, a7 +; RV32I-NEXT: sub a5, a5, a7 +; RV32I-NEXT: sub a5, a5, t1 +; RV32I-NEXT: sub a5, a5, t3 +; RV32I-NEXT: sub a7, t2, t0 +; RV32I-NEXT: sub a3, a3, a6 +; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: sub a2, a2, a1 ; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a1, 4(a0) -; RV32I-NEXT: sw a6, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sw a3, 4(a0) +; RV32I-NEXT: sw a7, 8(a0) +; RV32I-NEXT: sw a5, 12(a0) ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli128_m63: diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index 7c3294fa81dcfe..5ffcb19f6a138d 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -1074,40 +1074,40 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .cfi_offset s4, -24 ; RV32-NEXT: .cfi_offset s5, -28 ; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: mv s2, a5 +; RV32-NEXT: mv s5, a5 ; RV32-NEXT: andi a5, a5, 1 ; RV32-NEXT: beqz a5, .LBB32_8 ; RV32-NEXT: # %bb.1: # %t ; RV32-NEXT: mv s0, a4 -; RV32-NEXT: mv s3, a3 +; RV32-NEXT: mv s2, a3 ; RV32-NEXT: mv s1, a2 -; RV32-NEXT: mv s5, a1 -; RV32-NEXT: mv s4, a0 +; RV32-NEXT: mv s4, a1 +; RV32-NEXT: mv s3, a0 ; RV32-NEXT: beq a1, a3, .LBB32_3 ; RV32-NEXT: # %bb.2: # %t -; RV32-NEXT: sltu s6, s5, s3 +; RV32-NEXT: sltu s6, s4, s2 ; RV32-NEXT: j .LBB32_4 ; RV32-NEXT: .LBB32_3: -; RV32-NEXT: sltu s6, s4, s1 +; RV32-NEXT: sltu s6, s3, s1 ; RV32-NEXT: .LBB32_4: # %t ; RV32-NEXT: mv a0, s6 ; RV32-NEXT: call call@plt ; RV32-NEXT: beqz s6, .LBB32_8 ; RV32-NEXT: # %bb.5: # %end -; RV32-NEXT: sltu a1, s4, s1 +; RV32-NEXT: sltu a1, s3, s1 ; RV32-NEXT: mv a0, a1 -; RV32-NEXT: beq s5, s3, .LBB32_7 +; RV32-NEXT: beq s4, s2, .LBB32_7 ; RV32-NEXT: # %bb.6: # %end -; RV32-NEXT: sltu a0, s5, s3 +; RV32-NEXT: sltu a0, s4, s2 ; RV32-NEXT: .LBB32_7: # %end -; RV32-NEXT: sub a2, s5, s3 +; RV32-NEXT: sub a2, s4, s2 ; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: sub a1, s4, s1 +; RV32-NEXT: sub a1, s3, s1 ; RV32-NEXT: sw a1, 0(s0) ; RV32-NEXT: sw a2, 4(s0) ; RV32-NEXT: j .LBB32_9 ; RV32-NEXT: .LBB32_8: # %f -; RV32-NEXT: mv a0, s2 +; RV32-NEXT: mv a0, s5 ; RV32-NEXT: .LBB32_9: # %f ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll index 776944b177636c..a33a7ee2408faf 100644 --- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll +++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll @@ -1837,16 +1837,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: sw t4, 44(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: sw t5, 40(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: sw t6, 36(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lui a6, %hi(var_test_irq) -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-NEXT: lui a7, %hi(var_test_irq) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV32IZCMP-NEXT: sw a0, 32(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) +; RV32IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -1871,7 +1871,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: lw t1, 92(a5) ; RV32IZCMP-NEXT: lw t0, 96(a5) ; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a7, 104(a5) +; RV32IZCMP-NEXT: lw a6, 104(a5) ; RV32IZCMP-NEXT: lw a4, 108(a5) ; RV32IZCMP-NEXT: lw a0, 124(a5) ; RV32IZCMP-NEXT: lw a1, 120(a5) @@ -1882,7 +1882,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: sw a2, 116(a5) ; RV32IZCMP-NEXT: sw a3, 112(a5) ; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a7, 104(a5) +; RV32IZCMP-NEXT: sw a6, 104(a5) ; RV32IZCMP-NEXT: sw s0, 100(a5) ; RV32IZCMP-NEXT: sw t0, 96(a5) ; RV32IZCMP-NEXT: sw t1, 92(a5) @@ -1908,13 +1908,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV32IZCMP-NEXT: lw a0, 32(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV32IZCMP-NEXT: lw t0, 92(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: lw t1, 88(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: lw t2, 84(sp) # 4-byte Folded Reload @@ -1953,16 +1953,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: sd t4, 72(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: sd t5, 64(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: sd t6, 56(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lui a6, %hi(var_test_irq) -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-NEXT: lui a7, %hi(var_test_irq) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV64IZCMP-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) +; RV64IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -1987,7 +1987,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: lw t1, 92(a5) ; RV64IZCMP-NEXT: lw t0, 96(a5) ; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a7, 104(a5) +; RV64IZCMP-NEXT: lw a6, 104(a5) ; RV64IZCMP-NEXT: lw a4, 108(a5) ; RV64IZCMP-NEXT: lw a0, 124(a5) ; RV64IZCMP-NEXT: lw a1, 120(a5) @@ -1998,7 +1998,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: sw a2, 116(a5) ; RV64IZCMP-NEXT: sw a3, 112(a5) ; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a7, 104(a5) +; RV64IZCMP-NEXT: sw a6, 104(a5) ; RV64IZCMP-NEXT: sw s0, 100(a5) ; RV64IZCMP-NEXT: sw t0, 96(a5) ; RV64IZCMP-NEXT: sw t1, 92(a5) @@ -2024,13 +2024,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV64IZCMP-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV64IZCMP-NEXT: ld t0, 168(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: ld t1, 160(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: ld t2, 152(sp) # 8-byte Folded Reload @@ -2069,16 +2069,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: sw t4, 44(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: sw t5, 40(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: sw t6, 36(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV32IZCMP-SR-NEXT: sw a0, 32(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV32IZCMP-SR-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV32IZCMP-SR-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV32IZCMP-SR-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) +; RV32IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) ; RV32IZCMP-SR-NEXT: lw a0, 16(a5) ; RV32IZCMP-SR-NEXT: sw a0, 16(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2103,7 +2103,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: lw t1, 92(a5) ; RV32IZCMP-SR-NEXT: lw t0, 96(a5) ; RV32IZCMP-SR-NEXT: lw s0, 100(a5) -; RV32IZCMP-SR-NEXT: lw a7, 104(a5) +; RV32IZCMP-SR-NEXT: lw a6, 104(a5) ; RV32IZCMP-SR-NEXT: lw a4, 108(a5) ; RV32IZCMP-SR-NEXT: lw a0, 124(a5) ; RV32IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2114,7 +2114,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: sw a2, 116(a5) ; RV32IZCMP-SR-NEXT: sw a3, 112(a5) ; RV32IZCMP-SR-NEXT: sw a4, 108(a5) -; RV32IZCMP-SR-NEXT: sw a7, 104(a5) +; RV32IZCMP-SR-NEXT: sw a6, 104(a5) ; RV32IZCMP-SR-NEXT: sw s0, 100(a5) ; RV32IZCMP-SR-NEXT: sw t0, 96(a5) ; RV32IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2140,13 +2140,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: sw a0, 16(a5) ; RV32IZCMP-SR-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV32IZCMP-SR-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV32IZCMP-SR-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV32IZCMP-SR-NEXT: lw a0, 32(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV32IZCMP-SR-NEXT: lw t0, 92(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: lw t1, 88(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: lw t2, 84(sp) # 4-byte Folded Reload @@ -2185,16 +2185,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: sd t4, 72(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: sd t5, 64(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: sd t6, 56(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV64IZCMP-SR-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV64IZCMP-SR-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV64IZCMP-SR-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV64IZCMP-SR-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) +; RV64IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) ; RV64IZCMP-SR-NEXT: lw a0, 16(a5) ; RV64IZCMP-SR-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2219,7 +2219,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: lw t1, 92(a5) ; RV64IZCMP-SR-NEXT: lw t0, 96(a5) ; RV64IZCMP-SR-NEXT: lw s0, 100(a5) -; RV64IZCMP-SR-NEXT: lw a7, 104(a5) +; RV64IZCMP-SR-NEXT: lw a6, 104(a5) ; RV64IZCMP-SR-NEXT: lw a4, 108(a5) ; RV64IZCMP-SR-NEXT: lw a0, 124(a5) ; RV64IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2230,7 +2230,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: sw a2, 116(a5) ; RV64IZCMP-SR-NEXT: sw a3, 112(a5) ; RV64IZCMP-SR-NEXT: sw a4, 108(a5) -; RV64IZCMP-SR-NEXT: sw a7, 104(a5) +; RV64IZCMP-SR-NEXT: sw a6, 104(a5) ; RV64IZCMP-SR-NEXT: sw s0, 100(a5) ; RV64IZCMP-SR-NEXT: sw t0, 96(a5) ; RV64IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2256,13 +2256,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: sw a0, 16(a5) ; RV64IZCMP-SR-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV64IZCMP-SR-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV64IZCMP-SR-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV64IZCMP-SR-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV64IZCMP-SR-NEXT: ld t0, 168(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: ld t1, 160(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: ld t2, 152(sp) # 8-byte Folded Reload @@ -2313,16 +2313,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: sw t4, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw t5, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw t6, 32(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a6, %hi(var_test_irq) -; RV32I-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32I-NEXT: lui a7, %hi(var_test_irq) +; RV32I-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV32I-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a6, %lo(var_test_irq) +; RV32I-NEXT: addi a5, a7, %lo(var_test_irq) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -2347,7 +2347,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: lw s10, 92(a5) ; RV32I-NEXT: lw s11, 96(a5) ; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a7, 104(a5) +; RV32I-NEXT: lw a6, 104(a5) ; RV32I-NEXT: lw a4, 108(a5) ; RV32I-NEXT: lw a0, 124(a5) ; RV32I-NEXT: lw a1, 120(a5) @@ -2358,7 +2358,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: sw a2, 116(a5) ; RV32I-NEXT: sw a3, 112(a5) ; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a7, 104(a5) +; RV32I-NEXT: sw a6, 104(a5) ; RV32I-NEXT: sw ra, 100(a5) ; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) @@ -2384,13 +2384,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV32I-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32I-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw t0, 136(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw t1, 132(sp) # 4-byte Folded Reload @@ -2453,16 +2453,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: sd t4, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd t5, 56(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd t6, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a6, %hi(var_test_irq) -; RV64I-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64I-NEXT: lui a7, %hi(var_test_irq) +; RV64I-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a6, %lo(var_test_irq) +; RV64I-NEXT: addi a5, a7, %lo(var_test_irq) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -2487,7 +2487,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: lw s10, 92(a5) ; RV64I-NEXT: lw s11, 96(a5) ; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a7, 104(a5) +; RV64I-NEXT: lw a6, 104(a5) ; RV64I-NEXT: lw a4, 108(a5) ; RV64I-NEXT: lw a0, 124(a5) ; RV64I-NEXT: lw a1, 120(a5) @@ -2498,7 +2498,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: sw a2, 116(a5) ; RV64I-NEXT: sw a3, 112(a5) ; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a7, 104(a5) +; RV64I-NEXT: sw a6, 104(a5) ; RV64I-NEXT: sw ra, 100(a5) ; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) @@ -2524,13 +2524,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64I-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV64I-NEXT: ld ra, 264(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld t0, 256(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld t1, 248(sp) # 8-byte Folded Reload @@ -2570,16 +2570,16 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-LABEL: callee_no_irq: ; RV32IZCMP: # %bb.0: ; RV32IZCMP-NEXT: cm.push {ra, s0-s11}, -96 -; RV32IZCMP-NEXT: lui a6, %hi(var_test_irq) -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-NEXT: lui a7, %hi(var_test_irq) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) +; RV32IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -2604,7 +2604,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-NEXT: lw t1, 92(a5) ; RV32IZCMP-NEXT: lw t0, 96(a5) ; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a7, 104(a5) +; RV32IZCMP-NEXT: lw a6, 104(a5) ; RV32IZCMP-NEXT: lw a4, 108(a5) ; RV32IZCMP-NEXT: lw a0, 124(a5) ; RV32IZCMP-NEXT: lw a1, 120(a5) @@ -2615,7 +2615,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-NEXT: sw a2, 116(a5) ; RV32IZCMP-NEXT: sw a3, 112(a5) ; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a7, 104(a5) +; RV32IZCMP-NEXT: sw a6, 104(a5) ; RV32IZCMP-NEXT: sw s0, 100(a5) ; RV32IZCMP-NEXT: sw t0, 96(a5) ; RV32IZCMP-NEXT: sw t1, 92(a5) @@ -2641,28 +2641,28 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 96 ; ; RV64IZCMP-LABEL: callee_no_irq: ; RV64IZCMP: # %bb.0: ; RV64IZCMP-NEXT: cm.push {ra, s0-s11}, -160 -; RV64IZCMP-NEXT: lui a6, %hi(var_test_irq) -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-NEXT: lui a7, %hi(var_test_irq) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) +; RV64IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -2687,7 +2687,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-NEXT: lw t1, 92(a5) ; RV64IZCMP-NEXT: lw t0, 96(a5) ; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a7, 104(a5) +; RV64IZCMP-NEXT: lw a6, 104(a5) ; RV64IZCMP-NEXT: lw a4, 108(a5) ; RV64IZCMP-NEXT: lw a0, 124(a5) ; RV64IZCMP-NEXT: lw a1, 120(a5) @@ -2698,7 +2698,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-NEXT: sw a2, 116(a5) ; RV64IZCMP-NEXT: sw a3, 112(a5) ; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a7, 104(a5) +; RV64IZCMP-NEXT: sw a6, 104(a5) ; RV64IZCMP-NEXT: sw s0, 100(a5) ; RV64IZCMP-NEXT: sw t0, 96(a5) ; RV64IZCMP-NEXT: sw t1, 92(a5) @@ -2724,28 +2724,28 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV64IZCMP-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV32IZCMP-SR-LABEL: callee_no_irq: ; RV32IZCMP-SR: # %bb.0: ; RV32IZCMP-SR-NEXT: cm.push {ra, s0-s11}, -96 -; RV32IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV32IZCMP-SR-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV32IZCMP-SR-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV32IZCMP-SR-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV32IZCMP-SR-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) +; RV32IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) ; RV32IZCMP-SR-NEXT: lw a0, 16(a5) ; RV32IZCMP-SR-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2770,7 +2770,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-SR-NEXT: lw t1, 92(a5) ; RV32IZCMP-SR-NEXT: lw t0, 96(a5) ; RV32IZCMP-SR-NEXT: lw s0, 100(a5) -; RV32IZCMP-SR-NEXT: lw a7, 104(a5) +; RV32IZCMP-SR-NEXT: lw a6, 104(a5) ; RV32IZCMP-SR-NEXT: lw a4, 108(a5) ; RV32IZCMP-SR-NEXT: lw a0, 124(a5) ; RV32IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2781,7 +2781,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-SR-NEXT: sw a2, 116(a5) ; RV32IZCMP-SR-NEXT: sw a3, 112(a5) ; RV32IZCMP-SR-NEXT: sw a4, 108(a5) -; RV32IZCMP-SR-NEXT: sw a7, 104(a5) +; RV32IZCMP-SR-NEXT: sw a6, 104(a5) ; RV32IZCMP-SR-NEXT: sw s0, 100(a5) ; RV32IZCMP-SR-NEXT: sw t0, 96(a5) ; RV32IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2807,28 +2807,28 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-SR-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: sw a0, 16(a5) ; RV32IZCMP-SR-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV32IZCMP-SR-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV32IZCMP-SR-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV32IZCMP-SR-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV32IZCMP-SR-NEXT: cm.popret {ra, s0-s11}, 96 ; ; RV64IZCMP-SR-LABEL: callee_no_irq: ; RV64IZCMP-SR: # %bb.0: ; RV64IZCMP-SR-NEXT: cm.push {ra, s0-s11}, -160 -; RV64IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV64IZCMP-SR-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV64IZCMP-SR-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV64IZCMP-SR-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV64IZCMP-SR-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) +; RV64IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) ; RV64IZCMP-SR-NEXT: lw a0, 16(a5) ; RV64IZCMP-SR-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2853,7 +2853,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-SR-NEXT: lw t1, 92(a5) ; RV64IZCMP-SR-NEXT: lw t0, 96(a5) ; RV64IZCMP-SR-NEXT: lw s0, 100(a5) -; RV64IZCMP-SR-NEXT: lw a7, 104(a5) +; RV64IZCMP-SR-NEXT: lw a6, 104(a5) ; RV64IZCMP-SR-NEXT: lw a4, 108(a5) ; RV64IZCMP-SR-NEXT: lw a0, 124(a5) ; RV64IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2864,7 +2864,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-SR-NEXT: sw a2, 116(a5) ; RV64IZCMP-SR-NEXT: sw a3, 112(a5) ; RV64IZCMP-SR-NEXT: sw a4, 108(a5) -; RV64IZCMP-SR-NEXT: sw a7, 104(a5) +; RV64IZCMP-SR-NEXT: sw a6, 104(a5) ; RV64IZCMP-SR-NEXT: sw s0, 100(a5) ; RV64IZCMP-SR-NEXT: sw t0, 96(a5) ; RV64IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2890,13 +2890,13 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-SR-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: sw a0, 16(a5) ; RV64IZCMP-SR-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV64IZCMP-SR-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV64IZCMP-SR-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV64IZCMP-SR-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV64IZCMP-SR-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV32I-LABEL: callee_no_irq: @@ -2915,16 +2915,16 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: sw s9, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a6, %hi(var_test_irq) -; RV32I-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32I-NEXT: lui a7, %hi(var_test_irq) +; RV32I-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a6, %lo(var_test_irq) +; RV32I-NEXT: addi a5, a7, %lo(var_test_irq) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -2949,7 +2949,7 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: lw s10, 92(a5) ; RV32I-NEXT: lw s11, 96(a5) ; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a7, 104(a5) +; RV32I-NEXT: lw a6, 104(a5) ; RV32I-NEXT: lw a4, 108(a5) ; RV32I-NEXT: lw a0, 124(a5) ; RV32I-NEXT: lw a1, 120(a5) @@ -2960,7 +2960,7 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: sw a2, 116(a5) ; RV32I-NEXT: sw a3, 112(a5) ; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a7, 104(a5) +; RV32I-NEXT: sw a6, 104(a5) ; RV32I-NEXT: sw ra, 100(a5) ; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) @@ -2986,13 +2986,13 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32I-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -3025,16 +3025,16 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: sd s9, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a6, %hi(var_test_irq) -; RV64I-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64I-NEXT: lui a7, %hi(var_test_irq) +; RV64I-NEXT: lw a0, %lo(var_test_irq)(a7) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a7) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a7) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a7) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a6, %lo(var_test_irq) +; RV64I-NEXT: addi a5, a7, %lo(var_test_irq) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -3059,7 +3059,7 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: lw s10, 92(a5) ; RV64I-NEXT: lw s11, 96(a5) ; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a7, 104(a5) +; RV64I-NEXT: lw a6, 104(a5) ; RV64I-NEXT: lw a4, 108(a5) ; RV64I-NEXT: lw a0, 124(a5) ; RV64I-NEXT: lw a1, 120(a5) @@ -3070,7 +3070,7 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: sw a2, 116(a5) ; RV64I-NEXT: sw a3, 112(a5) ; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a7, 104(a5) +; RV64I-NEXT: sw a6, 104(a5) ; RV64I-NEXT: sw ra, 100(a5) ; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) @@ -3096,13 +3096,13 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a7) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a7) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a7) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64I-NEXT: sw a0, %lo(var_test_irq)(a7) ; RV64I-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 136(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll index c15321057aeb86..e4b5008140c9c9 100644 --- a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll +++ b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll @@ -20,17 +20,13 @@ define void @last_chance_recoloring_failure() { ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 16 * vlenb ; CHECK-NEXT: li a0, 55 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vloxseg2ei32.v v16, (a0), v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill @@ -42,7 +38,7 @@ define void @last_chance_recoloring_failure() { ; CHECK-NEXT: vsetvli zero, s0, e16, m4, ta, ma ; CHECK-NEXT: vfwadd.vv v16, v8, v8, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill @@ -51,38 +47,30 @@ define void @last_chance_recoloring_failure() { ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vrgather.vv v4, v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, s0, e16, m4, ta, ma -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwsub.wv v8, v16, v24 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwsub.wv v16, v8, v24 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu ; CHECK-NEXT: vssubu.vv v4, v4, v8, v0.t ; CHECK-NEXT: vsetvli zero, s0, e32, m8, tu, mu ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfdiv.vv v8, v16, v8, v0.t -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vfdiv.vv v16, v8, v8, v0.t +; CHECK-NEXT: vse32.v v16, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -104,10 +92,7 @@ define void @last_chance_recoloring_failure() { ; SUBREGLIVENESS-NEXT: li a0, 55 ; SUBREGLIVENESS-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; SUBREGLIVENESS-NEXT: vloxseg2ei32.v v16, (a0), v8 -; SUBREGLIVENESS-NEXT: csrr a0, vlenb -; SUBREGLIVENESS-NEXT: slli a0, a0, 3 -; SUBREGLIVENESS-NEXT: add a0, sp, a0 -; SUBREGLIVENESS-NEXT: addi a0, a0, 16 +; SUBREGLIVENESS-NEXT: addi a0, sp, 16 ; SUBREGLIVENESS-NEXT: csrr a1, vlenb ; SUBREGLIVENESS-NEXT: slli a1, a1, 2 ; SUBREGLIVENESS-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill @@ -118,30 +103,38 @@ define void @last_chance_recoloring_failure() { ; SUBREGLIVENESS-NEXT: li s0, 36 ; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e16, m4, ta, ma ; SUBREGLIVENESS-NEXT: vfwadd.vv v16, v8, v8, v0.t -; SUBREGLIVENESS-NEXT: addi a0, sp, 16 +; SUBREGLIVENESS-NEXT: csrr a0, vlenb +; SUBREGLIVENESS-NEXT: slli a0, a0, 3 +; SUBREGLIVENESS-NEXT: add a0, sp, a0 +; SUBREGLIVENESS-NEXT: addi a0, a0, 16 ; SUBREGLIVENESS-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; SUBREGLIVENESS-NEXT: call func@plt ; SUBREGLIVENESS-NEXT: li a0, 32 ; SUBREGLIVENESS-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; SUBREGLIVENESS-NEXT: vrgather.vv v16, v8, v8, v0.t +; SUBREGLIVENESS-NEXT: vrgather.vv v4, v8, v8, v0.t ; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e16, m4, ta, ma +; SUBREGLIVENESS-NEXT: addi a1, sp, 16 +; SUBREGLIVENESS-NEXT: csrr a2, vlenb +; SUBREGLIVENESS-NEXT: slli a2, a2, 2 +; SUBREGLIVENESS-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload +; SUBREGLIVENESS-NEXT: add a1, a1, a2 +; SUBREGLIVENESS-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload ; SUBREGLIVENESS-NEXT: csrr a1, vlenb ; SUBREGLIVENESS-NEXT: slli a1, a1, 3 ; SUBREGLIVENESS-NEXT: add a1, sp, a1 ; SUBREGLIVENESS-NEXT: addi a1, a1, 16 -; SUBREGLIVENESS-NEXT: csrr a2, vlenb -; SUBREGLIVENESS-NEXT: slli a2, a2, 2 -; SUBREGLIVENESS-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload -; SUBREGLIVENESS-NEXT: add a1, a1, a2 -; SUBREGLIVENESS-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload -; SUBREGLIVENESS-NEXT: addi a1, sp, 16 -; SUBREGLIVENESS-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; SUBREGLIVENESS-NEXT: vfwsub.wv v8, v24, v20 +; SUBREGLIVENESS-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; SUBREGLIVENESS-NEXT: vfwsub.wv v16, v8, v24 ; SUBREGLIVENESS-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; SUBREGLIVENESS-NEXT: vssubu.vv v16, v16, v8, v0.t +; SUBREGLIVENESS-NEXT: vssubu.vv v4, v4, v8, v0.t ; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e32, m8, tu, mu -; SUBREGLIVENESS-NEXT: vfdiv.vv v8, v24, v8, v0.t -; SUBREGLIVENESS-NEXT: vse32.v v8, (a0) +; SUBREGLIVENESS-NEXT: csrr a0, vlenb +; SUBREGLIVENESS-NEXT: slli a0, a0, 3 +; SUBREGLIVENESS-NEXT: add a0, sp, a0 +; SUBREGLIVENESS-NEXT: addi a0, a0, 16 +; SUBREGLIVENESS-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; SUBREGLIVENESS-NEXT: vfdiv.vv v16, v8, v8, v0.t +; SUBREGLIVENESS-NEXT: vse32.v v16, (a0) ; SUBREGLIVENESS-NEXT: csrr a0, vlenb ; SUBREGLIVENESS-NEXT: slli a0, a0, 4 ; SUBREGLIVENESS-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll index 7698f860589aaf..8498017a5a132d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll @@ -2555,29 +2555,29 @@ define @vp_bitreverse_nxv7i64_unmasked( %va ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v16, v8, a4 -; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: vand.vx v24, v8, a4 +; RV32-NEXT: vsll.vi v0, v24, 24 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v24, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: addi a5, sp, 16 ; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vx v0, v8, a3 ; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 @@ -2919,29 +2919,29 @@ define @vp_bitreverse_nxv8i64_unmasked( %va ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v16, v8, a4 -; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: vand.vx v24, v8, a4 +; RV32-NEXT: vsll.vi v0, v24, 24 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v24, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: addi a5, sp, 16 ; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vx v0, v8, a3 ; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll index 6aac13a0bcbb87..2dd49cbf95f371 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll @@ -1218,29 +1218,29 @@ define @vp_bswap_nxv7i64_unmasked( %va, i32 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v16, v8, a4 -; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: vand.vx v24, v8, a4 +; RV32-NEXT: vsll.vi v0, v24, 24 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v24, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vx v0, v8, a3 ; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb @@ -1467,29 +1467,29 @@ define @vp_bswap_nxv8i64_unmasked( %va, i32 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v16, v8, a4 -; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: vand.vx v24, v8, a4 +; RV32-NEXT: vsll.vi v0, v24, 24 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v24, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vx v0, v8, a3 ; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll index 7ce167f8929736..70bf5a5e8226f6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll @@ -737,16 +737,14 @@ define @vp_ceil_vv_nxv16f64( %va, < ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v25, v0, a2 +; CHECK-NEXT: vslidedown.vx v2, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -754,49 +752,40 @@ define @vp_ceil_vv_nxv16f64( %va, < ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfabs.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll index 9767ba4bbc3b7a..451463802164ea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll @@ -2159,9 +2159,9 @@ define @vp_ctpop_nxv16i64( %va, @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev ; RV32-NEXT: addi a4, sp, 48 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v16, v8, a4 -; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: vand.vx v24, v8, a4 +; RV32-NEXT: vsll.vi v0, v24, 24 ; RV32-NEXT: addi a5, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v24, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: addi a5, sp, 48 ; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vx v0, v8, a3 ; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 @@ -2290,29 +2290,29 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev ; RV32-NEXT: addi a4, sp, 48 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v16, v8, a4 -; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: vand.vx v24, v8, a4 +; RV32-NEXT: vsll.vi v0, v24, 24 ; RV32-NEXT: addi a5, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v24, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: addi a5, sp, 48 ; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vx v0, v8, a3 ; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll index 22061040ddbc1c..8309f04aeafe84 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll @@ -940,29 +940,29 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v16, v8, a4 -; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: vand.vx v24, v8, a4 +; RV32-NEXT: vsll.vi v0, v24, 24 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v24, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vx v0, v8, a3 ; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb @@ -1177,29 +1177,29 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v16, v8, a4 -; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: vand.vx v24, v8, a4 +; RV32-NEXT: vsll.vi v0, v24, 24 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v24, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vx v0, v8, a3 ; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v16, v0, v16 ; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll index 84e07e13bf9255..e0791e7a902b8e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll @@ -763,69 +763,58 @@ declare <32 x double> @llvm.vp.ceil.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v25, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v2, v0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vslidedown.vi v1, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll index d47971ef5a13ca..c6a021162593da 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll @@ -2148,18 +2148,18 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero @@ -4737,18 +4737,18 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll index 08f7e2058ad29e..5d41185658ba05 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll @@ -1524,18 +1524,18 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: vslidedown.vi v24, v0, 2 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 44(sp) @@ -1563,43 +1563,59 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a2, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 +; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v24, v8, v16, v0.t +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 +; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 4 @@ -1608,28 +1624,31 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vadd.vv v16, v16, v8, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a2, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a2), zero +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill @@ -1638,50 +1657,82 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 +; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 48 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index 345e4180bba31a..7a99cad401803c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -1726,16 +1726,16 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a3, 16 +; RV32-NEXT: li a2, 16 ; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a3, .LBB34_2 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB34_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB34_2: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: li a2, 1 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a2, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb @@ -1760,7 +1760,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 @@ -1790,7 +1790,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 @@ -1816,20 +1816,20 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero @@ -1838,10 +1838,10 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: li a2, 56 -; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 @@ -1857,10 +1857,10 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v16, v8, a2, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t @@ -1870,8 +1870,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -1885,8 +1885,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -1898,8 +1898,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -1913,8 +1913,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -1925,7 +1925,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 @@ -1952,24 +1952,24 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: li a1, 16 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a2, a0 -; RV64-NEXT: bltu a0, a1, .LBB34_2 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB34_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a2, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB34_2: -; RV64-NEXT: li a1, 1 -; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: li a2, 1 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsub.vx v16, v8, a2, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: slli a3, a1, 32 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: vand.vx v16, v16, a1, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: lui a3, 209715 ; RV64-NEXT: addiw a3, a3, 819 @@ -2006,11 +2006,11 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: vsub.vx v16, v8, a2, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: vand.vx v16, v8, a3, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -3915,16 +3915,16 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a3, 16 +; RV32-NEXT: li a2, 16 ; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a3, .LBB70_2 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB70_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB70_2: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: li a2, 1 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a2, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb @@ -3949,7 +3949,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 @@ -3979,7 +3979,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 @@ -4005,20 +4005,20 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero @@ -4027,10 +4027,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: li a2, 56 -; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 @@ -4046,10 +4046,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v16, v8, a2, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t @@ -4059,8 +4059,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -4074,8 +4074,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -4087,8 +4087,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -4102,8 +4102,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -4114,7 +4114,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 @@ -4141,24 +4141,24 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: li a1, 16 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a2, a0 -; RV64-NEXT: bltu a0, a1, .LBB70_2 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB70_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a2, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB70_2: -; RV64-NEXT: li a1, 1 -; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: li a2, 1 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsub.vx v16, v8, a2, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: slli a3, a1, 32 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: vand.vx v16, v16, a1, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: lui a3, 209715 ; RV64-NEXT: addiw a3, a3, 819 @@ -4195,11 +4195,11 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: vsub.vx v16, v8, a2, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: vand.vx v16, v8, a3, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll index 84b3e142d5aea3..ccfa103c8a12db 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll @@ -763,69 +763,58 @@ declare <32 x double> @llvm.vp.floor.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v25, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v2, v0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vslidedown.vi v1, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll index ea818df7329c7d..a92b3f55488348 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -241,14 +241,19 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) { ; V128-NEXT: addi sp, sp, -16 ; V128-NEXT: .cfi_def_cfa_offset 16 ; V128-NEXT: csrr a0, vlenb -; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: slli a0, a0, 3 ; V128-NEXT: sub sp, sp, a0 -; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; V128-NEXT: lui a0, %hi(.LCPI10_0) ; V128-NEXT: addi a0, a0, %lo(.LCPI10_0) ; V128-NEXT: li a1, 32 ; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; V128-NEXT: vle16.v v4, (a0) +; V128-NEXT: vle16.v v24, (a0) +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: add a0, sp, a0 +; V128-NEXT: addi a0, a0, 16 +; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill ; V128-NEXT: lui a0, %hi(.LCPI10_1) ; V128-NEXT: addi a0, a0, %lo(.LCPI10_1) ; V128-NEXT: vle16.v v24, (a0) @@ -257,6 +262,11 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) { ; V128-NEXT: lui a0, 699051 ; V128-NEXT: addi a0, a0, -1366 ; V128-NEXT: vmv.s.x v0, a0 +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: add a0, sp, a0 +; V128-NEXT: addi a0, a0, 16 +; V128-NEXT: vl4r.v v4, (a0) # Unknown-size Folded Reload ; V128-NEXT: vrgatherei16.vv v24, v8, v4 ; V128-NEXT: addi a0, sp, 16 ; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload @@ -268,7 +278,7 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) { ; V128-NEXT: vmv8r.v v8, v0 ; V128-NEXT: vmv8r.v v16, v24 ; V128-NEXT: csrr a0, vlenb -; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: slli a0, a0, 3 ; V128-NEXT: add sp, sp, a0 ; V128-NEXT: addi sp, sp, 16 ; V128-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll index f3570495600f3c..ec1312be90d004 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll @@ -858,129 +858,129 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV32-NEXT: .cfi_offset s11, -52 ; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma ; RV32-NEXT: vslidedown.vi v16, v8, 2 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s t6, v24 -; RV32-NEXT: vmv.x.s a1, v16 -; RV32-NEXT: sw a1, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: li t6, 32 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s0, v24 +; RV32-NEXT: vmv.x.s a0, v16 +; RV32-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: vslidedown.vi v16, v8, 3 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s a1, v24 -; RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s a0, v24 +; RV32-NEXT: sw a0, 4(sp) # 4-byte Folded Spill ; RV32-NEXT: vmv.x.s a2, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 4 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s0, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s1, v24 ; RV32-NEXT: vmv.x.s a3, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 5 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s1, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s2, v24 ; RV32-NEXT: vmv.x.s a4, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 6 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s2, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s3, v24 ; RV32-NEXT: vmv.x.s a5, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 7 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s3, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s4, v24 ; RV32-NEXT: vmv.x.s a6, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 8 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s4, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s5, v24 ; RV32-NEXT: vmv.x.s a7, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 9 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s5, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s6, v24 ; RV32-NEXT: vmv.x.s t0, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 10 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s6, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s7, v24 ; RV32-NEXT: vmv.x.s t1, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 11 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s7, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s8, v24 ; RV32-NEXT: vmv.x.s t2, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 12 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s8, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s9, v24 ; RV32-NEXT: vmv.x.s t3, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 13 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s9, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s10, v24 ; RV32-NEXT: vmv.x.s t4, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 14 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s10, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s s11, v24 ; RV32-NEXT: vmv.x.s t5, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 15 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s11, v24 +; RV32-NEXT: vsrl.vx v24, v16, t6 +; RV32-NEXT: vmv.x.s ra, v24 ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vmv.x.s ra, v16 +; RV32-NEXT: vmv.x.s a1, v16 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vredxor.vs v8, v8, v9 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: add a1, a0, t6 +; RV32-NEXT: vsrl.vx v9, v8, t6 +; RV32-NEXT: vmv.x.s t6, v9 +; RV32-NEXT: add t6, t6, s0 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: lw t6, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: add t6, a0, t6 -; RV32-NEXT: sltu a0, t6, a0 -; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a2, t6, a2 -; RV32-NEXT: sltu a1, a2, t6 -; RV32-NEXT: add a1, a1, s0 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: add s0, a0, s0 +; RV32-NEXT: sltu a0, s0, a0 +; RV32-NEXT: add a0, t6, a0 +; RV32-NEXT: lw t6, 4(sp) # 4-byte Folded Reload +; RV32-NEXT: add a0, a0, t6 +; RV32-NEXT: add a2, s0, a2 +; RV32-NEXT: sltu t6, a2, s0 +; RV32-NEXT: add t6, t6, s1 +; RV32-NEXT: add a0, a0, t6 ; RV32-NEXT: add a3, a2, a3 -; RV32-NEXT: sltu a1, a3, a2 -; RV32-NEXT: add a1, a1, s1 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, a3, a2 +; RV32-NEXT: add a2, a2, s2 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add a4, a3, a4 -; RV32-NEXT: sltu a1, a4, a3 -; RV32-NEXT: add a1, a1, s2 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, a4, a3 +; RV32-NEXT: add a2, a2, s3 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add a5, a4, a5 -; RV32-NEXT: sltu a1, a5, a4 -; RV32-NEXT: add a1, a1, s3 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, a5, a4 +; RV32-NEXT: add a2, a2, s4 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add a6, a5, a6 -; RV32-NEXT: sltu a1, a6, a5 -; RV32-NEXT: add a1, a1, s4 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, a6, a5 +; RV32-NEXT: add a2, a2, s5 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add a7, a6, a7 -; RV32-NEXT: sltu a1, a7, a6 -; RV32-NEXT: add a1, a1, s5 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, a7, a6 +; RV32-NEXT: add a2, a2, s6 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add t0, a7, t0 -; RV32-NEXT: sltu a1, t0, a7 -; RV32-NEXT: add a1, a1, s6 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, t0, a7 +; RV32-NEXT: add a2, a2, s7 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add t1, t0, t1 -; RV32-NEXT: sltu a1, t1, t0 -; RV32-NEXT: add a1, a1, s7 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, t1, t0 +; RV32-NEXT: add a2, a2, s8 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add t2, t1, t2 -; RV32-NEXT: sltu a1, t2, t1 -; RV32-NEXT: add a1, a1, s8 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, t2, t1 +; RV32-NEXT: add a2, a2, s9 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add t3, t2, t3 -; RV32-NEXT: sltu a1, t3, t2 -; RV32-NEXT: add a1, a1, s9 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, t3, t2 +; RV32-NEXT: add a2, a2, s10 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add t4, t3, t4 -; RV32-NEXT: sltu a1, t4, t3 -; RV32-NEXT: add a1, a1, s10 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, t4, t3 +; RV32-NEXT: add a2, a2, s11 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add t5, t4, t5 -; RV32-NEXT: sltu a1, t5, t4 -; RV32-NEXT: add a1, a1, s11 -; RV32-NEXT: add a1, a0, a1 -; RV32-NEXT: add a0, t5, ra -; RV32-NEXT: sltu a2, a0, t5 -; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: sltu a2, t5, t4 +; RV32-NEXT: add a2, a2, ra +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: add a0, t5, a1 +; RV32-NEXT: sltu a1, a0, t5 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll index a54fa2e9b765fe..7bd5447221695e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -400,14 +400,19 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; V128-NEXT: addi sp, sp, -16 ; V128-NEXT: .cfi_def_cfa_offset 16 ; V128-NEXT: csrr a0, vlenb -; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: slli a0, a0, 3 ; V128-NEXT: sub sp, sp, a0 -; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; V128-NEXT: lui a0, %hi(.LCPI17_0) ; V128-NEXT: addi a0, a0, %lo(.LCPI17_0) ; V128-NEXT: li a1, 32 ; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; V128-NEXT: vle16.v v4, (a0) +; V128-NEXT: vle16.v v24, (a0) +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: add a0, sp, a0 +; V128-NEXT: addi a0, a0, 16 +; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill ; V128-NEXT: lui a0, %hi(.LCPI17_1) ; V128-NEXT: addi a0, a0, %lo(.LCPI17_1) ; V128-NEXT: vle16.v v24, (a0) @@ -416,6 +421,11 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; V128-NEXT: lui a0, 699051 ; V128-NEXT: addi a0, a0, -1366 ; V128-NEXT: vmv.s.x v0, a0 +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: add a0, sp, a0 +; V128-NEXT: addi a0, a0, 16 +; V128-NEXT: vl4r.v v4, (a0) # Unknown-size Folded Reload ; V128-NEXT: vrgatherei16.vv v24, v8, v4 ; V128-NEXT: addi a0, sp, 16 ; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload @@ -427,7 +437,7 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; V128-NEXT: vmv8r.v v8, v0 ; V128-NEXT: vmv8r.v v16, v24 ; V128-NEXT: csrr a0, vlenb -; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: slli a0, a0, 3 ; V128-NEXT: add sp, sp, a0 ; V128-NEXT: addi sp, sp, 16 ; V128-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index a56a81f5f793bc..597cf2cabd3cb3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -177,7 +177,6 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) { ; ; RV64-LABEL: vrgather_shuffle_vv_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vmv4r.v v16, v8 ; RV64-NEXT: lui a0, 327683 ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: addi a0, a0, 1 @@ -186,7 +185,7 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) { ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vmv.v.x v20, a0 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vrgatherei16.vv v8, v16, v20 +; RV64-NEXT: vrgatherei16.vv v16, v8, v20 ; RV64-NEXT: li a0, 164 ; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: lui a0, 163841 @@ -194,9 +193,10 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) { ; RV64-NEXT: addi a0, a0, 1 ; RV64-NEXT: slli a0, a0, 17 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.x v16, a0 +; RV64-NEXT: vmv.v.x v8, a0 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v8, v12, v16, v0.t +; RV64-NEXT: vrgatherei16.vv v16, v12, v8, v0.t +; RV64-NEXT: vmv.v.v v8, v16 ; RV64-NEXT: ret %s = shufflevector <8 x i64> %x, <8 x i64> %y, <8 x i32> ret <8 x i64> %s diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index eeb8e517d01d2d..d43808599c4038 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -131,65 +131,55 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 62 +; RV32-NEXT: li a3, 58 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 62 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 58 * vlenb ; RV32-NEXT: addi a3, a1, 128 ; RV32-NEXT: addi a4, a1, 256 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v16, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 29 -; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: slli a5, a4, 5 +; RV32-NEXT: add a4, a5, a4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vid.v v10 +; RV32-NEXT: vid.v v24 +; RV32-NEXT: vadd.vi v8, v24, -4 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a5, a4, 3 ; RV32-NEXT: add a4, a5, a4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs2r.v v10, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vadd.vi v8, v10, -4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 13 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-NEXT: vrgatherei16.vv v12, v16, v8 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 21 -; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: slli a5, a4, 4 +; RV32-NEXT: add a4, a5, a4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v12, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vadd.vi v8, v10, -10 +; RV32-NEXT: vadd.vi v2, v24, -10 ; RV32-NEXT: lui a4, 12 ; RV32-NEXT: vmv.s.x v0, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v16, v16, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 45 +; RV32-NEXT: li a5, 49 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; RV32-NEXT: vmv1r.v v26, v0 +; RV32-NEXT: vrgatherei16.vv v12, v16, v2, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 25 +; RV32-NEXT: li a5, 21 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -201,206 +191,180 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a5, a5, %lo(.LCPI6_1) ; RV32-NEXT: lui a6, 1 ; RV32-NEXT: vle16.v v8, (a4) -; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vle16.v v8, (a5) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 2 +; RV32-NEXT: li a5, 13 +; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: vle32.v v0, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 37 +; RV32-NEXT: li a4, 41 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vle32.v v24, (a3) +; RV32-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle32.v v16, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, a6, -64 -; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: vmv.s.x v27, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: slli a3, a1, 2 ; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v4 +; RV32-NEXT: vs1r.v v27, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v0, v28 +; RV32-NEXT: vmv1r.v v0, v27 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 13 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 21 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 21 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 3 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v10, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vi v8, v10, -2 +; RV32-NEXT: vadd.vi v8, v24, -2 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 29 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vadd.vi v8, v10, -8 -; RV32-NEXT: vmv2r.v v30, v10 +; RV32-NEXT: vadd.vi v8, v24, -8 +; RV32-NEXT: vmv2r.v v2, v24 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vmv1r.v v1, v26 +; RV32-NEXT: vmv1r.v v0, v26 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v28, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv1r.v v0, v28 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: vmv.v.v v24, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_2) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_2) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_3) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_3) -; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: vle16.v v8, (a3) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vle16.v v4, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v24, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: slli a3, a1, 2 ; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v24, v4, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v24, v8 +; RV32-NEXT: vmv.v.v v12, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 13 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_4) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_4) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 29 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v4, v16, v8 +; RV32-NEXT: vrgatherei16.vv v12, v16, v8 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vadd.vi v8, v30, -6 +; RV32-NEXT: vadd.vi v8, v2, -6 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; RV32-NEXT: vmv1r.v v0, v28 -; RV32-NEXT: vmv1r.v v2, v28 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v4, v16, v8, v0.t +; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; RV32-NEXT: vmv.v.v v4, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_5) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_5) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_6) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_6) -; RV32-NEXT: vle16.v v20, (a1) -; RV32-NEXT: vle16.v v8, (a3) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 3 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vle16.v v12, (a3) ; RV32-NEXT: li a1, 960 -; RV32-NEXT: vmv.s.x v1, a1 +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v8, v20 -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v24, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 3 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v24, v12, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v4, v24 +; RV32-NEXT: vmv.v.v v4, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: slli a3, a1, 2 ; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -408,200 +372,197 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a1, %hi(.LCPI6_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_7) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vle16.v v12, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 29 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v16, v8 -; RV32-NEXT: vmv1r.v v0, v2 +; RV32-NEXT: vrgatherei16.vv v20, v16, v12 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: vmv.v.v v4, v12 +; RV32-NEXT: vl2r.v v12, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v20, v8, v12, v0.t +; RV32-NEXT: vmv.v.v v8, v20 ; RV32-NEXT: lui a1, %hi(.LCPI6_8) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_8) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_9) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_9) -; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: vle16.v v20, (a3) +; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v16, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v16 -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v20, v0.t +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v0, v12 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v24, v12, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v4, v8 +; RV32-NEXT: vmv.v.v v8, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: lui a1, %hi(.LCPI6_10) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_10) -; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vle16.v v16, (a1) ; RV32-NEXT: lui a1, 15 -; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v1, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 29 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v10, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v20, v16, v10 +; RV32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v12, v24, v8 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v20, v24, v8, v0.t +; RV32-NEXT: vrgatherei16.vv v12, v24, v16, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_11) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_11) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_12) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_12) -; RV32-NEXT: vle16.v v24, (a1) -; RV32-NEXT: vle16.v v16, (a3) +; RV32-NEXT: vle16.v v4, (a1) +; RV32-NEXT: vle16.v v8, (a3) ; RV32-NEXT: li a1, 1008 -; RV32-NEXT: vmv.s.x v28, a1 +; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 29 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v28, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v24 -; RV32-NEXT: vmv1r.v v0, v28 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v24, v4 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v8 +; RV32-NEXT: vmv.v.v v12, v16 ; RV32-NEXT: lui a1, %hi(.LCPI6_13) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_13) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vle16.v v16, (a1) +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t +; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_14) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_14) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a2, %hi(.LCPI6_15) ; RV32-NEXT: addi a2, a2, %lo(.LCPI6_15) ; RV32-NEXT: vle16.v v24, (a1) -; RV32-NEXT: vle16.v v8, (a2) +; RV32-NEXT: vle16.v v16, (a2) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 45 +; RV32-NEXT: li a2, 49 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 37 +; RV32-NEXT: li a2, 41 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v24 +; RV32-NEXT: vrgatherei16.vv v16, v0, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 29 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a2, a1, 5 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 53 +; RV32-NEXT: li a2, 25 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 45 +; RV32-NEXT: li a2, 49 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t +; RV32-NEXT: vrgatherei16.vv v16, v24, v4, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vmv.v.v v8, v16 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v16, (a1) +; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vse32.v v20, (a1) +; RV32-NEXT: vse32.v v12, (a1) ; RV32-NEXT: addi a1, a0, 192 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 13 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a3, a2, 3 +; RV32-NEXT: add a2, a3, a2 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a3, a2, 3 +; RV32-NEXT: slli a3, a2, 2 ; RV32-NEXT: add a2, a3, a2 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 @@ -609,21 +570,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a3, a2, 4 -; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: li a3, 13 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 25 +; RV32-NEXT: li a2, 21 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 62 +; RV32-NEXT: li a1, 58 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 @@ -634,15 +595,15 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 52 +; RV64-NEXT: li a3, 54 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x34, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 52 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x36, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 54 * vlenb ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: addi a2, a1, 256 ; RV64-NEXT: vle64.v v16, (a2) ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 27 +; RV64-NEXT: li a3, 29 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 @@ -650,312 +611,295 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a2, a1, 128 ; RV64-NEXT: vle64.v v8, (a2) ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 35 +; RV64-NEXT: li a3, 37 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vle64.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 43 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vrgather.vi v8, v16, 4 +; RV64-NEXT: vrgather.vi v12, v16, 4 ; RV64-NEXT: li a1, 128 -; RV64-NEXT: vmv.s.x v4, a1 +; RV64-NEXT: vmv.s.x v8, a1 ; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v16, 8 +; RV64-NEXT: vslidedown.vi v16, v16, 8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 19 +; RV64-NEXT: li a2, 21 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vmv1r.v v0, v4 +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vrgather.vi v12, v16, 2, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 1 +; RV64-NEXT: slli a2, a1, 4 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs1r.v v4, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vrgather.vi v8, v24, 2, v0.t -; RV64-NEXT: vmv.v.v v20, v8 +; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: li a1, 6 -; RV64-NEXT: vid.v v8 -; RV64-NEXT: vmul.vx v2, v8, a1 +; RV64-NEXT: vid.v v10 +; RV64-NEXT: vmul.vx v14, v10, a1 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 43 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v24, v2 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v16, v0, v14 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-NEXT: li a1, 56 -; RV64-NEXT: vmv.s.x v1, a1 -; RV64-NEXT: vadd.vi v16, v2, -16 +; RV64-NEXT: vmv.s.x v9, a1 +; RV64-NEXT: vadd.vi v10, v14, -16 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vmv1r.v v0, v9 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 35 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; RV64-NEXT: vrgatherei16.vv v16, v24, v10, v0.t ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v20, v8 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a2, a1, 4 -; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.v v20, v16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 4 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 27 +; RV64-NEXT: li a2, 29 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgather.vi v8, v16, 5 -; RV64-NEXT: vmv1r.v v0, v4 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vi v16, v24, 5 +; RV64-NEXT: vmv1r.v v13, v8 +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 19 +; RV64-NEXT: li a2, 21 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgather.vi v8, v16, 3, v0.t -; RV64-NEXT: vmv.v.v v4, v8 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vi v16, v24, 3, v0.t +; RV64-NEXT: vmv.v.v v4, v16 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs2r.v v2, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vadd.vi v24, v2, 1 +; RV64-NEXT: vadd.vi v10, v14, 1 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 43 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v16, v24 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v16, v24, v10 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v24, v2, -15 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 11 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs2r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vadd.vi v10, v14, -15 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vmv1r.v v0, v9 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 35 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 11 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl2r.v v2, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v24, v2, v0.t +; RV64-NEXT: vrgatherei16.vv v16, v24, v10, v0.t ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v4, v8 +; RV64-NEXT: vmv.v.v v4, v16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 11 +; RV64-NEXT: li a2, 13 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl2r.v v2, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vadd.vi v4, v2, 2 +; RV64-NEXT: vadd.vi v10, v14, 2 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v8, v16, v4 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 45 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v16, v0, v10 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-NEXT: li a1, 24 -; RV64-NEXT: vmv.s.x v4, a1 -; RV64-NEXT: vadd.vi v16, v2, -14 +; RV64-NEXT: vmv.s.x v9, a1 +; RV64-NEXT: vadd.vi v10, v14, -14 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vmv1r.v v0, v4 -; RV64-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; RV64-NEXT: vmv1r.v v0, v9 +; RV64-NEXT: vrgatherei16.vv v16, v24, v10, v0.t ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.i v12, 6 +; RV64-NEXT: vmv.v.i v10, 6 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 27 +; RV64-NEXT: li a2, 29 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv4r.v v24, v16 -; RV64-NEXT: vrgatherei16.vv v16, v24, v12 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 1 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl1r.v v1, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v20, v24, v10 +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 19 +; RV64-NEXT: li a2, 21 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgather.vi v16, v24, 4, v0.t +; RV64-NEXT: vrgather.vi v20, v24, 4, v0.t ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v16, v8 +; RV64-NEXT: vmv.v.v v20, v16 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v28, v2, 3 +; RV64-NEXT: vadd.vi v10, v14, 3 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 43 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v16, v28 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v24, v0, v10 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v16, v2, -13 +; RV64-NEXT: vadd.vi v10, v14, -13 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vmv1r.v v0, v4 +; RV64-NEXT: vmv1r.v v0, v9 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 35 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v24, v16, v10, v0.t ; RV64-NEXT: lui a1, 16 ; RV64-NEXT: addi a1, a1, 7 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vmv.v.x v12, a1 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 27 +; RV64-NEXT: li a2, 29 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v24, v16, v12 -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vmv4r.v v8, v16 +; RV64-NEXT: vrgatherei16.vv v28, v16, v12 +; RV64-NEXT: vmv1r.v v0, v13 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 19 +; RV64-NEXT: li a2, 21 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgather.vi v24, v16, 5, v0.t +; RV64-NEXT: vrgather.vi v28, v16, 5, v0.t ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v24, v8 +; RV64-NEXT: vmv.v.v v28, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 1 +; RV64-NEXT: slli a2, a1, 2 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill ; RV64-NEXT: lui a1, 96 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vmv.v.x v13, a1 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: li a1, 192 -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 1 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 27 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgather.vi v4, v24, 2 -; RV64-NEXT: vrgatherei16.vv v4, v16, v8, v0.t +; RV64-NEXT: vmv.s.x v12, a1 +; RV64-NEXT: vrgather.vi v20, v8, 2 +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vrgatherei16.vv v20, v16, v13, v0.t +; RV64-NEXT: vmv.v.v v8, v20 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v26, v2, 4 +; RV64-NEXT: vadd.vi v24, v14, 4 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 43 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v16, v26 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v16, v0, v24 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-NEXT: li a1, 28 -; RV64-NEXT: vmv.s.x v1, a1 -; RV64-NEXT: vadd.vi v16, v2, -12 +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vadd.vi v2, v14, -12 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 35 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; RV64-NEXT: vrgatherei16.vv v16, v24, v2, v0.t ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v4, v8 +; RV64-NEXT: vmv.v.v v8, v16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: lui a1, 112 ; RV64-NEXT: addi a1, a1, 1 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v12, a1 +; RV64-NEXT: vmv.v.x v13, a1 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 27 +; RV64-NEXT: li a2, 29 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vi v8, v16, 3 +; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 1 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 19 +; RV64-NEXT: li a2, 21 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v16, v12, v0.t +; RV64-NEXT: vrgatherei16.vv v8, v16, v13, v0.t ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v12, v2, 5 +; RV64-NEXT: vadd.vi v12, v14, 5 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 43 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v16, v24, v12 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v16, v0, v12 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v12, v2, -11 +; RV64-NEXT: vadd.vi v12, v14, -11 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 35 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -967,10 +911,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 256 -; RV64-NEXT: vse64.v v4, (a1) +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 192 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a3, a2, 1 +; RV64-NEXT: slli a3, a2, 2 ; RV64-NEXT: add a2, a3, a2 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 @@ -979,14 +927,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: slli a3, a2, 3 -; RV64-NEXT: sub a2, a3, a2 +; RV64-NEXT: add a2, a3, a2 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 64 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 11 +; RV64-NEXT: li a3, 13 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 @@ -994,13 +942,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a2, a1, 4 -; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 52 +; RV64-NEXT: li a1, 54 ; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 6ee0e4525f5ec7..20656d82260c3c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -3691,42 +3691,42 @@ define <4 x i64> @mgather_v4i64(<4 x ptr> %ptrs, <4 x i1> %m, <4 x i64> %passthr ; RV32ZVE32F-LABEL: mgather_v4i64: ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a6, v0 -; RV32ZVE32F-NEXT: andi a2, a6, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a2, a4, 1 ; RV32ZVE32F-NEXT: beqz a2, .LBB44_5 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a2, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, a6, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB44_6 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB44_6 ; RV32ZVE32F-NEXT: .LBB44_2: -; RV32ZVE32F-NEXT: lw a4, 12(a1) -; RV32ZVE32F-NEXT: lw a5, 8(a1) -; RV32ZVE32F-NEXT: andi a7, a6, 4 +; RV32ZVE32F-NEXT: lw a5, 12(a1) +; RV32ZVE32F-NEXT: lw a6, 8(a1) +; RV32ZVE32F-NEXT: andi a7, a4, 4 ; RV32ZVE32F-NEXT: bnez a7, .LBB44_7 ; RV32ZVE32F-NEXT: .LBB44_3: ; RV32ZVE32F-NEXT: lw a7, 20(a1) ; RV32ZVE32F-NEXT: lw t0, 16(a1) -; RV32ZVE32F-NEXT: andi a6, a6, 8 -; RV32ZVE32F-NEXT: bnez a6, .LBB44_8 +; RV32ZVE32F-NEXT: andi a4, a4, 8 +; RV32ZVE32F-NEXT: bnez a4, .LBB44_8 ; RV32ZVE32F-NEXT: .LBB44_4: -; RV32ZVE32F-NEXT: lw a6, 28(a1) +; RV32ZVE32F-NEXT: lw a4, 28(a1) ; RV32ZVE32F-NEXT: lw a1, 24(a1) ; RV32ZVE32F-NEXT: j .LBB44_9 ; RV32ZVE32F-NEXT: .LBB44_5: ; RV32ZVE32F-NEXT: lw a2, 4(a1) ; RV32ZVE32F-NEXT: lw a3, 0(a1) -; RV32ZVE32F-NEXT: andi a4, a6, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB44_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB44_2 ; RV32ZVE32F-NEXT: .LBB44_6: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v9 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a7, a6, 4 +; RV32ZVE32F-NEXT: vmv.x.s a6, v9 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 ; RV32ZVE32F-NEXT: beqz a7, .LBB44_3 ; RV32ZVE32F-NEXT: .LBB44_7: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -3734,67 +3734,67 @@ define <4 x i64> @mgather_v4i64(<4 x ptr> %ptrs, <4 x i1> %m, <4 x i64> %passthr ; RV32ZVE32F-NEXT: vmv.x.s t0, v9 ; RV32ZVE32F-NEXT: lw a7, 4(t0) ; RV32ZVE32F-NEXT: lw t0, 0(t0) -; RV32ZVE32F-NEXT: andi a6, a6, 8 -; RV32ZVE32F-NEXT: beqz a6, .LBB44_4 +; RV32ZVE32F-NEXT: andi a4, a4, 8 +; RV32ZVE32F-NEXT: beqz a4, .LBB44_4 ; RV32ZVE32F-NEXT: .LBB44_8: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: lw a6, 4(a1) +; RV32ZVE32F-NEXT: lw a4, 4(a1) ; RV32ZVE32F-NEXT: lw a1, 0(a1) ; RV32ZVE32F-NEXT: .LBB44_9: # %else8 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) ; RV32ZVE32F-NEXT: sw t0, 16(a0) ; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw a1, 24(a0) -; RV32ZVE32F-NEXT: sw a6, 28(a0) +; RV32ZVE32F-NEXT: sw a4, 28(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v4i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB44_5 +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: beqz a4, .LBB44_5 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: ld a3, 0(a1) -; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB44_6 +; RV64ZVE32F-NEXT: ld a4, 0(a1) +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB44_6 ; RV64ZVE32F-NEXT: .LBB44_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: ld a5, 8(a2) +; RV64ZVE32F-NEXT: andi a6, a3, 4 ; RV64ZVE32F-NEXT: bnez a6, .LBB44_7 ; RV64ZVE32F-NEXT: .LBB44_3: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: andi a5, a5, 8 -; RV64ZVE32F-NEXT: bnez a5, .LBB44_8 +; RV64ZVE32F-NEXT: andi a3, a3, 8 +; RV64ZVE32F-NEXT: bnez a3, .LBB44_8 ; RV64ZVE32F-NEXT: .LBB44_4: ; RV64ZVE32F-NEXT: ld a1, 24(a2) ; RV64ZVE32F-NEXT: j .LBB44_9 ; RV64ZVE32F-NEXT: .LBB44_5: -; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB44_2 +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB44_2 ; RV64ZVE32F-NEXT: .LBB44_6: # %cond.load1 -; RV64ZVE32F-NEXT: ld a4, 8(a1) -; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: ld a5, 8(a1) +; RV64ZVE32F-NEXT: ld a5, 0(a5) +; RV64ZVE32F-NEXT: andi a6, a3, 4 ; RV64ZVE32F-NEXT: beqz a6, .LBB44_3 ; RV64ZVE32F-NEXT: .LBB44_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a6, 16(a1) ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a5, a5, 8 -; RV64ZVE32F-NEXT: beqz a5, .LBB44_4 +; RV64ZVE32F-NEXT: andi a3, a3, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB44_4 ; RV64ZVE32F-NEXT: .LBB44_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a1, 24(a1) ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB44_9: # %else8 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a1, 24(a0) ; RV64ZVE32F-NEXT: ret @@ -3820,41 +3820,41 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru) { ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmset.m v9 -; RV32ZVE32F-NEXT: vmv.x.s a6, v9 +; RV32ZVE32F-NEXT: vmv.x.s a4, v9 ; RV32ZVE32F-NEXT: bnez zero, .LBB45_5 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a2, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, a6, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB45_6 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB45_6 ; RV32ZVE32F-NEXT: .LBB45_2: -; RV32ZVE32F-NEXT: lw a4, 12(a1) -; RV32ZVE32F-NEXT: lw a5, 8(a1) -; RV32ZVE32F-NEXT: andi a7, a6, 4 +; RV32ZVE32F-NEXT: lw a5, 12(a1) +; RV32ZVE32F-NEXT: lw a6, 8(a1) +; RV32ZVE32F-NEXT: andi a7, a4, 4 ; RV32ZVE32F-NEXT: bnez a7, .LBB45_7 ; RV32ZVE32F-NEXT: .LBB45_3: ; RV32ZVE32F-NEXT: lw a7, 20(a1) ; RV32ZVE32F-NEXT: lw t0, 16(a1) -; RV32ZVE32F-NEXT: andi a6, a6, 8 -; RV32ZVE32F-NEXT: bnez a6, .LBB45_8 +; RV32ZVE32F-NEXT: andi a4, a4, 8 +; RV32ZVE32F-NEXT: bnez a4, .LBB45_8 ; RV32ZVE32F-NEXT: .LBB45_4: -; RV32ZVE32F-NEXT: lw a6, 28(a1) +; RV32ZVE32F-NEXT: lw a4, 28(a1) ; RV32ZVE32F-NEXT: lw a1, 24(a1) ; RV32ZVE32F-NEXT: j .LBB45_9 ; RV32ZVE32F-NEXT: .LBB45_5: ; RV32ZVE32F-NEXT: lw a2, 4(a1) ; RV32ZVE32F-NEXT: lw a3, 0(a1) -; RV32ZVE32F-NEXT: andi a4, a6, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB45_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB45_2 ; RV32ZVE32F-NEXT: .LBB45_6: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v9 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a7, a6, 4 +; RV32ZVE32F-NEXT: vmv.x.s a6, v9 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 ; RV32ZVE32F-NEXT: beqz a7, .LBB45_3 ; RV32ZVE32F-NEXT: .LBB45_7: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -3862,67 +3862,67 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru) { ; RV32ZVE32F-NEXT: vmv.x.s t0, v9 ; RV32ZVE32F-NEXT: lw a7, 4(t0) ; RV32ZVE32F-NEXT: lw t0, 0(t0) -; RV32ZVE32F-NEXT: andi a6, a6, 8 -; RV32ZVE32F-NEXT: beqz a6, .LBB45_4 +; RV32ZVE32F-NEXT: andi a4, a4, 8 +; RV32ZVE32F-NEXT: beqz a4, .LBB45_4 ; RV32ZVE32F-NEXT: .LBB45_8: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: lw a6, 4(a1) +; RV32ZVE32F-NEXT: lw a4, 4(a1) ; RV32ZVE32F-NEXT: lw a1, 0(a1) ; RV32ZVE32F-NEXT: .LBB45_9: # %else8 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) ; RV32ZVE32F-NEXT: sw t0, 16(a0) ; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw a1, 24(a0) -; RV32ZVE32F-NEXT: sw a6, 28(a0) +; RV32ZVE32F-NEXT: sw a4, 28(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmset.m v8 -; RV64ZVE32F-NEXT: vmv.x.s a5, v8 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 ; RV64ZVE32F-NEXT: bnez zero, .LBB45_5 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: ld a3, 0(a1) ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB45_6 +; RV64ZVE32F-NEXT: andi a5, a4, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB45_6 ; RV64ZVE32F-NEXT: .LBB45_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: ld a5, 8(a2) +; RV64ZVE32F-NEXT: andi a6, a4, 4 ; RV64ZVE32F-NEXT: bnez a6, .LBB45_7 ; RV64ZVE32F-NEXT: .LBB45_3: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: andi a5, a5, 8 -; RV64ZVE32F-NEXT: bnez a5, .LBB45_8 +; RV64ZVE32F-NEXT: andi a4, a4, 8 +; RV64ZVE32F-NEXT: bnez a4, .LBB45_8 ; RV64ZVE32F-NEXT: .LBB45_4: ; RV64ZVE32F-NEXT: ld a1, 24(a2) ; RV64ZVE32F-NEXT: j .LBB45_9 ; RV64ZVE32F-NEXT: .LBB45_5: ; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB45_2 +; RV64ZVE32F-NEXT: andi a5, a4, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB45_2 ; RV64ZVE32F-NEXT: .LBB45_6: # %cond.load1 -; RV64ZVE32F-NEXT: ld a4, 8(a1) -; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: ld a5, 8(a1) +; RV64ZVE32F-NEXT: ld a5, 0(a5) +; RV64ZVE32F-NEXT: andi a6, a4, 4 ; RV64ZVE32F-NEXT: beqz a6, .LBB45_3 ; RV64ZVE32F-NEXT: .LBB45_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a6, 16(a1) ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a5, a5, 8 -; RV64ZVE32F-NEXT: beqz a5, .LBB45_4 +; RV64ZVE32F-NEXT: andi a4, a4, 8 +; RV64ZVE32F-NEXT: beqz a4, .LBB45_4 ; RV64ZVE32F-NEXT: .LBB45_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a1, 24(a1) ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB45_9: # %else8 ; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a1, 24(a0) ; RV64ZVE32F-NEXT: ret @@ -3998,35 +3998,35 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr ; RV32ZVE32F-LABEL: mgather_v8i64: ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a2, t0, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a2, a4, 1 ; RV32ZVE32F-NEXT: beqz a2, .LBB47_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a2, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB47_8 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB47_8 ; RV32ZVE32F-NEXT: .LBB47_2: -; RV32ZVE32F-NEXT: lw a4, 12(a1) -; RV32ZVE32F-NEXT: lw a5, 8(a1) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB47_9 +; RV32ZVE32F-NEXT: lw a5, 12(a1) +; RV32ZVE32F-NEXT: lw a6, 8(a1) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB47_9 ; RV32ZVE32F-NEXT: .LBB47_3: -; RV32ZVE32F-NEXT: lw a6, 20(a1) -; RV32ZVE32F-NEXT: lw a7, 16(a1) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 20(a1) +; RV32ZVE32F-NEXT: lw t0, 16(a1) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB47_10 ; RV32ZVE32F-NEXT: .LBB47_4: ; RV32ZVE32F-NEXT: lw t1, 28(a1) ; RV32ZVE32F-NEXT: lw t2, 24(a1) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB47_11 ; RV32ZVE32F-NEXT: .LBB47_5: ; RV32ZVE32F-NEXT: lw t3, 36(a1) ; RV32ZVE32F-NEXT: lw t4, 32(a1) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB47_12 ; RV32ZVE32F-NEXT: .LBB47_6: ; RV32ZVE32F-NEXT: lw t5, 44(a1) @@ -4035,23 +4035,23 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr ; RV32ZVE32F-NEXT: .LBB47_7: ; RV32ZVE32F-NEXT: lw a2, 4(a1) ; RV32ZVE32F-NEXT: lw a3, 0(a1) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB47_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB47_2 ; RV32ZVE32F-NEXT: .LBB47_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB47_3 +; RV32ZVE32F-NEXT: vmv.x.s a6, v10 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB47_3 ; RV32ZVE32F-NEXT: .LBB47_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB47_4 ; RV32ZVE32F-NEXT: .LBB47_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -4059,7 +4059,7 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 4(t2) ; RV32ZVE32F-NEXT: lw t2, 0(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB47_5 ; RV32ZVE32F-NEXT: .LBB47_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4067,7 +4067,7 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 4(t4) ; RV32ZVE32F-NEXT: lw t4, 0(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB47_6 ; RV32ZVE32F-NEXT: .LBB47_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4082,7 +4082,7 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a4, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB47_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4090,30 +4090,30 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 4(s1) ; RV32ZVE32F-NEXT: lw s1, 0(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB47_17 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: bnez a4, .LBB47_17 ; RV32ZVE32F-NEXT: .LBB47_15: -; RV32ZVE32F-NEXT: lw t0, 60(a1) +; RV32ZVE32F-NEXT: lw a4, 60(a1) ; RV32ZVE32F-NEXT: lw a1, 56(a1) ; RV32ZVE32F-NEXT: j .LBB47_18 ; RV32ZVE32F-NEXT: .LBB47_16: ; RV32ZVE32F-NEXT: lw s0, 52(a1) ; RV32ZVE32F-NEXT: lw s1, 48(a1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB47_15 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: beqz a4, .LBB47_15 ; RV32ZVE32F-NEXT: .LBB47_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a1) +; RV32ZVE32F-NEXT: lw a4, 4(a1) ; RV32ZVE32F-NEXT: lw a1, 0(a1) ; RV32ZVE32F-NEXT: .LBB47_18: # %else20 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw t2, 24(a0) ; RV32ZVE32F-NEXT: sw t1, 28(a0) ; RV32ZVE32F-NEXT: sw t4, 32(a0) @@ -4123,7 +4123,7 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr ; RV32ZVE32F-NEXT: sw s1, 48(a0) ; RV32ZVE32F-NEXT: sw s0, 52(a0) ; RV32ZVE32F-NEXT: sw a1, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -4132,82 +4132,82 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr ; RV64ZVE32F-LABEL: mgather_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi a3, a6, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB47_9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: beqz a4, .LBB47_9 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: ld a3, 0(a1) -; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a6, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB47_10 +; RV64ZVE32F-NEXT: ld a4, 0(a1) +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB47_10 ; RV64ZVE32F-NEXT: .LBB47_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: andi a5, a6, 4 -; RV64ZVE32F-NEXT: bnez a5, .LBB47_11 +; RV64ZVE32F-NEXT: ld a5, 8(a2) +; RV64ZVE32F-NEXT: andi a6, a3, 4 +; RV64ZVE32F-NEXT: bnez a6, .LBB47_11 ; RV64ZVE32F-NEXT: .LBB47_3: -; RV64ZVE32F-NEXT: ld a5, 16(a2) -; RV64ZVE32F-NEXT: andi a7, a6, 8 +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB47_12 ; RV64ZVE32F-NEXT: .LBB47_4: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a6, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB47_13 ; RV64ZVE32F-NEXT: .LBB47_5: ; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a6, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: bnez t1, .LBB47_14 ; RV64ZVE32F-NEXT: .LBB47_6: ; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: andi t2, a6, 64 +; RV64ZVE32F-NEXT: andi t2, a3, 64 ; RV64ZVE32F-NEXT: bnez t2, .LBB47_15 ; RV64ZVE32F-NEXT: .LBB47_7: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: bnez a6, .LBB47_16 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: bnez a3, .LBB47_16 ; RV64ZVE32F-NEXT: .LBB47_8: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB47_17 ; RV64ZVE32F-NEXT: .LBB47_9: -; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a6, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB47_2 +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB47_2 ; RV64ZVE32F-NEXT: .LBB47_10: # %cond.load1 -; RV64ZVE32F-NEXT: ld a4, 8(a1) -; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: andi a5, a6, 4 -; RV64ZVE32F-NEXT: beqz a5, .LBB47_3 -; RV64ZVE32F-NEXT: .LBB47_11: # %cond.load4 -; RV64ZVE32F-NEXT: ld a5, 16(a1) +; RV64ZVE32F-NEXT: ld a5, 8(a1) ; RV64ZVE32F-NEXT: ld a5, 0(a5) -; RV64ZVE32F-NEXT: andi a7, a6, 8 +; RV64ZVE32F-NEXT: andi a6, a3, 4 +; RV64ZVE32F-NEXT: beqz a6, .LBB47_3 +; RV64ZVE32F-NEXT: .LBB47_11: # %cond.load4 +; RV64ZVE32F-NEXT: ld a6, 16(a1) +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB47_4 ; RV64ZVE32F-NEXT: .LBB47_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld a7, 24(a1) ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a6, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: beqz t0, .LBB47_5 ; RV64ZVE32F-NEXT: .LBB47_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld t0, 32(a1) ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a6, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: beqz t1, .LBB47_6 ; RV64ZVE32F-NEXT: .LBB47_14: # %cond.load13 ; RV64ZVE32F-NEXT: ld t1, 40(a1) ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: andi t2, a6, 64 +; RV64ZVE32F-NEXT: andi t2, a3, 64 ; RV64ZVE32F-NEXT: beqz t2, .LBB47_7 ; RV64ZVE32F-NEXT: .LBB47_15: # %cond.load16 ; RV64ZVE32F-NEXT: ld t2, 48(a1) ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: beqz a6, .LBB47_8 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: beqz a3, .LBB47_8 ; RV64ZVE32F-NEXT: .LBB47_16: # %cond.load19 ; RV64ZVE32F-NEXT: ld a1, 56(a1) ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB47_17: # %else20 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) -; RV64ZVE32F-NEXT: sd a5, 16(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) ; RV64ZVE32F-NEXT: sd t1, 40(a0) @@ -4245,35 +4245,35 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a1, a4, 1 ; RV32ZVE32F-NEXT: beqz a1, .LBB48_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB48_8 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB48_8 ; RV32ZVE32F-NEXT: .LBB48_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB48_9 +; RV32ZVE32F-NEXT: lw a5, 12(a2) +; RV32ZVE32F-NEXT: lw a6, 8(a2) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB48_9 ; RV32ZVE32F-NEXT: .LBB48_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 20(a2) +; RV32ZVE32F-NEXT: lw t0, 16(a2) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB48_10 ; RV32ZVE32F-NEXT: .LBB48_4: ; RV32ZVE32F-NEXT: lw t1, 28(a2) ; RV32ZVE32F-NEXT: lw t2, 24(a2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB48_11 ; RV32ZVE32F-NEXT: .LBB48_5: ; RV32ZVE32F-NEXT: lw t3, 36(a2) ; RV32ZVE32F-NEXT: lw t4, 32(a2) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB48_12 ; RV32ZVE32F-NEXT: .LBB48_6: ; RV32ZVE32F-NEXT: lw t5, 44(a2) @@ -4282,23 +4282,23 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: .LBB48_7: ; RV32ZVE32F-NEXT: lw a1, 4(a2) ; RV32ZVE32F-NEXT: lw a3, 0(a2) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB48_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB48_2 ; RV32ZVE32F-NEXT: .LBB48_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB48_3 +; RV32ZVE32F-NEXT: vmv.x.s a6, v10 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB48_3 ; RV32ZVE32F-NEXT: .LBB48_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB48_4 ; RV32ZVE32F-NEXT: .LBB48_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -4306,7 +4306,7 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 4(t2) ; RV32ZVE32F-NEXT: lw t2, 0(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB48_5 ; RV32ZVE32F-NEXT: .LBB48_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4314,7 +4314,7 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 4(t4) ; RV32ZVE32F-NEXT: lw t4, 0(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB48_6 ; RV32ZVE32F-NEXT: .LBB48_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4329,7 +4329,7 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a4, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB48_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4337,30 +4337,30 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 4(s1) ; RV32ZVE32F-NEXT: lw s1, 0(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB48_17 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: bnez a4, .LBB48_17 ; RV32ZVE32F-NEXT: .LBB48_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a4, 60(a2) ; RV32ZVE32F-NEXT: lw a2, 56(a2) ; RV32ZVE32F-NEXT: j .LBB48_18 ; RV32ZVE32F-NEXT: .LBB48_16: ; RV32ZVE32F-NEXT: lw s0, 52(a2) ; RV32ZVE32F-NEXT: lw s1, 48(a2) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB48_15 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: beqz a4, .LBB48_15 ; RV32ZVE32F-NEXT: .LBB48_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a4, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB48_18: # %else20 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw t2, 24(a0) ; RV32ZVE32F-NEXT: sw t1, 28(a0) ; RV32ZVE32F-NEXT: sw t4, 32(a0) @@ -4370,7 +4370,7 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: sw s1, 48(a0) ; RV32ZVE32F-NEXT: sw s0, 52(a0) ; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -4379,35 +4379,35 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-LABEL: mgather_baseidx_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB48_3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: beqz a4, .LBB48_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB48_4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB48_4 ; RV64ZVE32F-NEXT: .LBB48_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: j .LBB48_5 ; RV64ZVE32F-NEXT: .LBB48_3: -; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB48_2 +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB48_2 ; RV64ZVE32F-NEXT: .LBB48_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v9 -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: vmv.x.s a5, v9 +; RV64ZVE32F-NEXT: slli a5, a5, 3 +; RV64ZVE32F-NEXT: add a5, a1, a5 +; RV64ZVE32F-NEXT: ld a5, 0(a5) ; RV64ZVE32F-NEXT: .LBB48_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: andi a6, a3, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: beqz a6, .LBB48_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 @@ -4415,22 +4415,22 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB48_11 ; RV64ZVE32F-NEXT: .LBB48_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB48_12 ; RV64ZVE32F-NEXT: .LBB48_8: ; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: bnez t1, .LBB48_13 ; RV64ZVE32F-NEXT: .LBB48_9: ; RV64ZVE32F-NEXT: ld t1, 40(a2) ; RV64ZVE32F-NEXT: j .LBB48_14 ; RV64ZVE32F-NEXT: .LBB48_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB48_7 ; RV64ZVE32F-NEXT: .LBB48_11: # %cond.load7 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -4438,14 +4438,14 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: beqz t0, .LBB48_8 ; RV64ZVE32F-NEXT: .LBB48_12: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: beqz t1, .LBB48_9 ; RV64ZVE32F-NEXT: .LBB48_13: # %cond.load13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 @@ -4454,7 +4454,7 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB48_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: andi t2, a3, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: beqz t2, .LBB48_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 @@ -4462,15 +4462,15 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB48_18 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: bnez a3, .LBB48_18 ; RV64ZVE32F-NEXT: .LBB48_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB48_19 ; RV64ZVE32F-NEXT: .LBB48_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB48_16 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: beqz a3, .LBB48_16 ; RV64ZVE32F-NEXT: .LBB48_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -4478,8 +4478,8 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB48_19: # %else20 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) @@ -4519,35 +4519,35 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a1, a4, 1 ; RV32ZVE32F-NEXT: beqz a1, .LBB49_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB49_8 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB49_8 ; RV32ZVE32F-NEXT: .LBB49_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB49_9 +; RV32ZVE32F-NEXT: lw a5, 12(a2) +; RV32ZVE32F-NEXT: lw a6, 8(a2) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB49_9 ; RV32ZVE32F-NEXT: .LBB49_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 20(a2) +; RV32ZVE32F-NEXT: lw t0, 16(a2) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB49_10 ; RV32ZVE32F-NEXT: .LBB49_4: ; RV32ZVE32F-NEXT: lw t1, 28(a2) ; RV32ZVE32F-NEXT: lw t2, 24(a2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB49_11 ; RV32ZVE32F-NEXT: .LBB49_5: ; RV32ZVE32F-NEXT: lw t3, 36(a2) ; RV32ZVE32F-NEXT: lw t4, 32(a2) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB49_12 ; RV32ZVE32F-NEXT: .LBB49_6: ; RV32ZVE32F-NEXT: lw t5, 44(a2) @@ -4556,23 +4556,23 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: .LBB49_7: ; RV32ZVE32F-NEXT: lw a1, 4(a2) ; RV32ZVE32F-NEXT: lw a3, 0(a2) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB49_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB49_2 ; RV32ZVE32F-NEXT: .LBB49_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB49_3 +; RV32ZVE32F-NEXT: vmv.x.s a6, v10 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB49_3 ; RV32ZVE32F-NEXT: .LBB49_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB49_4 ; RV32ZVE32F-NEXT: .LBB49_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -4580,7 +4580,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 4(t2) ; RV32ZVE32F-NEXT: lw t2, 0(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB49_5 ; RV32ZVE32F-NEXT: .LBB49_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4588,7 +4588,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 4(t4) ; RV32ZVE32F-NEXT: lw t4, 0(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB49_6 ; RV32ZVE32F-NEXT: .LBB49_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4603,7 +4603,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a4, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB49_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4611,30 +4611,30 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 4(s1) ; RV32ZVE32F-NEXT: lw s1, 0(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB49_17 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: bnez a4, .LBB49_17 ; RV32ZVE32F-NEXT: .LBB49_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a4, 60(a2) ; RV32ZVE32F-NEXT: lw a2, 56(a2) ; RV32ZVE32F-NEXT: j .LBB49_18 ; RV32ZVE32F-NEXT: .LBB49_16: ; RV32ZVE32F-NEXT: lw s0, 52(a2) ; RV32ZVE32F-NEXT: lw s1, 48(a2) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB49_15 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: beqz a4, .LBB49_15 ; RV32ZVE32F-NEXT: .LBB49_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a4, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB49_18: # %else20 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw t2, 24(a0) ; RV32ZVE32F-NEXT: sw t1, 28(a0) ; RV32ZVE32F-NEXT: sw t4, 32(a0) @@ -4644,7 +4644,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: sw s1, 48(a0) ; RV32ZVE32F-NEXT: sw s0, 52(a0) ; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -4653,35 +4653,35 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB49_3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: beqz a4, .LBB49_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB49_4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB49_4 ; RV64ZVE32F-NEXT: .LBB49_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: j .LBB49_5 ; RV64ZVE32F-NEXT: .LBB49_3: -; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB49_2 +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB49_2 ; RV64ZVE32F-NEXT: .LBB49_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v9 -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: vmv.x.s a5, v9 +; RV64ZVE32F-NEXT: slli a5, a5, 3 +; RV64ZVE32F-NEXT: add a5, a1, a5 +; RV64ZVE32F-NEXT: ld a5, 0(a5) ; RV64ZVE32F-NEXT: .LBB49_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: andi a6, a3, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: beqz a6, .LBB49_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 @@ -4689,22 +4689,22 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB49_11 ; RV64ZVE32F-NEXT: .LBB49_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB49_12 ; RV64ZVE32F-NEXT: .LBB49_8: ; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: bnez t1, .LBB49_13 ; RV64ZVE32F-NEXT: .LBB49_9: ; RV64ZVE32F-NEXT: ld t1, 40(a2) ; RV64ZVE32F-NEXT: j .LBB49_14 ; RV64ZVE32F-NEXT: .LBB49_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB49_7 ; RV64ZVE32F-NEXT: .LBB49_11: # %cond.load7 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -4712,14 +4712,14 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: beqz t0, .LBB49_8 ; RV64ZVE32F-NEXT: .LBB49_12: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: beqz t1, .LBB49_9 ; RV64ZVE32F-NEXT: .LBB49_13: # %cond.load13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 @@ -4728,7 +4728,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB49_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: andi t2, a3, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: beqz t2, .LBB49_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 @@ -4736,15 +4736,15 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB49_18 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: bnez a3, .LBB49_18 ; RV64ZVE32F-NEXT: .LBB49_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB49_19 ; RV64ZVE32F-NEXT: .LBB49_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB49_16 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: beqz a3, .LBB49_16 ; RV64ZVE32F-NEXT: .LBB49_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -4752,8 +4752,8 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB49_19: # %else20 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) @@ -4795,35 +4795,35 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a1, a4, 1 ; RV32ZVE32F-NEXT: beqz a1, .LBB50_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB50_8 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB50_8 ; RV32ZVE32F-NEXT: .LBB50_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB50_9 +; RV32ZVE32F-NEXT: lw a5, 12(a2) +; RV32ZVE32F-NEXT: lw a6, 8(a2) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB50_9 ; RV32ZVE32F-NEXT: .LBB50_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 20(a2) +; RV32ZVE32F-NEXT: lw t0, 16(a2) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB50_10 ; RV32ZVE32F-NEXT: .LBB50_4: ; RV32ZVE32F-NEXT: lw t1, 28(a2) ; RV32ZVE32F-NEXT: lw t2, 24(a2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB50_11 ; RV32ZVE32F-NEXT: .LBB50_5: ; RV32ZVE32F-NEXT: lw t3, 36(a2) ; RV32ZVE32F-NEXT: lw t4, 32(a2) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB50_12 ; RV32ZVE32F-NEXT: .LBB50_6: ; RV32ZVE32F-NEXT: lw t5, 44(a2) @@ -4832,23 +4832,23 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: .LBB50_7: ; RV32ZVE32F-NEXT: lw a1, 4(a2) ; RV32ZVE32F-NEXT: lw a3, 0(a2) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB50_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB50_2 ; RV32ZVE32F-NEXT: .LBB50_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB50_3 +; RV32ZVE32F-NEXT: vmv.x.s a6, v10 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB50_3 ; RV32ZVE32F-NEXT: .LBB50_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB50_4 ; RV32ZVE32F-NEXT: .LBB50_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -4856,7 +4856,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 4(t2) ; RV32ZVE32F-NEXT: lw t2, 0(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB50_5 ; RV32ZVE32F-NEXT: .LBB50_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4864,7 +4864,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 4(t4) ; RV32ZVE32F-NEXT: lw t4, 0(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB50_6 ; RV32ZVE32F-NEXT: .LBB50_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4879,7 +4879,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a4, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB50_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4887,30 +4887,30 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 4(s1) ; RV32ZVE32F-NEXT: lw s1, 0(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB50_17 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: bnez a4, .LBB50_17 ; RV32ZVE32F-NEXT: .LBB50_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a4, 60(a2) ; RV32ZVE32F-NEXT: lw a2, 56(a2) ; RV32ZVE32F-NEXT: j .LBB50_18 ; RV32ZVE32F-NEXT: .LBB50_16: ; RV32ZVE32F-NEXT: lw s0, 52(a2) ; RV32ZVE32F-NEXT: lw s1, 48(a2) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB50_15 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: beqz a4, .LBB50_15 ; RV32ZVE32F-NEXT: .LBB50_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a4, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB50_18: # %else20 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw t2, 24(a0) ; RV32ZVE32F-NEXT: sw t1, 28(a0) ; RV32ZVE32F-NEXT: sw t4, 32(a0) @@ -4920,7 +4920,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: sw s1, 48(a0) ; RV32ZVE32F-NEXT: sw s0, 52(a0) ; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -4929,37 +4929,37 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB50_3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: beqz a4, .LBB50_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: andi a3, a3, 255 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB50_4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: andi a4, a4, 255 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB50_4 ; RV64ZVE32F-NEXT: .LBB50_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: j .LBB50_5 ; RV64ZVE32F-NEXT: .LBB50_3: -; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB50_2 +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB50_2 ; RV64ZVE32F-NEXT: .LBB50_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v9 -; RV64ZVE32F-NEXT: andi a4, a4, 255 -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: vmv.x.s a5, v9 +; RV64ZVE32F-NEXT: andi a5, a5, 255 +; RV64ZVE32F-NEXT: slli a5, a5, 3 +; RV64ZVE32F-NEXT: add a5, a1, a5 +; RV64ZVE32F-NEXT: ld a5, 0(a5) ; RV64ZVE32F-NEXT: .LBB50_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: andi a6, a3, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: beqz a6, .LBB50_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 @@ -4968,22 +4968,22 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB50_11 ; RV64ZVE32F-NEXT: .LBB50_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB50_12 ; RV64ZVE32F-NEXT: .LBB50_8: ; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: bnez t1, .LBB50_13 ; RV64ZVE32F-NEXT: .LBB50_9: ; RV64ZVE32F-NEXT: ld t1, 40(a2) ; RV64ZVE32F-NEXT: j .LBB50_14 ; RV64ZVE32F-NEXT: .LBB50_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB50_7 ; RV64ZVE32F-NEXT: .LBB50_11: # %cond.load7 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -4992,7 +4992,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: beqz t0, .LBB50_8 ; RV64ZVE32F-NEXT: .LBB50_12: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s t0, v9 @@ -5000,7 +5000,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: beqz t1, .LBB50_9 ; RV64ZVE32F-NEXT: .LBB50_13: # %cond.load13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 @@ -5010,7 +5010,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB50_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: andi t2, a3, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: beqz t2, .LBB50_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 @@ -5019,15 +5019,15 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB50_18 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: bnez a3, .LBB50_18 ; RV64ZVE32F-NEXT: .LBB50_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB50_19 ; RV64ZVE32F-NEXT: .LBB50_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB50_16 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: beqz a3, .LBB50_16 ; RV64ZVE32F-NEXT: .LBB50_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -5036,8 +5036,8 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB50_19: # %else20 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) @@ -5078,35 +5078,35 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a1, a4, 1 ; RV32ZVE32F-NEXT: beqz a1, .LBB51_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB51_8 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB51_8 ; RV32ZVE32F-NEXT: .LBB51_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB51_9 +; RV32ZVE32F-NEXT: lw a5, 12(a2) +; RV32ZVE32F-NEXT: lw a6, 8(a2) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB51_9 ; RV32ZVE32F-NEXT: .LBB51_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 20(a2) +; RV32ZVE32F-NEXT: lw t0, 16(a2) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB51_10 ; RV32ZVE32F-NEXT: .LBB51_4: ; RV32ZVE32F-NEXT: lw t1, 28(a2) ; RV32ZVE32F-NEXT: lw t2, 24(a2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB51_11 ; RV32ZVE32F-NEXT: .LBB51_5: ; RV32ZVE32F-NEXT: lw t3, 36(a2) ; RV32ZVE32F-NEXT: lw t4, 32(a2) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB51_12 ; RV32ZVE32F-NEXT: .LBB51_6: ; RV32ZVE32F-NEXT: lw t5, 44(a2) @@ -5115,23 +5115,23 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: .LBB51_7: ; RV32ZVE32F-NEXT: lw a1, 4(a2) ; RV32ZVE32F-NEXT: lw a3, 0(a2) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB51_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB51_2 ; RV32ZVE32F-NEXT: .LBB51_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB51_3 +; RV32ZVE32F-NEXT: vmv.x.s a6, v10 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB51_3 ; RV32ZVE32F-NEXT: .LBB51_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB51_4 ; RV32ZVE32F-NEXT: .LBB51_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -5139,7 +5139,7 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 4(t2) ; RV32ZVE32F-NEXT: lw t2, 0(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB51_5 ; RV32ZVE32F-NEXT: .LBB51_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5147,7 +5147,7 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 4(t4) ; RV32ZVE32F-NEXT: lw t4, 0(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB51_6 ; RV32ZVE32F-NEXT: .LBB51_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5162,7 +5162,7 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a4, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB51_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5170,30 +5170,30 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 4(s1) ; RV32ZVE32F-NEXT: lw s1, 0(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB51_17 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: bnez a4, .LBB51_17 ; RV32ZVE32F-NEXT: .LBB51_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a4, 60(a2) ; RV32ZVE32F-NEXT: lw a2, 56(a2) ; RV32ZVE32F-NEXT: j .LBB51_18 ; RV32ZVE32F-NEXT: .LBB51_16: ; RV32ZVE32F-NEXT: lw s0, 52(a2) ; RV32ZVE32F-NEXT: lw s1, 48(a2) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB51_15 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: beqz a4, .LBB51_15 ; RV32ZVE32F-NEXT: .LBB51_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a4, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB51_18: # %else20 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw t2, 24(a0) ; RV32ZVE32F-NEXT: sw t1, 28(a0) ; RV32ZVE32F-NEXT: sw t4, 32(a0) @@ -5203,7 +5203,7 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: sw s1, 48(a0) ; RV32ZVE32F-NEXT: sw s0, 52(a0) ; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -5212,36 +5212,36 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-LABEL: mgather_baseidx_v8i16_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB51_3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: beqz a4, .LBB51_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB51_4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB51_4 ; RV64ZVE32F-NEXT: .LBB51_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: j .LBB51_5 ; RV64ZVE32F-NEXT: .LBB51_3: -; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB51_2 +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB51_2 ; RV64ZVE32F-NEXT: .LBB51_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v9 -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: vmv.x.s a5, v9 +; RV64ZVE32F-NEXT: slli a5, a5, 3 +; RV64ZVE32F-NEXT: add a5, a1, a5 +; RV64ZVE32F-NEXT: ld a5, 0(a5) ; RV64ZVE32F-NEXT: .LBB51_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: andi a6, a3, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: beqz a6, .LBB51_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 @@ -5249,22 +5249,22 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB51_11 ; RV64ZVE32F-NEXT: .LBB51_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB51_12 ; RV64ZVE32F-NEXT: .LBB51_8: ; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: bnez t1, .LBB51_13 ; RV64ZVE32F-NEXT: .LBB51_9: ; RV64ZVE32F-NEXT: ld t1, 40(a2) ; RV64ZVE32F-NEXT: j .LBB51_14 ; RV64ZVE32F-NEXT: .LBB51_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB51_7 ; RV64ZVE32F-NEXT: .LBB51_11: # %cond.load7 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -5272,14 +5272,14 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: beqz t0, .LBB51_8 ; RV64ZVE32F-NEXT: .LBB51_12: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: beqz t1, .LBB51_9 ; RV64ZVE32F-NEXT: .LBB51_13: # %cond.load13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 @@ -5288,7 +5288,7 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB51_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: andi t2, a3, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: beqz t2, .LBB51_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 @@ -5296,15 +5296,15 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB51_18 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: bnez a3, .LBB51_18 ; RV64ZVE32F-NEXT: .LBB51_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB51_19 ; RV64ZVE32F-NEXT: .LBB51_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB51_16 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: beqz a3, .LBB51_16 ; RV64ZVE32F-NEXT: .LBB51_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -5312,8 +5312,8 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB51_19: # %else20 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) @@ -5353,35 +5353,35 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a1, a4, 1 ; RV32ZVE32F-NEXT: beqz a1, .LBB52_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB52_8 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB52_8 ; RV32ZVE32F-NEXT: .LBB52_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB52_9 +; RV32ZVE32F-NEXT: lw a5, 12(a2) +; RV32ZVE32F-NEXT: lw a6, 8(a2) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB52_9 ; RV32ZVE32F-NEXT: .LBB52_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 20(a2) +; RV32ZVE32F-NEXT: lw t0, 16(a2) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB52_10 ; RV32ZVE32F-NEXT: .LBB52_4: ; RV32ZVE32F-NEXT: lw t1, 28(a2) ; RV32ZVE32F-NEXT: lw t2, 24(a2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB52_11 ; RV32ZVE32F-NEXT: .LBB52_5: ; RV32ZVE32F-NEXT: lw t3, 36(a2) ; RV32ZVE32F-NEXT: lw t4, 32(a2) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB52_12 ; RV32ZVE32F-NEXT: .LBB52_6: ; RV32ZVE32F-NEXT: lw t5, 44(a2) @@ -5390,23 +5390,23 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: .LBB52_7: ; RV32ZVE32F-NEXT: lw a1, 4(a2) ; RV32ZVE32F-NEXT: lw a3, 0(a2) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB52_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB52_2 ; RV32ZVE32F-NEXT: .LBB52_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB52_3 +; RV32ZVE32F-NEXT: vmv.x.s a6, v10 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB52_3 ; RV32ZVE32F-NEXT: .LBB52_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB52_4 ; RV32ZVE32F-NEXT: .LBB52_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -5414,7 +5414,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 4(t2) ; RV32ZVE32F-NEXT: lw t2, 0(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB52_5 ; RV32ZVE32F-NEXT: .LBB52_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5422,7 +5422,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 4(t4) ; RV32ZVE32F-NEXT: lw t4, 0(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB52_6 ; RV32ZVE32F-NEXT: .LBB52_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5437,7 +5437,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a4, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB52_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5445,30 +5445,30 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 4(s1) ; RV32ZVE32F-NEXT: lw s1, 0(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB52_17 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: bnez a4, .LBB52_17 ; RV32ZVE32F-NEXT: .LBB52_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a4, 60(a2) ; RV32ZVE32F-NEXT: lw a2, 56(a2) ; RV32ZVE32F-NEXT: j .LBB52_18 ; RV32ZVE32F-NEXT: .LBB52_16: ; RV32ZVE32F-NEXT: lw s0, 52(a2) ; RV32ZVE32F-NEXT: lw s1, 48(a2) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB52_15 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: beqz a4, .LBB52_15 ; RV32ZVE32F-NEXT: .LBB52_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a4, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB52_18: # %else20 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw t2, 24(a0) ; RV32ZVE32F-NEXT: sw t1, 28(a0) ; RV32ZVE32F-NEXT: sw t4, 32(a0) @@ -5478,7 +5478,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: sw s1, 48(a0) ; RV32ZVE32F-NEXT: sw s0, 52(a0) ; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -5487,36 +5487,36 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i16_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB52_3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: beqz a4, .LBB52_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB52_4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB52_4 ; RV64ZVE32F-NEXT: .LBB52_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: j .LBB52_5 ; RV64ZVE32F-NEXT: .LBB52_3: -; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB52_2 +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB52_2 ; RV64ZVE32F-NEXT: .LBB52_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v9 -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: vmv.x.s a5, v9 +; RV64ZVE32F-NEXT: slli a5, a5, 3 +; RV64ZVE32F-NEXT: add a5, a1, a5 +; RV64ZVE32F-NEXT: ld a5, 0(a5) ; RV64ZVE32F-NEXT: .LBB52_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: andi a6, a3, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: beqz a6, .LBB52_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 @@ -5524,22 +5524,22 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB52_11 ; RV64ZVE32F-NEXT: .LBB52_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB52_12 ; RV64ZVE32F-NEXT: .LBB52_8: ; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: bnez t1, .LBB52_13 ; RV64ZVE32F-NEXT: .LBB52_9: ; RV64ZVE32F-NEXT: ld t1, 40(a2) ; RV64ZVE32F-NEXT: j .LBB52_14 ; RV64ZVE32F-NEXT: .LBB52_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB52_7 ; RV64ZVE32F-NEXT: .LBB52_11: # %cond.load7 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -5547,14 +5547,14 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: beqz t0, .LBB52_8 ; RV64ZVE32F-NEXT: .LBB52_12: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: beqz t1, .LBB52_9 ; RV64ZVE32F-NEXT: .LBB52_13: # %cond.load13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 @@ -5563,7 +5563,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB52_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: andi t2, a3, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: beqz t2, .LBB52_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 @@ -5571,15 +5571,15 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB52_18 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: bnez a3, .LBB52_18 ; RV64ZVE32F-NEXT: .LBB52_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB52_19 ; RV64ZVE32F-NEXT: .LBB52_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB52_16 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: beqz a3, .LBB52_16 ; RV64ZVE32F-NEXT: .LBB52_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -5587,8 +5587,8 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB52_19: # %else20 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) @@ -5630,35 +5630,35 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a1, a4, 1 ; RV32ZVE32F-NEXT: beqz a1, .LBB53_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB53_8 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB53_8 ; RV32ZVE32F-NEXT: .LBB53_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB53_9 +; RV32ZVE32F-NEXT: lw a5, 12(a2) +; RV32ZVE32F-NEXT: lw a6, 8(a2) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB53_9 ; RV32ZVE32F-NEXT: .LBB53_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 20(a2) +; RV32ZVE32F-NEXT: lw t0, 16(a2) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB53_10 ; RV32ZVE32F-NEXT: .LBB53_4: ; RV32ZVE32F-NEXT: lw t1, 28(a2) ; RV32ZVE32F-NEXT: lw t2, 24(a2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB53_11 ; RV32ZVE32F-NEXT: .LBB53_5: ; RV32ZVE32F-NEXT: lw t3, 36(a2) ; RV32ZVE32F-NEXT: lw t4, 32(a2) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB53_12 ; RV32ZVE32F-NEXT: .LBB53_6: ; RV32ZVE32F-NEXT: lw t5, 44(a2) @@ -5667,23 +5667,23 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: .LBB53_7: ; RV32ZVE32F-NEXT: lw a1, 4(a2) ; RV32ZVE32F-NEXT: lw a3, 0(a2) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB53_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB53_2 ; RV32ZVE32F-NEXT: .LBB53_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB53_3 +; RV32ZVE32F-NEXT: vmv.x.s a6, v10 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB53_3 ; RV32ZVE32F-NEXT: .LBB53_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB53_4 ; RV32ZVE32F-NEXT: .LBB53_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -5691,7 +5691,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 4(t2) ; RV32ZVE32F-NEXT: lw t2, 0(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB53_5 ; RV32ZVE32F-NEXT: .LBB53_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5699,7 +5699,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 4(t4) ; RV32ZVE32F-NEXT: lw t4, 0(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB53_6 ; RV32ZVE32F-NEXT: .LBB53_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5714,7 +5714,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a4, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB53_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5722,30 +5722,30 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 4(s1) ; RV32ZVE32F-NEXT: lw s1, 0(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB53_17 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: bnez a4, .LBB53_17 ; RV32ZVE32F-NEXT: .LBB53_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a4, 60(a2) ; RV32ZVE32F-NEXT: lw a2, 56(a2) ; RV32ZVE32F-NEXT: j .LBB53_18 ; RV32ZVE32F-NEXT: .LBB53_16: ; RV32ZVE32F-NEXT: lw s0, 52(a2) ; RV32ZVE32F-NEXT: lw s1, 48(a2) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB53_15 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: beqz a4, .LBB53_15 ; RV32ZVE32F-NEXT: .LBB53_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a4, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB53_18: # %else20 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw t2, 24(a0) ; RV32ZVE32F-NEXT: sw t1, 28(a0) ; RV32ZVE32F-NEXT: sw t4, 32(a0) @@ -5755,7 +5755,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: sw s1, 48(a0) ; RV32ZVE32F-NEXT: sw s0, 52(a0) ; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -5763,119 +5763,119 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; ; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: lui a5, 16 +; RV64ZVE32F-NEXT: lui a3, 16 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi a3, a6, 1 -; RV64ZVE32F-NEXT: addiw a5, a5, -1 -; RV64ZVE32F-NEXT: beqz a3, .LBB53_3 +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi a4, a5, 1 +; RV64ZVE32F-NEXT: addiw a3, a3, -1 +; RV64ZVE32F-NEXT: beqz a4, .LBB53_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: and a3, a3, a5 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a6, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB53_4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: and a4, a4, a3 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a6, a5, 2 +; RV64ZVE32F-NEXT: bnez a6, .LBB53_4 ; RV64ZVE32F-NEXT: .LBB53_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: ld a6, 8(a2) ; RV64ZVE32F-NEXT: j .LBB53_5 ; RV64ZVE32F-NEXT: .LBB53_3: -; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a6, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB53_2 +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: andi a6, a5, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB53_2 ; RV64ZVE32F-NEXT: .LBB53_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v9 -; RV64ZVE32F-NEXT: and a4, a4, a5 -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: and a6, a6, a3 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: .LBB53_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: andi a7, a6, 4 +; RV64ZVE32F-NEXT: andi a7, a5, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: beqz a7, .LBB53_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 ; RV64ZVE32F-NEXT: vmv.x.s a7, v8 -; RV64ZVE32F-NEXT: and a7, a7, a5 +; RV64ZVE32F-NEXT: and a7, a7, a3 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a6, 8 +; RV64ZVE32F-NEXT: andi t0, a5, 8 ; RV64ZVE32F-NEXT: bnez t0, .LBB53_11 ; RV64ZVE32F-NEXT: .LBB53_7: ; RV64ZVE32F-NEXT: ld t0, 24(a2) -; RV64ZVE32F-NEXT: andi t1, a6, 16 +; RV64ZVE32F-NEXT: andi t1, a5, 16 ; RV64ZVE32F-NEXT: bnez t1, .LBB53_12 ; RV64ZVE32F-NEXT: .LBB53_8: ; RV64ZVE32F-NEXT: ld t1, 32(a2) -; RV64ZVE32F-NEXT: andi t2, a6, 32 +; RV64ZVE32F-NEXT: andi t2, a5, 32 ; RV64ZVE32F-NEXT: bnez t2, .LBB53_13 ; RV64ZVE32F-NEXT: .LBB53_9: ; RV64ZVE32F-NEXT: ld t2, 40(a2) ; RV64ZVE32F-NEXT: j .LBB53_14 ; RV64ZVE32F-NEXT: .LBB53_10: ; RV64ZVE32F-NEXT: ld a7, 16(a2) -; RV64ZVE32F-NEXT: andi t0, a6, 8 +; RV64ZVE32F-NEXT: andi t0, a5, 8 ; RV64ZVE32F-NEXT: beqz t0, .LBB53_7 ; RV64ZVE32F-NEXT: .LBB53_11: # %cond.load7 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s t0, v8 -; RV64ZVE32F-NEXT: and t0, t0, a5 +; RV64ZVE32F-NEXT: and t0, t0, a3 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a6, 16 +; RV64ZVE32F-NEXT: andi t1, a5, 16 ; RV64ZVE32F-NEXT: beqz t1, .LBB53_8 ; RV64ZVE32F-NEXT: .LBB53_12: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s t1, v9 -; RV64ZVE32F-NEXT: and t1, t1, a5 +; RV64ZVE32F-NEXT: and t1, t1, a3 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: andi t2, a6, 32 +; RV64ZVE32F-NEXT: andi t2, a5, 32 ; RV64ZVE32F-NEXT: beqz t2, .LBB53_9 ; RV64ZVE32F-NEXT: .LBB53_13: # %cond.load13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 -; RV64ZVE32F-NEXT: and t2, t2, a5 +; RV64ZVE32F-NEXT: and t2, t2, a3 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: .LBB53_14: # %else14 -; RV64ZVE32F-NEXT: andi t3, a6, 64 +; RV64ZVE32F-NEXT: andi t3, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: beqz t3, .LBB53_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t3, v8 -; RV64ZVE32F-NEXT: and t3, t3, a5 +; RV64ZVE32F-NEXT: and t3, t3, a3 ; RV64ZVE32F-NEXT: slli t3, t3, 3 ; RV64ZVE32F-NEXT: add t3, a1, t3 ; RV64ZVE32F-NEXT: ld t3, 0(t3) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: bnez a6, .LBB53_18 +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB53_18 ; RV64ZVE32F-NEXT: .LBB53_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB53_19 ; RV64ZVE32F-NEXT: .LBB53_17: ; RV64ZVE32F-NEXT: ld t3, 48(a2) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: beqz a6, .LBB53_16 +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB53_16 ; RV64ZVE32F-NEXT: .LBB53_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: and a2, a2, a5 +; RV64ZVE32F-NEXT: and a2, a2, a3 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB53_19: # %else20 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a6, 8(a0) ; RV64ZVE32F-NEXT: sd a7, 16(a0) ; RV64ZVE32F-NEXT: sd t0, 24(a0) ; RV64ZVE32F-NEXT: sd t1, 32(a0) @@ -5914,35 +5914,35 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a1, a4, 1 ; RV32ZVE32F-NEXT: beqz a1, .LBB54_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB54_8 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB54_8 ; RV32ZVE32F-NEXT: .LBB54_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB54_9 +; RV32ZVE32F-NEXT: lw a5, 12(a2) +; RV32ZVE32F-NEXT: lw a6, 8(a2) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB54_9 ; RV32ZVE32F-NEXT: .LBB54_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 20(a2) +; RV32ZVE32F-NEXT: lw t0, 16(a2) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB54_10 ; RV32ZVE32F-NEXT: .LBB54_4: ; RV32ZVE32F-NEXT: lw t1, 28(a2) ; RV32ZVE32F-NEXT: lw t2, 24(a2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB54_11 ; RV32ZVE32F-NEXT: .LBB54_5: ; RV32ZVE32F-NEXT: lw t3, 36(a2) ; RV32ZVE32F-NEXT: lw t4, 32(a2) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB54_12 ; RV32ZVE32F-NEXT: .LBB54_6: ; RV32ZVE32F-NEXT: lw t5, 44(a2) @@ -5951,23 +5951,23 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV32ZVE32F-NEXT: .LBB54_7: ; RV32ZVE32F-NEXT: lw a1, 4(a2) ; RV32ZVE32F-NEXT: lw a3, 0(a2) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB54_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB54_2 ; RV32ZVE32F-NEXT: .LBB54_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB54_3 +; RV32ZVE32F-NEXT: vmv.x.s a6, v10 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB54_3 ; RV32ZVE32F-NEXT: .LBB54_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB54_4 ; RV32ZVE32F-NEXT: .LBB54_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -5975,7 +5975,7 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 4(t2) ; RV32ZVE32F-NEXT: lw t2, 0(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB54_5 ; RV32ZVE32F-NEXT: .LBB54_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5983,7 +5983,7 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 4(t4) ; RV32ZVE32F-NEXT: lw t4, 0(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB54_6 ; RV32ZVE32F-NEXT: .LBB54_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5998,7 +5998,7 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a4, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB54_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6006,30 +6006,30 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 4(s1) ; RV32ZVE32F-NEXT: lw s1, 0(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB54_17 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: bnez a4, .LBB54_17 ; RV32ZVE32F-NEXT: .LBB54_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a4, 60(a2) ; RV32ZVE32F-NEXT: lw a2, 56(a2) ; RV32ZVE32F-NEXT: j .LBB54_18 ; RV32ZVE32F-NEXT: .LBB54_16: ; RV32ZVE32F-NEXT: lw s0, 52(a2) ; RV32ZVE32F-NEXT: lw s1, 48(a2) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB54_15 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: beqz a4, .LBB54_15 ; RV32ZVE32F-NEXT: .LBB54_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a4, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB54_18: # %else20 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw t2, 24(a0) ; RV32ZVE32F-NEXT: sw t1, 28(a0) ; RV32ZVE32F-NEXT: sw t4, 32(a0) @@ -6039,7 +6039,7 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV32ZVE32F-NEXT: sw s1, 48(a0) ; RV32ZVE32F-NEXT: sw s0, 52(a0) ; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -6048,36 +6048,36 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV64ZVE32F-LABEL: mgather_baseidx_v8i32_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB54_3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: beqz a4, .LBB54_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB54_4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB54_4 ; RV64ZVE32F-NEXT: .LBB54_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: j .LBB54_5 ; RV64ZVE32F-NEXT: .LBB54_3: -; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB54_2 +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB54_2 ; RV64ZVE32F-NEXT: .LBB54_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v10 -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: vmv.x.s a5, v10 +; RV64ZVE32F-NEXT: slli a5, a5, 3 +; RV64ZVE32F-NEXT: add a5, a1, a5 +; RV64ZVE32F-NEXT: ld a5, 0(a5) ; RV64ZVE32F-NEXT: .LBB54_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: andi a6, a3, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: beqz a6, .LBB54_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 @@ -6085,22 +6085,22 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB54_11 ; RV64ZVE32F-NEXT: .LBB54_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB54_12 ; RV64ZVE32F-NEXT: .LBB54_8: ; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: bnez t1, .LBB54_13 ; RV64ZVE32F-NEXT: .LBB54_9: ; RV64ZVE32F-NEXT: ld t1, 40(a2) ; RV64ZVE32F-NEXT: j .LBB54_14 ; RV64ZVE32F-NEXT: .LBB54_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB54_7 ; RV64ZVE32F-NEXT: .LBB54_11: # %cond.load7 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -6108,14 +6108,14 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: beqz t0, .LBB54_8 ; RV64ZVE32F-NEXT: .LBB54_12: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s t0, v10 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: beqz t1, .LBB54_9 ; RV64ZVE32F-NEXT: .LBB54_13: # %cond.load13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 @@ -6124,7 +6124,7 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB54_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: andi t2, a3, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: beqz t2, .LBB54_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 @@ -6132,15 +6132,15 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB54_18 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: bnez a3, .LBB54_18 ; RV64ZVE32F-NEXT: .LBB54_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB54_19 ; RV64ZVE32F-NEXT: .LBB54_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB54_16 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: beqz a3, .LBB54_16 ; RV64ZVE32F-NEXT: .LBB54_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -6148,8 +6148,8 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB54_19: # %else20 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) @@ -6187,35 +6187,35 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a1, a4, 1 ; RV32ZVE32F-NEXT: beqz a1, .LBB55_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB55_8 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB55_8 ; RV32ZVE32F-NEXT: .LBB55_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB55_9 +; RV32ZVE32F-NEXT: lw a5, 12(a2) +; RV32ZVE32F-NEXT: lw a6, 8(a2) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB55_9 ; RV32ZVE32F-NEXT: .LBB55_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 20(a2) +; RV32ZVE32F-NEXT: lw t0, 16(a2) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB55_10 ; RV32ZVE32F-NEXT: .LBB55_4: ; RV32ZVE32F-NEXT: lw t1, 28(a2) ; RV32ZVE32F-NEXT: lw t2, 24(a2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB55_11 ; RV32ZVE32F-NEXT: .LBB55_5: ; RV32ZVE32F-NEXT: lw t3, 36(a2) ; RV32ZVE32F-NEXT: lw t4, 32(a2) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB55_12 ; RV32ZVE32F-NEXT: .LBB55_6: ; RV32ZVE32F-NEXT: lw t5, 44(a2) @@ -6224,23 +6224,23 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: .LBB55_7: ; RV32ZVE32F-NEXT: lw a1, 4(a2) ; RV32ZVE32F-NEXT: lw a3, 0(a2) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB55_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB55_2 ; RV32ZVE32F-NEXT: .LBB55_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB55_3 +; RV32ZVE32F-NEXT: vmv.x.s a6, v10 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB55_3 ; RV32ZVE32F-NEXT: .LBB55_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB55_4 ; RV32ZVE32F-NEXT: .LBB55_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -6248,7 +6248,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 4(t2) ; RV32ZVE32F-NEXT: lw t2, 0(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB55_5 ; RV32ZVE32F-NEXT: .LBB55_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6256,7 +6256,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 4(t4) ; RV32ZVE32F-NEXT: lw t4, 0(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB55_6 ; RV32ZVE32F-NEXT: .LBB55_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6271,7 +6271,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a4, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB55_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6279,30 +6279,30 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 4(s1) ; RV32ZVE32F-NEXT: lw s1, 0(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB55_17 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: bnez a4, .LBB55_17 ; RV32ZVE32F-NEXT: .LBB55_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a4, 60(a2) ; RV32ZVE32F-NEXT: lw a2, 56(a2) ; RV32ZVE32F-NEXT: j .LBB55_18 ; RV32ZVE32F-NEXT: .LBB55_16: ; RV32ZVE32F-NEXT: lw s0, 52(a2) ; RV32ZVE32F-NEXT: lw s1, 48(a2) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB55_15 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: beqz a4, .LBB55_15 ; RV32ZVE32F-NEXT: .LBB55_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a4, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB55_18: # %else20 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw t2, 24(a0) ; RV32ZVE32F-NEXT: sw t1, 28(a0) ; RV32ZVE32F-NEXT: sw t4, 32(a0) @@ -6312,7 +6312,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: sw s1, 48(a0) ; RV32ZVE32F-NEXT: sw s0, 52(a0) ; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -6321,36 +6321,36 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i32_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB55_3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: beqz a4, .LBB55_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB55_4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB55_4 ; RV64ZVE32F-NEXT: .LBB55_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: j .LBB55_5 ; RV64ZVE32F-NEXT: .LBB55_3: -; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB55_2 +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB55_2 ; RV64ZVE32F-NEXT: .LBB55_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v10 -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: vmv.x.s a5, v10 +; RV64ZVE32F-NEXT: slli a5, a5, 3 +; RV64ZVE32F-NEXT: add a5, a1, a5 +; RV64ZVE32F-NEXT: ld a5, 0(a5) ; RV64ZVE32F-NEXT: .LBB55_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: andi a6, a3, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: beqz a6, .LBB55_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 @@ -6358,22 +6358,22 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB55_11 ; RV64ZVE32F-NEXT: .LBB55_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB55_12 ; RV64ZVE32F-NEXT: .LBB55_8: ; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: bnez t1, .LBB55_13 ; RV64ZVE32F-NEXT: .LBB55_9: ; RV64ZVE32F-NEXT: ld t1, 40(a2) ; RV64ZVE32F-NEXT: j .LBB55_14 ; RV64ZVE32F-NEXT: .LBB55_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB55_7 ; RV64ZVE32F-NEXT: .LBB55_11: # %cond.load7 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -6381,14 +6381,14 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: beqz t0, .LBB55_8 ; RV64ZVE32F-NEXT: .LBB55_12: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s t0, v10 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: beqz t1, .LBB55_9 ; RV64ZVE32F-NEXT: .LBB55_13: # %cond.load13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 @@ -6397,7 +6397,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB55_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: andi t2, a3, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: beqz t2, .LBB55_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 @@ -6405,15 +6405,15 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB55_18 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: bnez a3, .LBB55_18 ; RV64ZVE32F-NEXT: .LBB55_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB55_19 ; RV64ZVE32F-NEXT: .LBB55_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB55_16 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: beqz a3, .LBB55_16 ; RV64ZVE32F-NEXT: .LBB55_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -6421,8 +6421,8 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB55_19: # %else20 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) @@ -6461,35 +6461,35 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a1, a4, 1 ; RV32ZVE32F-NEXT: beqz a1, .LBB56_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB56_8 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB56_8 ; RV32ZVE32F-NEXT: .LBB56_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB56_9 +; RV32ZVE32F-NEXT: lw a5, 12(a2) +; RV32ZVE32F-NEXT: lw a6, 8(a2) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB56_9 ; RV32ZVE32F-NEXT: .LBB56_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 20(a2) +; RV32ZVE32F-NEXT: lw t0, 16(a2) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB56_10 ; RV32ZVE32F-NEXT: .LBB56_4: ; RV32ZVE32F-NEXT: lw t1, 28(a2) ; RV32ZVE32F-NEXT: lw t2, 24(a2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB56_11 ; RV32ZVE32F-NEXT: .LBB56_5: ; RV32ZVE32F-NEXT: lw t3, 36(a2) ; RV32ZVE32F-NEXT: lw t4, 32(a2) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB56_12 ; RV32ZVE32F-NEXT: .LBB56_6: ; RV32ZVE32F-NEXT: lw t5, 44(a2) @@ -6498,23 +6498,23 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: .LBB56_7: ; RV32ZVE32F-NEXT: lw a1, 4(a2) ; RV32ZVE32F-NEXT: lw a3, 0(a2) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB56_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB56_2 ; RV32ZVE32F-NEXT: .LBB56_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB56_3 +; RV32ZVE32F-NEXT: vmv.x.s a6, v10 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB56_3 ; RV32ZVE32F-NEXT: .LBB56_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB56_4 ; RV32ZVE32F-NEXT: .LBB56_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -6522,7 +6522,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 4(t2) ; RV32ZVE32F-NEXT: lw t2, 0(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB56_5 ; RV32ZVE32F-NEXT: .LBB56_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6530,7 +6530,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 4(t4) ; RV32ZVE32F-NEXT: lw t4, 0(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB56_6 ; RV32ZVE32F-NEXT: .LBB56_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6545,7 +6545,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a4, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB56_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6553,30 +6553,30 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 4(s1) ; RV32ZVE32F-NEXT: lw s1, 0(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB56_17 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: bnez a4, .LBB56_17 ; RV32ZVE32F-NEXT: .LBB56_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a4, 60(a2) ; RV32ZVE32F-NEXT: lw a2, 56(a2) ; RV32ZVE32F-NEXT: j .LBB56_18 ; RV32ZVE32F-NEXT: .LBB56_16: ; RV32ZVE32F-NEXT: lw s0, 52(a2) ; RV32ZVE32F-NEXT: lw s1, 48(a2) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB56_15 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: beqz a4, .LBB56_15 ; RV32ZVE32F-NEXT: .LBB56_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a4, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB56_18: # %else20 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw t2, 24(a0) ; RV32ZVE32F-NEXT: sw t1, 28(a0) ; RV32ZVE32F-NEXT: sw t4, 32(a0) @@ -6586,7 +6586,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: sw s1, 48(a0) ; RV32ZVE32F-NEXT: sw s0, 52(a0) ; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -6595,38 +6595,38 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i32_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB56_3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: beqz a4, .LBB56_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 32 -; RV64ZVE32F-NEXT: srli a3, a3, 29 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB56_4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: slli a4, a4, 32 +; RV64ZVE32F-NEXT: srli a4, a4, 29 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB56_4 ; RV64ZVE32F-NEXT: .LBB56_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: j .LBB56_5 ; RV64ZVE32F-NEXT: .LBB56_3: -; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB56_2 +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: andi a5, a3, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB56_2 ; RV64ZVE32F-NEXT: .LBB56_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v10 -; RV64ZVE32F-NEXT: slli a4, a4, 32 -; RV64ZVE32F-NEXT: srli a4, a4, 29 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: vmv.x.s a5, v10 +; RV64ZVE32F-NEXT: slli a5, a5, 32 +; RV64ZVE32F-NEXT: srli a5, a5, 29 +; RV64ZVE32F-NEXT: add a5, a1, a5 +; RV64ZVE32F-NEXT: ld a5, 0(a5) ; RV64ZVE32F-NEXT: .LBB56_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: andi a6, a3, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: beqz a6, .LBB56_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 @@ -6635,22 +6635,22 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: srli a6, a6, 29 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB56_11 ; RV64ZVE32F-NEXT: .LBB56_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB56_12 ; RV64ZVE32F-NEXT: .LBB56_8: ; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: bnez t1, .LBB56_13 ; RV64ZVE32F-NEXT: .LBB56_9: ; RV64ZVE32F-NEXT: ld t1, 40(a2) ; RV64ZVE32F-NEXT: j .LBB56_14 ; RV64ZVE32F-NEXT: .LBB56_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: andi a7, a3, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB56_7 ; RV64ZVE32F-NEXT: .LBB56_11: # %cond.load7 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -6659,7 +6659,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: srli a7, a7, 29 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: andi t0, a3, 16 ; RV64ZVE32F-NEXT: beqz t0, .LBB56_8 ; RV64ZVE32F-NEXT: .LBB56_12: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s t0, v10 @@ -6667,7 +6667,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: srli t0, t0, 29 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: andi t1, a3, 32 ; RV64ZVE32F-NEXT: beqz t1, .LBB56_9 ; RV64ZVE32F-NEXT: .LBB56_13: # %cond.load13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 @@ -6677,7 +6677,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB56_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: andi t2, a3, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: beqz t2, .LBB56_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 @@ -6686,15 +6686,15 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: srli t2, t2, 29 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB56_18 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: bnez a3, .LBB56_18 ; RV64ZVE32F-NEXT: .LBB56_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB56_19 ; RV64ZVE32F-NEXT: .LBB56_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB56_16 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: beqz a3, .LBB56_16 ; RV64ZVE32F-NEXT: .LBB56_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -6703,8 +6703,8 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB56_19: # %else20 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) @@ -6759,35 +6759,35 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 +; RV32ZVE32F-NEXT: andi a1, a4, 1 ; RV32ZVE32F-NEXT: beqz a1, .LBB57_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: bnez a4, .LBB57_8 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: bnez a5, .LBB57_8 ; RV32ZVE32F-NEXT: .LBB57_2: -; RV32ZVE32F-NEXT: lw a4, 12(a3) -; RV32ZVE32F-NEXT: lw a5, 8(a3) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB57_9 +; RV32ZVE32F-NEXT: lw a5, 12(a3) +; RV32ZVE32F-NEXT: lw a6, 8(a3) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB57_9 ; RV32ZVE32F-NEXT: .LBB57_3: -; RV32ZVE32F-NEXT: lw a6, 20(a3) -; RV32ZVE32F-NEXT: lw a7, 16(a3) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 20(a3) +; RV32ZVE32F-NEXT: lw t0, 16(a3) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB57_10 ; RV32ZVE32F-NEXT: .LBB57_4: ; RV32ZVE32F-NEXT: lw t1, 28(a3) ; RV32ZVE32F-NEXT: lw t2, 24(a3) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB57_11 ; RV32ZVE32F-NEXT: .LBB57_5: ; RV32ZVE32F-NEXT: lw t3, 36(a3) ; RV32ZVE32F-NEXT: lw t4, 32(a3) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB57_12 ; RV32ZVE32F-NEXT: .LBB57_6: ; RV32ZVE32F-NEXT: lw t5, 44(a3) @@ -6796,23 +6796,23 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: .LBB57_7: ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a2, 0(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 -; RV32ZVE32F-NEXT: beqz a4, .LBB57_2 +; RV32ZVE32F-NEXT: andi a5, a4, 2 +; RV32ZVE32F-NEXT: beqz a5, .LBB57_2 ; RV32ZVE32F-NEXT: .LBB57_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB57_3 +; RV32ZVE32F-NEXT: vmv.x.s a6, v10 +; RV32ZVE32F-NEXT: lw a5, 4(a6) +; RV32ZVE32F-NEXT: lw a6, 0(a6) +; RV32ZVE32F-NEXT: andi a7, a4, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB57_3 ; RV32ZVE32F-NEXT: .LBB57_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t1, a4, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB57_4 ; RV32ZVE32F-NEXT: .LBB57_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -6820,7 +6820,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 4(t2) ; RV32ZVE32F-NEXT: lw t2, 0(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a4, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB57_5 ; RV32ZVE32F-NEXT: .LBB57_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6828,7 +6828,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 4(t4) ; RV32ZVE32F-NEXT: lw t4, 0(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a4, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB57_6 ; RV32ZVE32F-NEXT: .LBB57_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6843,7 +6843,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a4, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB57_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6851,30 +6851,30 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 4(s1) ; RV32ZVE32F-NEXT: lw s1, 0(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB57_17 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: bnez a4, .LBB57_17 ; RV32ZVE32F-NEXT: .LBB57_15: -; RV32ZVE32F-NEXT: lw t0, 60(a3) +; RV32ZVE32F-NEXT: lw a4, 60(a3) ; RV32ZVE32F-NEXT: lw a3, 56(a3) ; RV32ZVE32F-NEXT: j .LBB57_18 ; RV32ZVE32F-NEXT: .LBB57_16: ; RV32ZVE32F-NEXT: lw s0, 52(a3) ; RV32ZVE32F-NEXT: lw s1, 48(a3) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB57_15 +; RV32ZVE32F-NEXT: andi a4, a4, -128 +; RV32ZVE32F-NEXT: beqz a4, .LBB57_15 ; RV32ZVE32F-NEXT: .LBB57_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a3) +; RV32ZVE32F-NEXT: lw a4, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) ; RV32ZVE32F-NEXT: .LBB57_18: # %else20 ; RV32ZVE32F-NEXT: sw a2, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw t2, 24(a0) ; RV32ZVE32F-NEXT: sw t1, 28(a0) ; RV32ZVE32F-NEXT: sw t4, 32(a0) @@ -6884,7 +6884,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: sw s1, 48(a0) ; RV32ZVE32F-NEXT: sw s0, 52(a0) ; RV32ZVE32F-NEXT: sw a3, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -6893,97 +6893,97 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV64ZVE32F-LABEL: mgather_baseidx_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi a4, a6, 1 -; RV64ZVE32F-NEXT: beqz a4, .LBB57_9 +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi a5, a4, 1 +; RV64ZVE32F-NEXT: beqz a5, .LBB57_9 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: ld a4, 0(a2) -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: andi a5, a6, 2 -; RV64ZVE32F-NEXT: bnez a5, .LBB57_10 +; RV64ZVE32F-NEXT: ld a5, 0(a2) +; RV64ZVE32F-NEXT: slli a5, a5, 3 +; RV64ZVE32F-NEXT: add a5, a1, a5 +; RV64ZVE32F-NEXT: ld a5, 0(a5) +; RV64ZVE32F-NEXT: andi a6, a4, 2 +; RV64ZVE32F-NEXT: bnez a6, .LBB57_10 ; RV64ZVE32F-NEXT: .LBB57_2: -; RV64ZVE32F-NEXT: ld a5, 8(a3) -; RV64ZVE32F-NEXT: andi a7, a6, 4 +; RV64ZVE32F-NEXT: ld a6, 8(a3) +; RV64ZVE32F-NEXT: andi a7, a4, 4 ; RV64ZVE32F-NEXT: bnez a7, .LBB57_11 ; RV64ZVE32F-NEXT: .LBB57_3: ; RV64ZVE32F-NEXT: ld a7, 16(a3) -; RV64ZVE32F-NEXT: andi t0, a6, 8 +; RV64ZVE32F-NEXT: andi t0, a4, 8 ; RV64ZVE32F-NEXT: bnez t0, .LBB57_12 ; RV64ZVE32F-NEXT: .LBB57_4: ; RV64ZVE32F-NEXT: ld t0, 24(a3) -; RV64ZVE32F-NEXT: andi t1, a6, 16 +; RV64ZVE32F-NEXT: andi t1, a4, 16 ; RV64ZVE32F-NEXT: bnez t1, .LBB57_13 ; RV64ZVE32F-NEXT: .LBB57_5: ; RV64ZVE32F-NEXT: ld t1, 32(a3) -; RV64ZVE32F-NEXT: andi t2, a6, 32 +; RV64ZVE32F-NEXT: andi t2, a4, 32 ; RV64ZVE32F-NEXT: bnez t2, .LBB57_14 ; RV64ZVE32F-NEXT: .LBB57_6: ; RV64ZVE32F-NEXT: ld t2, 40(a3) -; RV64ZVE32F-NEXT: andi t3, a6, 64 +; RV64ZVE32F-NEXT: andi t3, a4, 64 ; RV64ZVE32F-NEXT: bnez t3, .LBB57_15 ; RV64ZVE32F-NEXT: .LBB57_7: ; RV64ZVE32F-NEXT: ld t3, 48(a3) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: bnez a6, .LBB57_16 +; RV64ZVE32F-NEXT: andi a4, a4, -128 +; RV64ZVE32F-NEXT: bnez a4, .LBB57_16 ; RV64ZVE32F-NEXT: .LBB57_8: ; RV64ZVE32F-NEXT: ld a1, 56(a3) ; RV64ZVE32F-NEXT: j .LBB57_17 ; RV64ZVE32F-NEXT: .LBB57_9: -; RV64ZVE32F-NEXT: ld a4, 0(a3) -; RV64ZVE32F-NEXT: andi a5, a6, 2 -; RV64ZVE32F-NEXT: beqz a5, .LBB57_2 +; RV64ZVE32F-NEXT: ld a5, 0(a3) +; RV64ZVE32F-NEXT: andi a6, a4, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB57_2 ; RV64ZVE32F-NEXT: .LBB57_10: # %cond.load1 -; RV64ZVE32F-NEXT: ld a5, 8(a2) -; RV64ZVE32F-NEXT: slli a5, a5, 3 -; RV64ZVE32F-NEXT: add a5, a1, a5 -; RV64ZVE32F-NEXT: ld a5, 0(a5) -; RV64ZVE32F-NEXT: andi a7, a6, 4 +; RV64ZVE32F-NEXT: ld a6, 8(a2) +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: andi a7, a4, 4 ; RV64ZVE32F-NEXT: beqz a7, .LBB57_3 ; RV64ZVE32F-NEXT: .LBB57_11: # %cond.load4 ; RV64ZVE32F-NEXT: ld a7, 16(a2) ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a6, 8 +; RV64ZVE32F-NEXT: andi t0, a4, 8 ; RV64ZVE32F-NEXT: beqz t0, .LBB57_4 ; RV64ZVE32F-NEXT: .LBB57_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld t0, 24(a2) ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a6, 16 +; RV64ZVE32F-NEXT: andi t1, a4, 16 ; RV64ZVE32F-NEXT: beqz t1, .LBB57_5 ; RV64ZVE32F-NEXT: .LBB57_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld t1, 32(a2) ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: andi t2, a6, 32 +; RV64ZVE32F-NEXT: andi t2, a4, 32 ; RV64ZVE32F-NEXT: beqz t2, .LBB57_6 ; RV64ZVE32F-NEXT: .LBB57_14: # %cond.load13 ; RV64ZVE32F-NEXT: ld t2, 40(a2) ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi t3, a6, 64 +; RV64ZVE32F-NEXT: andi t3, a4, 64 ; RV64ZVE32F-NEXT: beqz t3, .LBB57_7 ; RV64ZVE32F-NEXT: .LBB57_15: # %cond.load16 ; RV64ZVE32F-NEXT: ld t3, 48(a2) ; RV64ZVE32F-NEXT: slli t3, t3, 3 ; RV64ZVE32F-NEXT: add t3, a1, t3 ; RV64ZVE32F-NEXT: ld t3, 0(t3) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: beqz a6, .LBB57_8 +; RV64ZVE32F-NEXT: andi a4, a4, -128 +; RV64ZVE32F-NEXT: beqz a4, .LBB57_8 ; RV64ZVE32F-NEXT: .LBB57_16: # %cond.load19 ; RV64ZVE32F-NEXT: ld a2, 56(a2) ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: .LBB57_17: # %else20 -; RV64ZVE32F-NEXT: sd a4, 0(a0) -; RV64ZVE32F-NEXT: sd a5, 8(a0) +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: sd a6, 8(a0) ; RV64ZVE32F-NEXT: sd a7, 16(a0) ; RV64ZVE32F-NEXT: sd t0, 24(a0) ; RV64ZVE32F-NEXT: sd t1, 32(a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 60b61e889315cf..a5d22af7595722 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -3265,7 +3265,7 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV32ZVE32F-NEXT: lw a2, 56(a0) ; RV32ZVE32F-NEXT: lw a3, 52(a0) ; RV32ZVE32F-NEXT: lw a4, 48(a0) -; RV32ZVE32F-NEXT: lw a5, 44(a0) +; RV32ZVE32F-NEXT: lw a6, 44(a0) ; RV32ZVE32F-NEXT: lw a7, 40(a0) ; RV32ZVE32F-NEXT: lw t0, 36(a0) ; RV32ZVE32F-NEXT: lw t1, 32(a0) @@ -3276,29 +3276,29 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a6, v0 -; RV32ZVE32F-NEXT: andi s1, a6, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v0 +; RV32ZVE32F-NEXT: andi s1, a5, 1 ; RV32ZVE32F-NEXT: bnez s1, .LBB41_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a0, a6, 2 +; RV32ZVE32F-NEXT: andi a0, a5, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_11 ; RV32ZVE32F-NEXT: .LBB41_2: # %else2 -; RV32ZVE32F-NEXT: andi a0, a6, 4 +; RV32ZVE32F-NEXT: andi a0, a5, 4 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_12 ; RV32ZVE32F-NEXT: .LBB41_3: # %else4 -; RV32ZVE32F-NEXT: andi a0, a6, 8 +; RV32ZVE32F-NEXT: andi a0, a5, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_13 ; RV32ZVE32F-NEXT: .LBB41_4: # %else6 -; RV32ZVE32F-NEXT: andi a0, a6, 16 +; RV32ZVE32F-NEXT: andi a0, a5, 16 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_14 ; RV32ZVE32F-NEXT: .LBB41_5: # %else8 -; RV32ZVE32F-NEXT: andi a0, a6, 32 +; RV32ZVE32F-NEXT: andi a0, a5, 32 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_15 ; RV32ZVE32F-NEXT: .LBB41_6: # %else10 -; RV32ZVE32F-NEXT: andi a0, a6, 64 +; RV32ZVE32F-NEXT: andi a0, a5, 64 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_16 ; RV32ZVE32F-NEXT: .LBB41_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, a6, -128 +; RV32ZVE32F-NEXT: andi a0, a5, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_9 ; RV32ZVE32F-NEXT: .LBB41_8: # %cond.store13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -3319,7 +3319,7 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) -; RV32ZVE32F-NEXT: andi a0, a6, 2 +; RV32ZVE32F-NEXT: andi a0, a5, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_2 ; RV32ZVE32F-NEXT: .LBB41_11: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -3327,7 +3327,7 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw s0, 4(a0) ; RV32ZVE32F-NEXT: sw t6, 0(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 4 +; RV32ZVE32F-NEXT: andi a0, a5, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_3 ; RV32ZVE32F-NEXT: .LBB41_12: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -3335,7 +3335,7 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw t5, 0(a0) ; RV32ZVE32F-NEXT: sw t4, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 8 +; RV32ZVE32F-NEXT: andi a0, a5, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_4 ; RV32ZVE32F-NEXT: .LBB41_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -3343,7 +3343,7 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw t3, 0(a0) ; RV32ZVE32F-NEXT: sw t2, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 16 +; RV32ZVE32F-NEXT: andi a0, a5, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_5 ; RV32ZVE32F-NEXT: .LBB41_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -3351,15 +3351,15 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw t1, 0(a0) ; RV32ZVE32F-NEXT: sw t0, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 32 +; RV32ZVE32F-NEXT: andi a0, a5, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_6 ; RV32ZVE32F-NEXT: .LBB41_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 64 +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a5, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_7 ; RV32ZVE32F-NEXT: .LBB41_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -3367,7 +3367,7 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a4, 0(a0) ; RV32ZVE32F-NEXT: sw a3, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, -128 +; RV32ZVE32F-NEXT: andi a0, a5, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_8 ; RV32ZVE32F-NEXT: j .LBB41_9 ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll index a1e81ea41c249b..e8eecb8b1e90c7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll @@ -365,17 +365,12 @@ define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32 ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: sub sp, sp, a3 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a2) ; RV32-NEXT: addi a2, a2, 128 ; RV32-NEXT: vle64.v v16, (a2) -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: fcvt.d.w fa5, zero ; RV32-NEXT: vmfeq.vf v0, v8, fa5 ; RV32-NEXT: vle64.v v24, (a0) @@ -383,11 +378,6 @@ define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32 ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmfeq.vf v8, v16, fa5 ; RV32-NEXT: vse64.v v24, (a1), v0.t ; RV32-NEXT: addi a0, a1, 128 @@ -396,7 +386,7 @@ define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse64.v v8, (a0), v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -405,17 +395,12 @@ define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32 ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: slli a3, a3, 3 ; RV64-NEXT: sub sp, sp, a3 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a2) ; RV64-NEXT: addi a2, a2, 128 ; RV64-NEXT: vle64.v v16, (a2) -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: fmv.d.x fa5, zero ; RV64-NEXT: vmfeq.vf v0, v8, fa5 ; RV64-NEXT: vle64.v v24, (a0) @@ -423,11 +408,6 @@ define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32 ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vmfeq.vf v8, v16, fa5 ; RV64-NEXT: vse64.v v24, (a1), v0.t ; RV64-NEXT: addi a0, a1, 128 @@ -436,7 +416,7 @@ define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a0), v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -472,18 +452,13 @@ define void @masked_store_v64f32(<64 x float>* %val_ptr, <64 x float>* %a, <64 x ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: slli a3, a3, 4 +; CHECK-NEXT: slli a3, a3, 3 ; CHECK-NEXT: sub sp, sp, a3 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) ; CHECK-NEXT: addi a2, a2, 128 ; CHECK-NEXT: vle32.v v16, (a2) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: fmv.w.x fa5, zero ; CHECK-NEXT: vmfeq.vf v0, v8, fa5 ; CHECK-NEXT: vle32.v v24, (a0) @@ -491,11 +466,6 @@ define void @masked_store_v64f32(<64 x float>* %val_ptr, <64 x float>* %a, <64 x ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmfeq.vf v8, v16, fa5 ; CHECK-NEXT: vse32.v v24, (a1), v0.t ; CHECK-NEXT: addi a0, a1, 128 @@ -504,7 +474,7 @@ define void @masked_store_v64f32(<64 x float>* %val_ptr, <64 x float>* %a, <64 x ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vse32.v v8, (a0), v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -521,18 +491,13 @@ define void @masked_store_v128f16(<128 x half>* %val_ptr, <128 x half>* %a, <128 ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: slli a3, a3, 4 +; CHECK-NEXT: slli a3, a3, 3 ; CHECK-NEXT: sub sp, sp, a3 ; CHECK-NEXT: li a3, 64 ; CHECK-NEXT: vsetvli zero, a3, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a2) ; CHECK-NEXT: addi a2, a2, 128 ; CHECK-NEXT: vle16.v v16, (a2) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: fmv.h.x fa5, zero ; CHECK-NEXT: vmfeq.vf v0, v8, fa5 ; CHECK-NEXT: vle16.v v24, (a0) @@ -540,11 +505,6 @@ define void @masked_store_v128f16(<128 x half>* %val_ptr, <128 x half>* %a, <128 ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmfeq.vf v8, v16, fa5 ; CHECK-NEXT: vse16.v v24, (a1), v0.t ; CHECK-NEXT: addi a0, a1, 128 @@ -553,7 +513,7 @@ define void @masked_store_v128f16(<128 x half>* %val_ptr, <128 x half>* %a, <128 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vse16.v v8, (a0), v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll index b3011d0f01cab1..69c59921006591 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll @@ -401,54 +401,41 @@ define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 18 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: sub sp, sp, a3 ; RV32-NEXT: addi a3, a2, 128 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a3) ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a4, a3, 3 -; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vle64.v v0, (a2) +; RV32-NEXT: vle64.v v24, (a2) ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, 0 +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vmseq.vv v8, v0, v24 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vmseq.vv v1, v24, v8 ; RV32-NEXT: addi a2, a0, 128 -; RV32-NEXT: vle64.v v8, (a2) +; RV32-NEXT: vle64.v v24, (a2) ; RV32-NEXT: vle64.v v16, (a0) ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a2, a0, 3 -; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmseq.vv v0, v16, v24 +; RV32-NEXT: vmseq.vv v0, v16, v8 ; RV32-NEXT: addi a0, a1, 128 -; RV32-NEXT: vse64.v v8, (a0), v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vse64.v v24, (a0), v0.t +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vse64.v v8, (a1), v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 18 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -457,28 +444,18 @@ define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: slli a3, a3, 3 ; RV64-NEXT: sub sp, sp, a3 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a2) ; RV64-NEXT: addi a2, a2, 128 ; RV64-NEXT: vle64.v v16, (a2) -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vmseq.vi v0, v8, 0 ; RV64-NEXT: vle64.v v24, (a0) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vmseq.vi v8, v16, 0 ; RV64-NEXT: vse64.v v24, (a1), v0.t ; RV64-NEXT: addi a0, a1, 128 @@ -487,7 +464,7 @@ define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a0), v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -540,29 +517,19 @@ define void @masked_store_v64i32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: slli a3, a3, 4 +; CHECK-NEXT: slli a3, a3, 3 ; CHECK-NEXT: sub sp, sp, a3 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) ; CHECK-NEXT: addi a2, a2, 128 ; CHECK-NEXT: vle32.v v16, (a2) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vle32.v v24, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmseq.vi v8, v16, 0 ; CHECK-NEXT: vse32.v v24, (a1), v0.t ; CHECK-NEXT: addi a0, a1, 128 @@ -571,7 +538,7 @@ define void @masked_store_v64i32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vse32.v v8, (a0), v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -606,29 +573,19 @@ define void @masked_store_v128i16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: slli a3, a3, 4 +; CHECK-NEXT: slli a3, a3, 3 ; CHECK-NEXT: sub sp, sp, a3 ; CHECK-NEXT: li a3, 64 ; CHECK-NEXT: vsetvli zero, a3, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a2) ; CHECK-NEXT: addi a2, a2, 128 ; CHECK-NEXT: vle16.v v16, (a2) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vle16.v v24, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmseq.vi v8, v16, 0 ; CHECK-NEXT: vse16.v v24, (a1), v0.t ; CHECK-NEXT: addi a0, a1, 128 @@ -637,7 +594,7 @@ define void @masked_store_v128i16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vse16.v v8, (a0), v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -654,29 +611,19 @@ define void @masked_store_v256i8(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: slli a3, a3, 4 +; CHECK-NEXT: slli a3, a3, 3 ; CHECK-NEXT: sub sp, sp, a3 ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v8, (a2) ; CHECK-NEXT: addi a2, a2, 128 ; CHECK-NEXT: vle8.v v16, (a2) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vle8.v v24, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmseq.vi v8, v16, 0 ; CHECK-NEXT: vse8.v v24, (a1), v0.t ; CHECK-NEXT: addi a0, a1, 128 @@ -685,7 +632,7 @@ define void @masked_store_v256i8(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vse8.v v8, (a0), v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll index d9958f4aae3500..cba78368b2e7d9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll @@ -595,15 +595,7 @@ declare <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v25, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v2, v0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vslidedown.vi v1, v0, 2 @@ -615,43 +607,35 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t ; CHECK-NEXT: frflags a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: fsflags a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v16, v24, v0.t +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t -; CHECK-NEXT: vmv.v.v v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: ret %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl) ret <32 x double> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll index f2a1f2752cda00..bffd5d632a632c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -1565,26 +1565,26 @@ define i64 @vwreduce_add_v64i64(ptr %x) { ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v16, (a1) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v8, 16 -; RV32-NEXT: vslidedown.vi v0, v16, 16 +; RV32-NEXT: vslidedown.vi v0, v8, 16 +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv4r.v v8, v0 -; RV32-NEXT: vwadd.vv v0, v24, v8 +; RV32-NEXT: vmv4r.v v8, v24 +; RV32-NEXT: vwadd.vv v24, v0, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vwadd.vv v0, v8, v16 +; RV32-NEXT: vwadd.vv v24, v8, v16 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vmv.s.x v16, zero ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 @@ -1613,26 +1613,26 @@ define i64 @vwreduce_add_v64i64(ptr %x) { ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vle32.v v16, (a1) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v8, 16 -; RV64-NEXT: vslidedown.vi v0, v16, 16 +; RV64-NEXT: vslidedown.vi v0, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vmv4r.v v8, v0 -; RV64-NEXT: vwadd.vv v0, v24, v8 +; RV64-NEXT: vmv4r.v v8, v24 +; RV64-NEXT: vwadd.vv v24, v0, v8 ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vwadd.vv v0, v8, v16 +; RV64-NEXT: vwadd.vv v24, v8, v16 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vadd.vv v8, v0, v8 +; RV64-NEXT: vadd.vv v8, v24, v8 ; RV64-NEXT: vmv.s.x v16, zero ; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 @@ -1664,26 +1664,26 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v16, (a1) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v8, 16 -; RV32-NEXT: vslidedown.vi v0, v16, 16 +; RV32-NEXT: vslidedown.vi v0, v8, 16 +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv4r.v v8, v0 -; RV32-NEXT: vwaddu.vv v0, v24, v8 +; RV32-NEXT: vmv4r.v v8, v24 +; RV32-NEXT: vwaddu.vv v24, v0, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vwaddu.vv v0, v8, v16 +; RV32-NEXT: vwaddu.vv v24, v8, v16 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vmv.s.x v16, zero ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 @@ -1712,26 +1712,26 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vle32.v v16, (a1) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v8, 16 -; RV64-NEXT: vslidedown.vi v0, v16, 16 +; RV64-NEXT: vslidedown.vi v0, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vmv4r.v v8, v0 -; RV64-NEXT: vwaddu.vv v0, v24, v8 +; RV64-NEXT: vmv4r.v v8, v24 +; RV64-NEXT: vwaddu.vv v24, v0, v8 ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vwaddu.vv v0, v8, v16 +; RV64-NEXT: vwaddu.vv v24, v8, v16 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vadd.vv v8, v0, v8 +; RV64-NEXT: vadd.vv v8, v24, v8 ; RV64-NEXT: vmv.s.x v16, zero ; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll index 3e0fb3009c6b19..d7a73c51d58b65 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll @@ -543,65 +543,54 @@ declare <32 x double> @llvm.vp.rint.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v25, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v2, v0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vslidedown.vi v1, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll index 504982111d055b..8fec2779621882 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll @@ -763,69 +763,58 @@ declare <32 x double> @llvm.vp.round.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v25, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v2, v0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vslidedown.vi v1, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll index 35480164d4a12d..d4480c53136c35 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll @@ -763,69 +763,58 @@ declare <32 x double> @llvm.vp.roundeven.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v25, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v2, v0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vslidedown.vi v1, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a1, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll index 4928eba52ac8ca..995f693c92aee9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll @@ -763,69 +763,58 @@ declare <32 x double> @llvm.vp.roundtozero.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v25, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v2, v0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vslidedown.vi v1, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll index 8a0d8e1791b0f6..b372a5f57f1d66 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll @@ -245,7 +245,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v3, v0, 8 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v26, v0, 4 +; CHECK-NEXT: vslidedown.vi v2, v0, 4 ; CHECK-NEXT: addi a2, a1, 512 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a2) @@ -264,13 +264,13 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: addi a2, a7, -64 ; CHECK-NEXT: sltu a3, a7, a2 ; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a4, a3, a2 -; CHECK-NEXT: addi a2, a4, -32 -; CHECK-NEXT: sltu a3, a4, a2 -; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a3, a3, a2 -; CHECK-NEXT: addi a2, a3, -16 -; CHECK-NEXT: sltu a5, a3, a2 +; CHECK-NEXT: addi a2, a3, -32 +; CHECK-NEXT: sltu a4, a3, a2 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a4, a4, a2 +; CHECK-NEXT: addi a2, a4, -16 +; CHECK-NEXT: sltu a5, a4, a2 ; CHECK-NEXT: addi a5, a5, -1 ; CHECK-NEXT: and a2, a5, a2 ; CHECK-NEXT: vslidedown.vi v0, v27, 2 @@ -283,18 +283,16 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: addi a5, a1, 128 -; CHECK-NEXT: bltu a3, a2, .LBB16_2 +; CHECK-NEXT: bltu a4, a2, .LBB16_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: .LBB16_2: ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v28, v26, 2 +; CHECK-NEXT: vslidedown.vi v28, v2, 2 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a5) -; CHECK-NEXT: addi a5, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma -; CHECK-NEXT: li a3, 64 +; CHECK-NEXT: vle64.v v16, (a5) +; CHECK-NEXT: vsetvli zero, a4, e32, m4, ta, ma +; CHECK-NEXT: li a4, 64 ; CHECK-NEXT: vmv1r.v v0, v27 ; CHECK-NEXT: csrr a5, vlenb ; CHECK-NEXT: li a6, 48 @@ -302,19 +300,19 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: add a5, sp, a5 ; CHECK-NEXT: addi a5, a5, 16 ; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload -; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t +; CHECK-NEXT: vnsrl.wi v24, v8, 0, v0.t ; CHECK-NEXT: csrr a5, vlenb ; CHECK-NEXT: li a6, 56 ; CHECK-NEXT: mul a5, a5, a6 ; CHECK-NEXT: add a5, sp, a5 ; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; CHECK-NEXT: bltu a7, a3, .LBB16_4 +; CHECK-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: bltu a7, a4, .LBB16_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a7, 64 ; CHECK-NEXT: .LBB16_4: ; CHECK-NEXT: addi a5, a1, 384 -; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: csrr a6, vlenb @@ -333,8 +331,6 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: and t0, t1, t0 ; CHECK-NEXT: vsetvli zero, t0, e32, m4, ta, ma ; CHECK-NEXT: vmv1r.v v0, v28 -; CHECK-NEXT: addi t0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (t0) # Unknown-size Folded Reload ; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t ; CHECK-NEXT: csrr t0, vlenb ; CHECK-NEXT: slli t0, t0, 3 @@ -346,77 +342,81 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: li a6, 16 ; CHECK-NEXT: .LBB16_6: ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v20, v3, 2 +; CHECK-NEXT: vslidedown.vi v12, v3, 2 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a5) +; CHECK-NEXT: vle64.v v24, (a5) ; CHECK-NEXT: addi a1, a1, 256 ; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v26 +; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: csrr a5, vlenb ; CHECK-NEXT: li a6, 40 ; CHECK-NEXT: mul a5, a5, a6 ; CHECK-NEXT: add a5, sp, a5 ; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload -; CHECK-NEXT: vnsrl.wi v16, v24, 0, v0.t +; CHECK-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload +; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t ; CHECK-NEXT: csrr a5, vlenb ; CHECK-NEXT: li a6, 48 ; CHECK-NEXT: mul a5, a5, a6 ; CHECK-NEXT: add a5, sp, a5 ; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; CHECK-NEXT: bltu a4, a3, .LBB16_8 +; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: bltu a3, a4, .LBB16_8 ; CHECK-NEXT: # %bb.7: -; CHECK-NEXT: li a4, 32 +; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: .LBB16_8: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: addi a1, a4, -16 -; CHECK-NEXT: sltu a5, a4, a1 +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a3, -16 +; CHECK-NEXT: sltu a5, a3, a1 ; CHECK-NEXT: addi a5, a5, -1 ; CHECK-NEXT: and a1, a5, a1 ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v20 -; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t -; CHECK-NEXT: bltu a4, a2, .LBB16_10 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vnsrl.wi v16, v24, 0, v0.t +; CHECK-NEXT: bltu a3, a2, .LBB16_10 ; CHECK-NEXT: # %bb.9: -; CHECK-NEXT: li a4, 16 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: .LBB16_10: ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v2, v1, 2 -; CHECK-NEXT: vsetvli zero, a4, e32, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma ; CHECK-NEXT: vmv1r.v v0, v3 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 40 -; CHECK-NEXT: mul a1, a1, a4 +; CHECK-NEXT: li a3, 40 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: bltu a7, a3, .LBB16_12 +; CHECK-NEXT: bltu a7, a4, .LBB16_12 ; CHECK-NEXT: # %bb.11: ; CHECK-NEXT: li a7, 32 ; CHECK-NEXT: .LBB16_12: -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v24, v8 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmv4r.v v8, v24 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 56 -; CHECK-NEXT: mul a1, a1, a4 +; CHECK-NEXT: li a3, 56 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vslideup.vi v8, v24, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vslideup.vi v24, v8, 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 56 -; CHECK-NEXT: mul a1, a1, a4 +; CHECK-NEXT: li a3, 56 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 @@ -424,40 +424,40 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 48 -; CHECK-NEXT: mul a1, a1, a4 +; CHECK-NEXT: li a3, 48 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vslideup.vi v8, v24, 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 48 -; CHECK-NEXT: mul a1, a1, a4 +; CHECK-NEXT: li a3, 48 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 40 -; CHECK-NEXT: mul a1, a1, a4 +; CHECK-NEXT: li a3, 40 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vslideup.vi v8, v16, 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 40 -; CHECK-NEXT: mul a1, a1, a4 +; CHECK-NEXT: li a3, 40 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a7, -16 -; CHECK-NEXT: sltu a4, a7, a1 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a1, a4, a1 +; CHECK-NEXT: sltu a3, a7, a1 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a1, a3, a1 ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma ; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 24 -; CHECK-NEXT: mul a1, a1, a4 +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -474,7 +474,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vnsrl.wi v16, v24, 0, v0.t -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma ; CHECK-NEXT: vslideup.vi v16, v8, 16 ; CHECK-NEXT: vse32.v v16, (a0) ; CHECK-NEXT: addi a1, a0, 256 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll index 32ef08101407d7..c0660fa5b5b43e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll @@ -991,12 +991,12 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a2) +; CHECK-NEXT: vle64.v v0, (a2) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vle64.v v0, (a0) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll index e05d6b1525eeeb..762d61962bdcca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll @@ -755,12 +755,12 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a2) +; CHECK-NEXT: vle64.v v0, (a2) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vle64.v v0, (a0) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll index d05f580ea7d222..e698ffdf398eec 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll @@ -158,48 +158,38 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a2, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v9, v8 -; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: vmv1r.v v2, v8 +; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v24, (a0) ; CHECK-NEXT: addi a0, a1, 128 -; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: addi a0, a3, -128 ; CHECK-NEXT: sltu a4, a3, a0 ; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: vle8.v v0, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: and a0, a4, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vmerge.vvm v24, v16, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 ; CHECK-NEXT: bltu a3, a2, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: .LBB11_2: ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -213,22 +203,27 @@ define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a2, a2, a3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v24, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, a1, 128 ; CHECK-NEXT: vle8.v v24, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a0, a0, a3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill @@ -239,15 +234,21 @@ define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c ; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, ma ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0 +; CHECK-NEXT: vmerge.vvm v24, v16, v24, v0 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -256,8 +257,7 @@ define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll index 6c4f523aa8d948..320f5da8eb5fad 100644 --- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll @@ -737,16 +737,14 @@ define @vp_floor_nxv16f64( %va, @vp_floor_nxv16f64( %va, @vfmax_nxv32f16_vv( %a, @vfmax_nxv32f16_vv( %a, @vfmin_nxv32f16_vv( %a, @vfmin_nxv32f16_vv( %a, @ustest_f32i32(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i32: ; CHECK-NOV: # %bb.0: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a1, fa0, rtz -; CHECK-NOV-NEXT: li a4, -1 -; CHECK-NOV-NEXT: srli a4, a4, 32 +; CHECK-NOV-NEXT: li a5, -1 +; CHECK-NOV-NEXT: srli a5, a5, 32 ; CHECK-NOV-NEXT: fcvt.l.s a2, fa1, rtz -; CHECK-NOV-NEXT: bge a1, a4, .LBB5_6 +; CHECK-NOV-NEXT: bge a1, a5, .LBB5_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a3, fa2, rtz -; CHECK-NOV-NEXT: bge a2, a4, .LBB5_7 +; CHECK-NOV-NEXT: bge a2, a5, .LBB5_7 ; CHECK-NOV-NEXT: .LBB5_2: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a5, fa3, rtz -; CHECK-NOV-NEXT: bge a3, a4, .LBB5_8 +; CHECK-NOV-NEXT: fcvt.l.s a4, fa3, rtz +; CHECK-NOV-NEXT: bge a3, a5, .LBB5_8 ; CHECK-NOV-NEXT: .LBB5_3: # %entry -; CHECK-NOV-NEXT: blt a5, a4, .LBB5_5 +; CHECK-NOV-NEXT: blt a4, a5, .LBB5_5 ; CHECK-NOV-NEXT: .LBB5_4: # %entry -; CHECK-NOV-NEXT: mv a5, a4 +; CHECK-NOV-NEXT: mv a4, a5 ; CHECK-NOV-NEXT: .LBB5_5: # %entry -; CHECK-NOV-NEXT: sgtz a4, a5 +; CHECK-NOV-NEXT: sgtz a5, a4 ; CHECK-NOV-NEXT: sgtz a6, a3 ; CHECK-NOV-NEXT: sgtz a7, a2 ; CHECK-NOV-NEXT: sgtz t0, a1 @@ -301,24 +301,24 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NOV-NEXT: and a2, a7, a2 ; CHECK-NOV-NEXT: negw a6, a6 ; CHECK-NOV-NEXT: and a3, a6, a3 -; CHECK-NOV-NEXT: negw a4, a4 -; CHECK-NOV-NEXT: and a4, a4, a5 +; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: and a4, a5, a4 ; CHECK-NOV-NEXT: sw a4, 12(a0) ; CHECK-NOV-NEXT: sw a3, 8(a0) ; CHECK-NOV-NEXT: sw a2, 4(a0) ; CHECK-NOV-NEXT: sw a1, 0(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB5_6: # %entry -; CHECK-NOV-NEXT: mv a1, a4 +; CHECK-NOV-NEXT: mv a1, a5 ; CHECK-NOV-NEXT: fcvt.l.s a3, fa2, rtz -; CHECK-NOV-NEXT: blt a2, a4, .LBB5_2 +; CHECK-NOV-NEXT: blt a2, a5, .LBB5_2 ; CHECK-NOV-NEXT: .LBB5_7: # %entry -; CHECK-NOV-NEXT: mv a2, a4 -; CHECK-NOV-NEXT: fcvt.l.s a5, fa3, rtz -; CHECK-NOV-NEXT: blt a3, a4, .LBB5_3 +; CHECK-NOV-NEXT: mv a2, a5 +; CHECK-NOV-NEXT: fcvt.l.s a4, fa3, rtz +; CHECK-NOV-NEXT: blt a3, a5, .LBB5_3 ; CHECK-NOV-NEXT: .LBB5_8: # %entry -; CHECK-NOV-NEXT: mv a3, a4 -; CHECK-NOV-NEXT: bge a5, a4, .LBB5_4 +; CHECK-NOV-NEXT: mv a3, a5 +; CHECK-NOV-NEXT: bge a4, a5, .LBB5_4 ; CHECK-NOV-NEXT: j .LBB5_5 ; ; CHECK-V-LABEL: ustest_f32i32: @@ -374,10 +374,10 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt @@ -386,10 +386,10 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: addiw a4, a1, -1 ; CHECK-NOV-NEXT: bge a0, a4, .LBB6_10 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fs0, rtz ; CHECK-NOV-NEXT: bge s1, a4, .LBB6_11 ; CHECK-NOV-NEXT: .LBB6_2: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a3, fs1, rtz ; CHECK-NOV-NEXT: bge a2, a4, .LBB6_12 ; CHECK-NOV-NEXT: .LBB6_3: # %entry ; CHECK-NOV-NEXT: bge a3, a4, .LBB6_13 @@ -420,11 +420,11 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB6_10: # %entry ; CHECK-NOV-NEXT: mv a0, a4 -; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fs0, rtz ; CHECK-NOV-NEXT: blt s1, a4, .LBB6_2 ; CHECK-NOV-NEXT: .LBB6_11: # %entry ; CHECK-NOV-NEXT: mv s1, a4 -; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a3, fs1, rtz ; CHECK-NOV-NEXT: blt a2, a4, .LBB6_3 ; CHECK-NOV-NEXT: .LBB6_12: # %entry ; CHECK-NOV-NEXT: mv a2, a4 @@ -561,10 +561,10 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.lu.s s1, fs2, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt @@ -573,10 +573,10 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: srli a1, a1, 32 ; CHECK-NOV-NEXT: bgeu a0, a1, .LBB7_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a2, fs1, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a2, fs0, rtz ; CHECK-NOV-NEXT: bgeu s1, a1, .LBB7_7 ; CHECK-NOV-NEXT: .LBB7_2: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a3, fs0, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a3, fs1, rtz ; CHECK-NOV-NEXT: bgeu a2, a1, .LBB7_8 ; CHECK-NOV-NEXT: .LBB7_3: # %entry ; CHECK-NOV-NEXT: bltu a3, a1, .LBB7_5 @@ -599,11 +599,11 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB7_6: # %entry ; CHECK-NOV-NEXT: mv a0, a1 -; CHECK-NOV-NEXT: fcvt.lu.s a2, fs1, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a2, fs0, rtz ; CHECK-NOV-NEXT: bltu s1, a1, .LBB7_2 ; CHECK-NOV-NEXT: .LBB7_7: # %entry ; CHECK-NOV-NEXT: mv s1, a1 -; CHECK-NOV-NEXT: fcvt.lu.s a3, fs0, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a3, fs1, rtz ; CHECK-NOV-NEXT: bltu a2, a1, .LBB7_3 ; CHECK-NOV-NEXT: .LBB7_8: # %entry ; CHECK-NOV-NEXT: mv a2, a1 @@ -725,29 +725,29 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NOV-NEXT: li a2, -1 -; CHECK-NOV-NEXT: srli a2, a2, 32 -; CHECK-NOV-NEXT: bge a0, a2, .LBB8_6 +; CHECK-NOV-NEXT: li a3, -1 +; CHECK-NOV-NEXT: srli a3, a3, 32 +; CHECK-NOV-NEXT: bge a0, a3, .LBB8_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a1, fs1, rtz -; CHECK-NOV-NEXT: bge s1, a2, .LBB8_7 +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz +; CHECK-NOV-NEXT: bge s1, a3, .LBB8_7 ; CHECK-NOV-NEXT: .LBB8_2: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz -; CHECK-NOV-NEXT: bge a1, a2, .LBB8_8 +; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz +; CHECK-NOV-NEXT: bge a1, a3, .LBB8_8 ; CHECK-NOV-NEXT: .LBB8_3: # %entry -; CHECK-NOV-NEXT: blt a3, a2, .LBB8_5 +; CHECK-NOV-NEXT: blt a2, a3, .LBB8_5 ; CHECK-NOV-NEXT: .LBB8_4: # %entry -; CHECK-NOV-NEXT: mv a3, a2 +; CHECK-NOV-NEXT: mv a2, a3 ; CHECK-NOV-NEXT: .LBB8_5: # %entry -; CHECK-NOV-NEXT: sgtz a2, a3 +; CHECK-NOV-NEXT: sgtz a3, a2 ; CHECK-NOV-NEXT: sgtz a4, a1 ; CHECK-NOV-NEXT: sgtz a5, s1 ; CHECK-NOV-NEXT: sgtz a6, a0 @@ -757,8 +757,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: and a5, a5, s1 ; CHECK-NOV-NEXT: negw a4, a4 ; CHECK-NOV-NEXT: and a1, a4, a1 -; CHECK-NOV-NEXT: negw a2, a2 -; CHECK-NOV-NEXT: and a2, a2, a3 +; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: and a2, a3, a2 ; CHECK-NOV-NEXT: sw a2, 12(s0) ; CHECK-NOV-NEXT: sw a1, 8(s0) ; CHECK-NOV-NEXT: sw a5, 4(s0) @@ -774,16 +774,16 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: addi sp, sp, 64 ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB8_6: # %entry -; CHECK-NOV-NEXT: mv a0, a2 -; CHECK-NOV-NEXT: fcvt.l.s a1, fs1, rtz -; CHECK-NOV-NEXT: blt s1, a2, .LBB8_2 +; CHECK-NOV-NEXT: mv a0, a3 +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz +; CHECK-NOV-NEXT: blt s1, a3, .LBB8_2 ; CHECK-NOV-NEXT: .LBB8_7: # %entry -; CHECK-NOV-NEXT: mv s1, a2 -; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz -; CHECK-NOV-NEXT: blt a1, a2, .LBB8_3 +; CHECK-NOV-NEXT: mv s1, a3 +; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz +; CHECK-NOV-NEXT: blt a1, a3, .LBB8_3 ; CHECK-NOV-NEXT: .LBB8_8: # %entry -; CHECK-NOV-NEXT: mv a1, a2 -; CHECK-NOV-NEXT: bge a3, a2, .LBB8_4 +; CHECK-NOV-NEXT: mv a1, a3 +; CHECK-NOV-NEXT: bge a2, a3, .LBB8_4 ; CHECK-NOV-NEXT: j .LBB8_5 ; ; CHECK-V-LABEL: ustest_f16i32: @@ -1149,22 +1149,22 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i16: ; CHECK-NOV: # %bb.0: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a1, fa0, rtz -; CHECK-NOV-NEXT: lui a4, 16 -; CHECK-NOV-NEXT: addiw a4, a4, -1 +; CHECK-NOV-NEXT: lui a5, 16 +; CHECK-NOV-NEXT: addiw a5, a5, -1 ; CHECK-NOV-NEXT: fcvt.w.s a2, fa1, rtz -; CHECK-NOV-NEXT: bge a1, a4, .LBB14_6 +; CHECK-NOV-NEXT: bge a1, a5, .LBB14_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a3, fa2, rtz -; CHECK-NOV-NEXT: bge a2, a4, .LBB14_7 +; CHECK-NOV-NEXT: bge a2, a5, .LBB14_7 ; CHECK-NOV-NEXT: .LBB14_2: # %entry -; CHECK-NOV-NEXT: fcvt.w.s a5, fa3, rtz -; CHECK-NOV-NEXT: bge a3, a4, .LBB14_8 +; CHECK-NOV-NEXT: fcvt.w.s a4, fa3, rtz +; CHECK-NOV-NEXT: bge a3, a5, .LBB14_8 ; CHECK-NOV-NEXT: .LBB14_3: # %entry -; CHECK-NOV-NEXT: blt a5, a4, .LBB14_5 +; CHECK-NOV-NEXT: blt a4, a5, .LBB14_5 ; CHECK-NOV-NEXT: .LBB14_4: # %entry -; CHECK-NOV-NEXT: mv a5, a4 +; CHECK-NOV-NEXT: mv a4, a5 ; CHECK-NOV-NEXT: .LBB14_5: # %entry -; CHECK-NOV-NEXT: sgtz a4, a5 +; CHECK-NOV-NEXT: sgtz a5, a4 ; CHECK-NOV-NEXT: sgtz a6, a3 ; CHECK-NOV-NEXT: sgtz a7, a2 ; CHECK-NOV-NEXT: sgtz t0, a1 @@ -1174,24 +1174,24 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-NOV-NEXT: and a2, a7, a2 ; CHECK-NOV-NEXT: negw a6, a6 ; CHECK-NOV-NEXT: and a3, a6, a3 -; CHECK-NOV-NEXT: negw a4, a4 -; CHECK-NOV-NEXT: and a4, a4, a5 +; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: and a4, a5, a4 ; CHECK-NOV-NEXT: sh a4, 6(a0) ; CHECK-NOV-NEXT: sh a3, 4(a0) ; CHECK-NOV-NEXT: sh a2, 2(a0) ; CHECK-NOV-NEXT: sh a1, 0(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB14_6: # %entry -; CHECK-NOV-NEXT: mv a1, a4 +; CHECK-NOV-NEXT: mv a1, a5 ; CHECK-NOV-NEXT: fcvt.w.s a3, fa2, rtz -; CHECK-NOV-NEXT: blt a2, a4, .LBB14_2 +; CHECK-NOV-NEXT: blt a2, a5, .LBB14_2 ; CHECK-NOV-NEXT: .LBB14_7: # %entry -; CHECK-NOV-NEXT: mv a2, a4 -; CHECK-NOV-NEXT: fcvt.w.s a5, fa3, rtz -; CHECK-NOV-NEXT: blt a3, a4, .LBB14_3 +; CHECK-NOV-NEXT: mv a2, a5 +; CHECK-NOV-NEXT: fcvt.w.s a4, fa3, rtz +; CHECK-NOV-NEXT: blt a3, a5, .LBB14_3 ; CHECK-NOV-NEXT: .LBB14_8: # %entry -; CHECK-NOV-NEXT: mv a3, a4 -; CHECK-NOV-NEXT: bge a5, a4, .LBB14_4 +; CHECK-NOV-NEXT: mv a3, a5 +; CHECK-NOV-NEXT: bge a4, a5, .LBB14_4 ; CHECK-NOV-NEXT: j .LBB14_5 ; ; CHECK-V-LABEL: ustest_f32i16: @@ -1266,22 +1266,22 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs5, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs4, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs3, fa0 +; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs2, fa0 +; CHECK-NOV-NEXT: fmv.s fs3, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs4, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs5, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt @@ -1290,22 +1290,22 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: addiw a7, a7, -1 ; CHECK-NOV-NEXT: bge a0, a7, .LBB15_18 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz ; CHECK-NOV-NEXT: bge s1, a7, .LBB15_19 ; CHECK-NOV-NEXT: .LBB15_2: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz ; CHECK-NOV-NEXT: bge a1, a7, .LBB15_20 ; CHECK-NOV-NEXT: .LBB15_3: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz +; CHECK-NOV-NEXT: fcvt.l.s a3, fs2, rtz ; CHECK-NOV-NEXT: bge a2, a7, .LBB15_21 ; CHECK-NOV-NEXT: .LBB15_4: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz +; CHECK-NOV-NEXT: fcvt.l.s a4, fs3, rtz ; CHECK-NOV-NEXT: bge a3, a7, .LBB15_22 ; CHECK-NOV-NEXT: .LBB15_5: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a5, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a5, fs4, rtz ; CHECK-NOV-NEXT: bge a4, a7, .LBB15_23 ; CHECK-NOV-NEXT: .LBB15_6: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a6, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a6, fs5, rtz ; CHECK-NOV-NEXT: bge a5, a7, .LBB15_24 ; CHECK-NOV-NEXT: .LBB15_7: # %entry ; CHECK-NOV-NEXT: bge a6, a7, .LBB15_25 @@ -1357,27 +1357,27 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB15_18: # %entry ; CHECK-NOV-NEXT: mv a0, a7 -; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz ; CHECK-NOV-NEXT: blt s1, a7, .LBB15_2 ; CHECK-NOV-NEXT: .LBB15_19: # %entry ; CHECK-NOV-NEXT: mv s1, a7 -; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz ; CHECK-NOV-NEXT: blt a1, a7, .LBB15_3 ; CHECK-NOV-NEXT: .LBB15_20: # %entry ; CHECK-NOV-NEXT: mv a1, a7 -; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz +; CHECK-NOV-NEXT: fcvt.l.s a3, fs2, rtz ; CHECK-NOV-NEXT: blt a2, a7, .LBB15_4 ; CHECK-NOV-NEXT: .LBB15_21: # %entry ; CHECK-NOV-NEXT: mv a2, a7 -; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz +; CHECK-NOV-NEXT: fcvt.l.s a4, fs3, rtz ; CHECK-NOV-NEXT: blt a3, a7, .LBB15_5 ; CHECK-NOV-NEXT: .LBB15_22: # %entry ; CHECK-NOV-NEXT: mv a3, a7 -; CHECK-NOV-NEXT: fcvt.l.s a5, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a5, fs4, rtz ; CHECK-NOV-NEXT: blt a4, a7, .LBB15_6 ; CHECK-NOV-NEXT: .LBB15_23: # %entry ; CHECK-NOV-NEXT: mv a4, a7 -; CHECK-NOV-NEXT: fcvt.l.s a6, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a6, fs5, rtz ; CHECK-NOV-NEXT: blt a5, a7, .LBB15_7 ; CHECK-NOV-NEXT: .LBB15_24: # %entry ; CHECK-NOV-NEXT: mv a5, a7 @@ -1591,58 +1591,58 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs5, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs4, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs3, fa0 +; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs2, fa0 +; CHECK-NOV-NEXT: fmv.s fs3, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs4, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs5, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.lu.s s1, fs6, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-NOV-NEXT: lui a1, 16 -; CHECK-NOV-NEXT: addiw a1, a1, -1 -; CHECK-NOV-NEXT: bgeu a0, a1, .LBB16_10 +; CHECK-NOV-NEXT: lui a2, 16 +; CHECK-NOV-NEXT: addiw a2, a2, -1 +; CHECK-NOV-NEXT: bgeu a0, a2, .LBB16_10 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a2, fs5, rtz -; CHECK-NOV-NEXT: bgeu s1, a1, .LBB16_11 +; CHECK-NOV-NEXT: fcvt.lu.s a1, fs0, rtz +; CHECK-NOV-NEXT: bgeu s1, a2, .LBB16_11 ; CHECK-NOV-NEXT: .LBB16_2: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a3, fs4, rtz -; CHECK-NOV-NEXT: bgeu a2, a1, .LBB16_12 +; CHECK-NOV-NEXT: fcvt.lu.s a3, fs1, rtz +; CHECK-NOV-NEXT: bgeu a1, a2, .LBB16_12 ; CHECK-NOV-NEXT: .LBB16_3: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a4, fs3, rtz -; CHECK-NOV-NEXT: bgeu a3, a1, .LBB16_13 +; CHECK-NOV-NEXT: fcvt.lu.s a4, fs2, rtz +; CHECK-NOV-NEXT: bgeu a3, a2, .LBB16_13 ; CHECK-NOV-NEXT: .LBB16_4: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a5, fs2, rtz -; CHECK-NOV-NEXT: bgeu a4, a1, .LBB16_14 +; CHECK-NOV-NEXT: fcvt.lu.s a5, fs3, rtz +; CHECK-NOV-NEXT: bgeu a4, a2, .LBB16_14 ; CHECK-NOV-NEXT: .LBB16_5: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a6, fs1, rtz -; CHECK-NOV-NEXT: bgeu a5, a1, .LBB16_15 +; CHECK-NOV-NEXT: fcvt.lu.s a6, fs4, rtz +; CHECK-NOV-NEXT: bgeu a5, a2, .LBB16_15 ; CHECK-NOV-NEXT: .LBB16_6: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a7, fs0, rtz -; CHECK-NOV-NEXT: bgeu a6, a1, .LBB16_16 +; CHECK-NOV-NEXT: fcvt.lu.s a7, fs5, rtz +; CHECK-NOV-NEXT: bgeu a6, a2, .LBB16_16 ; CHECK-NOV-NEXT: .LBB16_7: # %entry -; CHECK-NOV-NEXT: bltu a7, a1, .LBB16_9 +; CHECK-NOV-NEXT: bltu a7, a2, .LBB16_9 ; CHECK-NOV-NEXT: .LBB16_8: # %entry -; CHECK-NOV-NEXT: mv a7, a1 +; CHECK-NOV-NEXT: mv a7, a2 ; CHECK-NOV-NEXT: .LBB16_9: # %entry ; CHECK-NOV-NEXT: sh a7, 14(s0) ; CHECK-NOV-NEXT: sh a6, 12(s0) ; CHECK-NOV-NEXT: sh a5, 10(s0) ; CHECK-NOV-NEXT: sh a4, 8(s0) ; CHECK-NOV-NEXT: sh a3, 6(s0) -; CHECK-NOV-NEXT: sh a2, 4(s0) +; CHECK-NOV-NEXT: sh a1, 4(s0) ; CHECK-NOV-NEXT: sh s1, 2(s0) ; CHECK-NOV-NEXT: sh a0, 0(s0) ; CHECK-NOV-NEXT: ld ra, 120(sp) # 8-byte Folded Reload @@ -1664,32 +1664,32 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: addi sp, sp, 128 ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB16_10: # %entry -; CHECK-NOV-NEXT: mv a0, a1 -; CHECK-NOV-NEXT: fcvt.lu.s a2, fs5, rtz -; CHECK-NOV-NEXT: bltu s1, a1, .LBB16_2 +; CHECK-NOV-NEXT: mv a0, a2 +; CHECK-NOV-NEXT: fcvt.lu.s a1, fs0, rtz +; CHECK-NOV-NEXT: bltu s1, a2, .LBB16_2 ; CHECK-NOV-NEXT: .LBB16_11: # %entry -; CHECK-NOV-NEXT: mv s1, a1 -; CHECK-NOV-NEXT: fcvt.lu.s a3, fs4, rtz -; CHECK-NOV-NEXT: bltu a2, a1, .LBB16_3 +; CHECK-NOV-NEXT: mv s1, a2 +; CHECK-NOV-NEXT: fcvt.lu.s a3, fs1, rtz +; CHECK-NOV-NEXT: bltu a1, a2, .LBB16_3 ; CHECK-NOV-NEXT: .LBB16_12: # %entry -; CHECK-NOV-NEXT: mv a2, a1 -; CHECK-NOV-NEXT: fcvt.lu.s a4, fs3, rtz -; CHECK-NOV-NEXT: bltu a3, a1, .LBB16_4 +; CHECK-NOV-NEXT: mv a1, a2 +; CHECK-NOV-NEXT: fcvt.lu.s a4, fs2, rtz +; CHECK-NOV-NEXT: bltu a3, a2, .LBB16_4 ; CHECK-NOV-NEXT: .LBB16_13: # %entry -; CHECK-NOV-NEXT: mv a3, a1 -; CHECK-NOV-NEXT: fcvt.lu.s a5, fs2, rtz -; CHECK-NOV-NEXT: bltu a4, a1, .LBB16_5 +; CHECK-NOV-NEXT: mv a3, a2 +; CHECK-NOV-NEXT: fcvt.lu.s a5, fs3, rtz +; CHECK-NOV-NEXT: bltu a4, a2, .LBB16_5 ; CHECK-NOV-NEXT: .LBB16_14: # %entry -; CHECK-NOV-NEXT: mv a4, a1 -; CHECK-NOV-NEXT: fcvt.lu.s a6, fs1, rtz -; CHECK-NOV-NEXT: bltu a5, a1, .LBB16_6 +; CHECK-NOV-NEXT: mv a4, a2 +; CHECK-NOV-NEXT: fcvt.lu.s a6, fs4, rtz +; CHECK-NOV-NEXT: bltu a5, a2, .LBB16_6 ; CHECK-NOV-NEXT: .LBB16_15: # %entry -; CHECK-NOV-NEXT: mv a5, a1 -; CHECK-NOV-NEXT: fcvt.lu.s a7, fs0, rtz -; CHECK-NOV-NEXT: bltu a6, a1, .LBB16_7 +; CHECK-NOV-NEXT: mv a5, a2 +; CHECK-NOV-NEXT: fcvt.lu.s a7, fs5, rtz +; CHECK-NOV-NEXT: bltu a6, a2, .LBB16_7 ; CHECK-NOV-NEXT: .LBB16_16: # %entry -; CHECK-NOV-NEXT: mv a6, a1 -; CHECK-NOV-NEXT: bgeu a7, a1, .LBB16_8 +; CHECK-NOV-NEXT: mv a6, a2 +; CHECK-NOV-NEXT: bgeu a7, a2, .LBB16_8 ; CHECK-NOV-NEXT: j .LBB16_9 ; ; CHECK-V-LABEL: utesth_f16i16: @@ -1870,56 +1870,56 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs5, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs4, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs3, fa0 +; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs2, fa0 +; CHECK-NOV-NEXT: fmv.s fs3, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs4, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs5, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NOV-NEXT: lui a3, 16 -; CHECK-NOV-NEXT: addiw a3, a3, -1 -; CHECK-NOV-NEXT: bge a0, a3, .LBB17_10 +; CHECK-NOV-NEXT: lui a4, 16 +; CHECK-NOV-NEXT: addiw a4, a4, -1 +; CHECK-NOV-NEXT: bge a0, a4, .LBB17_10 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NOV-NEXT: bge s1, a3, .LBB17_11 +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz +; CHECK-NOV-NEXT: bge s1, a4, .LBB17_11 ; CHECK-NOV-NEXT: .LBB17_2: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NOV-NEXT: bge a1, a3, .LBB17_12 +; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz +; CHECK-NOV-NEXT: bge a1, a4, .LBB17_12 ; CHECK-NOV-NEXT: .LBB17_3: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a4, fs3, rtz -; CHECK-NOV-NEXT: bge a2, a3, .LBB17_13 +; CHECK-NOV-NEXT: fcvt.l.s a3, fs2, rtz +; CHECK-NOV-NEXT: bge a2, a4, .LBB17_13 ; CHECK-NOV-NEXT: .LBB17_4: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz -; CHECK-NOV-NEXT: bge a4, a3, .LBB17_14 +; CHECK-NOV-NEXT: fcvt.l.s a5, fs3, rtz +; CHECK-NOV-NEXT: bge a3, a4, .LBB17_14 ; CHECK-NOV-NEXT: .LBB17_5: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz -; CHECK-NOV-NEXT: bge a5, a3, .LBB17_15 +; CHECK-NOV-NEXT: fcvt.l.s a6, fs4, rtz +; CHECK-NOV-NEXT: bge a5, a4, .LBB17_15 ; CHECK-NOV-NEXT: .LBB17_6: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz -; CHECK-NOV-NEXT: bge a6, a3, .LBB17_16 +; CHECK-NOV-NEXT: fcvt.l.s a7, fs5, rtz +; CHECK-NOV-NEXT: bge a6, a4, .LBB17_16 ; CHECK-NOV-NEXT: .LBB17_7: # %entry -; CHECK-NOV-NEXT: blt a7, a3, .LBB17_9 +; CHECK-NOV-NEXT: blt a7, a4, .LBB17_9 ; CHECK-NOV-NEXT: .LBB17_8: # %entry -; CHECK-NOV-NEXT: mv a7, a3 +; CHECK-NOV-NEXT: mv a7, a4 ; CHECK-NOV-NEXT: .LBB17_9: # %entry -; CHECK-NOV-NEXT: sgtz a3, a7 +; CHECK-NOV-NEXT: sgtz a4, a7 ; CHECK-NOV-NEXT: sgtz t0, a6 ; CHECK-NOV-NEXT: sgtz t1, a5 -; CHECK-NOV-NEXT: sgtz t2, a4 +; CHECK-NOV-NEXT: sgtz t2, a3 ; CHECK-NOV-NEXT: sgtz t3, a2 ; CHECK-NOV-NEXT: sgtz t4, a1 ; CHECK-NOV-NEXT: sgtz t5, s1 @@ -1933,17 +1933,17 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: negw t3, t3 ; CHECK-NOV-NEXT: and a2, t3, a2 ; CHECK-NOV-NEXT: negw t2, t2 -; CHECK-NOV-NEXT: and a4, t2, a4 +; CHECK-NOV-NEXT: and a3, t2, a3 ; CHECK-NOV-NEXT: negw t1, t1 ; CHECK-NOV-NEXT: and a5, t1, a5 ; CHECK-NOV-NEXT: negw t0, t0 ; CHECK-NOV-NEXT: and a6, t0, a6 -; CHECK-NOV-NEXT: negw a3, a3 -; CHECK-NOV-NEXT: and a3, a3, a7 -; CHECK-NOV-NEXT: sh a3, 14(s0) +; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: and a4, a4, a7 +; CHECK-NOV-NEXT: sh a4, 14(s0) ; CHECK-NOV-NEXT: sh a6, 12(s0) ; CHECK-NOV-NEXT: sh a5, 10(s0) -; CHECK-NOV-NEXT: sh a4, 8(s0) +; CHECK-NOV-NEXT: sh a3, 8(s0) ; CHECK-NOV-NEXT: sh a2, 6(s0) ; CHECK-NOV-NEXT: sh a1, 4(s0) ; CHECK-NOV-NEXT: sh t5, 2(s0) @@ -1967,32 +1967,32 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: addi sp, sp, 128 ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB17_10: # %entry -; CHECK-NOV-NEXT: mv a0, a3 -; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NOV-NEXT: blt s1, a3, .LBB17_2 +; CHECK-NOV-NEXT: mv a0, a4 +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz +; CHECK-NOV-NEXT: blt s1, a4, .LBB17_2 ; CHECK-NOV-NEXT: .LBB17_11: # %entry -; CHECK-NOV-NEXT: mv s1, a3 -; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NOV-NEXT: blt a1, a3, .LBB17_3 +; CHECK-NOV-NEXT: mv s1, a4 +; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz +; CHECK-NOV-NEXT: blt a1, a4, .LBB17_3 ; CHECK-NOV-NEXT: .LBB17_12: # %entry -; CHECK-NOV-NEXT: mv a1, a3 -; CHECK-NOV-NEXT: fcvt.l.s a4, fs3, rtz -; CHECK-NOV-NEXT: blt a2, a3, .LBB17_4 +; CHECK-NOV-NEXT: mv a1, a4 +; CHECK-NOV-NEXT: fcvt.l.s a3, fs2, rtz +; CHECK-NOV-NEXT: blt a2, a4, .LBB17_4 ; CHECK-NOV-NEXT: .LBB17_13: # %entry -; CHECK-NOV-NEXT: mv a2, a3 -; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz -; CHECK-NOV-NEXT: blt a4, a3, .LBB17_5 +; CHECK-NOV-NEXT: mv a2, a4 +; CHECK-NOV-NEXT: fcvt.l.s a5, fs3, rtz +; CHECK-NOV-NEXT: blt a3, a4, .LBB17_5 ; CHECK-NOV-NEXT: .LBB17_14: # %entry -; CHECK-NOV-NEXT: mv a4, a3 -; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz -; CHECK-NOV-NEXT: blt a5, a3, .LBB17_6 +; CHECK-NOV-NEXT: mv a3, a4 +; CHECK-NOV-NEXT: fcvt.l.s a6, fs4, rtz +; CHECK-NOV-NEXT: blt a5, a4, .LBB17_6 ; CHECK-NOV-NEXT: .LBB17_15: # %entry -; CHECK-NOV-NEXT: mv a5, a3 -; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz -; CHECK-NOV-NEXT: blt a6, a3, .LBB17_7 +; CHECK-NOV-NEXT: mv a5, a4 +; CHECK-NOV-NEXT: fcvt.l.s a7, fs5, rtz +; CHECK-NOV-NEXT: blt a6, a4, .LBB17_7 ; CHECK-NOV-NEXT: .LBB17_16: # %entry -; CHECK-NOV-NEXT: mv a6, a3 -; CHECK-NOV-NEXT: bge a7, a3, .LBB17_8 +; CHECK-NOV-NEXT: mv a6, a4 +; CHECK-NOV-NEXT: bge a7, a4, .LBB17_8 ; CHECK-NOV-NEXT: j .LBB17_9 ; ; CHECK-V-LABEL: ustest_f16i16: @@ -3695,10 +3695,10 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt @@ -3707,10 +3707,10 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: addiw a4, a1, -1 ; CHECK-NOV-NEXT: bge a0, a4, .LBB33_10 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fs0, rtz ; CHECK-NOV-NEXT: bge s1, a4, .LBB33_11 ; CHECK-NOV-NEXT: .LBB33_2: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a3, fs1, rtz ; CHECK-NOV-NEXT: bge a2, a4, .LBB33_12 ; CHECK-NOV-NEXT: .LBB33_3: # %entry ; CHECK-NOV-NEXT: bge a3, a4, .LBB33_13 @@ -3741,11 +3741,11 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB33_10: # %entry ; CHECK-NOV-NEXT: mv a0, a4 -; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fs0, rtz ; CHECK-NOV-NEXT: blt s1, a4, .LBB33_2 ; CHECK-NOV-NEXT: .LBB33_11: # %entry ; CHECK-NOV-NEXT: mv s1, a4 -; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a3, fs1, rtz ; CHECK-NOV-NEXT: blt a2, a4, .LBB33_3 ; CHECK-NOV-NEXT: .LBB33_12: # %entry ; CHECK-NOV-NEXT: mv a2, a4 @@ -3880,10 +3880,10 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.lu.s s1, fs2, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt @@ -3892,10 +3892,10 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: srli a1, a1, 32 ; CHECK-NOV-NEXT: bgeu a0, a1, .LBB34_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a2, fs1, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a2, fs0, rtz ; CHECK-NOV-NEXT: bgeu s1, a1, .LBB34_7 ; CHECK-NOV-NEXT: .LBB34_2: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a3, fs0, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a3, fs1, rtz ; CHECK-NOV-NEXT: bgeu a2, a1, .LBB34_8 ; CHECK-NOV-NEXT: .LBB34_3: # %entry ; CHECK-NOV-NEXT: bltu a3, a1, .LBB34_5 @@ -3918,11 +3918,11 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB34_6: # %entry ; CHECK-NOV-NEXT: mv a0, a1 -; CHECK-NOV-NEXT: fcvt.lu.s a2, fs1, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a2, fs0, rtz ; CHECK-NOV-NEXT: bltu s1, a1, .LBB34_2 ; CHECK-NOV-NEXT: .LBB34_7: # %entry ; CHECK-NOV-NEXT: mv s1, a1 -; CHECK-NOV-NEXT: fcvt.lu.s a3, fs0, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a3, fs1, rtz ; CHECK-NOV-NEXT: bltu a2, a1, .LBB34_3 ; CHECK-NOV-NEXT: .LBB34_8: # %entry ; CHECK-NOV-NEXT: mv a2, a1 @@ -4043,10 +4043,10 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt @@ -4055,10 +4055,10 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: srli a2, a2, 32 ; CHECK-NOV-NEXT: bge a0, a2, .LBB35_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a1, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz ; CHECK-NOV-NEXT: bge s1, a2, .LBB35_7 ; CHECK-NOV-NEXT: .LBB35_2: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a3, fs1, rtz ; CHECK-NOV-NEXT: bge a1, a2, .LBB35_8 ; CHECK-NOV-NEXT: .LBB35_3: # %entry ; CHECK-NOV-NEXT: blt a3, a2, .LBB35_5 @@ -4093,11 +4093,11 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB35_6: # %entry ; CHECK-NOV-NEXT: mv a0, a2 -; CHECK-NOV-NEXT: fcvt.l.s a1, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz ; CHECK-NOV-NEXT: blt s1, a2, .LBB35_2 ; CHECK-NOV-NEXT: .LBB35_7: # %entry ; CHECK-NOV-NEXT: mv s1, a2 -; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a3, fs1, rtz ; CHECK-NOV-NEXT: blt a1, a2, .LBB35_3 ; CHECK-NOV-NEXT: .LBB35_8: # %entry ; CHECK-NOV-NEXT: mv a1, a2 @@ -4572,22 +4572,22 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs5, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs4, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs3, fa0 +; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs2, fa0 +; CHECK-NOV-NEXT: fmv.s fs3, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs4, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs5, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt @@ -4596,22 +4596,22 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: addiw a7, a7, -1 ; CHECK-NOV-NEXT: bge a0, a7, .LBB42_18 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz ; CHECK-NOV-NEXT: bge s1, a7, .LBB42_19 ; CHECK-NOV-NEXT: .LBB42_2: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz ; CHECK-NOV-NEXT: bge a1, a7, .LBB42_20 ; CHECK-NOV-NEXT: .LBB42_3: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz +; CHECK-NOV-NEXT: fcvt.l.s a3, fs2, rtz ; CHECK-NOV-NEXT: bge a2, a7, .LBB42_21 ; CHECK-NOV-NEXT: .LBB42_4: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz +; CHECK-NOV-NEXT: fcvt.l.s a4, fs3, rtz ; CHECK-NOV-NEXT: bge a3, a7, .LBB42_22 ; CHECK-NOV-NEXT: .LBB42_5: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a5, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a5, fs4, rtz ; CHECK-NOV-NEXT: bge a4, a7, .LBB42_23 ; CHECK-NOV-NEXT: .LBB42_6: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a6, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a6, fs5, rtz ; CHECK-NOV-NEXT: bge a5, a7, .LBB42_24 ; CHECK-NOV-NEXT: .LBB42_7: # %entry ; CHECK-NOV-NEXT: bge a6, a7, .LBB42_25 @@ -4663,27 +4663,27 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB42_18: # %entry ; CHECK-NOV-NEXT: mv a0, a7 -; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz ; CHECK-NOV-NEXT: blt s1, a7, .LBB42_2 ; CHECK-NOV-NEXT: .LBB42_19: # %entry ; CHECK-NOV-NEXT: mv s1, a7 -; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz ; CHECK-NOV-NEXT: blt a1, a7, .LBB42_3 ; CHECK-NOV-NEXT: .LBB42_20: # %entry ; CHECK-NOV-NEXT: mv a1, a7 -; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz +; CHECK-NOV-NEXT: fcvt.l.s a3, fs2, rtz ; CHECK-NOV-NEXT: blt a2, a7, .LBB42_4 ; CHECK-NOV-NEXT: .LBB42_21: # %entry ; CHECK-NOV-NEXT: mv a2, a7 -; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz +; CHECK-NOV-NEXT: fcvt.l.s a4, fs3, rtz ; CHECK-NOV-NEXT: blt a3, a7, .LBB42_5 ; CHECK-NOV-NEXT: .LBB42_22: # %entry ; CHECK-NOV-NEXT: mv a3, a7 -; CHECK-NOV-NEXT: fcvt.l.s a5, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a5, fs4, rtz ; CHECK-NOV-NEXT: blt a4, a7, .LBB42_6 ; CHECK-NOV-NEXT: .LBB42_23: # %entry ; CHECK-NOV-NEXT: mv a4, a7 -; CHECK-NOV-NEXT: fcvt.l.s a6, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a6, fs5, rtz ; CHECK-NOV-NEXT: blt a5, a7, .LBB42_7 ; CHECK-NOV-NEXT: .LBB42_24: # %entry ; CHECK-NOV-NEXT: mv a5, a7 @@ -4898,19 +4898,19 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs4, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs3, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs3, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs4, fa0 ; CHECK-NOV-NEXT: fcvt.lu.s s2, fs6, rtz ; CHECK-NOV-NEXT: fcvt.lu.s a0, fs5, rtz ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 @@ -4924,13 +4924,13 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: mv a0, a1 ; CHECK-NOV-NEXT: .LBB43_2: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a3, fs4, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a3, fs0, rtz ; CHECK-NOV-NEXT: sext.w a2, s2 ; CHECK-NOV-NEXT: bltu s1, a1, .LBB43_4 ; CHECK-NOV-NEXT: # %bb.3: # %entry ; CHECK-NOV-NEXT: mv s1, a1 ; CHECK-NOV-NEXT: .LBB43_4: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a4, fs3, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a4, fs1, rtz ; CHECK-NOV-NEXT: sext.w a3, a3 ; CHECK-NOV-NEXT: bltu a2, a1, .LBB43_6 ; CHECK-NOV-NEXT: # %bb.5: # %entry @@ -4942,13 +4942,13 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: # %bb.7: # %entry ; CHECK-NOV-NEXT: mv a3, a1 ; CHECK-NOV-NEXT: .LBB43_8: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a6, fs1, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a6, fs3, rtz ; CHECK-NOV-NEXT: sext.w a5, a5 ; CHECK-NOV-NEXT: bltu a4, a1, .LBB43_10 ; CHECK-NOV-NEXT: # %bb.9: # %entry ; CHECK-NOV-NEXT: mv a4, a1 ; CHECK-NOV-NEXT: .LBB43_10: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a7, fs0, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a7, fs4, rtz ; CHECK-NOV-NEXT: sext.w a6, a6 ; CHECK-NOV-NEXT: bgeu a5, a1, .LBB43_15 ; CHECK-NOV-NEXT: # %bb.11: # %entry @@ -5171,22 +5171,22 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs5, fa0 +; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs4, fa0 +; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs3, fa0 +; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs2, fa0 +; CHECK-NOV-NEXT: fmv.s fs3, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs1, fa0 +; CHECK-NOV-NEXT: fmv.s fs4, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt -; CHECK-NOV-NEXT: fmv.s fs0, fa0 +; CHECK-NOV-NEXT: fmv.s fs5, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt @@ -5195,22 +5195,22 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: addiw a3, a3, -1 ; CHECK-NOV-NEXT: bge a0, a3, .LBB44_10 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz ; CHECK-NOV-NEXT: bge s1, a3, .LBB44_11 ; CHECK-NOV-NEXT: .LBB44_2: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz ; CHECK-NOV-NEXT: bge a1, a3, .LBB44_12 ; CHECK-NOV-NEXT: .LBB44_3: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a4, fs3, rtz +; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz ; CHECK-NOV-NEXT: bge a2, a3, .LBB44_13 ; CHECK-NOV-NEXT: .LBB44_4: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz +; CHECK-NOV-NEXT: fcvt.l.s a5, fs3, rtz ; CHECK-NOV-NEXT: bge a4, a3, .LBB44_14 ; CHECK-NOV-NEXT: .LBB44_5: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a6, fs4, rtz ; CHECK-NOV-NEXT: bge a5, a3, .LBB44_15 ; CHECK-NOV-NEXT: .LBB44_6: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a7, fs5, rtz ; CHECK-NOV-NEXT: bge a6, a3, .LBB44_16 ; CHECK-NOV-NEXT: .LBB44_7: # %entry ; CHECK-NOV-NEXT: blt a7, a3, .LBB44_9 @@ -5269,27 +5269,27 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB44_10: # %entry ; CHECK-NOV-NEXT: mv a0, a3 -; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz +; CHECK-NOV-NEXT: fcvt.l.s a1, fs0, rtz ; CHECK-NOV-NEXT: blt s1, a3, .LBB44_2 ; CHECK-NOV-NEXT: .LBB44_11: # %entry ; CHECK-NOV-NEXT: mv s1, a3 -; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fs1, rtz ; CHECK-NOV-NEXT: blt a1, a3, .LBB44_3 ; CHECK-NOV-NEXT: .LBB44_12: # %entry ; CHECK-NOV-NEXT: mv a1, a3 -; CHECK-NOV-NEXT: fcvt.l.s a4, fs3, rtz +; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz ; CHECK-NOV-NEXT: blt a2, a3, .LBB44_4 ; CHECK-NOV-NEXT: .LBB44_13: # %entry ; CHECK-NOV-NEXT: mv a2, a3 -; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz +; CHECK-NOV-NEXT: fcvt.l.s a5, fs3, rtz ; CHECK-NOV-NEXT: blt a4, a3, .LBB44_5 ; CHECK-NOV-NEXT: .LBB44_14: # %entry ; CHECK-NOV-NEXT: mv a4, a3 -; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz +; CHECK-NOV-NEXT: fcvt.l.s a6, fs4, rtz ; CHECK-NOV-NEXT: blt a5, a3, .LBB44_6 ; CHECK-NOV-NEXT: .LBB44_15: # %entry ; CHECK-NOV-NEXT: mv a5, a3 -; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz +; CHECK-NOV-NEXT: fcvt.l.s a7, fs5, rtz ; CHECK-NOV-NEXT: blt a6, a3, .LBB44_7 ; CHECK-NOV-NEXT: .LBB44_16: # %entry ; CHECK-NOV-NEXT: mv a6, a3 diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll index d79d28d52e73c9..0eb69c89f2c442 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll @@ -1037,14 +1037,14 @@ define @fshr_v16i64( %a, @fshl_v16i64( %a, @llvm.vp.nearbyint.nxv16f64( @vp_nearbyint_nxv16f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv16f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v25, v0, a2 +; CHECK-NEXT: vslidedown.vx v2, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -1095,62 +1087,35 @@ define @vp_nearbyint_nxv16f64( %va, ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vmv8r.v v8, v16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vfabs.v v16, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t ; CHECK-NEXT: frflags a2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: fsflags a2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv16f64( %va, %m, i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll index 04761d4e7bfc4a..778e7848178258 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll @@ -454,28 +454,32 @@ define @vp_rint_nxv32f16( %va, @vp_rint_nxv16f64( %va, @vp_rint_nxv16f64( %va, @vp_round_nxv32f16( %va, @vp_round_nxv16f64( %va, @vp_round_nxv16f64( %va, @vp_roundeven_nxv32f16( %va, @vp_roundeven_nxv16f64( %va, ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v25, v0, a2 +; CHECK-NEXT: vslidedown.vx v2, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -1096,49 +1098,40 @@ define @vp_roundeven_nxv16f64( %va, ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfabs.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll index f35d70d6d470b3..06de7d2d5e029c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll @@ -498,30 +498,34 @@ define @vp_roundtozero_nxv32f16( %va, < ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB10_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 +; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v0, v16 -; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t +; ZVFHMIN-NEXT: vmv1r.v v8, v16 +; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vmflt.vf v16, v8, fa5, v0.t +; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vmflt.vf v8, v16, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmv1r.v v0, v16 -; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t +; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v24, v0.t ; ZVFHMIN-NEXT: fsrm a0 -; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v24 -; ZVFHMIN-NEXT: vmv8r.v v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 @@ -1079,16 +1083,14 @@ define @vp_roundtozero_nxv16f64( %v ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v25, v0, a2 +; CHECK-NEXT: vslidedown.vx v2, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -1096,49 +1098,40 @@ define @vp_roundtozero_nxv16f64( %v ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfabs.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll index 734fb59e2d881f..d0caf5d57a96d4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -2249,16 +2249,21 @@ define @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; CHECK-NEXT: addi a0, a0, %lo(.LCPI15_0) ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v20, (a0) -; CHECK-NEXT: vmv2r.v v16, v10 -; CHECK-NEXT: vmv2r.v v12, v8 -; CHECK-NEXT: vrgather.vv v8, v12, v20 -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vi v12, v12, 15 +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vmv2r.v v20, v10 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vrsub.vi v8, v8, 15 ; CHECK-NEXT: lui a0, 16 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vrgather.vv v8, v16, v12, v0.t +; CHECK-NEXT: vrgather.vv v12, v20, v8, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %v32i16 = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> ret <32 x i16> %v32i16 @@ -329,18 +329,18 @@ define <16 x i32> @v8i32_2(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: v8i32_2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv2r.v v16, v10 -; CHECK-NEXT: vmv2r.v v12, v8 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vid.v v14 -; CHECK-NEXT: vrsub.vi v18, v14, 15 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrsub.vi v18, v10, 15 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v8, v12, v18 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v18 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vrsub.vi v12, v14, 7 +; CHECK-NEXT: vrsub.vi v8, v10, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v8, v16, v12, v0.t +; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %v16i32 = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> ret <16 x i32> %v16i32 @@ -639,18 +639,18 @@ define <16 x float> @v8f32_2(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: v8f32_2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv2r.v v16, v10 -; CHECK-NEXT: vmv2r.v v12, v8 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vid.v v14 -; CHECK-NEXT: vrsub.vi v18, v14, 15 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrsub.vi v18, v10, 15 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v8, v12, v18 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v18 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vrsub.vi v12, v14, 7 +; CHECK-NEXT: vrsub.vi v8, v10, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v8, v16, v12, v0.t +; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %v16f32 = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> ret <16 x float> %v16f32 diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index f08bfce409305c..59a14feaedfa4c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -244,42 +244,42 @@ define void @sink_splat_mul_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_mul_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB7_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB7_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB7_5 ; CHECK-NEXT: .LBB7_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB7_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vmul.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB7_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB7_7 ; CHECK-NEXT: .LBB7_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB7_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: mul a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: mul a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB7_6 +; CHECK-NEXT: bnez a2, .LBB7_6 ; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -335,42 +335,42 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_add_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB8_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB8_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB8_5 ; CHECK-NEXT: .LBB8_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB8_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB8_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB8_7 ; CHECK-NEXT: .LBB8_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB8_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB8_6 +; CHECK-NEXT: bnez a2, .LBB8_6 ; CHECK-NEXT: .LBB8_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -426,42 +426,42 @@ define void @sink_splat_sub_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_sub_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB9_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB9_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB9_5 ; CHECK-NEXT: .LBB9_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB9_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB9_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB9_7 ; CHECK-NEXT: .LBB9_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB9_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB9_6 +; CHECK-NEXT: bnez a2, .LBB9_6 ; CHECK-NEXT: .LBB9_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -517,42 +517,42 @@ define void @sink_splat_rsub_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_rsub_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB10_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB10_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB10_5 ; CHECK-NEXT: .LBB10_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB10_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB10_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB10_7 ; CHECK-NEXT: .LBB10_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB10_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: subw a2, a1, a2 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: subw a3, a1, a3 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB10_6 +; CHECK-NEXT: bnez a2, .LBB10_6 ; CHECK-NEXT: .LBB10_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -608,42 +608,42 @@ define void @sink_splat_and_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_and_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB11_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB11_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB11_5 ; CHECK-NEXT: .LBB11_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB11_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB11_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB11_7 ; CHECK-NEXT: .LBB11_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB11_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: and a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: and a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB11_6 +; CHECK-NEXT: bnez a2, .LBB11_6 ; CHECK-NEXT: .LBB11_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -699,42 +699,42 @@ define void @sink_splat_or_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_or_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB12_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB12_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB12_5 ; CHECK-NEXT: .LBB12_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB12_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vor.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB12_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB12_7 ; CHECK-NEXT: .LBB12_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB12_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: or a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: or a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB12_6 +; CHECK-NEXT: bnez a2, .LBB12_6 ; CHECK-NEXT: .LBB12_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -790,42 +790,42 @@ define void @sink_splat_xor_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_xor_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB13_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB13_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB13_5 ; CHECK-NEXT: .LBB13_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB13_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vxor.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB13_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB13_7 ; CHECK-NEXT: .LBB13_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB13_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: xor a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: xor a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB13_6 +; CHECK-NEXT: bnez a2, .LBB13_6 ; CHECK-NEXT: .LBB13_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -983,42 +983,42 @@ define void @sink_splat_shl_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_shl_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB17_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB17_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB17_5 ; CHECK-NEXT: .LBB17_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB17_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vsll.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB17_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB17_7 ; CHECK-NEXT: .LBB17_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB17_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: sllw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: sllw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB17_6 +; CHECK-NEXT: bnez a2, .LBB17_6 ; CHECK-NEXT: .LBB17_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1074,42 +1074,42 @@ define void @sink_splat_lshr_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_lshr_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB18_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB18_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB18_5 ; CHECK-NEXT: .LBB18_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB18_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vsrl.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB18_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB18_7 ; CHECK-NEXT: .LBB18_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB18_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: srlw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: srlw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB18_6 +; CHECK-NEXT: bnez a2, .LBB18_6 ; CHECK-NEXT: .LBB18_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1165,42 +1165,42 @@ define void @sink_splat_ashr_scalable(ptr nocapture %a) { ; CHECK-LABEL: sink_splat_ashr_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: srli a2, a4, 1 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a2, .LBB19_2 +; CHECK-NEXT: srli a1, a4, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a1, .LBB19_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB19_5 ; CHECK-NEXT: .LBB19_2: # %vector.ph -; CHECK-NEXT: addi a1, a2, -1 -; CHECK-NEXT: andi a3, a1, 1024 -; CHECK-NEXT: xori a1, a3, 1024 +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: andi a3, a2, 1024 +; CHECK-NEXT: xori a2, a3, 1024 ; CHECK-NEXT: slli a4, a4, 1 ; CHECK-NEXT: vsetvli a5, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB19_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a5) ; CHECK-NEXT: vsra.vi v8, v8, 2 ; CHECK-NEXT: vs2r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a2 +; CHECK-NEXT: sub a6, a6, a1 ; CHECK-NEXT: add a5, a5, a4 ; CHECK-NEXT: bnez a6, .LBB19_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a3, .LBB19_7 ; CHECK-NEXT: .LBB19_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB19_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a1, 0(a0) -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: sw a1, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: lw a2, 0(a0) +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: sw a2, 0(a0) +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB19_6 +; CHECK-NEXT: bnez a1, .LBB19_6 ; CHECK-NEXT: .LBB19_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1460,33 +1460,33 @@ define void @sink_splat_fmul_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fmul_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB26_2 +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB26_5 ; CHECK-NEXT: .LBB26_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a3 ; CHECK-NEXT: .LBB26_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfmul.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 +; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB26_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB26_7 ; CHECK-NEXT: .LBB26_5: # %for.body.preheader -; CHECK-NEXT: addi a1, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a1, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB26_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) @@ -1550,33 +1550,33 @@ define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fdiv_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB27_2 +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB27_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB27_5 ; CHECK-NEXT: .LBB27_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a3 ; CHECK-NEXT: .LBB27_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfdiv.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 +; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB27_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB27_7 ; CHECK-NEXT: .LBB27_5: # %for.body.preheader -; CHECK-NEXT: addi a1, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a1, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB27_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) @@ -1640,33 +1640,33 @@ define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frdiv_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB28_2 +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB28_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB28_5 ; CHECK-NEXT: .LBB28_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a3 ; CHECK-NEXT: .LBB28_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfrdiv.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 +; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB28_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB28_7 ; CHECK-NEXT: .LBB28_5: # %for.body.preheader -; CHECK-NEXT: addi a1, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a1, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB28_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) @@ -1730,33 +1730,33 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fadd_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB29_2 +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB29_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB29_5 ; CHECK-NEXT: .LBB29_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a3 ; CHECK-NEXT: .LBB29_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfadd.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 +; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB29_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB29_7 ; CHECK-NEXT: .LBB29_5: # %for.body.preheader -; CHECK-NEXT: addi a1, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a1, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB29_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) @@ -1820,33 +1820,33 @@ define void @sink_splat_fsub_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fsub_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB30_2 +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB30_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB30_5 ; CHECK-NEXT: .LBB30_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a3 ; CHECK-NEXT: .LBB30_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfsub.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 +; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB30_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB30_7 ; CHECK-NEXT: .LBB30_5: # %for.body.preheader -; CHECK-NEXT: addi a1, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a1, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB30_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) @@ -1910,33 +1910,33 @@ define void @sink_splat_frsub_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frsub_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB31_2 +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB31_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB31_5 ; CHECK-NEXT: .LBB31_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a3 ; CHECK-NEXT: .LBB31_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfrsub.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 +; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB31_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB31_7 ; CHECK-NEXT: .LBB31_5: # %for.body.preheader -; CHECK-NEXT: addi a1, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a1, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB31_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) @@ -2489,42 +2489,42 @@ define void @sink_splat_udiv_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_udiv_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB42_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB42_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB42_5 ; CHECK-NEXT: .LBB42_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB42_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vdivu.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB42_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB42_7 ; CHECK-NEXT: .LBB42_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB42_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: divuw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: divuw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB42_6 +; CHECK-NEXT: bnez a2, .LBB42_6 ; CHECK-NEXT: .LBB42_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2580,42 +2580,42 @@ define void @sink_splat_sdiv_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_sdiv_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB43_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB43_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB43_5 ; CHECK-NEXT: .LBB43_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB43_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vdiv.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB43_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB43_7 ; CHECK-NEXT: .LBB43_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB43_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: divw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: divw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB43_6 +; CHECK-NEXT: bnez a2, .LBB43_6 ; CHECK-NEXT: .LBB43_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2671,42 +2671,42 @@ define void @sink_splat_urem_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_urem_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB44_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB44_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB44_5 ; CHECK-NEXT: .LBB44_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB44_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vremu.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB44_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB44_7 ; CHECK-NEXT: .LBB44_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB44_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: remuw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: remuw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB44_6 +; CHECK-NEXT: bnez a2, .LBB44_6 ; CHECK-NEXT: .LBB44_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2762,42 +2762,42 @@ define void @sink_splat_srem_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_srem_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB45_2 +; CHECK-NEXT: srli a2, a5, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB45_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB45_5 ; CHECK-NEXT: .LBB45_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 +; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB45_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vrem.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 +; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB45_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB45_7 ; CHECK-NEXT: .LBB45_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB45_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: remw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: remw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB45_6 +; CHECK-NEXT: bnez a2, .LBB45_6 ; CHECK-NEXT: .LBB45_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll index 5d5a2a3b898bc6..0b803b36470718 100644 --- a/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll @@ -9,43 +9,43 @@ define i32 @splat_vector_split_i64() { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vmv.v.i v10, 3 +; CHECK-NEXT: vmv.v.i v8, 3 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 3 +; CHECK-NEXT: vslideup.vi v10, v8, 3 ; CHECK-NEXT: sw zero, 12(sp) ; CHECK-NEXT: lui a0, 1044480 ; CHECK-NEXT: sw a0, 8(sp) ; CHECK-NEXT: li a0, 56 ; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; CHECK-NEXT: vsrl.vx v10, v8, a0 +; CHECK-NEXT: vsrl.vx v8, v10, a0 ; CHECK-NEXT: li a1, 40 -; CHECK-NEXT: vsrl.vx v12, v8, a1 +; CHECK-NEXT: vsrl.vx v12, v10, a1 ; CHECK-NEXT: lui a2, 16 ; CHECK-NEXT: addi a2, a2, -256 ; CHECK-NEXT: vand.vx v12, v12, a2 -; CHECK-NEXT: vor.vv v10, v12, v10 -; CHECK-NEXT: vsrl.vi v12, v8, 24 +; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: vsrl.vi v12, v10, 24 ; CHECK-NEXT: addi a3, sp, 8 ; CHECK-NEXT: vlse64.v v14, (a3), zero ; CHECK-NEXT: lui a3, 4080 ; CHECK-NEXT: vand.vx v12, v12, a3 -; CHECK-NEXT: vsrl.vi v16, v8, 8 +; CHECK-NEXT: vsrl.vi v16, v10, 8 ; CHECK-NEXT: vand.vv v16, v16, v14 ; CHECK-NEXT: vor.vv v12, v16, v12 -; CHECK-NEXT: vor.vv v10, v12, v10 -; CHECK-NEXT: vand.vv v12, v8, v14 +; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: vand.vv v12, v10, v14 ; CHECK-NEXT: vsll.vi v12, v12, 8 -; CHECK-NEXT: vand.vx v14, v8, a3 +; CHECK-NEXT: vand.vx v14, v10, a3 ; CHECK-NEXT: vsll.vi v14, v14, 24 ; CHECK-NEXT: vor.vv v12, v14, v12 -; CHECK-NEXT: vsll.vx v14, v8, a0 -; CHECK-NEXT: vand.vx v8, v8, a2 -; CHECK-NEXT: vsll.vx v8, v8, a1 -; CHECK-NEXT: vor.vv v8, v14, v8 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vsll.vx v14, v10, a0 +; CHECK-NEXT: vand.vx v10, v10, a2 +; CHECK-NEXT: vsll.vx v10, v10, a1 +; CHECK-NEXT: vor.vv v10, v14, v10 +; CHECK-NEXT: vor.vv v10, v10, v12 +; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll index 47074d612bb646..00db6fb82ef6a4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll @@ -598,21 +598,21 @@ define @strided_load_nxv16f64(ptr %ptr, i64 %stride, @strided_load_nxv16f64(ptr %ptr, i64 %stride, @strided_load_nxv16f64(ptr %ptr, i64 %stride, @strided_load_nxv16f64_allones_mask(ptr %ptr, i64 %stride, i32 zeroext %evl) { ; CHECK-RV32-LABEL: strided_load_nxv16f64_allones_mask: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: csrr a4, vlenb -; CHECK-RV32-NEXT: sub a2, a3, a4 -; CHECK-RV32-NEXT: sltu a5, a3, a2 +; CHECK-RV32-NEXT: csrr a2, vlenb +; CHECK-RV32-NEXT: sub a4, a3, a2 +; CHECK-RV32-NEXT: sltu a5, a3, a4 ; CHECK-RV32-NEXT: addi a5, a5, -1 -; CHECK-RV32-NEXT: and a2, a5, a2 -; CHECK-RV32-NEXT: bltu a3, a4, .LBB50_2 +; CHECK-RV32-NEXT: and a4, a5, a4 +; CHECK-RV32-NEXT: bltu a3, a2, .LBB50_2 ; CHECK-RV32-NEXT: # %bb.1: -; CHECK-RV32-NEXT: mv a3, a4 +; CHECK-RV32-NEXT: mv a3, a2 ; CHECK-RV32-NEXT: .LBB50_2: -; CHECK-RV32-NEXT: mul a4, a3, a1 -; CHECK-RV32-NEXT: add a4, a0, a4 -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV32-NEXT: vlse64.v v16, (a4), a1 +; CHECK-RV32-NEXT: mul a2, a3, a1 +; CHECK-RV32-NEXT: add a2, a0, a2 +; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma +; CHECK-RV32-NEXT: vlse64.v v16, (a2), a1 ; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: strided_load_nxv16f64_allones_mask: ; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: csrr a4, vlenb -; CHECK-RV64-NEXT: sub a3, a2, a4 -; CHECK-RV64-NEXT: sltu a5, a2, a3 +; CHECK-RV64-NEXT: csrr a3, vlenb +; CHECK-RV64-NEXT: sub a4, a2, a3 +; CHECK-RV64-NEXT: sltu a5, a2, a4 ; CHECK-RV64-NEXT: addi a5, a5, -1 -; CHECK-RV64-NEXT: and a3, a5, a3 -; CHECK-RV64-NEXT: bltu a2, a4, .LBB50_2 +; CHECK-RV64-NEXT: and a4, a5, a4 +; CHECK-RV64-NEXT: bltu a2, a3, .LBB50_2 ; CHECK-RV64-NEXT: # %bb.1: -; CHECK-RV64-NEXT: mv a2, a4 +; CHECK-RV64-NEXT: mv a2, a3 ; CHECK-RV64-NEXT: .LBB50_2: -; CHECK-RV64-NEXT: mul a4, a2, a1 -; CHECK-RV64-NEXT: add a4, a0, a4 -; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; CHECK-RV64-NEXT: vlse64.v v16, (a4), a1 +; CHECK-RV64-NEXT: mul a3, a2, a1 +; CHECK-RV64-NEXT: add a3, a0, a3 +; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma +; CHECK-RV64-NEXT: vlse64.v v16, (a3), a1 ; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1 ; CHECK-RV64-NEXT: ret @@ -702,45 +702,45 @@ define @strided_load_nxv17f64(ptr %ptr, i64 %stride, @strided_load_nxv17f64(ptr %ptr, i64 %stride, , } @vector_deinterleave_load_nxv8i6 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: li a2, 40 +; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, a0, a1 -; CHECK-NEXT: vl8re64.v v8, (a1) +; CHECK-NEXT: vl8re64.v v24, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: li a2, 24 ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vl8re64.v v0, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: vadd.vv v16, v8, v8 ; CHECK-NEXT: vrgather.vv v8, v0, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vrgather.vv v0, v24, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vadd.vi v24, v16, 1 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v24, v8, v16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vadd.vi v8, v16, 1 -; CHECK-NEXT: vrgather.vv v16, v0, v8 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v16, v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vrgather.vv v24, v0, v8 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v28, v8 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v20, v8 -; CHECK-NEXT: vmv8r.v v8, v24 +; CHECK-NEXT: vmv4r.v v12, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv4r.v v20, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index ef4baf34d23f03..7c064424548e05 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -8,18 +8,18 @@ define {, } @vector_deinterleave_nxv16i1_nxv ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v12, v8, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 -; CHECK-NEXT: vnsrl.wi v12, v8, 0 -; CHECK-NEXT: vmsne.vi v0, v12, 0 -; CHECK-NEXT: vnsrl.wi v12, v8, 8 -; CHECK-NEXT: vmsne.vi v8, v12, 0 +; CHECK-NEXT: vmerge.vim v14, v8, 1, v0 +; CHECK-NEXT: vnsrl.wi v8, v12, 0 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vnsrl.wi v10, v12, 8 +; CHECK-NEXT: vmsne.vi v8, v10, 0 ; CHECK-NEXT: ret %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) ret {, } %retval @@ -90,24 +90,38 @@ declare {, } @llvm.experimental.vector.deint define {, } @vector_deinterleave_nxv64i1_nxv128i1( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv64i1_nxv128i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v28, v8 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: vmv1r.v v12, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vmerge.vim v16, v24, 1, v0 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v24, v16, 0 +; CHECK-NEXT: vnsrl.wi v8, v16, 0 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v28 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vmerge.vim v24, v24, 1, v0 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v28, v8, 0 +; CHECK-NEXT: vnsrl.wi v12, v24, 0 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; CHECK-NEXT: vmsne.vi v0, v24, 0 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs1r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v24, v16, 8 -; CHECK-NEXT: vnsrl.wi v28, v8, 8 +; CHECK-NEXT: vnsrl.wi v0, v16, 8 +; CHECK-NEXT: vnsrl.wi v4, v24, 8 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; CHECK-NEXT: vmsne.vi v8, v24, 0 +; CHECK-NEXT: vmsne.vi v8, v0, 0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv128i1( %vec) ret {, } %retval @@ -116,12 +130,12 @@ ret {, } %retval define {, } @vector_deinterleave_nxv64i8_nxv128i8( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv64i8_nxv128i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v24, 0 -; CHECK-NEXT: vnsrl.wi v12, v16, 0 -; CHECK-NEXT: vnsrl.wi v0, v24, 8 +; CHECK-NEXT: vnsrl.wi v24, v8, 0 +; CHECK-NEXT: vnsrl.wi v28, v16, 0 +; CHECK-NEXT: vnsrl.wi v0, v8, 8 ; CHECK-NEXT: vnsrl.wi v4, v16, 8 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv128i8( %vec) @@ -131,12 +145,12 @@ ret {, } %retval define {, } @vector_deinterleave_nxv32i16_nxv64i16( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv32i16_nxv64i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v24, 0 -; CHECK-NEXT: vnsrl.wi v12, v16, 0 -; CHECK-NEXT: vnsrl.wi v0, v24, 16 +; CHECK-NEXT: vnsrl.wi v24, v8, 0 +; CHECK-NEXT: vnsrl.wi v28, v16, 0 +; CHECK-NEXT: vnsrl.wi v0, v8, 16 ; CHECK-NEXT: vnsrl.wi v4, v16, 16 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv64i16( %vec) @@ -146,14 +160,14 @@ ret {, } %retval define {, } @vector_deinterleave_nxv16i32_nxvv32i32( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv16i32_nxvv32i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v24, v16 ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; CHECK-NEXT: vnsrl.wx v20, v24, a0 -; CHECK-NEXT: vnsrl.wx v16, v8, a0 +; CHECK-NEXT: vnsrl.wx v28, v16, a0 +; CHECK-NEXT: vnsrl.wx v24, v8, a0 ; CHECK-NEXT: vnsrl.wi v0, v8, 0 -; CHECK-NEXT: vnsrl.wi v4, v24, 0 +; CHECK-NEXT: vnsrl.wi v4, v16, 0 ; CHECK-NEXT: vmv8r.v v8, v0 +; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: ret %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i32( %vec) ret {, } %retval @@ -165,50 +179,52 @@ define {, } @vector_deinterleave_nxv8i64_nxv ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv8r.v v24, v8 -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vv v0, v8, v8 -; CHECK-NEXT: vrgather.vv v8, v24, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vid.v v24 +; CHECK-NEXT: vadd.vv v0, v24, v24 +; CHECK-NEXT: vrgather.vv v16, v8, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v16, v8, v0 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vadd.vi v16, v0, 1 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vadd.vi v8, v0, 1 -; CHECK-NEXT: vrgather.vv v0, v24, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v16, v24, v8 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v0, v8, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -216,15 +232,33 @@ define {, } @vector_deinterleave_nxv8i64_nxv ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v16, v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmv4r.v v20, v8 -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmv4r.v v4, v8 ; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -332,12 +366,12 @@ declare {, } @llvm.experimental.vector define {, } @vector_deinterleave_nxv32f16_nxv64f16( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv32f16_nxv64f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v24, 0 -; CHECK-NEXT: vnsrl.wi v12, v16, 0 -; CHECK-NEXT: vnsrl.wi v0, v24, 16 +; CHECK-NEXT: vnsrl.wi v24, v8, 0 +; CHECK-NEXT: vnsrl.wi v28, v16, 0 +; CHECK-NEXT: vnsrl.wi v0, v8, 16 ; CHECK-NEXT: vnsrl.wi v4, v16, 16 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv64f16( %vec) @@ -347,14 +381,14 @@ ret {, } %retval define {, } @vector_deinterleave_nxv16f32_nxv32f32( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv16f32_nxv32f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v24, v16 ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; CHECK-NEXT: vnsrl.wx v20, v24, a0 -; CHECK-NEXT: vnsrl.wx v16, v8, a0 +; CHECK-NEXT: vnsrl.wx v28, v16, a0 +; CHECK-NEXT: vnsrl.wx v24, v8, a0 ; CHECK-NEXT: vnsrl.wi v0, v8, 0 -; CHECK-NEXT: vnsrl.wi v4, v24, 0 +; CHECK-NEXT: vnsrl.wi v4, v16, 0 ; CHECK-NEXT: vmv8r.v v8, v0 +; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: ret %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32f32( %vec) ret {, } %retval @@ -366,50 +400,52 @@ define {, } @vector_deinterleave_nxv8f ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv8r.v v24, v8 -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vv v0, v8, v8 -; CHECK-NEXT: vrgather.vv v8, v24, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vid.v v24 +; CHECK-NEXT: vadd.vv v0, v24, v24 +; CHECK-NEXT: vrgather.vv v16, v8, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v16, v8, v0 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vadd.vi v16, v0, 1 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vadd.vi v8, v0, 1 -; CHECK-NEXT: vrgather.vv v0, v24, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v16, v24, v8 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v0, v8, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -417,15 +453,33 @@ define {, } @vector_deinterleave_nxv8f ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v16, v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmv4r.v v20, v8 -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmv4r.v v4, v8 ; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index c23c10205e6e36..888ceb7127469a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -101,40 +101,36 @@ define void @vector_interleave_store_nxv16i64_nxv8i64( %a, @vector_interleave_nxv128i1_nxv64i1( @vector_interleave_nxv128i8_nxv64i8( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv128i8_nxv64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v8, v24, v16 +; CHECK-NEXT: vwaddu.vv v24, v8, v16 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v8, a0, v16 -; CHECK-NEXT: vwaddu.vv v0, v28, v20 +; CHECK-NEXT: vwmaccu.vx v24, a0, v16 +; CHECK-NEXT: vwaddu.vv v0, v12, v20 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: vector_interleave_nxv128i8_nxv64i8: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vmv8r.v v24, v8 ; ZVBB-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; ZVBB-NEXT: vwsll.vi v8, v16, 8 -; ZVBB-NEXT: vwaddu.wv v8, v8, v24 +; ZVBB-NEXT: vwsll.vi v24, v16, 8 +; ZVBB-NEXT: vwaddu.wv v24, v24, v8 ; ZVBB-NEXT: vwsll.vi v0, v20, 8 -; ZVBB-NEXT: vwaddu.wv v0, v0, v28 +; ZVBB-NEXT: vwaddu.wv v0, v0, v12 +; ZVBB-NEXT: vmv8r.v v8, v24 ; ZVBB-NEXT: vmv8r.v v16, v0 ; ZVBB-NEXT: ret %res = call @llvm.experimental.vector.interleave2.nxv128i8( %a, %b) @@ -228,24 +228,24 @@ define @vector_interleave_nxv128i8_nxv64i8( @vector_interleave_nxv64i16_nxv32i16( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv64i16_nxv32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v8, v24, v16 +; CHECK-NEXT: vwaddu.vv v24, v8, v16 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v8, a0, v16 -; CHECK-NEXT: vwaddu.vv v0, v28, v20 +; CHECK-NEXT: vwmaccu.vx v24, a0, v16 +; CHECK-NEXT: vwaddu.vv v0, v12, v20 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: vector_interleave_nxv64i16_nxv32i16: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vmv8r.v v24, v8 ; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVBB-NEXT: vwsll.vi v8, v16, 16 -; ZVBB-NEXT: vwaddu.wv v8, v8, v24 +; ZVBB-NEXT: vwsll.vi v24, v16, 16 +; ZVBB-NEXT: vwaddu.wv v24, v24, v8 ; ZVBB-NEXT: vwsll.vi v0, v20, 16 -; ZVBB-NEXT: vwaddu.wv v0, v0, v28 +; ZVBB-NEXT: vwaddu.wv v0, v0, v12 +; ZVBB-NEXT: vmv8r.v v8, v24 ; ZVBB-NEXT: vmv8r.v v16, v0 ; ZVBB-NEXT: ret %res = call @llvm.experimental.vector.interleave2.nxv64i16( %a, %b) @@ -255,13 +255,13 @@ define @vector_interleave_nxv64i16_nxv32i16( @vector_interleave_nxv32i32_nxv16i32( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv32i32_nxv16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v8, v24, v16 +; CHECK-NEXT: vwaddu.vv v24, v8, v16 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v8, a0, v16 -; CHECK-NEXT: vwaddu.vv v0, v28, v20 +; CHECK-NEXT: vwmaccu.vx v24, a0, v16 +; CHECK-NEXT: vwaddu.vv v0, v12, v20 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret ; @@ -286,32 +286,44 @@ define @vector_interleave_nxv16i64_nxv8i64( @vector_interleave_nxv16i64_nxv8i64( @llvm.experimental.vector.interleave2.nxv4f64( @vector_interleave_nxv64f16_nxv32f16( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv64f16_nxv32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v8, v24, v16 +; CHECK-NEXT: vwaddu.vv v24, v8, v16 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v8, a0, v16 -; CHECK-NEXT: vwaddu.vv v0, v28, v20 +; CHECK-NEXT: vwmaccu.vx v24, a0, v16 +; CHECK-NEXT: vwaddu.vv v0, v12, v20 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: vector_interleave_nxv64f16_nxv32f16: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vmv8r.v v24, v8 ; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVBB-NEXT: vwsll.vi v8, v16, 16 -; ZVBB-NEXT: vwaddu.wv v8, v8, v24 +; ZVBB-NEXT: vwsll.vi v24, v16, 16 +; ZVBB-NEXT: vwaddu.wv v24, v24, v8 ; ZVBB-NEXT: vwsll.vi v0, v20, 16 -; ZVBB-NEXT: vwaddu.wv v0, v0, v28 +; ZVBB-NEXT: vwaddu.wv v0, v0, v12 +; ZVBB-NEXT: vmv8r.v v8, v24 ; ZVBB-NEXT: vmv8r.v v16, v0 ; ZVBB-NEXT: ret %res = call @llvm.experimental.vector.interleave2.nxv64f16( %a, %b) @@ -555,13 +579,13 @@ define @vector_interleave_nxv64f16_nxv32f16( @vector_interleave_nxv32f32_nxv16f32( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv32f32_nxv16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v8, v24, v16 +; CHECK-NEXT: vwaddu.vv v24, v8, v16 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v8, a0, v16 -; CHECK-NEXT: vwaddu.vv v0, v28, v20 +; CHECK-NEXT: vwmaccu.vx v24, a0, v16 +; CHECK-NEXT: vwaddu.vv v0, v12, v20 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret ; @@ -586,32 +610,44 @@ define @vector_interleave_nxv16f64_nxv8f64( @vector_interleave_nxv16f64_nxv8f64( @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64_unmasked( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 +; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 ; CHECK-NEXT: add a5, a2, a3 -; CHECK-NEXT: vl8re64.v v24, (a5) +; CHECK-NEXT: vl8re64.v v8, (a5) +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a5, a5, 3 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vl8re64.v v16, (a3) ; CHECK-NEXT: sub a3, a4, a1 @@ -1334,10 +1351,15 @@ define @vfma_vv_nxv16f64_unmasked( ; CHECK-NEXT: and a3, a5, a3 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmadd.vv v16, v8, v24 ; CHECK-NEXT: bltu a4, a1, .LBB93_2 ; CHECK-NEXT: # %bb.1: @@ -1345,17 +1367,17 @@ define @vfma_vv_nxv16f64_unmasked( ; CHECK-NEXT: .LBB93_2: ; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmadd.vv v0, v24, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmadd.vv v0, v8, v24 ; CHECK-NEXT: vmv.v.v v8, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll index 1f716a9abcc595..13b2ac516dce5b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll @@ -301,87 +301,78 @@ define @vfmadd_vv_nxv32f16( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64_unmasked( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 +; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 ; CHECK-NEXT: add a5, a2, a3 -; CHECK-NEXT: vl8re64.v v24, (a5) +; CHECK-NEXT: vl8re64.v v8, (a5) +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a5, a5, 3 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vl8re64.v v16, (a3) ; CHECK-NEXT: sub a3, a4, a1 @@ -1334,10 +1351,15 @@ define @vfma_vv_nxv16f64_unmasked( ; CHECK-NEXT: and a3, a5, a3 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmadd.vv v16, v8, v24 ; CHECK-NEXT: bltu a4, a1, .LBB93_2 ; CHECK-NEXT: # %bb.1: @@ -1345,17 +1367,17 @@ define @vfma_vv_nxv16f64_unmasked( ; CHECK-NEXT: .LBB93_2: ; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmadd.vv v0, v24, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmadd.vv v0, v8, v24 ; CHECK-NEXT: vmv.v.v v8, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll index c6554561be3395..ba0f453f89de52 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll @@ -96,15 +96,7 @@ declare @llvm.vp.fptrunc.nxv16f64.nxv16f32( @vfptrunc_nxv16f32_nxv16f64( %a, %m, i32 zeroext %vl) { ; CHECK-LABEL: vfptrunc_nxv16f32_nxv16f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v1, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma @@ -114,21 +106,15 @@ define @vfptrunc_nxv16f32_nxv16f64( ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 ; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfncvt.f.f.w v20, v24, v0.t +; CHECK-NEXT: vfncvt.f.f.w v28, v16, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB7_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB7_2: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfncvt.f.f.w v16, v8, v0.t -; CHECK-NEXT: vmv8r.v v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfncvt.f.f.w v24, v8, v0.t +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %v = call @llvm.vp.fptrunc.nxv16f64.nxv16f32( %a, %m, i32 %vl) ret %v @@ -142,16 +128,20 @@ define @vfptrunc_nxv32f32_nxv32f64( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: vmv1r.v v1, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a3, a1, 3 @@ -161,6 +151,8 @@ define @vfptrunc_nxv32f32_nxv32f64( ; CHECK-NEXT: slli a4, a1, 3 ; CHECK-NEXT: add a4, a0, a4 ; CHECK-NEXT: vl8re64.v v8, (a4) +; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; CHECK-NEXT: slli a4, a1, 1 ; CHECK-NEXT: sub a5, a2, a4 ; CHECK-NEXT: sltu a6, a2, a5 @@ -171,19 +163,21 @@ define @vfptrunc_nxv32f32_nxv32f64( ; CHECK-NEXT: addi a7, a7, -1 ; CHECK-NEXT: and a6, a7, a6 ; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, ma -; CHECK-NEXT: vl8re64.v v24, (a0) +; CHECK-NEXT: vl8re64.v v8, (a0) ; CHECK-NEXT: vslidedown.vx v0, v16, a3 ; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, ma -; CHECK-NEXT: vfncvt.f.f.w v20, v8, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfncvt.f.f.w v20, v24, v0.t ; CHECK-NEXT: bltu a5, a1, .LBB8_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a5, a1 ; CHECK-NEXT: .LBB8_2: ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v2, v1, a3 +; CHECK-NEXT: vslidedown.vx v25, v1, a3 ; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma ; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfncvt.f.f.w v16, v24, v0.t +; CHECK-NEXT: vfncvt.f.f.w v16, v8, v0.t ; CHECK-NEXT: bltu a2, a4, .LBB8_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a2, a4 @@ -193,8 +187,11 @@ define @vfptrunc_nxv32f32_nxv32f64( ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a0, a3, a0 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfncvt.f.f.w v28, v8, v0.t ; CHECK-NEXT: bltu a2, a1, .LBB8_6 @@ -204,14 +201,15 @@ define @vfptrunc_nxv32f32_nxv32f64( ; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma ; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfncvt.f.f.w v24, v8, v0.t ; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll index 3dc8340600fded..6f8425adf90cd5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll @@ -728,17 +728,17 @@ define @vfnmacc_vf_nxv16f32_commute( % ; ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32_commute: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vmv4r.v v24, v8 ; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v8, fa5 +; ZVFHMIN-NEXT: vfmv.v.f v24, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8 +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfnmadd.vv v8, v24, v16, v0.t +; ZVFHMIN-NEXT: vfnmadd.vv v24, v8, v16, v0.t +; ZVFHMIN-NEXT: vmv.v.v v8, v24 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll index 6eb1f512f76af7..f6d085ef129a8a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll @@ -700,17 +700,17 @@ define @vfnmsac_vf_nxv16f32_commute( % ; ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32_commute: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vmv4r.v v24, v8 ; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmv.v.f v8, fa5 +; ZVFHMIN-NEXT: vfmv.v.f v24, fa5 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8 +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfnmsub.vv v8, v24, v16, v0.t +; ZVFHMIN-NEXT: vfnmsub.vv v24, v8, v16, v0.t +; ZVFHMIN-NEXT: vmv.v.v v8, v24 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll index 2f8454983d0d6e..00ab08a2391fe8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll @@ -1348,24 +1348,24 @@ define @vpmerge_vf_nxv32f16(half %a, % ; RV32ZVFHMIN: # %bb.0: ; RV32ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 ; RV32ZVFHMIN-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32ZVFHMIN-NEXT: vfmv.v.f v24, fa5 +; RV32ZVFHMIN-NEXT: vfmv.v.f v16, fa5 ; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; RV32ZVFHMIN-NEXT: vfncvt.f.f.w v16, v24 -; RV32ZVFHMIN-NEXT: vmv.v.v v20, v16 +; RV32ZVFHMIN-NEXT: vfncvt.f.f.w v24, v16 +; RV32ZVFHMIN-NEXT: vmv.v.v v28, v24 ; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, tu, ma -; RV32ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32ZVFHMIN-NEXT: vmerge.vvm v8, v8, v24, v0 ; RV32ZVFHMIN-NEXT: ret ; ; RV64ZVFHMIN-LABEL: vpmerge_vf_nxv32f16: ; RV64ZVFHMIN: # %bb.0: ; RV64ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 ; RV64ZVFHMIN-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV64ZVFHMIN-NEXT: vfmv.v.f v24, fa5 +; RV64ZVFHMIN-NEXT: vfmv.v.f v16, fa5 ; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; RV64ZVFHMIN-NEXT: vfncvt.f.f.w v16, v24 -; RV64ZVFHMIN-NEXT: vmv.v.v v20, v16 +; RV64ZVFHMIN-NEXT: vfncvt.f.f.w v24, v16 +; RV64ZVFHMIN-NEXT: vmv.v.v v28, v24 ; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, tu, ma -; RV64ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV64ZVFHMIN-NEXT: vmerge.vvm v8, v8, v24, v0 ; RV64ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %a, i32 0 %va = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll index 3b1e19ec4b3d21..005493e9ec04b2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll @@ -204,36 +204,36 @@ define half @vpreduce_fadd_nxv64f16(half %s, %v, %v, %v, %v, %v, %v, %v, %v, % ; RV64-NEXT: srli a2, a3, 2 ; RV64-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; RV64-NEXT: vslidedown.vx v24, v0, a2 -; RV64-NEXT: andi a2, a0, -1 +; RV64-NEXT: andi a0, a0, -1 ; RV64-NEXT: slli a3, a3, 1 -; RV64-NEXT: sub a0, a1, a3 -; RV64-NEXT: sltu a4, a1, a0 +; RV64-NEXT: sub a2, a1, a3 +; RV64-NEXT: sltu a4, a1, a2 ; RV64-NEXT: addi a4, a4, -1 -; RV64-NEXT: and a0, a4, a0 +; RV64-NEXT: and a2, a4, a2 ; RV64-NEXT: bltu a1, a3, .LBB67_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a1, a3 ; RV64-NEXT: .LBB67_2: ; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vmv.s.x v25, a2 +; RV64-NEXT: vmv.s.x v25, a0 ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t -; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: vredmaxu.vs v25, v16, v25, v0.t ; RV64-NEXT: vmv.x.s a0, v25 diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll index 59280e2ec2a8af..22f1b47e80004c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll @@ -215,12 +215,12 @@ define @vfmerge_fv_nxv32f16( %va, half ; CHECK-ZVFHMIN: # %bb.0: ; CHECK-ZVFHMIN-NEXT: fcvt.s.h fa5, fa0 ; CHECK-ZVFHMIN-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; CHECK-ZVFHMIN-NEXT: vfmv.v.f v24, fa5 +; CHECK-ZVFHMIN-NEXT: vfmv.v.f v16, fa5 ; CHECK-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-ZVFHMIN-NEXT: vfncvt.f.f.w v16, v24 -; CHECK-ZVFHMIN-NEXT: vmv.v.v v20, v16 +; CHECK-ZVFHMIN-NEXT: vfncvt.f.f.w v24, v16 +; CHECK-ZVFHMIN-NEXT: vmv.v.v v28, v24 ; CHECK-ZVFHMIN-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-ZVFHMIN-NEXT: vmerge.vvm v8, v8, v24, v0 ; CHECK-ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll index 4d0f640408dd2a..2a6c9d03f84c18 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll @@ -102,23 +102,23 @@ declare @llvm.riscv.vmand.nxv1i1.i64(, This Inner Loop Header: Depth=1 -; CHECK-NEXT: slli a3, a4, 2 -; CHECK-NEXT: add a5, a0, a3 +; CHECK-NEXT: slli a6, a4, 2 +; CHECK-NEXT: add a5, a0, a6 ; CHECK-NEXT: vle32.v v8, (a5) ; CHECK-NEXT: vmsle.vi v9, v8, -3 ; CHECK-NEXT: vmsgt.vi v10, v8, 2 ; CHECK-NEXT: vmor.mm v0, v9, v10 -; CHECK-NEXT: add a3, a3, a1 -; CHECK-NEXT: vse32.v v8, (a3), v0.t -; CHECK-NEXT: add a4, a4, a6 -; CHECK-NEXT: vsetvli a6, a2, e32, m1, ta, ma -; CHECK-NEXT: bnez a6, .LBB5_2 +; CHECK-NEXT: add a6, a6, a1 +; CHECK-NEXT: vse32.v v8, (a6), v0.t +; CHECK-NEXT: add a4, a4, a3 +; CHECK-NEXT: vsetvli a3, a2, e32, m1, ta, ma +; CHECK-NEXT: bnez a3, .LBB5_2 ; CHECK-NEXT: .LBB5_3: # %for.cond.cleanup ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll index a624a42b3873bb..ef1c1808928396 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll @@ -285,16 +285,20 @@ define @vtrunc_nxv32i64_nxv32i32( %a, @vtrunc_nxv32i64_nxv32i32( %a, @vtrunc_nxv32i64_nxv32i32( %a, @vtrunc_nxv32i64_nxv32i32( %a, @vtrunc_nxv32i64_nxv32i32( %a, @combine_srem_sdiv(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lh s1, 0(a1) -; RV32I-NEXT: lh s2, 4(a1) -; RV32I-NEXT: lh s3, 8(a1) -; RV32I-NEXT: lh s4, 12(a1) -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lh s0, 0(a1) +; RV32I-NEXT: lh s1, 4(a1) +; RV32I-NEXT: lh s2, 8(a1) +; RV32I-NEXT: lh s3, 12(a1) +; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: mv s5, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: mv s6, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: mv s7, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: mv s8, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: call __divsi3@plt -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __divsi3@plt ; RV32I-NEXT: mv s3, a0 @@ -422,14 +418,18 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __divsi3@plt +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: li a1, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __divsi3@plt ; RV32I-NEXT: add a0, s8, a0 -; RV32I-NEXT: add s2, s7, s2 -; RV32I-NEXT: add s3, s6, s3 -; RV32I-NEXT: add s4, s5, s4 -; RV32I-NEXT: sh s4, 6(s0) -; RV32I-NEXT: sh s3, 4(s0) -; RV32I-NEXT: sh s2, 2(s0) -; RV32I-NEXT: sh a0, 0(s0) +; RV32I-NEXT: add s1, s7, s1 +; RV32I-NEXT: add s2, s6, s2 +; RV32I-NEXT: add s3, s5, s3 +; RV32I-NEXT: sh s3, 6(s4) +; RV32I-NEXT: sh s2, 4(s4) +; RV32I-NEXT: sh s1, 2(s4) +; RV32I-NEXT: sh a0, 0(s4) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -503,32 +503,28 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s6, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s7, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s8, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lh s1, 0(a1) -; RV64I-NEXT: lh s2, 8(a1) -; RV64I-NEXT: lh s3, 16(a1) -; RV64I-NEXT: lh s4, 24(a1) -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lh s0, 0(a1) +; RV64I-NEXT: lh s1, 8(a1) +; RV64I-NEXT: lh s2, 16(a1) +; RV64I-NEXT: lh s3, 24(a1) +; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: mv s5, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: mv s6, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: mv s7, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: mv s8, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: call __divdi3@plt -; RV64I-NEXT: mv s4, a0 -; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __divdi3@plt ; RV64I-NEXT: mv s3, a0 @@ -539,14 +535,18 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __divdi3@plt +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: li a1, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __divdi3@plt ; RV64I-NEXT: add a0, s8, a0 -; RV64I-NEXT: add s2, s7, s2 -; RV64I-NEXT: add s3, s6, s3 -; RV64I-NEXT: add s4, s5, s4 -; RV64I-NEXT: sh s4, 6(s0) -; RV64I-NEXT: sh s3, 4(s0) -; RV64I-NEXT: sh s2, 2(s0) -; RV64I-NEXT: sh a0, 0(s0) +; RV64I-NEXT: add s1, s7, s1 +; RV64I-NEXT: add s2, s6, s2 +; RV64I-NEXT: add s3, s5, s3 +; RV64I-NEXT: sh s3, 6(s4) +; RV64I-NEXT: sh s2, 4(s4) +; RV64I-NEXT: sh s1, 2(s4) +; RV64I-NEXT: sh a0, 0(s4) ; RV64I-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 56(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll index 651df94bab496e..c802134c9813a3 100644 --- a/llvm/test/CodeGen/RISCV/stack-store-check.ll +++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll @@ -14,292 +14,293 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-LABEL: main: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -688 -; CHECK-NEXT: sw ra, 684(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s0, 680(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s1, 676(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s2, 672(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s3, 668(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s4, 664(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s5, 660(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s6, 656(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s7, 652(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s8, 648(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s9, 644(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s10, 640(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s11, 636(sp) # 4-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -704 +; CHECK-NEXT: sw ra, 700(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s0, 696(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s1, 692(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s2, 688(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s3, 684(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s4, 680(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s5, 676(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s6, 672(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s7, 668(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s8, 664(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s9, 660(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s10, 656(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s11, 652(sp) # 4-byte Folded Spill ; CHECK-NEXT: lui a0, %hi(U) ; CHECK-NEXT: lw s6, %lo(U)(a0) ; CHECK-NEXT: lw s7, %lo(U+4)(a0) ; CHECK-NEXT: lw s8, %lo(U+8)(a0) ; CHECK-NEXT: lw s0, %lo(U+12)(a0) -; CHECK-NEXT: sw zero, 612(sp) -; CHECK-NEXT: sw zero, 608(sp) -; CHECK-NEXT: sw zero, 604(sp) -; CHECK-NEXT: sw zero, 600(sp) -; CHECK-NEXT: sw s0, 596(sp) -; CHECK-NEXT: sw s8, 592(sp) -; CHECK-NEXT: sw s7, 588(sp) -; CHECK-NEXT: addi a0, sp, 616 -; CHECK-NEXT: addi a1, sp, 600 -; CHECK-NEXT: addi a2, sp, 584 -; CHECK-NEXT: sw s6, 584(sp) +; CHECK-NEXT: sw zero, 628(sp) +; CHECK-NEXT: sw zero, 624(sp) +; CHECK-NEXT: sw zero, 620(sp) +; CHECK-NEXT: sw zero, 616(sp) +; CHECK-NEXT: sw s0, 612(sp) +; CHECK-NEXT: sw s8, 608(sp) +; CHECK-NEXT: sw s7, 604(sp) +; CHECK-NEXT: addi a0, sp, 632 +; CHECK-NEXT: addi a1, sp, 616 +; CHECK-NEXT: addi a2, sp, 600 +; CHECK-NEXT: sw s6, 600(sp) ; CHECK-NEXT: call __subtf3@plt -; CHECK-NEXT: lw s1, 616(sp) -; CHECK-NEXT: lw s2, 620(sp) -; CHECK-NEXT: lw s3, 624(sp) -; CHECK-NEXT: lw s4, 628(sp) -; CHECK-NEXT: sw s0, 548(sp) -; CHECK-NEXT: sw s8, 544(sp) -; CHECK-NEXT: sw s7, 540(sp) -; CHECK-NEXT: sw s6, 536(sp) -; CHECK-NEXT: sw s4, 564(sp) -; CHECK-NEXT: sw s3, 560(sp) -; CHECK-NEXT: sw s2, 556(sp) -; CHECK-NEXT: addi a0, sp, 568 -; CHECK-NEXT: addi a1, sp, 552 -; CHECK-NEXT: addi a2, sp, 536 -; CHECK-NEXT: sw s1, 552(sp) +; CHECK-NEXT: lw s1, 632(sp) +; CHECK-NEXT: lw s2, 636(sp) +; CHECK-NEXT: lw s3, 640(sp) +; CHECK-NEXT: lw s4, 644(sp) +; CHECK-NEXT: sw s0, 564(sp) +; CHECK-NEXT: sw s8, 560(sp) +; CHECK-NEXT: sw s7, 556(sp) +; CHECK-NEXT: sw s6, 552(sp) +; CHECK-NEXT: sw s4, 580(sp) +; CHECK-NEXT: sw s3, 576(sp) +; CHECK-NEXT: sw s2, 572(sp) +; CHECK-NEXT: addi a0, sp, 584 +; CHECK-NEXT: addi a1, sp, 568 +; CHECK-NEXT: addi a2, sp, 552 +; CHECK-NEXT: sw s1, 568(sp) ; CHECK-NEXT: call __subtf3@plt -; CHECK-NEXT: lw a0, 568(sp) -; CHECK-NEXT: sw a0, 40(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw a0, 572(sp) -; CHECK-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw a0, 576(sp) -; CHECK-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw a0, 580(sp) -; CHECK-NEXT: sw a0, 48(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw zero, 500(sp) -; CHECK-NEXT: sw zero, 496(sp) -; CHECK-NEXT: sw zero, 492(sp) -; CHECK-NEXT: sw zero, 488(sp) -; CHECK-NEXT: sw s0, 516(sp) -; CHECK-NEXT: sw s8, 512(sp) -; CHECK-NEXT: sw s7, 508(sp) -; CHECK-NEXT: addi a0, sp, 520 -; CHECK-NEXT: addi a1, sp, 504 -; CHECK-NEXT: addi a2, sp, 488 -; CHECK-NEXT: sw s6, 504(sp) +; CHECK-NEXT: lw a0, 584(sp) +; CHECK-NEXT: sw a0, 60(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw a0, 588(sp) +; CHECK-NEXT: sw a0, 52(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw a0, 592(sp) +; CHECK-NEXT: sw a0, 68(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw a0, 596(sp) +; CHECK-NEXT: sw a0, 44(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw zero, 516(sp) +; CHECK-NEXT: sw zero, 512(sp) +; CHECK-NEXT: sw zero, 508(sp) +; CHECK-NEXT: sw zero, 504(sp) +; CHECK-NEXT: sw s0, 532(sp) +; CHECK-NEXT: sw s8, 528(sp) +; CHECK-NEXT: sw s7, 524(sp) +; CHECK-NEXT: addi a0, sp, 536 +; CHECK-NEXT: addi a1, sp, 520 +; CHECK-NEXT: addi a2, sp, 504 +; CHECK-NEXT: sw s6, 520(sp) ; CHECK-NEXT: call __addtf3@plt -; CHECK-NEXT: lw s9, 520(sp) -; CHECK-NEXT: lw s11, 524(sp) -; CHECK-NEXT: lw s5, 528(sp) -; CHECK-NEXT: lw s10, 532(sp) -; CHECK-NEXT: sw s10, 16(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s11, 536(sp) +; CHECK-NEXT: lw s10, 540(sp) +; CHECK-NEXT: sw s10, 40(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s9, 544(sp) +; CHECK-NEXT: sw s9, 36(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s5, 548(sp) ; CHECK-NEXT: lui a0, %hi(Y1) ; CHECK-NEXT: lw a1, %lo(Y1)(a0) -; CHECK-NEXT: sw a1, 52(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a1, 32(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a2, %lo(Y1+4)(a0) -; CHECK-NEXT: sw a2, 12(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a2, 28(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a3, %lo(Y1+8)(a0) -; CHECK-NEXT: sw a3, 8(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a3, 24(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, %lo(Y1+12)(a0) -; CHECK-NEXT: sw a0, 4(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw a0, 308(sp) -; CHECK-NEXT: sw a3, 304(sp) -; CHECK-NEXT: sw a2, 300(sp) -; CHECK-NEXT: sw a1, 296(sp) -; CHECK-NEXT: sw s4, 324(sp) -; CHECK-NEXT: sw s3, 320(sp) -; CHECK-NEXT: sw s2, 316(sp) -; CHECK-NEXT: addi a0, sp, 328 -; CHECK-NEXT: addi a1, sp, 312 -; CHECK-NEXT: addi a2, sp, 296 -; CHECK-NEXT: sw s1, 312(sp) +; CHECK-NEXT: sw a0, 20(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a0, 324(sp) +; CHECK-NEXT: sw a3, 320(sp) +; CHECK-NEXT: sw a2, 316(sp) +; CHECK-NEXT: sw a1, 312(sp) +; CHECK-NEXT: sw s4, 340(sp) +; CHECK-NEXT: sw s3, 336(sp) +; CHECK-NEXT: sw s2, 332(sp) +; CHECK-NEXT: addi a0, sp, 344 +; CHECK-NEXT: addi a1, sp, 328 +; CHECK-NEXT: addi a2, sp, 312 +; CHECK-NEXT: sw s1, 328(sp) ; CHECK-NEXT: call __multf3@plt -; CHECK-NEXT: lw a0, 328(sp) -; CHECK-NEXT: sw a0, 44(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw a0, 332(sp) -; CHECK-NEXT: sw a0, 36(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw a0, 336(sp) -; CHECK-NEXT: sw a0, 32(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw a0, 340(sp) -; CHECK-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s0, 468(sp) -; CHECK-NEXT: sw s8, 464(sp) -; CHECK-NEXT: sw s7, 460(sp) -; CHECK-NEXT: sw s6, 456(sp) -; CHECK-NEXT: sw s10, 452(sp) -; CHECK-NEXT: sw s5, 448(sp) -; CHECK-NEXT: sw s11, 444(sp) -; CHECK-NEXT: addi a0, sp, 472 -; CHECK-NEXT: addi a1, sp, 456 -; CHECK-NEXT: addi a2, sp, 440 -; CHECK-NEXT: sw s9, 440(sp) +; CHECK-NEXT: lw a0, 344(sp) +; CHECK-NEXT: sw a0, 64(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw a0, 348(sp) +; CHECK-NEXT: sw a0, 56(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s1, 352(sp) +; CHECK-NEXT: lw a0, 356(sp) +; CHECK-NEXT: sw a0, 48(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s0, 484(sp) +; CHECK-NEXT: sw s8, 480(sp) +; CHECK-NEXT: sw s7, 476(sp) +; CHECK-NEXT: sw s6, 472(sp) +; CHECK-NEXT: sw s5, 468(sp) +; CHECK-NEXT: sw s9, 464(sp) +; CHECK-NEXT: sw s10, 460(sp) +; CHECK-NEXT: addi a0, sp, 488 +; CHECK-NEXT: addi a1, sp, 472 +; CHECK-NEXT: addi a2, sp, 456 +; CHECK-NEXT: sw s11, 456(sp) ; CHECK-NEXT: call __addtf3@plt -; CHECK-NEXT: lw a3, 472(sp) -; CHECK-NEXT: lw a0, 476(sp) -; CHECK-NEXT: lw a1, 480(sp) -; CHECK-NEXT: lw a2, 484(sp) -; CHECK-NEXT: sw zero, 420(sp) -; CHECK-NEXT: sw zero, 416(sp) -; CHECK-NEXT: sw zero, 412(sp) -; CHECK-NEXT: sw zero, 408(sp) -; CHECK-NEXT: sw a2, 404(sp) -; CHECK-NEXT: sw a1, 400(sp) -; CHECK-NEXT: sw a0, 396(sp) -; CHECK-NEXT: addi a0, sp, 424 -; CHECK-NEXT: addi a1, sp, 408 -; CHECK-NEXT: addi a2, sp, 392 -; CHECK-NEXT: sw a3, 392(sp) +; CHECK-NEXT: lw a3, 488(sp) +; CHECK-NEXT: lw a0, 492(sp) +; CHECK-NEXT: lw a1, 496(sp) +; CHECK-NEXT: lw a2, 500(sp) +; CHECK-NEXT: sw zero, 436(sp) +; CHECK-NEXT: sw zero, 432(sp) +; CHECK-NEXT: sw zero, 428(sp) +; CHECK-NEXT: sw zero, 424(sp) +; CHECK-NEXT: sw a2, 420(sp) +; CHECK-NEXT: sw a1, 416(sp) +; CHECK-NEXT: sw a0, 412(sp) +; CHECK-NEXT: addi a0, sp, 440 +; CHECK-NEXT: addi a1, sp, 424 +; CHECK-NEXT: addi a2, sp, 408 +; CHECK-NEXT: sw a3, 408(sp) ; CHECK-NEXT: call __subtf3@plt -; CHECK-NEXT: lw a0, 424(sp) -; CHECK-NEXT: lw a1, 436(sp) -; CHECK-NEXT: lw a2, 432(sp) -; CHECK-NEXT: lw a3, 428(sp) +; CHECK-NEXT: lw a0, 440(sp) +; CHECK-NEXT: lw a1, 452(sp) +; CHECK-NEXT: lw a2, 448(sp) +; CHECK-NEXT: lw a3, 444(sp) ; CHECK-NEXT: lui a4, %hi(X) ; CHECK-NEXT: sw a1, %lo(X+12)(a4) ; CHECK-NEXT: sw a2, %lo(X+8)(a4) ; CHECK-NEXT: sw a3, %lo(X+4)(a4) ; CHECK-NEXT: sw a0, %lo(X)(a4) -; CHECK-NEXT: lw s8, 4(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s8, 212(sp) -; CHECK-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s4, 208(sp) -; CHECK-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s3, 204(sp) -; CHECK-NEXT: lw a0, 52(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 200(sp) -; CHECK-NEXT: lw a0, 48(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 228(sp) -; CHECK-NEXT: lw s10, 20(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s10, 224(sp) -; CHECK-NEXT: lw s2, 28(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s2, 220(sp) -; CHECK-NEXT: addi a0, sp, 232 -; CHECK-NEXT: addi a1, sp, 216 -; CHECK-NEXT: addi a2, sp, 200 -; CHECK-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s0, 216(sp) +; CHECK-NEXT: lw s8, 20(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s8, 228(sp) +; CHECK-NEXT: lw s7, 24(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s7, 224(sp) +; CHECK-NEXT: lw s6, 28(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s6, 220(sp) +; CHECK-NEXT: lw s4, 32(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s4, 216(sp) +; CHECK-NEXT: lw s2, 44(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s2, 244(sp) +; CHECK-NEXT: lw a0, 68(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 240(sp) +; CHECK-NEXT: lw s9, 52(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s9, 236(sp) +; CHECK-NEXT: addi a0, sp, 248 +; CHECK-NEXT: addi a1, sp, 232 +; CHECK-NEXT: addi a2, sp, 216 +; CHECK-NEXT: lw s0, 60(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s0, 232(sp) ; CHECK-NEXT: call __multf3@plt -; CHECK-NEXT: lw s1, 232(sp) -; CHECK-NEXT: lw a0, 236(sp) -; CHECK-NEXT: sw a0, 0(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw s6, 240(sp) -; CHECK-NEXT: lw s7, 244(sp) -; CHECK-NEXT: sw zero, 356(sp) -; CHECK-NEXT: sw zero, 352(sp) -; CHECK-NEXT: sw zero, 348(sp) -; CHECK-NEXT: sw zero, 344(sp) -; CHECK-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 372(sp) -; CHECK-NEXT: sw s5, 368(sp) -; CHECK-NEXT: sw s11, 364(sp) -; CHECK-NEXT: addi a0, sp, 376 -; CHECK-NEXT: addi a1, sp, 360 -; CHECK-NEXT: addi a2, sp, 344 -; CHECK-NEXT: sw s9, 360(sp) +; CHECK-NEXT: lw s10, 248(sp) +; CHECK-NEXT: lw a0, 252(sp) +; CHECK-NEXT: sw a0, 16(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw a0, 256(sp) +; CHECK-NEXT: sw a0, 12(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s3, 260(sp) +; CHECK-NEXT: sw zero, 372(sp) +; CHECK-NEXT: sw zero, 368(sp) +; CHECK-NEXT: sw zero, 364(sp) +; CHECK-NEXT: sw zero, 360(sp) +; CHECK-NEXT: sw s5, 388(sp) +; CHECK-NEXT: lw a0, 36(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 384(sp) +; CHECK-NEXT: lw a0, 40(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 380(sp) +; CHECK-NEXT: addi a0, sp, 392 +; CHECK-NEXT: addi a1, sp, 376 +; CHECK-NEXT: addi a2, sp, 360 +; CHECK-NEXT: sw s11, 376(sp) ; CHECK-NEXT: call __multf3@plt -; CHECK-NEXT: lw a0, 376(sp) -; CHECK-NEXT: lw a1, 388(sp) -; CHECK-NEXT: lw a2, 384(sp) -; CHECK-NEXT: lw a3, 380(sp) +; CHECK-NEXT: lw a0, 392(sp) +; CHECK-NEXT: lw a1, 404(sp) +; CHECK-NEXT: lw a2, 400(sp) +; CHECK-NEXT: lw a3, 396(sp) ; CHECK-NEXT: lui a4, %hi(S) ; CHECK-NEXT: sw a1, %lo(S+12)(a4) ; CHECK-NEXT: sw a2, %lo(S+8)(a4) ; CHECK-NEXT: sw a3, %lo(S+4)(a4) ; CHECK-NEXT: sw a0, %lo(S)(a4) -; CHECK-NEXT: lw a0, 48(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 260(sp) -; CHECK-NEXT: sw s10, 256(sp) -; CHECK-NEXT: sw s2, 252(sp) -; CHECK-NEXT: sw s0, 248(sp) -; CHECK-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 276(sp) -; CHECK-NEXT: lw a0, 32(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s2, 276(sp) +; CHECK-NEXT: lw a0, 68(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 272(sp) -; CHECK-NEXT: lw a0, 36(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 268(sp) -; CHECK-NEXT: addi a0, sp, 280 -; CHECK-NEXT: addi a1, sp, 264 -; CHECK-NEXT: addi a2, sp, 248 -; CHECK-NEXT: lw a3, 44(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a3, 264(sp) +; CHECK-NEXT: sw s9, 268(sp) +; CHECK-NEXT: sw s0, 264(sp) +; CHECK-NEXT: lw a0, 48(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 292(sp) +; CHECK-NEXT: sw s1, 288(sp) +; CHECK-NEXT: lw a0, 56(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 284(sp) +; CHECK-NEXT: addi a0, sp, 296 +; CHECK-NEXT: addi a1, sp, 280 +; CHECK-NEXT: addi a2, sp, 264 +; CHECK-NEXT: lw a3, 64(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a3, 280(sp) ; CHECK-NEXT: call __subtf3@plt -; CHECK-NEXT: lw a0, 280(sp) -; CHECK-NEXT: lw a1, 292(sp) -; CHECK-NEXT: lw a2, 288(sp) -; CHECK-NEXT: lw a3, 284(sp) +; CHECK-NEXT: lw a0, 296(sp) +; CHECK-NEXT: lw a1, 308(sp) +; CHECK-NEXT: lw a2, 304(sp) +; CHECK-NEXT: lw a3, 300(sp) ; CHECK-NEXT: lui a4, %hi(T) ; CHECK-NEXT: sw a1, %lo(T+12)(a4) ; CHECK-NEXT: sw a2, %lo(T+8)(a4) ; CHECK-NEXT: sw a3, %lo(T+4)(a4) ; CHECK-NEXT: sw a0, %lo(T)(a4) -; CHECK-NEXT: sw zero, 164(sp) -; CHECK-NEXT: sw zero, 160(sp) -; CHECK-NEXT: sw zero, 156(sp) -; CHECK-NEXT: sw zero, 152(sp) -; CHECK-NEXT: sw s7, 180(sp) -; CHECK-NEXT: sw s6, 176(sp) -; CHECK-NEXT: lw a0, 0(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 172(sp) -; CHECK-NEXT: addi a0, sp, 184 -; CHECK-NEXT: addi a1, sp, 168 -; CHECK-NEXT: addi a2, sp, 152 -; CHECK-NEXT: sw s1, 168(sp) +; CHECK-NEXT: sw zero, 180(sp) +; CHECK-NEXT: sw zero, 176(sp) +; CHECK-NEXT: sw zero, 172(sp) +; CHECK-NEXT: sw zero, 168(sp) +; CHECK-NEXT: sw s3, 196(sp) +; CHECK-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 192(sp) +; CHECK-NEXT: lw a0, 16(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 188(sp) +; CHECK-NEXT: addi a0, sp, 200 +; CHECK-NEXT: addi a1, sp, 184 +; CHECK-NEXT: addi a2, sp, 168 +; CHECK-NEXT: sw s10, 184(sp) ; CHECK-NEXT: call __addtf3@plt -; CHECK-NEXT: lw a0, 184(sp) -; CHECK-NEXT: lw a1, 196(sp) -; CHECK-NEXT: lw a2, 192(sp) -; CHECK-NEXT: lw a3, 188(sp) +; CHECK-NEXT: lw a0, 200(sp) +; CHECK-NEXT: lw a1, 212(sp) +; CHECK-NEXT: lw a2, 208(sp) +; CHECK-NEXT: lw a3, 204(sp) ; CHECK-NEXT: lui a4, %hi(Y) ; CHECK-NEXT: sw a1, %lo(Y+12)(a4) ; CHECK-NEXT: sw a2, %lo(Y+8)(a4) ; CHECK-NEXT: sw a3, %lo(Y+4)(a4) ; CHECK-NEXT: sw a0, %lo(Y)(a4) -; CHECK-NEXT: sw zero, 116(sp) -; CHECK-NEXT: sw zero, 112(sp) -; CHECK-NEXT: sw zero, 108(sp) -; CHECK-NEXT: sw zero, 104(sp) -; CHECK-NEXT: sw s8, 132(sp) -; CHECK-NEXT: sw s4, 128(sp) -; CHECK-NEXT: sw s3, 124(sp) -; CHECK-NEXT: addi a0, sp, 136 -; CHECK-NEXT: addi a1, sp, 120 -; CHECK-NEXT: addi a2, sp, 104 -; CHECK-NEXT: lw a3, 52(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a3, 120(sp) +; CHECK-NEXT: sw zero, 132(sp) +; CHECK-NEXT: sw zero, 128(sp) +; CHECK-NEXT: sw zero, 124(sp) +; CHECK-NEXT: sw zero, 120(sp) +; CHECK-NEXT: sw s8, 148(sp) +; CHECK-NEXT: sw s7, 144(sp) +; CHECK-NEXT: sw s6, 140(sp) +; CHECK-NEXT: addi a0, sp, 152 +; CHECK-NEXT: addi a1, sp, 136 +; CHECK-NEXT: addi a2, sp, 120 +; CHECK-NEXT: sw s4, 136(sp) ; CHECK-NEXT: call __multf3@plt -; CHECK-NEXT: lw a3, 136(sp) -; CHECK-NEXT: lw a0, 140(sp) -; CHECK-NEXT: lw a1, 144(sp) -; CHECK-NEXT: lw a2, 148(sp) +; CHECK-NEXT: lw a3, 152(sp) +; CHECK-NEXT: lw a0, 156(sp) +; CHECK-NEXT: lw a1, 160(sp) +; CHECK-NEXT: lw a2, 164(sp) ; CHECK-NEXT: lui a4, 786400 -; CHECK-NEXT: sw a4, 68(sp) -; CHECK-NEXT: sw zero, 64(sp) -; CHECK-NEXT: sw zero, 60(sp) -; CHECK-NEXT: sw zero, 56(sp) -; CHECK-NEXT: sw a2, 84(sp) -; CHECK-NEXT: sw a1, 80(sp) -; CHECK-NEXT: sw a0, 76(sp) -; CHECK-NEXT: addi a0, sp, 88 -; CHECK-NEXT: addi a1, sp, 72 -; CHECK-NEXT: addi a2, sp, 56 -; CHECK-NEXT: sw a3, 72(sp) +; CHECK-NEXT: sw a4, 84(sp) +; CHECK-NEXT: sw zero, 80(sp) +; CHECK-NEXT: sw zero, 76(sp) +; CHECK-NEXT: sw zero, 72(sp) +; CHECK-NEXT: sw a2, 100(sp) +; CHECK-NEXT: sw a1, 96(sp) +; CHECK-NEXT: sw a0, 92(sp) +; CHECK-NEXT: addi a0, sp, 104 +; CHECK-NEXT: addi a1, sp, 88 +; CHECK-NEXT: addi a2, sp, 72 +; CHECK-NEXT: sw a3, 88(sp) ; CHECK-NEXT: call __addtf3@plt -; CHECK-NEXT: lw a0, 96(sp) -; CHECK-NEXT: lw a1, 100(sp) -; CHECK-NEXT: lw a2, 88(sp) -; CHECK-NEXT: lw a3, 92(sp) +; CHECK-NEXT: lw a0, 112(sp) +; CHECK-NEXT: lw a1, 116(sp) +; CHECK-NEXT: lw a2, 104(sp) +; CHECK-NEXT: lw a3, 108(sp) ; CHECK-NEXT: lui a4, %hi(Y1) ; CHECK-NEXT: sw a0, %lo(Y1+8)(a4) ; CHECK-NEXT: sw a1, %lo(Y1+12)(a4) ; CHECK-NEXT: sw a2, %lo(Y1)(a4) ; CHECK-NEXT: sw a3, %lo(Y1+4)(a4) -; CHECK-NEXT: lw ra, 684(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s0, 680(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s1, 676(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s2, 672(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s3, 668(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s4, 664(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s5, 660(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s6, 656(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s7, 652(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s8, 648(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s9, 644(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s10, 640(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s11, 636(sp) # 4-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 688 +; CHECK-NEXT: lw ra, 700(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s0, 696(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s1, 692(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s2, 688(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s3, 684(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s4, 680(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s5, 676(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s6, 672(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s7, 668(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s8, 664(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s9, 660(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s10, 656(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s11, 652(sp) # 4-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 704 ; CHECK-NEXT: ret %1 = load fp128, ptr @U, align 16 %2 = fsub fp128 0xL00000000000000000000000000000000, %1 diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll index 8c0d97afe6c21d..c0b0c010341ca8 100644 --- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll @@ -10,30 +10,30 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill ; RISCV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill ; RISCV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill -; RISCV32-NEXT: lw a3, 12(a1) +; RISCV32-NEXT: lw a5, 12(a1) ; RISCV32-NEXT: lw a7, 12(a2) ; RISCV32-NEXT: lw a6, 8(a1) -; RISCV32-NEXT: lw a4, 0(a2) -; RISCV32-NEXT: lw a5, 0(a1) +; RISCV32-NEXT: lw a3, 0(a2) +; RISCV32-NEXT: lw a4, 0(a1) ; RISCV32-NEXT: lw t2, 4(a1) ; RISCV32-NEXT: lw t0, 8(a2) ; RISCV32-NEXT: lw a2, 4(a2) -; RISCV32-NEXT: mulhu a1, a5, a4 -; RISCV32-NEXT: mul t1, t2, a4 +; RISCV32-NEXT: mulhu a1, a4, a3 +; RISCV32-NEXT: mul t1, t2, a3 ; RISCV32-NEXT: add a1, t1, a1 ; RISCV32-NEXT: sltu t1, a1, t1 -; RISCV32-NEXT: mulhu t3, t2, a4 +; RISCV32-NEXT: mulhu t3, t2, a3 ; RISCV32-NEXT: add t4, t3, t1 -; RISCV32-NEXT: mul t1, a5, a2 +; RISCV32-NEXT: mul t1, a4, a2 ; RISCV32-NEXT: add a1, t1, a1 ; RISCV32-NEXT: sltu t1, a1, t1 -; RISCV32-NEXT: mulhu t3, a5, a2 +; RISCV32-NEXT: mulhu t3, a4, a2 ; RISCV32-NEXT: add t1, t3, t1 ; RISCV32-NEXT: add t5, t4, t1 ; RISCV32-NEXT: mul t6, t2, a2 ; RISCV32-NEXT: add s0, t6, t5 -; RISCV32-NEXT: mul t1, t0, a5 -; RISCV32-NEXT: mul s3, a6, a4 +; RISCV32-NEXT: mul t1, t0, a4 +; RISCV32-NEXT: mul s3, a6, a3 ; RISCV32-NEXT: add s4, s3, t1 ; RISCV32-NEXT: add t1, s0, s4 ; RISCV32-NEXT: sltu t3, t1, s0 @@ -43,14 +43,14 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: add t4, t5, t4 ; RISCV32-NEXT: add s0, t4, s0 ; RISCV32-NEXT: mul t4, t2, t0 -; RISCV32-NEXT: mul t5, a7, a5 +; RISCV32-NEXT: mul t5, a7, a4 ; RISCV32-NEXT: add t4, t5, t4 -; RISCV32-NEXT: mulhu s1, t0, a5 +; RISCV32-NEXT: mulhu s1, t0, a4 ; RISCV32-NEXT: add s2, s1, t4 ; RISCV32-NEXT: mul t4, a2, a6 -; RISCV32-NEXT: mul t5, a3, a4 +; RISCV32-NEXT: mul t5, a5, a3 ; RISCV32-NEXT: add t4, t5, t4 -; RISCV32-NEXT: mulhu t5, a6, a4 +; RISCV32-NEXT: mulhu t5, a6, a3 ; RISCV32-NEXT: add t6, t5, t4 ; RISCV32-NEXT: add t4, t6, s2 ; RISCV32-NEXT: sltu s3, s4, s3 @@ -65,7 +65,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: snez s1, t2 ; RISCV32-NEXT: snez s2, a7 ; RISCV32-NEXT: and s1, s2, s1 -; RISCV32-NEXT: mulhu s2, a7, a5 +; RISCV32-NEXT: mulhu s2, a7, a4 ; RISCV32-NEXT: snez s2, s2 ; RISCV32-NEXT: or s1, s1, s2 ; RISCV32-NEXT: mulhu t2, t2, t0 @@ -74,9 +74,9 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: or t2, t2, s0 ; RISCV32-NEXT: sltu t5, t6, t5 ; RISCV32-NEXT: snez t6, a2 -; RISCV32-NEXT: snez s0, a3 +; RISCV32-NEXT: snez s0, a5 ; RISCV32-NEXT: and t6, s0, t6 -; RISCV32-NEXT: mulhu s0, a3, a4 +; RISCV32-NEXT: mulhu s0, a5, a3 ; RISCV32-NEXT: snez s0, s0 ; RISCV32-NEXT: or t6, t6, s0 ; RISCV32-NEXT: mulhu a2, a2, a6 @@ -85,13 +85,13 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: or a2, a2, t5 ; RISCV32-NEXT: or a7, t0, a7 ; RISCV32-NEXT: snez a7, a7 -; RISCV32-NEXT: or a3, a6, a3 -; RISCV32-NEXT: snez a3, a3 -; RISCV32-NEXT: and a3, a3, a7 -; RISCV32-NEXT: or a2, a3, a2 -; RISCV32-NEXT: or a3, t2, t3 -; RISCV32-NEXT: or a2, a2, a3 -; RISCV32-NEXT: mul a3, a5, a4 +; RISCV32-NEXT: or a5, a6, a5 +; RISCV32-NEXT: snez a5, a5 +; RISCV32-NEXT: and a5, a5, a7 +; RISCV32-NEXT: or a2, a5, a2 +; RISCV32-NEXT: or a5, t2, t3 +; RISCV32-NEXT: or a2, a2, a5 +; RISCV32-NEXT: mul a3, a4, a3 ; RISCV32-NEXT: andi a2, a2, 1 ; RISCV32-NEXT: sw a3, 0(a0) ; RISCV32-NEXT: sw a1, 4(a0) diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index 32aca29d16e9b9..b2c9a9aa407d7d 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -327,32 +327,28 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lhu s1, 0(a1) -; RV32I-NEXT: lhu s2, 4(a1) -; RV32I-NEXT: lhu s3, 8(a1) -; RV32I-NEXT: lhu s4, 12(a1) -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu s0, 0(a1) +; RV32I-NEXT: lhu s1, 4(a1) +; RV32I-NEXT: lhu s2, 8(a1) +; RV32I-NEXT: lhu s3, 12(a1) +; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: mv s5, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: mv s6, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: mv s7, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: mv s8, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: call __udivsi3@plt -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __udivsi3@plt ; RV32I-NEXT: mv s3, a0 @@ -363,14 +359,18 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __udivsi3@plt +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: li a1, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __udivsi3@plt ; RV32I-NEXT: add a0, s8, a0 -; RV32I-NEXT: add s2, s7, s2 -; RV32I-NEXT: add s3, s6, s3 -; RV32I-NEXT: add s4, s5, s4 -; RV32I-NEXT: sh s4, 6(s0) -; RV32I-NEXT: sh s3, 4(s0) -; RV32I-NEXT: sh s2, 2(s0) -; RV32I-NEXT: sh a0, 0(s0) +; RV32I-NEXT: add s1, s7, s1 +; RV32I-NEXT: add s2, s6, s2 +; RV32I-NEXT: add s3, s5, s3 +; RV32I-NEXT: sh s3, 6(s4) +; RV32I-NEXT: sh s2, 4(s4) +; RV32I-NEXT: sh s1, 2(s4) +; RV32I-NEXT: sh a0, 0(s4) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -428,32 +428,28 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s6, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s7, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s8, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lhu s1, 0(a1) -; RV64I-NEXT: lhu s2, 8(a1) -; RV64I-NEXT: lhu s3, 16(a1) -; RV64I-NEXT: lhu s4, 24(a1) -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu s0, 0(a1) +; RV64I-NEXT: lhu s1, 8(a1) +; RV64I-NEXT: lhu s2, 16(a1) +; RV64I-NEXT: lhu s3, 24(a1) +; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: mv s5, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: mv s6, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: mv s7, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: mv s8, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: call __udivdi3@plt -; RV64I-NEXT: mv s4, a0 -; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __udivdi3@plt ; RV64I-NEXT: mv s3, a0 @@ -464,14 +460,18 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __udivdi3@plt +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: li a1, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __udivdi3@plt ; RV64I-NEXT: add a0, s8, a0 -; RV64I-NEXT: add s2, s7, s2 -; RV64I-NEXT: add s3, s6, s3 -; RV64I-NEXT: add s4, s5, s4 -; RV64I-NEXT: sh s4, 6(s0) -; RV64I-NEXT: sh s3, 4(s0) -; RV64I-NEXT: sh s2, 2(s0) -; RV64I-NEXT: sh a0, 0(s0) +; RV64I-NEXT: add s1, s7, s1 +; RV64I-NEXT: add s2, s6, s2 +; RV64I-NEXT: add s3, s5, s3 +; RV64I-NEXT: sh s3, 6(s4) +; RV64I-NEXT: sh s2, 4(s4) +; RV64I-NEXT: sh s1, 2(s4) +; RV64I-NEXT: sh a0, 0(s4) ; RV64I-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 56(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index b0d435368e92bd..f724fab54738e8 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -1286,18 +1286,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv a7, a1 +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 2(a0) +; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 3(a0) +; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 4(a0) +; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 5(a0) +; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu t1, 6(a0) ; RV64I-NEXT: lbu t2, 7(a0) ; RV64I-NEXT: lbu t3, 8(a0) @@ -1318,19 +1319,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu s11, 23(a0) ; RV64I-NEXT: lbu ra, 24(a0) ; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: lbu a7, 26(a0) -; RV64I-NEXT: lbu a6, 27(a0) -; RV64I-NEXT: lbu a5, 28(a0) -; RV64I-NEXT: lbu a3, 31(a0) -; RV64I-NEXT: lbu a4, 30(a0) +; RV64I-NEXT: lbu a6, 26(a0) +; RV64I-NEXT: lbu a5, 27(a0) +; RV64I-NEXT: lbu a4, 28(a0) +; RV64I-NEXT: lbu a1, 31(a0) +; RV64I-NEXT: lbu a3, 30(a0) ; RV64I-NEXT: lbu a0, 29(a0) -; RV64I-NEXT: lbu a1, 0(a1) -; RV64I-NEXT: sb a3, 87(sp) -; RV64I-NEXT: sb a4, 86(sp) +; RV64I-NEXT: lbu a7, 0(a7) +; RV64I-NEXT: sb a1, 87(sp) +; RV64I-NEXT: sb a3, 86(sp) ; RV64I-NEXT: sb a0, 85(sp) -; RV64I-NEXT: sb a5, 84(sp) -; RV64I-NEXT: sb a6, 83(sp) -; RV64I-NEXT: sb a7, 82(sp) +; RV64I-NEXT: sb a4, 84(sp) +; RV64I-NEXT: sb a5, 83(sp) +; RV64I-NEXT: sb a6, 82(sp) ; RV64I-NEXT: sb zero, 119(sp) ; RV64I-NEXT: sb zero, 118(sp) ; RV64I-NEXT: sb zero, 117(sp) @@ -1395,52 +1396,52 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 57(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: andi a1, a1, 31 -; RV64I-NEXT: addi a0, sp, 56 -; RV64I-NEXT: add a6, a0, a1 -; RV64I-NEXT: lbu a0, 8(a6) +; RV64I-NEXT: andi a0, a7, 31 +; RV64I-NEXT: addi a1, sp, 56 +; RV64I-NEXT: add a5, a1, a0 +; RV64I-NEXT: lbu a0, 8(a5) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a6) +; RV64I-NEXT: lbu a0, 9(a5) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a6) +; RV64I-NEXT: lbu a0, 10(a5) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a6) +; RV64I-NEXT: lbu a0, 11(a5) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a6) +; RV64I-NEXT: lbu a0, 12(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a6) -; RV64I-NEXT: lbu t0, 14(a6) -; RV64I-NEXT: lbu t1, 15(a6) -; RV64I-NEXT: lbu t2, 0(a6) -; RV64I-NEXT: lbu t3, 1(a6) -; RV64I-NEXT: lbu t4, 2(a6) -; RV64I-NEXT: lbu t5, 3(a6) -; RV64I-NEXT: lbu t6, 4(a6) -; RV64I-NEXT: lbu s0, 5(a6) -; RV64I-NEXT: lbu s1, 6(a6) -; RV64I-NEXT: lbu s2, 7(a6) -; RV64I-NEXT: lbu s3, 24(a6) -; RV64I-NEXT: lbu s4, 25(a6) -; RV64I-NEXT: lbu s5, 26(a6) -; RV64I-NEXT: lbu s6, 27(a6) -; RV64I-NEXT: lbu s7, 28(a6) -; RV64I-NEXT: lbu s8, 29(a6) -; RV64I-NEXT: lbu s9, 30(a6) -; RV64I-NEXT: lbu s10, 31(a6) -; RV64I-NEXT: lbu s11, 16(a6) -; RV64I-NEXT: lbu ra, 17(a6) -; RV64I-NEXT: lbu a5, 18(a6) -; RV64I-NEXT: lbu a4, 19(a6) -; RV64I-NEXT: lbu a0, 23(a6) -; RV64I-NEXT: lbu a1, 22(a6) -; RV64I-NEXT: lbu a3, 21(a6) -; RV64I-NEXT: lbu a6, 20(a6) +; RV64I-NEXT: lbu a7, 13(a5) +; RV64I-NEXT: lbu t0, 14(a5) +; RV64I-NEXT: lbu t1, 15(a5) +; RV64I-NEXT: lbu t2, 0(a5) +; RV64I-NEXT: lbu t3, 1(a5) +; RV64I-NEXT: lbu t4, 2(a5) +; RV64I-NEXT: lbu t5, 3(a5) +; RV64I-NEXT: lbu t6, 4(a5) +; RV64I-NEXT: lbu s0, 5(a5) +; RV64I-NEXT: lbu s1, 6(a5) +; RV64I-NEXT: lbu s2, 7(a5) +; RV64I-NEXT: lbu s3, 24(a5) +; RV64I-NEXT: lbu s4, 25(a5) +; RV64I-NEXT: lbu s5, 26(a5) +; RV64I-NEXT: lbu s6, 27(a5) +; RV64I-NEXT: lbu s7, 28(a5) +; RV64I-NEXT: lbu s8, 29(a5) +; RV64I-NEXT: lbu s9, 30(a5) +; RV64I-NEXT: lbu s10, 31(a5) +; RV64I-NEXT: lbu s11, 16(a5) +; RV64I-NEXT: lbu ra, 17(a5) +; RV64I-NEXT: lbu a6, 18(a5) +; RV64I-NEXT: lbu a4, 19(a5) +; RV64I-NEXT: lbu a0, 23(a5) +; RV64I-NEXT: lbu a1, 22(a5) +; RV64I-NEXT: lbu a3, 21(a5) +; RV64I-NEXT: lbu a5, 20(a5) ; RV64I-NEXT: sb a0, 23(a2) ; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a6, 20(a2) +; RV64I-NEXT: sb a5, 20(a2) ; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a5, 18(a2) +; RV64I-NEXT: sb a6, 18(a2) ; RV64I-NEXT: sb ra, 17(a2) ; RV64I-NEXT: sb s11, 16(a2) ; RV64I-NEXT: sb s10, 31(a2) @@ -1504,18 +1505,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv a7, a1 +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 2(a0) +; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 3(a0) +; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 4(a0) +; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 5(a0) +; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: lbu t3, 8(a0) @@ -1536,19 +1538,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: lbu ra, 24(a0) ; RV32I-NEXT: lbu t0, 25(a0) -; RV32I-NEXT: lbu a7, 26(a0) -; RV32I-NEXT: lbu a6, 27(a0) -; RV32I-NEXT: lbu a5, 28(a0) -; RV32I-NEXT: lbu a3, 31(a0) -; RV32I-NEXT: lbu a4, 30(a0) +; RV32I-NEXT: lbu a6, 26(a0) +; RV32I-NEXT: lbu a5, 27(a0) +; RV32I-NEXT: lbu a4, 28(a0) +; RV32I-NEXT: lbu a1, 31(a0) +; RV32I-NEXT: lbu a3, 30(a0) ; RV32I-NEXT: lbu a0, 29(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sb a3, 59(sp) -; RV32I-NEXT: sb a4, 58(sp) +; RV32I-NEXT: lbu a7, 0(a7) +; RV32I-NEXT: sb a1, 59(sp) +; RV32I-NEXT: sb a3, 58(sp) ; RV32I-NEXT: sb a0, 57(sp) -; RV32I-NEXT: sb a5, 56(sp) -; RV32I-NEXT: sb a6, 55(sp) -; RV32I-NEXT: sb a7, 54(sp) +; RV32I-NEXT: sb a4, 56(sp) +; RV32I-NEXT: sb a5, 55(sp) +; RV32I-NEXT: sb a6, 54(sp) ; RV32I-NEXT: sb zero, 91(sp) ; RV32I-NEXT: sb zero, 90(sp) ; RV32I-NEXT: sb zero, 89(sp) @@ -1613,52 +1615,52 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 29(sp) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: andi a1, a1, 31 -; RV32I-NEXT: addi a0, sp, 28 -; RV32I-NEXT: add a6, a0, a1 -; RV32I-NEXT: lbu a0, 6(a6) +; RV32I-NEXT: andi a0, a7, 31 +; RV32I-NEXT: addi a1, sp, 28 +; RV32I-NEXT: add a5, a1, a0 +; RV32I-NEXT: lbu a0, 6(a5) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a6) +; RV32I-NEXT: lbu a0, 7(a5) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a6) +; RV32I-NEXT: lbu a0, 4(a5) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a6) +; RV32I-NEXT: lbu a0, 5(a5) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a6) +; RV32I-NEXT: lbu a0, 0(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a6) -; RV32I-NEXT: lbu t0, 2(a6) -; RV32I-NEXT: lbu t1, 3(a6) -; RV32I-NEXT: lbu t2, 14(a6) -; RV32I-NEXT: lbu t3, 15(a6) -; RV32I-NEXT: lbu t4, 12(a6) -; RV32I-NEXT: lbu t5, 13(a6) -; RV32I-NEXT: lbu t6, 10(a6) -; RV32I-NEXT: lbu s0, 11(a6) -; RV32I-NEXT: lbu s1, 8(a6) -; RV32I-NEXT: lbu s2, 9(a6) -; RV32I-NEXT: lbu s3, 22(a6) -; RV32I-NEXT: lbu s4, 23(a6) -; RV32I-NEXT: lbu s5, 20(a6) -; RV32I-NEXT: lbu s6, 21(a6) -; RV32I-NEXT: lbu s7, 18(a6) -; RV32I-NEXT: lbu s8, 19(a6) -; RV32I-NEXT: lbu s9, 16(a6) -; RV32I-NEXT: lbu s10, 17(a6) -; RV32I-NEXT: lbu s11, 30(a6) -; RV32I-NEXT: lbu ra, 31(a6) -; RV32I-NEXT: lbu a5, 28(a6) -; RV32I-NEXT: lbu a4, 29(a6) -; RV32I-NEXT: lbu a0, 25(a6) -; RV32I-NEXT: lbu a1, 24(a6) -; RV32I-NEXT: lbu a3, 27(a6) -; RV32I-NEXT: lbu a6, 26(a6) +; RV32I-NEXT: lbu a7, 1(a5) +; RV32I-NEXT: lbu t0, 2(a5) +; RV32I-NEXT: lbu t1, 3(a5) +; RV32I-NEXT: lbu t2, 14(a5) +; RV32I-NEXT: lbu t3, 15(a5) +; RV32I-NEXT: lbu t4, 12(a5) +; RV32I-NEXT: lbu t5, 13(a5) +; RV32I-NEXT: lbu t6, 10(a5) +; RV32I-NEXT: lbu s0, 11(a5) +; RV32I-NEXT: lbu s1, 8(a5) +; RV32I-NEXT: lbu s2, 9(a5) +; RV32I-NEXT: lbu s3, 22(a5) +; RV32I-NEXT: lbu s4, 23(a5) +; RV32I-NEXT: lbu s5, 20(a5) +; RV32I-NEXT: lbu s6, 21(a5) +; RV32I-NEXT: lbu s7, 18(a5) +; RV32I-NEXT: lbu s8, 19(a5) +; RV32I-NEXT: lbu s9, 16(a5) +; RV32I-NEXT: lbu s10, 17(a5) +; RV32I-NEXT: lbu s11, 30(a5) +; RV32I-NEXT: lbu ra, 31(a5) +; RV32I-NEXT: lbu a6, 28(a5) +; RV32I-NEXT: lbu a4, 29(a5) +; RV32I-NEXT: lbu a0, 25(a5) +; RV32I-NEXT: lbu a1, 24(a5) +; RV32I-NEXT: lbu a3, 27(a5) +; RV32I-NEXT: lbu a5, 26(a5) ; RV32I-NEXT: sb a0, 25(a2) ; RV32I-NEXT: sb a1, 24(a2) ; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a6, 26(a2) +; RV32I-NEXT: sb a5, 26(a2) ; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a5, 28(a2) +; RV32I-NEXT: sb a6, 28(a2) ; RV32I-NEXT: sb ra, 31(a2) ; RV32I-NEXT: sb s11, 30(a2) ; RV32I-NEXT: sb s10, 17(a2) @@ -1729,18 +1731,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv a7, a1 +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 2(a0) +; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 3(a0) +; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 4(a0) +; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 5(a0) +; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu t1, 6(a0) ; RV64I-NEXT: lbu t2, 7(a0) ; RV64I-NEXT: lbu t3, 8(a0) @@ -1761,19 +1764,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu s11, 23(a0) ; RV64I-NEXT: lbu ra, 24(a0) ; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: lbu a7, 26(a0) -; RV64I-NEXT: lbu a6, 27(a0) -; RV64I-NEXT: lbu a5, 28(a0) -; RV64I-NEXT: lbu a3, 31(a0) -; RV64I-NEXT: lbu a4, 30(a0) +; RV64I-NEXT: lbu a6, 26(a0) +; RV64I-NEXT: lbu a5, 27(a0) +; RV64I-NEXT: lbu a4, 28(a0) +; RV64I-NEXT: lbu a1, 31(a0) +; RV64I-NEXT: lbu a3, 30(a0) ; RV64I-NEXT: lbu a0, 29(a0) -; RV64I-NEXT: lbu a1, 0(a1) -; RV64I-NEXT: sb a3, 119(sp) -; RV64I-NEXT: sb a4, 118(sp) +; RV64I-NEXT: lbu a7, 0(a7) +; RV64I-NEXT: sb a1, 119(sp) +; RV64I-NEXT: sb a3, 118(sp) ; RV64I-NEXT: sb a0, 117(sp) -; RV64I-NEXT: sb a5, 116(sp) -; RV64I-NEXT: sb a6, 115(sp) -; RV64I-NEXT: sb a7, 114(sp) +; RV64I-NEXT: sb a4, 116(sp) +; RV64I-NEXT: sb a5, 115(sp) +; RV64I-NEXT: sb a6, 114(sp) ; RV64I-NEXT: sb zero, 87(sp) ; RV64I-NEXT: sb zero, 86(sp) ; RV64I-NEXT: sb zero, 85(sp) @@ -1838,52 +1841,52 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 89(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 88(sp) -; RV64I-NEXT: andi a1, a1, 31 -; RV64I-NEXT: addi a0, sp, 88 -; RV64I-NEXT: sub a6, a0, a1 -; RV64I-NEXT: lbu a0, 8(a6) +; RV64I-NEXT: andi a0, a7, 31 +; RV64I-NEXT: addi a1, sp, 88 +; RV64I-NEXT: sub a5, a1, a0 +; RV64I-NEXT: lbu a0, 8(a5) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a6) +; RV64I-NEXT: lbu a0, 9(a5) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a6) +; RV64I-NEXT: lbu a0, 10(a5) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a6) +; RV64I-NEXT: lbu a0, 11(a5) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a6) +; RV64I-NEXT: lbu a0, 12(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a6) -; RV64I-NEXT: lbu t0, 14(a6) -; RV64I-NEXT: lbu t1, 15(a6) -; RV64I-NEXT: lbu t2, 0(a6) -; RV64I-NEXT: lbu t3, 1(a6) -; RV64I-NEXT: lbu t4, 2(a6) -; RV64I-NEXT: lbu t5, 3(a6) -; RV64I-NEXT: lbu t6, 4(a6) -; RV64I-NEXT: lbu s0, 5(a6) -; RV64I-NEXT: lbu s1, 6(a6) -; RV64I-NEXT: lbu s2, 7(a6) -; RV64I-NEXT: lbu s3, 24(a6) -; RV64I-NEXT: lbu s4, 25(a6) -; RV64I-NEXT: lbu s5, 26(a6) -; RV64I-NEXT: lbu s6, 27(a6) -; RV64I-NEXT: lbu s7, 28(a6) -; RV64I-NEXT: lbu s8, 29(a6) -; RV64I-NEXT: lbu s9, 30(a6) -; RV64I-NEXT: lbu s10, 31(a6) -; RV64I-NEXT: lbu s11, 16(a6) -; RV64I-NEXT: lbu ra, 17(a6) -; RV64I-NEXT: lbu a5, 18(a6) -; RV64I-NEXT: lbu a4, 19(a6) -; RV64I-NEXT: lbu a0, 23(a6) -; RV64I-NEXT: lbu a1, 22(a6) -; RV64I-NEXT: lbu a3, 21(a6) -; RV64I-NEXT: lbu a6, 20(a6) +; RV64I-NEXT: lbu a7, 13(a5) +; RV64I-NEXT: lbu t0, 14(a5) +; RV64I-NEXT: lbu t1, 15(a5) +; RV64I-NEXT: lbu t2, 0(a5) +; RV64I-NEXT: lbu t3, 1(a5) +; RV64I-NEXT: lbu t4, 2(a5) +; RV64I-NEXT: lbu t5, 3(a5) +; RV64I-NEXT: lbu t6, 4(a5) +; RV64I-NEXT: lbu s0, 5(a5) +; RV64I-NEXT: lbu s1, 6(a5) +; RV64I-NEXT: lbu s2, 7(a5) +; RV64I-NEXT: lbu s3, 24(a5) +; RV64I-NEXT: lbu s4, 25(a5) +; RV64I-NEXT: lbu s5, 26(a5) +; RV64I-NEXT: lbu s6, 27(a5) +; RV64I-NEXT: lbu s7, 28(a5) +; RV64I-NEXT: lbu s8, 29(a5) +; RV64I-NEXT: lbu s9, 30(a5) +; RV64I-NEXT: lbu s10, 31(a5) +; RV64I-NEXT: lbu s11, 16(a5) +; RV64I-NEXT: lbu ra, 17(a5) +; RV64I-NEXT: lbu a6, 18(a5) +; RV64I-NEXT: lbu a4, 19(a5) +; RV64I-NEXT: lbu a0, 23(a5) +; RV64I-NEXT: lbu a1, 22(a5) +; RV64I-NEXT: lbu a3, 21(a5) +; RV64I-NEXT: lbu a5, 20(a5) ; RV64I-NEXT: sb a0, 23(a2) ; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a6, 20(a2) +; RV64I-NEXT: sb a5, 20(a2) ; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a5, 18(a2) +; RV64I-NEXT: sb a6, 18(a2) ; RV64I-NEXT: sb ra, 17(a2) ; RV64I-NEXT: sb s11, 16(a2) ; RV64I-NEXT: sb s10, 31(a2) @@ -1947,18 +1950,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv a7, a1 +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 2(a0) +; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 3(a0) +; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 4(a0) +; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 5(a0) +; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: lbu t3, 8(a0) @@ -1979,19 +1983,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: lbu ra, 24(a0) ; RV32I-NEXT: lbu t0, 25(a0) -; RV32I-NEXT: lbu a7, 26(a0) -; RV32I-NEXT: lbu a6, 27(a0) -; RV32I-NEXT: lbu a5, 28(a0) -; RV32I-NEXT: lbu a3, 31(a0) -; RV32I-NEXT: lbu a4, 30(a0) +; RV32I-NEXT: lbu a6, 26(a0) +; RV32I-NEXT: lbu a5, 27(a0) +; RV32I-NEXT: lbu a4, 28(a0) +; RV32I-NEXT: lbu a1, 31(a0) +; RV32I-NEXT: lbu a3, 30(a0) ; RV32I-NEXT: lbu a0, 29(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sb a3, 91(sp) -; RV32I-NEXT: sb a4, 90(sp) +; RV32I-NEXT: lbu a7, 0(a7) +; RV32I-NEXT: sb a1, 91(sp) +; RV32I-NEXT: sb a3, 90(sp) ; RV32I-NEXT: sb a0, 89(sp) -; RV32I-NEXT: sb a5, 88(sp) -; RV32I-NEXT: sb a6, 87(sp) -; RV32I-NEXT: sb a7, 86(sp) +; RV32I-NEXT: sb a4, 88(sp) +; RV32I-NEXT: sb a5, 87(sp) +; RV32I-NEXT: sb a6, 86(sp) ; RV32I-NEXT: sb zero, 59(sp) ; RV32I-NEXT: sb zero, 58(sp) ; RV32I-NEXT: sb zero, 57(sp) @@ -2056,52 +2060,52 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 61(sp) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 60(sp) -; RV32I-NEXT: andi a1, a1, 31 -; RV32I-NEXT: addi a0, sp, 60 -; RV32I-NEXT: sub a6, a0, a1 -; RV32I-NEXT: lbu a0, 6(a6) +; RV32I-NEXT: andi a0, a7, 31 +; RV32I-NEXT: addi a1, sp, 60 +; RV32I-NEXT: sub a5, a1, a0 +; RV32I-NEXT: lbu a0, 6(a5) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a6) +; RV32I-NEXT: lbu a0, 7(a5) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a6) +; RV32I-NEXT: lbu a0, 4(a5) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a6) +; RV32I-NEXT: lbu a0, 5(a5) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a6) +; RV32I-NEXT: lbu a0, 0(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a6) -; RV32I-NEXT: lbu t0, 2(a6) -; RV32I-NEXT: lbu t1, 3(a6) -; RV32I-NEXT: lbu t2, 14(a6) -; RV32I-NEXT: lbu t3, 15(a6) -; RV32I-NEXT: lbu t4, 12(a6) -; RV32I-NEXT: lbu t5, 13(a6) -; RV32I-NEXT: lbu t6, 10(a6) -; RV32I-NEXT: lbu s0, 11(a6) -; RV32I-NEXT: lbu s1, 8(a6) -; RV32I-NEXT: lbu s2, 9(a6) -; RV32I-NEXT: lbu s3, 22(a6) -; RV32I-NEXT: lbu s4, 23(a6) -; RV32I-NEXT: lbu s5, 20(a6) -; RV32I-NEXT: lbu s6, 21(a6) -; RV32I-NEXT: lbu s7, 18(a6) -; RV32I-NEXT: lbu s8, 19(a6) -; RV32I-NEXT: lbu s9, 16(a6) -; RV32I-NEXT: lbu s10, 17(a6) -; RV32I-NEXT: lbu s11, 30(a6) -; RV32I-NEXT: lbu ra, 31(a6) -; RV32I-NEXT: lbu a5, 28(a6) -; RV32I-NEXT: lbu a4, 29(a6) -; RV32I-NEXT: lbu a0, 25(a6) -; RV32I-NEXT: lbu a1, 24(a6) -; RV32I-NEXT: lbu a3, 27(a6) -; RV32I-NEXT: lbu a6, 26(a6) +; RV32I-NEXT: lbu a7, 1(a5) +; RV32I-NEXT: lbu t0, 2(a5) +; RV32I-NEXT: lbu t1, 3(a5) +; RV32I-NEXT: lbu t2, 14(a5) +; RV32I-NEXT: lbu t3, 15(a5) +; RV32I-NEXT: lbu t4, 12(a5) +; RV32I-NEXT: lbu t5, 13(a5) +; RV32I-NEXT: lbu t6, 10(a5) +; RV32I-NEXT: lbu s0, 11(a5) +; RV32I-NEXT: lbu s1, 8(a5) +; RV32I-NEXT: lbu s2, 9(a5) +; RV32I-NEXT: lbu s3, 22(a5) +; RV32I-NEXT: lbu s4, 23(a5) +; RV32I-NEXT: lbu s5, 20(a5) +; RV32I-NEXT: lbu s6, 21(a5) +; RV32I-NEXT: lbu s7, 18(a5) +; RV32I-NEXT: lbu s8, 19(a5) +; RV32I-NEXT: lbu s9, 16(a5) +; RV32I-NEXT: lbu s10, 17(a5) +; RV32I-NEXT: lbu s11, 30(a5) +; RV32I-NEXT: lbu ra, 31(a5) +; RV32I-NEXT: lbu a6, 28(a5) +; RV32I-NEXT: lbu a4, 29(a5) +; RV32I-NEXT: lbu a0, 25(a5) +; RV32I-NEXT: lbu a1, 24(a5) +; RV32I-NEXT: lbu a3, 27(a5) +; RV32I-NEXT: lbu a5, 26(a5) ; RV32I-NEXT: sb a0, 25(a2) ; RV32I-NEXT: sb a1, 24(a2) ; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a6, 26(a2) +; RV32I-NEXT: sb a5, 26(a2) ; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a5, 28(a2) +; RV32I-NEXT: sb a6, 28(a2) ; RV32I-NEXT: sb ra, 31(a2) ; RV32I-NEXT: sb s11, 30(a2) ; RV32I-NEXT: sb s10, 17(a2) @@ -2172,8 +2176,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv t0, a1 -; RV64I-NEXT: lbu t1, 31(a0) +; RV64I-NEXT: mv t1, a1 +; RV64I-NEXT: lbu t0, 31(a0) ; RV64I-NEXT: lbu a1, 0(a0) ; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a1, 1(a0) @@ -2211,15 +2215,15 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a1, 30(a0) ; RV64I-NEXT: lbu a3, 29(a0) ; RV64I-NEXT: lbu a0, 28(a0) -; RV64I-NEXT: lbu t0, 0(t0) +; RV64I-NEXT: lbu t1, 0(t1) ; RV64I-NEXT: sb a1, 86(sp) ; RV64I-NEXT: sb a3, 85(sp) ; RV64I-NEXT: sb a0, 84(sp) ; RV64I-NEXT: sb a4, 83(sp) ; RV64I-NEXT: sb a5, 82(sp) ; RV64I-NEXT: sb a6, 81(sp) -; RV64I-NEXT: sb t1, 87(sp) -; RV64I-NEXT: slli t1, t1, 56 +; RV64I-NEXT: sb t0, 87(sp) +; RV64I-NEXT: slli t0, t0, 56 ; RV64I-NEXT: sb a7, 80(sp) ; RV64I-NEXT: sb ra, 79(sp) ; RV64I-NEXT: sb s11, 78(sp) @@ -2251,7 +2255,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 57(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: srai a0, t1, 63 +; RV64I-NEXT: srai a0, t0, 63 ; RV64I-NEXT: sb a0, 112(sp) ; RV64I-NEXT: sb a0, 104(sp) ; RV64I-NEXT: sb a0, 96(sp) @@ -2291,52 +2295,52 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a6, 91(sp) ; RV64I-NEXT: sb a7, 90(sp) ; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: andi a0, t0, 31 +; RV64I-NEXT: andi a0, t1, 31 ; RV64I-NEXT: addi a1, sp, 56 -; RV64I-NEXT: add a6, a1, a0 -; RV64I-NEXT: lbu a0, 8(a6) +; RV64I-NEXT: add a5, a1, a0 +; RV64I-NEXT: lbu a0, 8(a5) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a6) +; RV64I-NEXT: lbu a0, 9(a5) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a6) +; RV64I-NEXT: lbu a0, 10(a5) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a6) +; RV64I-NEXT: lbu a0, 11(a5) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a6) +; RV64I-NEXT: lbu a0, 12(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a6) -; RV64I-NEXT: lbu t0, 14(a6) -; RV64I-NEXT: lbu t1, 15(a6) -; RV64I-NEXT: lbu t2, 0(a6) -; RV64I-NEXT: lbu t3, 1(a6) -; RV64I-NEXT: lbu t4, 2(a6) -; RV64I-NEXT: lbu t5, 3(a6) -; RV64I-NEXT: lbu t6, 4(a6) -; RV64I-NEXT: lbu s0, 5(a6) -; RV64I-NEXT: lbu s1, 6(a6) -; RV64I-NEXT: lbu s2, 7(a6) -; RV64I-NEXT: lbu s3, 24(a6) -; RV64I-NEXT: lbu s4, 25(a6) -; RV64I-NEXT: lbu s5, 26(a6) -; RV64I-NEXT: lbu s6, 27(a6) -; RV64I-NEXT: lbu s7, 28(a6) -; RV64I-NEXT: lbu s8, 29(a6) -; RV64I-NEXT: lbu s9, 30(a6) -; RV64I-NEXT: lbu s10, 31(a6) -; RV64I-NEXT: lbu s11, 16(a6) -; RV64I-NEXT: lbu ra, 17(a6) -; RV64I-NEXT: lbu a5, 18(a6) -; RV64I-NEXT: lbu a4, 19(a6) -; RV64I-NEXT: lbu a0, 23(a6) -; RV64I-NEXT: lbu a1, 22(a6) -; RV64I-NEXT: lbu a3, 21(a6) -; RV64I-NEXT: lbu a6, 20(a6) +; RV64I-NEXT: lbu a7, 13(a5) +; RV64I-NEXT: lbu t0, 14(a5) +; RV64I-NEXT: lbu t1, 15(a5) +; RV64I-NEXT: lbu t2, 0(a5) +; RV64I-NEXT: lbu t3, 1(a5) +; RV64I-NEXT: lbu t4, 2(a5) +; RV64I-NEXT: lbu t5, 3(a5) +; RV64I-NEXT: lbu t6, 4(a5) +; RV64I-NEXT: lbu s0, 5(a5) +; RV64I-NEXT: lbu s1, 6(a5) +; RV64I-NEXT: lbu s2, 7(a5) +; RV64I-NEXT: lbu s3, 24(a5) +; RV64I-NEXT: lbu s4, 25(a5) +; RV64I-NEXT: lbu s5, 26(a5) +; RV64I-NEXT: lbu s6, 27(a5) +; RV64I-NEXT: lbu s7, 28(a5) +; RV64I-NEXT: lbu s8, 29(a5) +; RV64I-NEXT: lbu s9, 30(a5) +; RV64I-NEXT: lbu s10, 31(a5) +; RV64I-NEXT: lbu s11, 16(a5) +; RV64I-NEXT: lbu ra, 17(a5) +; RV64I-NEXT: lbu a6, 18(a5) +; RV64I-NEXT: lbu a4, 19(a5) +; RV64I-NEXT: lbu a0, 23(a5) +; RV64I-NEXT: lbu a1, 22(a5) +; RV64I-NEXT: lbu a3, 21(a5) +; RV64I-NEXT: lbu a5, 20(a5) ; RV64I-NEXT: sb a0, 23(a2) ; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a6, 20(a2) +; RV64I-NEXT: sb a5, 20(a2) ; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a5, 18(a2) +; RV64I-NEXT: sb a6, 18(a2) ; RV64I-NEXT: sb ra, 17(a2) ; RV64I-NEXT: sb s11, 16(a2) ; RV64I-NEXT: sb s10, 31(a2) @@ -2400,8 +2404,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv t0, a1 -; RV32I-NEXT: lbu t1, 31(a0) +; RV32I-NEXT: mv t1, a1 +; RV32I-NEXT: lbu t0, 31(a0) ; RV32I-NEXT: lbu a1, 0(a0) ; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a1, 1(a0) @@ -2439,15 +2443,15 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a1, 30(a0) ; RV32I-NEXT: lbu a3, 29(a0) ; RV32I-NEXT: lbu a0, 28(a0) -; RV32I-NEXT: lbu t0, 0(t0) +; RV32I-NEXT: lbu t1, 0(t1) ; RV32I-NEXT: sb a1, 58(sp) ; RV32I-NEXT: sb a3, 57(sp) ; RV32I-NEXT: sb a0, 56(sp) ; RV32I-NEXT: sb a4, 55(sp) ; RV32I-NEXT: sb a5, 54(sp) ; RV32I-NEXT: sb a6, 53(sp) -; RV32I-NEXT: sb t1, 59(sp) -; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: sb t0, 59(sp) +; RV32I-NEXT: slli t0, t0, 24 ; RV32I-NEXT: sb a7, 52(sp) ; RV32I-NEXT: sb ra, 51(sp) ; RV32I-NEXT: sb s11, 50(sp) @@ -2479,7 +2483,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 29(sp) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: srai a0, t1, 31 +; RV32I-NEXT: srai a0, t0, 31 ; RV32I-NEXT: sb a0, 88(sp) ; RV32I-NEXT: sb a0, 84(sp) ; RV32I-NEXT: sb a0, 80(sp) @@ -2515,52 +2519,52 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a1, 63(sp) ; RV32I-NEXT: sb a3, 62(sp) ; RV32I-NEXT: sb a0, 61(sp) -; RV32I-NEXT: andi a0, t0, 31 +; RV32I-NEXT: andi a0, t1, 31 ; RV32I-NEXT: addi a1, sp, 28 -; RV32I-NEXT: add a6, a1, a0 -; RV32I-NEXT: lbu a0, 6(a6) +; RV32I-NEXT: add a5, a1, a0 +; RV32I-NEXT: lbu a0, 6(a5) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a6) +; RV32I-NEXT: lbu a0, 7(a5) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a6) +; RV32I-NEXT: lbu a0, 4(a5) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a6) +; RV32I-NEXT: lbu a0, 5(a5) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a6) +; RV32I-NEXT: lbu a0, 0(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a6) -; RV32I-NEXT: lbu t0, 2(a6) -; RV32I-NEXT: lbu t1, 3(a6) -; RV32I-NEXT: lbu t2, 14(a6) -; RV32I-NEXT: lbu t3, 15(a6) -; RV32I-NEXT: lbu t4, 12(a6) -; RV32I-NEXT: lbu t5, 13(a6) -; RV32I-NEXT: lbu t6, 10(a6) -; RV32I-NEXT: lbu s0, 11(a6) -; RV32I-NEXT: lbu s1, 8(a6) -; RV32I-NEXT: lbu s2, 9(a6) -; RV32I-NEXT: lbu s3, 22(a6) -; RV32I-NEXT: lbu s4, 23(a6) -; RV32I-NEXT: lbu s5, 20(a6) -; RV32I-NEXT: lbu s6, 21(a6) -; RV32I-NEXT: lbu s7, 18(a6) -; RV32I-NEXT: lbu s8, 19(a6) -; RV32I-NEXT: lbu s9, 16(a6) -; RV32I-NEXT: lbu s10, 17(a6) -; RV32I-NEXT: lbu s11, 30(a6) -; RV32I-NEXT: lbu ra, 31(a6) -; RV32I-NEXT: lbu a5, 28(a6) -; RV32I-NEXT: lbu a4, 29(a6) -; RV32I-NEXT: lbu a0, 25(a6) -; RV32I-NEXT: lbu a1, 24(a6) -; RV32I-NEXT: lbu a3, 27(a6) -; RV32I-NEXT: lbu a6, 26(a6) +; RV32I-NEXT: lbu a7, 1(a5) +; RV32I-NEXT: lbu t0, 2(a5) +; RV32I-NEXT: lbu t1, 3(a5) +; RV32I-NEXT: lbu t2, 14(a5) +; RV32I-NEXT: lbu t3, 15(a5) +; RV32I-NEXT: lbu t4, 12(a5) +; RV32I-NEXT: lbu t5, 13(a5) +; RV32I-NEXT: lbu t6, 10(a5) +; RV32I-NEXT: lbu s0, 11(a5) +; RV32I-NEXT: lbu s1, 8(a5) +; RV32I-NEXT: lbu s2, 9(a5) +; RV32I-NEXT: lbu s3, 22(a5) +; RV32I-NEXT: lbu s4, 23(a5) +; RV32I-NEXT: lbu s5, 20(a5) +; RV32I-NEXT: lbu s6, 21(a5) +; RV32I-NEXT: lbu s7, 18(a5) +; RV32I-NEXT: lbu s8, 19(a5) +; RV32I-NEXT: lbu s9, 16(a5) +; RV32I-NEXT: lbu s10, 17(a5) +; RV32I-NEXT: lbu s11, 30(a5) +; RV32I-NEXT: lbu ra, 31(a5) +; RV32I-NEXT: lbu a6, 28(a5) +; RV32I-NEXT: lbu a4, 29(a5) +; RV32I-NEXT: lbu a0, 25(a5) +; RV32I-NEXT: lbu a1, 24(a5) +; RV32I-NEXT: lbu a3, 27(a5) +; RV32I-NEXT: lbu a5, 26(a5) ; RV32I-NEXT: sb a0, 25(a2) ; RV32I-NEXT: sb a1, 24(a2) ; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a6, 26(a2) +; RV32I-NEXT: sb a5, 26(a2) ; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a5, 28(a2) +; RV32I-NEXT: sb a6, 28(a2) ; RV32I-NEXT: sb ra, 31(a2) ; RV32I-NEXT: sb s11, 30(a2) ; RV32I-NEXT: sb s10, 17(a2) diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index a601256bc2afaa..6f996f1a3a406f 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -781,9 +781,9 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, a4, a0 -; RV32I-NEXT: andi a4, a1, 7 -; RV32I-NEXT: srl a0, a5, a4 +; RV32I-NEXT: or a4, a4, a0 +; RV32I-NEXT: andi a5, a1, 7 +; RV32I-NEXT: srl a0, a4, a5 ; RV32I-NEXT: lbu a1, 9(a3) ; RV32I-NEXT: lbu a6, 8(a3) ; RV32I-NEXT: lbu a7, 10(a3) @@ -795,7 +795,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a6, t0, a7 ; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a4 +; RV32I-NEXT: not a7, a5 ; RV32I-NEXT: sll a1, a1, a7 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: lbu a7, 1(a3) @@ -808,12 +808,12 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a4 -; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: xori t0, a4, 31 -; RV32I-NEXT: sll a5, a5, t0 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: srl a6, a6, a4 +; RV32I-NEXT: srl a7, a7, a5 +; RV32I-NEXT: slli a4, a4, 1 +; RV32I-NEXT: xori t0, a5, 31 +; RV32I-NEXT: sll a4, a4, t0 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: srl a6, a6, a5 ; RV32I-NEXT: lbu t1, 13(a3) ; RV32I-NEXT: lbu t2, 12(a3) ; RV32I-NEXT: lbu t3, 14(a3) @@ -827,19 +827,19 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t1, a3, 1 ; RV32I-NEXT: sll t0, t1, t0 ; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: srl a3, a3, a4 +; RV32I-NEXT: srl a3, a3, a5 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb a7, 0(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a4, a6, 16 -; RV32I-NEXT: sb a4, 10(a2) -; RV32I-NEXT: srli a4, a6, 8 -; RV32I-NEXT: sb a4, 9(a2) -; RV32I-NEXT: srli a4, a3, 16 -; RV32I-NEXT: sb a4, 14(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: sb a4, 15(a2) +; RV32I-NEXT: srli a5, a6, 16 +; RV32I-NEXT: sb a5, 10(a2) +; RV32I-NEXT: srli a5, a6, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a5, a3, 16 +; RV32I-NEXT: sb a5, 14(a2) +; RV32I-NEXT: srli a5, a3, 24 +; RV32I-NEXT: sb a5, 15(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 13(a2) ; RV32I-NEXT: srli a3, a7, 16 @@ -852,8 +852,8 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: srli a0, t0, 24 ; RV32I-NEXT: sb a0, 11(a2) -; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: srli a4, a4, 24 +; RV32I-NEXT: sb a4, 3(a2) ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: sb a1, 7(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload @@ -1064,9 +1064,9 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, a4, a0 -; RV32I-NEXT: andi a4, a1, 7 -; RV32I-NEXT: sll a0, a5, a4 +; RV32I-NEXT: or a4, a4, a0 +; RV32I-NEXT: andi a5, a1, 7 +; RV32I-NEXT: sll a0, a4, a5 ; RV32I-NEXT: lbu a1, 1(a3) ; RV32I-NEXT: lbu a6, 0(a3) ; RV32I-NEXT: lbu a7, 2(a3) @@ -1078,7 +1078,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a6, t0, a7 ; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: srli a1, a6, 1 -; RV32I-NEXT: xori a7, a4, 31 +; RV32I-NEXT: xori a7, a5, 31 ; RV32I-NEXT: srl a1, a1, a7 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: lbu t0, 13(a3) @@ -1091,7 +1091,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t3, t3, 24 ; RV32I-NEXT: or t1, t3, t2 ; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: sll t0, t0, a4 +; RV32I-NEXT: sll t0, t0, a5 ; RV32I-NEXT: lbu t1, 9(a3) ; RV32I-NEXT: lbu t2, 8(a3) ; RV32I-NEXT: lbu t3, 10(a3) @@ -1105,13 +1105,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli t1, a3, 1 ; RV32I-NEXT: srl a7, t1, a7 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: sll a3, a3, a4 -; RV32I-NEXT: srli a5, a5, 1 -; RV32I-NEXT: not t1, a4 -; RV32I-NEXT: srl a5, a5, t1 -; RV32I-NEXT: or a5, a3, a5 -; RV32I-NEXT: sll a4, a6, a4 -; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sll a3, a3, a5 +; RV32I-NEXT: srli a4, a4, 1 +; RV32I-NEXT: not t1, a5 +; RV32I-NEXT: srl a4, a4, t1 +; RV32I-NEXT: or a4, a3, a4 +; RV32I-NEXT: sll a5, a6, a5 +; RV32I-NEXT: sb a5, 0(a2) ; RV32I-NEXT: srli a6, a3, 16 ; RV32I-NEXT: sb a6, 10(a2) ; RV32I-NEXT: srli a6, a3, 24 @@ -1124,19 +1124,19 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a3, 15(a2) ; RV32I-NEXT: srli a3, t0, 8 ; RV32I-NEXT: sb a3, 13(a2) -; RV32I-NEXT: srli a3, a4, 16 +; RV32I-NEXT: srli a3, a5, 16 ; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: srli a3, a4, 24 +; RV32I-NEXT: srli a3, a5, 24 ; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 1(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 1(a2) ; RV32I-NEXT: srli a3, a0, 16 ; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: sb a3, 7(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a4, 8(a2) ; RV32I-NEXT: sb a7, 12(a2) ; RV32I-NEXT: sb a1, 4(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload @@ -1353,9 +1353,9 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, a4, a0 -; RV32I-NEXT: andi a4, a1, 7 -; RV32I-NEXT: srl a0, a5, a4 +; RV32I-NEXT: or a4, a4, a0 +; RV32I-NEXT: andi a5, a1, 7 +; RV32I-NEXT: srl a0, a4, a5 ; RV32I-NEXT: lbu a1, 9(a3) ; RV32I-NEXT: lbu a6, 8(a3) ; RV32I-NEXT: lbu a7, 10(a3) @@ -1367,7 +1367,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a6, t0, a7 ; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a4 +; RV32I-NEXT: not a7, a5 ; RV32I-NEXT: sll a1, a1, a7 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: lbu a7, 1(a3) @@ -1380,12 +1380,12 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a4 -; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: xori t0, a4, 31 -; RV32I-NEXT: sll a5, a5, t0 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: srl a6, a6, a4 +; RV32I-NEXT: srl a7, a7, a5 +; RV32I-NEXT: slli a4, a4, 1 +; RV32I-NEXT: xori t0, a5, 31 +; RV32I-NEXT: sll a4, a4, t0 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: srl a6, a6, a5 ; RV32I-NEXT: lbu t1, 13(a3) ; RV32I-NEXT: lbu t2, 12(a3) ; RV32I-NEXT: lbu t3, 14(a3) @@ -1399,19 +1399,19 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t1, a3, 1 ; RV32I-NEXT: sll t0, t1, t0 ; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: sra a3, a3, a4 +; RV32I-NEXT: sra a3, a3, a5 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb a7, 0(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a4, a6, 16 -; RV32I-NEXT: sb a4, 10(a2) -; RV32I-NEXT: srli a4, a6, 8 -; RV32I-NEXT: sb a4, 9(a2) -; RV32I-NEXT: srli a4, a3, 16 -; RV32I-NEXT: sb a4, 14(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: sb a4, 15(a2) +; RV32I-NEXT: srli a5, a6, 16 +; RV32I-NEXT: sb a5, 10(a2) +; RV32I-NEXT: srli a5, a6, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a5, a3, 16 +; RV32I-NEXT: sb a5, 14(a2) +; RV32I-NEXT: srli a5, a3, 24 +; RV32I-NEXT: sb a5, 15(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 13(a2) ; RV32I-NEXT: srli a3, a7, 16 @@ -1424,8 +1424,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: srli a0, t0, 24 ; RV32I-NEXT: sb a0, 11(a2) -; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: srli a4, a4, 24 +; RV32I-NEXT: sb a4, 3(a2) ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: sb a1, 7(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload @@ -1497,13 +1497,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli ra, ra, 24 ; RV64I-NEXT: lbu s10, 5(a1) ; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s11, s11, s9 -; RV64I-NEXT: lbu s9, 4(a1) +; RV64I-NEXT: or s9, s11, s9 +; RV64I-NEXT: lbu s11, 4(a1) ; RV64I-NEXT: slli s10, s10, 8 ; RV64I-NEXT: lbu ra, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s9 -; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: or s10, s10, s11 +; RV64I-NEXT: lbu s11, 21(a0) ; RV64I-NEXT: slli ra, ra, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, ra @@ -1511,8 +1511,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a1, a1, s10 ; RV64I-NEXT: lbu s10, 23(a0) ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t0, a1, s11 -; RV64I-NEXT: lbu s11, 24(a0) +; RV64I-NEXT: or t0, a1, s9 +; RV64I-NEXT: lbu s9, 24(a0) ; RV64I-NEXT: lbu a7, 25(a0) ; RV64I-NEXT: lbu a6, 26(a0) ; RV64I-NEXT: lbu a5, 27(a0) @@ -1527,10 +1527,10 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a5, 83(sp) ; RV64I-NEXT: sb a6, 82(sp) ; RV64I-NEXT: sb a7, 81(sp) -; RV64I-NEXT: sb s11, 80(sp) +; RV64I-NEXT: sb s9, 80(sp) ; RV64I-NEXT: sb s10, 79(sp) ; RV64I-NEXT: sb ra, 78(sp) -; RV64I-NEXT: sb s9, 77(sp) +; RV64I-NEXT: sb s11, 77(sp) ; RV64I-NEXT: sb s8, 76(sp) ; RV64I-NEXT: sb s7, 75(sp) ; RV64I-NEXT: sb s6, 74(sp) @@ -1816,21 +1816,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s5, 17(a0) ; RV32I-NEXT: lbu s6, 18(a0) ; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s10, 1(a1) -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s8, 1(a1) +; RV32I-NEXT: lbu s9, 20(a0) +; RV32I-NEXT: lbu s10, 21(a0) ; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s8, s8, 8 ; RV32I-NEXT: lbu ra, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or s10, s10, s11 +; RV32I-NEXT: or s8, s8, s11 ; RV32I-NEXT: lbu s11, 22(a0) ; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, ra ; RV32I-NEXT: lbu ra, 23(a0) -; RV32I-NEXT: or t0, a1, s10 -; RV32I-NEXT: lbu s10, 24(a0) +; RV32I-NEXT: or t0, a1, s8 +; RV32I-NEXT: lbu s8, 24(a0) ; RV32I-NEXT: lbu a7, 25(a0) ; RV32I-NEXT: lbu a6, 26(a0) ; RV32I-NEXT: lbu a5, 27(a0) @@ -1845,11 +1845,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a5, 55(sp) ; RV32I-NEXT: sb a6, 54(sp) ; RV32I-NEXT: sb a7, 53(sp) -; RV32I-NEXT: sb s10, 52(sp) +; RV32I-NEXT: sb s8, 52(sp) ; RV32I-NEXT: sb ra, 51(sp) ; RV32I-NEXT: sb s11, 50(sp) -; RV32I-NEXT: sb s9, 49(sp) -; RV32I-NEXT: sb s8, 48(sp) +; RV32I-NEXT: sb s10, 49(sp) +; RV32I-NEXT: sb s9, 48(sp) ; RV32I-NEXT: sb s7, 47(sp) ; RV32I-NEXT: sb s6, 46(sp) ; RV32I-NEXT: sb s5, 45(sp) @@ -1910,123 +1910,123 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 28(sp) ; RV32I-NEXT: slli a0, t0, 24 ; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a4, sp, 28 -; RV32I-NEXT: add a4, a4, a0 -; RV32I-NEXT: lbu a0, 5(a4) -; RV32I-NEXT: lbu a1, 4(a4) -; RV32I-NEXT: lbu a3, 6(a4) -; RV32I-NEXT: lbu a5, 7(a4) +; RV32I-NEXT: addi a5, sp, 28 +; RV32I-NEXT: add a5, a5, a0 +; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a1, 4(a5) +; RV32I-NEXT: lbu a3, 6(a5) +; RV32I-NEXT: lbu a4, 7(a5) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or t5, a3, a0 +; RV32I-NEXT: slli a4, a4, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or t3, a3, a0 ; RV32I-NEXT: andi a3, t0, 7 -; RV32I-NEXT: lbu a0, 9(a4) -; RV32I-NEXT: lbu a1, 8(a4) -; RV32I-NEXT: lbu a5, 10(a4) -; RV32I-NEXT: lbu a6, 11(a4) +; RV32I-NEXT: lbu a0, 9(a5) +; RV32I-NEXT: lbu a1, 8(a5) +; RV32I-NEXT: lbu a4, 10(a5) +; RV32I-NEXT: lbu a6, 11(a5) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a1, a6, a5 +; RV32I-NEXT: or a1, a6, a4 ; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t1, a3 -; RV32I-NEXT: sll a0, a0, t1 -; RV32I-NEXT: lbu a1, 1(a4) -; RV32I-NEXT: lbu a5, 0(a4) -; RV32I-NEXT: lbu a7, 2(a4) -; RV32I-NEXT: lbu t0, 3(a4) +; RV32I-NEXT: not t2, a3 +; RV32I-NEXT: sll a0, a0, t2 +; RV32I-NEXT: lbu a1, 1(a5) +; RV32I-NEXT: lbu a4, 0(a5) +; RV32I-NEXT: lbu a7, 2(a5) +; RV32I-NEXT: lbu t0, 3(a5) ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or t0, a5, a1 -; RV32I-NEXT: slli a1, t5, 1 -; RV32I-NEXT: xori t2, a3, 31 -; RV32I-NEXT: sll a1, a1, t2 -; RV32I-NEXT: lbu a5, 13(a4) -; RV32I-NEXT: lbu a7, 12(a4) -; RV32I-NEXT: lbu t3, 14(a4) -; RV32I-NEXT: lbu t4, 15(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t3, a7, a5 -; RV32I-NEXT: lbu a5, 17(a4) -; RV32I-NEXT: lbu a7, 16(a4) -; RV32I-NEXT: lbu t4, 18(a4) -; RV32I-NEXT: lbu t6, 19(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 +; RV32I-NEXT: or a4, t0, a7 +; RV32I-NEXT: or t0, a4, a1 +; RV32I-NEXT: slli a1, t3, 1 +; RV32I-NEXT: xori t1, a3, 31 +; RV32I-NEXT: sll a1, a1, t1 +; RV32I-NEXT: lbu a4, 13(a5) +; RV32I-NEXT: lbu a7, 12(a5) +; RV32I-NEXT: lbu t4, 14(a5) +; RV32I-NEXT: lbu t5, 15(a5) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a7 ; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: or a7, t5, t4 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: lbu a7, 17(a5) +; RV32I-NEXT: lbu t4, 16(a5) +; RV32I-NEXT: lbu t5, 18(a5) +; RV32I-NEXT: lbu t6, 19(a5) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t4 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t4 -; RV32I-NEXT: or t4, a7, a5 -; RV32I-NEXT: slli a5, t4, 1 -; RV32I-NEXT: sll a7, a5, t1 -; RV32I-NEXT: lbu a5, 21(a4) -; RV32I-NEXT: lbu t6, 20(a4) -; RV32I-NEXT: lbu s0, 22(a4) -; RV32I-NEXT: lbu s1, 23(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 +; RV32I-NEXT: or t4, t6, t5 +; RV32I-NEXT: or t4, t4, a7 +; RV32I-NEXT: slli a7, t4, 1 +; RV32I-NEXT: sll a7, a7, t2 +; RV32I-NEXT: lbu t5, 21(a5) +; RV32I-NEXT: lbu t6, 20(a5) +; RV32I-NEXT: lbu s0, 22(a5) +; RV32I-NEXT: lbu s1, 23(a5) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: or t5, t5, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 ; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or s0, s0, a5 -; RV32I-NEXT: lbu a5, 25(a4) -; RV32I-NEXT: lbu t6, 24(a4) -; RV32I-NEXT: lbu s1, 26(a4) -; RV32I-NEXT: lbu s2, 27(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 +; RV32I-NEXT: or t5, s0, t5 +; RV32I-NEXT: lbu t6, 25(a5) +; RV32I-NEXT: lbu s0, 24(a5) +; RV32I-NEXT: lbu s1, 26(a5) +; RV32I-NEXT: lbu s2, 27(a5) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t6, t6, s0 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or t6, s2, s1 -; RV32I-NEXT: or t6, t6, a5 -; RV32I-NEXT: lbu a5, 29(a4) -; RV32I-NEXT: lbu s1, 28(a4) +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: lbu s0, 29(a5) +; RV32I-NEXT: lbu s1, 28(a5) ; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t1, s2, t1 -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, s1 -; RV32I-NEXT: lbu s1, 30(a4) -; RV32I-NEXT: lbu a4, 31(a4) -; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: sll s2, s2, t2 +; RV32I-NEXT: sll t2, s2, t2 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: or s0, s0, s1 +; RV32I-NEXT: lbu s1, 30(a5) +; RV32I-NEXT: lbu a5, 31(a5) +; RV32I-NEXT: slli s2, a4, 1 +; RV32I-NEXT: sll s2, s2, t1 ; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: slli s1, s0, 1 -; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or s3, a4, a5 -; RV32I-NEXT: slli a4, s3, 1 -; RV32I-NEXT: sll t2, a4, t2 -; RV32I-NEXT: srl a4, t5, a3 -; RV32I-NEXT: srl a5, t0, a3 -; RV32I-NEXT: srl t0, t3, a3 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a5, a5, s1 +; RV32I-NEXT: slli s1, t5, 1 +; RV32I-NEXT: sll s1, s1, t1 +; RV32I-NEXT: or a5, a5, s0 +; RV32I-NEXT: slli s0, a5, 1 +; RV32I-NEXT: sll t1, s0, t1 +; RV32I-NEXT: srl t3, t3, a3 +; RV32I-NEXT: srl t0, t0, a3 +; RV32I-NEXT: srl a4, a4, a3 ; RV32I-NEXT: srl a6, a6, a3 -; RV32I-NEXT: srl t3, s0, a3 +; RV32I-NEXT: srl t5, t5, a3 ; RV32I-NEXT: srl t4, t4, a3 -; RV32I-NEXT: srl t5, t6, a3 -; RV32I-NEXT: srl a3, s3, a3 -; RV32I-NEXT: srli t6, t5, 16 -; RV32I-NEXT: sb t6, 26(a2) -; RV32I-NEXT: or t2, t5, t2 -; RV32I-NEXT: sb t5, 24(a2) -; RV32I-NEXT: srli t5, t5, 8 -; RV32I-NEXT: sb t5, 25(a2) -; RV32I-NEXT: srli t5, a3, 24 -; RV32I-NEXT: sb t5, 31(a2) -; RV32I-NEXT: srli t5, a3, 16 -; RV32I-NEXT: sb t5, 30(a2) +; RV32I-NEXT: srl t6, t6, a3 +; RV32I-NEXT: srl a3, a5, a3 +; RV32I-NEXT: srli a5, t6, 16 +; RV32I-NEXT: sb a5, 26(a2) +; RV32I-NEXT: or a5, t6, t1 +; RV32I-NEXT: sb t6, 24(a2) +; RV32I-NEXT: srli t1, t6, 8 +; RV32I-NEXT: sb t1, 25(a2) +; RV32I-NEXT: srli t1, a3, 24 +; RV32I-NEXT: sb t1, 31(a2) +; RV32I-NEXT: srli t1, a3, 16 +; RV32I-NEXT: sb t1, 30(a2) ; RV32I-NEXT: sb a3, 28(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 29(a2) @@ -2034,45 +2034,45 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a3, 18(a2) ; RV32I-NEXT: or a3, t4, s1 ; RV32I-NEXT: sb t4, 16(a2) -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb t4, 17(a2) -; RV32I-NEXT: srli t4, t3, 16 -; RV32I-NEXT: sb t4, 22(a2) -; RV32I-NEXT: or t1, t3, t1 -; RV32I-NEXT: sb t3, 20(a2) -; RV32I-NEXT: srli t3, t3, 8 -; RV32I-NEXT: sb t3, 21(a2) -; RV32I-NEXT: srli t3, a6, 16 -; RV32I-NEXT: sb t3, 10(a2) -; RV32I-NEXT: or t3, a6, s2 +; RV32I-NEXT: srli t1, t4, 8 +; RV32I-NEXT: sb t1, 17(a2) +; RV32I-NEXT: srli t1, t5, 16 +; RV32I-NEXT: sb t1, 22(a2) +; RV32I-NEXT: or t1, t5, t2 +; RV32I-NEXT: sb t5, 20(a2) +; RV32I-NEXT: srli t2, t5, 8 +; RV32I-NEXT: sb t2, 21(a2) +; RV32I-NEXT: srli t2, a6, 16 +; RV32I-NEXT: sb t2, 10(a2) +; RV32I-NEXT: or t2, a6, s2 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: srli a6, a6, 8 ; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t0, 16 +; RV32I-NEXT: srli a6, a4, 16 ; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: sb t0, 12(a2) -; RV32I-NEXT: srli a7, t0, 8 -; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, a5, a1 -; RV32I-NEXT: sb a5, 0(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: srli a5, a4, 16 -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: or a6, a4, a7 +; RV32I-NEXT: sb a4, 12(a2) ; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 13(a2) +; RV32I-NEXT: srli a4, t0, 16 +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: or a1, t0, a1 +; RV32I-NEXT: sb t0, 0(a2) +; RV32I-NEXT: srli a4, t0, 8 +; RV32I-NEXT: sb a4, 1(a2) +; RV32I-NEXT: srli a4, t3, 16 +; RV32I-NEXT: sb a4, 6(a2) +; RV32I-NEXT: or a0, t3, a0 +; RV32I-NEXT: sb t3, 4(a2) +; RV32I-NEXT: srli a4, t3, 8 ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: srli a4, t2, 24 -; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: srli a5, a5, 24 +; RV32I-NEXT: sb a5, 27(a2) ; RV32I-NEXT: srli a3, a3, 24 ; RV32I-NEXT: sb a3, 19(a2) ; RV32I-NEXT: srli a3, t1, 24 ; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: srli a3, t3, 24 +; RV32I-NEXT: srli a3, t2, 24 ; RV32I-NEXT: sb a3, 11(a2) ; RV32I-NEXT: srli a3, a6, 24 ; RV32I-NEXT: sb a3, 15(a2) @@ -2155,13 +2155,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli ra, ra, 24 ; RV64I-NEXT: lbu s10, 5(a1) ; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s11, s11, s9 -; RV64I-NEXT: lbu s9, 4(a1) +; RV64I-NEXT: or s9, s11, s9 +; RV64I-NEXT: lbu s11, 4(a1) ; RV64I-NEXT: slli s10, s10, 8 ; RV64I-NEXT: lbu ra, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s9 -; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: or s10, s10, s11 +; RV64I-NEXT: lbu s11, 21(a0) ; RV64I-NEXT: slli ra, ra, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, ra @@ -2169,8 +2169,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a1, a1, s10 ; RV64I-NEXT: lbu s10, 23(a0) ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t0, a1, s11 -; RV64I-NEXT: lbu s11, 24(a0) +; RV64I-NEXT: or t0, a1, s9 +; RV64I-NEXT: lbu s9, 24(a0) ; RV64I-NEXT: lbu a7, 25(a0) ; RV64I-NEXT: lbu a6, 26(a0) ; RV64I-NEXT: lbu a5, 27(a0) @@ -2185,10 +2185,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a5, 115(sp) ; RV64I-NEXT: sb a6, 114(sp) ; RV64I-NEXT: sb a7, 113(sp) -; RV64I-NEXT: sb s11, 112(sp) +; RV64I-NEXT: sb s9, 112(sp) ; RV64I-NEXT: sb s10, 111(sp) ; RV64I-NEXT: sb ra, 110(sp) -; RV64I-NEXT: sb s9, 109(sp) +; RV64I-NEXT: sb s11, 109(sp) ; RV64I-NEXT: sb s8, 108(sp) ; RV64I-NEXT: sb s7, 107(sp) ; RV64I-NEXT: sb s6, 106(sp) @@ -2474,21 +2474,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s5, 17(a0) ; RV32I-NEXT: lbu s6, 18(a0) ; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s10, 1(a1) -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s8, 1(a1) +; RV32I-NEXT: lbu s9, 20(a0) +; RV32I-NEXT: lbu s10, 21(a0) ; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s8, s8, 8 ; RV32I-NEXT: lbu ra, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or s10, s10, s11 +; RV32I-NEXT: or s8, s8, s11 ; RV32I-NEXT: lbu s11, 22(a0) ; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, ra ; RV32I-NEXT: lbu ra, 23(a0) -; RV32I-NEXT: or t0, a1, s10 -; RV32I-NEXT: lbu s10, 24(a0) +; RV32I-NEXT: or t0, a1, s8 +; RV32I-NEXT: lbu s8, 24(a0) ; RV32I-NEXT: lbu a7, 25(a0) ; RV32I-NEXT: lbu a6, 26(a0) ; RV32I-NEXT: lbu a5, 27(a0) @@ -2503,11 +2503,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a5, 87(sp) ; RV32I-NEXT: sb a6, 86(sp) ; RV32I-NEXT: sb a7, 85(sp) -; RV32I-NEXT: sb s10, 84(sp) +; RV32I-NEXT: sb s8, 84(sp) ; RV32I-NEXT: sb ra, 83(sp) ; RV32I-NEXT: sb s11, 82(sp) -; RV32I-NEXT: sb s9, 81(sp) -; RV32I-NEXT: sb s8, 80(sp) +; RV32I-NEXT: sb s10, 81(sp) +; RV32I-NEXT: sb s9, 80(sp) ; RV32I-NEXT: sb s7, 79(sp) ; RV32I-NEXT: sb s6, 78(sp) ; RV32I-NEXT: sb s5, 77(sp) @@ -2568,125 +2568,125 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 60(sp) ; RV32I-NEXT: slli a0, t0, 24 ; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a4, sp, 60 -; RV32I-NEXT: sub a4, a4, a0 -; RV32I-NEXT: lbu a0, 5(a4) -; RV32I-NEXT: lbu a1, 4(a4) -; RV32I-NEXT: lbu a3, 6(a4) -; RV32I-NEXT: lbu a5, 7(a4) +; RV32I-NEXT: addi a5, sp, 60 +; RV32I-NEXT: sub a5, a5, a0 +; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a1, 4(a5) +; RV32I-NEXT: lbu a3, 6(a5) +; RV32I-NEXT: lbu a4, 7(a5) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or t5, a3, a0 -; RV32I-NEXT: andi a1, t0, 7 -; RV32I-NEXT: lbu a0, 1(a4) -; RV32I-NEXT: lbu a3, 0(a4) -; RV32I-NEXT: lbu a5, 2(a4) -; RV32I-NEXT: lbu a6, 3(a4) +; RV32I-NEXT: slli a4, a4, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or t2, a3, a0 +; RV32I-NEXT: andi a3, t0, 7 +; RV32I-NEXT: lbu a0, 1(a5) +; RV32I-NEXT: lbu a1, 0(a5) +; RV32I-NEXT: lbu a4, 2(a5) +; RV32I-NEXT: lbu a6, 3(a5) ; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a6, a5 -; RV32I-NEXT: or a6, a3, a0 +; RV32I-NEXT: or a1, a6, a4 +; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: srli a0, a6, 1 -; RV32I-NEXT: xori a7, a1, 31 +; RV32I-NEXT: xori a7, a3, 31 ; RV32I-NEXT: srl a0, a0, a7 -; RV32I-NEXT: lbu a3, 13(a4) -; RV32I-NEXT: lbu a5, 12(a4) -; RV32I-NEXT: lbu t0, 14(a4) -; RV32I-NEXT: lbu t1, 15(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: lbu a1, 13(a5) +; RV32I-NEXT: lbu a4, 12(a5) +; RV32I-NEXT: lbu t0, 14(a5) +; RV32I-NEXT: lbu t1, 15(a5) +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a5, t1, t0 -; RV32I-NEXT: or t0, a5, a3 -; RV32I-NEXT: lbu a3, 9(a4) -; RV32I-NEXT: lbu a5, 8(a4) -; RV32I-NEXT: lbu t1, 10(a4) -; RV32I-NEXT: lbu t2, 11(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: or a4, t1, t0 +; RV32I-NEXT: or t0, a4, a1 +; RV32I-NEXT: lbu a1, 9(a5) +; RV32I-NEXT: lbu a4, 8(a5) +; RV32I-NEXT: lbu t1, 10(a5) +; RV32I-NEXT: lbu t3, 11(a5) +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a5, t2, t1 -; RV32I-NEXT: or t1, a5, a3 -; RV32I-NEXT: srli a3, t1, 1 -; RV32I-NEXT: srl a5, a3, a7 -; RV32I-NEXT: srli t4, t5, 1 -; RV32I-NEXT: not t2, a1 -; RV32I-NEXT: lbu a3, 21(a4) -; RV32I-NEXT: lbu t3, 20(a4) -; RV32I-NEXT: lbu t6, 22(a4) -; RV32I-NEXT: lbu s0, 23(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, t3 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or a4, t3, t1 +; RV32I-NEXT: or t3, a4, a1 +; RV32I-NEXT: srli a1, t3, 1 +; RV32I-NEXT: srl a1, a1, a7 +; RV32I-NEXT: srli a4, t2, 1 +; RV32I-NEXT: not t4, a3 +; RV32I-NEXT: lbu t1, 21(a5) +; RV32I-NEXT: lbu t5, 20(a5) +; RV32I-NEXT: lbu t6, 22(a5) +; RV32I-NEXT: lbu s0, 23(a5) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t5 ; RV32I-NEXT: slli t6, t6, 16 ; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or t3, s0, t6 -; RV32I-NEXT: or t3, t3, a3 -; RV32I-NEXT: lbu a3, 17(a4) -; RV32I-NEXT: lbu t6, 16(a4) -; RV32I-NEXT: lbu s0, 18(a4) -; RV32I-NEXT: lbu s1, 19(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, t6 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or t5, t5, t1 +; RV32I-NEXT: lbu t1, 17(a5) +; RV32I-NEXT: lbu t6, 16(a5) +; RV32I-NEXT: lbu s0, 18(a5) +; RV32I-NEXT: lbu s1, 19(a5) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 ; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or s0, s0, a3 -; RV32I-NEXT: lbu a3, 29(a4) -; RV32I-NEXT: lbu t6, 28(a4) -; RV32I-NEXT: lbu s1, 30(a4) -; RV32I-NEXT: lbu s2, 31(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, t6 +; RV32I-NEXT: or t6, s0, t1 +; RV32I-NEXT: lbu t1, 29(a5) +; RV32I-NEXT: lbu s0, 28(a5) +; RV32I-NEXT: lbu s1, 30(a5) +; RV32I-NEXT: lbu s2, 31(a5) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or s0, t1, s0 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or t6, s2, s1 -; RV32I-NEXT: lbu s1, 25(a4) -; RV32I-NEXT: lbu s2, 24(a4) -; RV32I-NEXT: srl t4, t4, t2 -; RV32I-NEXT: or t6, t6, a3 -; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: or a3, s1, s2 -; RV32I-NEXT: lbu s1, 26(a4) -; RV32I-NEXT: lbu a4, 27(a4) -; RV32I-NEXT: srli s2, s0, 1 +; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: lbu s2, 25(a5) +; RV32I-NEXT: lbu s3, 24(a5) +; RV32I-NEXT: srl t1, a4, t4 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: or a4, s2, s3 +; RV32I-NEXT: lbu s1, 26(a5) +; RV32I-NEXT: lbu a5, 27(a5) +; RV32I-NEXT: srli s2, t6, 1 ; RV32I-NEXT: srl s2, s2, a7 ; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a4, a4, s1 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a5, a5, s1 ; RV32I-NEXT: srli s1, t0, 1 -; RV32I-NEXT: srl s1, s1, t2 -; RV32I-NEXT: or a4, a4, a3 -; RV32I-NEXT: srli a3, a4, 1 -; RV32I-NEXT: srl a7, a3, a7 -; RV32I-NEXT: srli a3, t3, 1 -; RV32I-NEXT: srl t2, a3, t2 -; RV32I-NEXT: sll a3, t5, a1 -; RV32I-NEXT: sll t0, t0, a1 -; RV32I-NEXT: sll t1, t1, a1 -; RV32I-NEXT: sll t3, t3, a1 -; RV32I-NEXT: sll t5, s0, a1 -; RV32I-NEXT: sll t6, t6, a1 -; RV32I-NEXT: sll a4, a4, a1 -; RV32I-NEXT: sll a1, a6, a1 -; RV32I-NEXT: srli a6, a4, 24 +; RV32I-NEXT: srl s1, s1, t4 +; RV32I-NEXT: or a5, a5, a4 +; RV32I-NEXT: srli a4, a5, 1 +; RV32I-NEXT: srl a7, a4, a7 +; RV32I-NEXT: srli a4, t5, 1 +; RV32I-NEXT: srl t4, a4, t4 +; RV32I-NEXT: sll a4, t2, a3 +; RV32I-NEXT: sll t0, t0, a3 +; RV32I-NEXT: sll t2, t3, a3 +; RV32I-NEXT: sll t3, t5, a3 +; RV32I-NEXT: sll t5, t6, a3 +; RV32I-NEXT: sll t6, s0, a3 +; RV32I-NEXT: sll a5, a5, a3 +; RV32I-NEXT: sll a3, a6, a3 +; RV32I-NEXT: srli a6, a5, 24 ; RV32I-NEXT: sb a6, 27(a2) -; RV32I-NEXT: srli a6, a4, 16 +; RV32I-NEXT: srli a6, a5, 16 ; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: or a6, a4, t2 -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 25(a2) -; RV32I-NEXT: srli a4, t6, 24 -; RV32I-NEXT: sb a4, 31(a2) -; RV32I-NEXT: srli a4, t6, 16 -; RV32I-NEXT: sb a4, 30(a2) -; RV32I-NEXT: or a4, t6, a7 +; RV32I-NEXT: or a6, a5, t4 +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 25(a2) +; RV32I-NEXT: srli a5, t6, 24 +; RV32I-NEXT: sb a5, 31(a2) +; RV32I-NEXT: srli a5, t6, 16 +; RV32I-NEXT: sb a5, 30(a2) +; RV32I-NEXT: or a5, t6, a7 ; RV32I-NEXT: srli a7, t6, 8 ; RV32I-NEXT: sb a7, 29(a2) ; RV32I-NEXT: srli a7, t5, 24 @@ -2694,49 +2694,49 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli a7, t5, 16 ; RV32I-NEXT: sb a7, 18(a2) ; RV32I-NEXT: or a7, t5, s1 -; RV32I-NEXT: srli t2, t5, 8 -; RV32I-NEXT: sb t2, 17(a2) -; RV32I-NEXT: srli t2, t3, 24 -; RV32I-NEXT: sb t2, 23(a2) -; RV32I-NEXT: srli t2, t3, 16 -; RV32I-NEXT: sb t2, 22(a2) -; RV32I-NEXT: or t2, t3, s2 +; RV32I-NEXT: srli t4, t5, 8 +; RV32I-NEXT: sb t4, 17(a2) +; RV32I-NEXT: srli t4, t3, 24 +; RV32I-NEXT: sb t4, 23(a2) +; RV32I-NEXT: srli t4, t3, 16 +; RV32I-NEXT: sb t4, 22(a2) +; RV32I-NEXT: or t4, t3, s2 ; RV32I-NEXT: srli t3, t3, 8 ; RV32I-NEXT: sb t3, 21(a2) -; RV32I-NEXT: srli t3, t1, 24 +; RV32I-NEXT: srli t3, t2, 24 ; RV32I-NEXT: sb t3, 11(a2) -; RV32I-NEXT: srli t3, t1, 16 +; RV32I-NEXT: srli t3, t2, 16 ; RV32I-NEXT: sb t3, 10(a2) -; RV32I-NEXT: or t3, t1, t4 -; RV32I-NEXT: srli t1, t1, 8 -; RV32I-NEXT: sb t1, 9(a2) -; RV32I-NEXT: srli t1, t0, 24 -; RV32I-NEXT: sb t1, 15(a2) -; RV32I-NEXT: srli t1, t0, 16 -; RV32I-NEXT: sb t1, 14(a2) -; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: srli t2, t2, 8 +; RV32I-NEXT: sb t2, 9(a2) +; RV32I-NEXT: srli t2, t0, 24 +; RV32I-NEXT: sb t2, 15(a2) +; RV32I-NEXT: srli t2, t0, 16 +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: or a1, t0, a1 ; RV32I-NEXT: srli t0, t0, 8 ; RV32I-NEXT: sb t0, 13(a2) -; RV32I-NEXT: srli t0, a1, 24 +; RV32I-NEXT: srli t0, a3, 24 ; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: srli t0, a1, 16 +; RV32I-NEXT: srli t0, a3, 16 ; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a1, 1(a2) -; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: srli a1, a3, 16 -; RV32I-NEXT: sb a1, 6(a2) -; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: sb a3, 0(a2) ; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 5(a2) +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: srli a3, a4, 24 +; RV32I-NEXT: sb a3, 7(a2) +; RV32I-NEXT: srli a3, a4, 16 +; RV32I-NEXT: sb a3, 6(a2) +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 5(a2) ; RV32I-NEXT: sb a6, 24(a2) -; RV32I-NEXT: sb a4, 28(a2) +; RV32I-NEXT: sb a5, 28(a2) ; RV32I-NEXT: sb a7, 16(a2) -; RV32I-NEXT: sb t2, 20(a2) -; RV32I-NEXT: sb t3, 8(a2) -; RV32I-NEXT: sb a5, 12(a2) +; RV32I-NEXT: sb t4, 20(a2) +; RV32I-NEXT: sb t1, 8(a2) +; RV32I-NEXT: sb a1, 12(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload @@ -2776,7 +2776,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t1, 31(a0) +; RV64I-NEXT: lbu t0, 31(a0) ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 1(a0) @@ -2789,31 +2789,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 5(a0) ; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t3, 6(a0) -; RV64I-NEXT: lbu t4, 7(a0) -; RV64I-NEXT: lbu t5, 8(a0) -; RV64I-NEXT: lbu t6, 9(a0) -; RV64I-NEXT: lbu s0, 10(a0) -; RV64I-NEXT: lbu s1, 11(a0) -; RV64I-NEXT: lbu s2, 12(a0) -; RV64I-NEXT: lbu s3, 13(a0) -; RV64I-NEXT: lbu s4, 14(a0) -; RV64I-NEXT: lbu s5, 15(a0) -; RV64I-NEXT: lbu s6, 16(a0) -; RV64I-NEXT: lbu s7, 17(a0) -; RV64I-NEXT: lbu s8, 18(a0) -; RV64I-NEXT: lbu s9, 19(a0) -; RV64I-NEXT: lbu a3, 1(a1) +; RV64I-NEXT: lbu t2, 6(a0) +; RV64I-NEXT: lbu t3, 7(a0) +; RV64I-NEXT: lbu t4, 8(a0) +; RV64I-NEXT: lbu t5, 9(a0) +; RV64I-NEXT: lbu t6, 10(a0) +; RV64I-NEXT: lbu s0, 11(a0) +; RV64I-NEXT: lbu s1, 12(a0) +; RV64I-NEXT: lbu s2, 13(a0) +; RV64I-NEXT: lbu s3, 14(a0) +; RV64I-NEXT: lbu s4, 15(a0) +; RV64I-NEXT: lbu s5, 16(a0) +; RV64I-NEXT: lbu s6, 17(a0) +; RV64I-NEXT: lbu s7, 18(a0) +; RV64I-NEXT: lbu s8, 19(a0) +; RV64I-NEXT: lbu s9, 1(a1) ; RV64I-NEXT: lbu s10, 0(a1) ; RV64I-NEXT: lbu s11, 2(a1) ; RV64I-NEXT: lbu ra, 3(a1) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, s10 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s9, s9, s10 ; RV64I-NEXT: slli s11, s11, 16 ; RV64I-NEXT: slli ra, ra, 24 ; RV64I-NEXT: lbu s10, 5(a1) ; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or a3, s11, a3 +; RV64I-NEXT: or s9, s11, s9 ; RV64I-NEXT: lbu s11, 4(a1) ; RV64I-NEXT: slli s10, s10, 8 ; RV64I-NEXT: lbu ra, 6(a1) @@ -2827,8 +2827,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a1, a1, s10 ; RV64I-NEXT: lbu s10, 22(a0) ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t2, a1, a3 -; RV64I-NEXT: lbu t0, 23(a0) +; RV64I-NEXT: or t1, a1, s9 +; RV64I-NEXT: lbu s9, 23(a0) ; RV64I-NEXT: lbu a7, 24(a0) ; RV64I-NEXT: lbu a6, 25(a0) ; RV64I-NEXT: lbu a5, 26(a0) @@ -2843,26 +2843,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a5, 82(sp) ; RV64I-NEXT: sb a6, 81(sp) ; RV64I-NEXT: sb a7, 80(sp) -; RV64I-NEXT: sb t0, 79(sp) +; RV64I-NEXT: sb s9, 79(sp) ; RV64I-NEXT: sb s10, 78(sp) ; RV64I-NEXT: sb ra, 77(sp) ; RV64I-NEXT: sb s11, 76(sp) -; RV64I-NEXT: sb s9, 75(sp) -; RV64I-NEXT: sb s8, 74(sp) -; RV64I-NEXT: sb s7, 73(sp) -; RV64I-NEXT: sb s6, 72(sp) -; RV64I-NEXT: sb s5, 71(sp) -; RV64I-NEXT: sb s4, 70(sp) -; RV64I-NEXT: sb s3, 69(sp) -; RV64I-NEXT: sb s2, 68(sp) -; RV64I-NEXT: sb s1, 67(sp) -; RV64I-NEXT: sb s0, 66(sp) -; RV64I-NEXT: sb t6, 65(sp) -; RV64I-NEXT: sb t5, 64(sp) -; RV64I-NEXT: sb t1, 87(sp) -; RV64I-NEXT: slli t1, t1, 56 -; RV64I-NEXT: sb t4, 63(sp) -; RV64I-NEXT: sb t3, 62(sp) +; RV64I-NEXT: sb s8, 75(sp) +; RV64I-NEXT: sb s7, 74(sp) +; RV64I-NEXT: sb s6, 73(sp) +; RV64I-NEXT: sb s5, 72(sp) +; RV64I-NEXT: sb s4, 71(sp) +; RV64I-NEXT: sb s3, 70(sp) +; RV64I-NEXT: sb s2, 69(sp) +; RV64I-NEXT: sb s1, 68(sp) +; RV64I-NEXT: sb s0, 67(sp) +; RV64I-NEXT: sb t6, 66(sp) +; RV64I-NEXT: sb t5, 65(sp) +; RV64I-NEXT: sb t4, 64(sp) +; RV64I-NEXT: sb t0, 87(sp) +; RV64I-NEXT: slli t0, t0, 56 +; RV64I-NEXT: sb t3, 63(sp) +; RV64I-NEXT: sb t2, 62(sp) ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 61(sp) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload @@ -2875,7 +2875,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 57(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: srai a0, t1, 63 +; RV64I-NEXT: srai a0, t0, 63 ; RV64I-NEXT: sb a0, 112(sp) ; RV64I-NEXT: sb a0, 104(sp) ; RV64I-NEXT: sb a0, 96(sp) @@ -2915,47 +2915,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a6, 91(sp) ; RV64I-NEXT: sb a7, 90(sp) ; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: slli a0, t2, 56 +; RV64I-NEXT: slli a0, t1, 56 ; RV64I-NEXT: srli a0, a0, 59 -; RV64I-NEXT: addi a1, sp, 56 -; RV64I-NEXT: add a1, a1, a0 -; RV64I-NEXT: lbu a0, 9(a1) -; RV64I-NEXT: lbu a3, 8(a1) -; RV64I-NEXT: lbu a4, 10(a1) -; RV64I-NEXT: lbu a5, 11(a1) +; RV64I-NEXT: addi a3, sp, 56 +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: lbu a0, 9(a3) +; RV64I-NEXT: lbu a1, 8(a3) +; RV64I-NEXT: lbu a4, 10(a3) +; RV64I-NEXT: lbu a5, 11(a3) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: slli a5, a5, 24 ; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: lbu a3, 13(a1) -; RV64I-NEXT: lbu a4, 12(a1) -; RV64I-NEXT: lbu a5, 14(a1) -; RV64I-NEXT: lbu a6, 15(a1) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a1, 13(a3) +; RV64I-NEXT: lbu a4, 12(a3) +; RV64I-NEXT: lbu a5, 14(a3) +; RV64I-NEXT: lbu a6, 15(a3) +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a4, a3, a0 -; RV64I-NEXT: andi a3, t2, 7 -; RV64I-NEXT: lbu a0, 17(a1) -; RV64I-NEXT: lbu a5, 16(a1) -; RV64I-NEXT: lbu a6, 18(a1) -; RV64I-NEXT: lbu a7, 19(a1) +; RV64I-NEXT: or a1, a4, a1 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a4, a1, a0 +; RV64I-NEXT: andi a1, t1, 7 +; RV64I-NEXT: lbu a0, 17(a3) +; RV64I-NEXT: lbu a5, 16(a3) +; RV64I-NEXT: lbu a6, 18(a3) +; RV64I-NEXT: lbu a7, 19(a3) ; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 ; RV64I-NEXT: or a5, a7, a6 ; RV64I-NEXT: or a0, a5, a0 -; RV64I-NEXT: lbu a5, 21(a1) -; RV64I-NEXT: lbu a6, 20(a1) -; RV64I-NEXT: lbu a7, 22(a1) -; RV64I-NEXT: lbu t0, 23(a1) +; RV64I-NEXT: lbu a5, 21(a3) +; RV64I-NEXT: lbu a6, 20(a3) +; RV64I-NEXT: lbu a7, 22(a3) +; RV64I-NEXT: lbu t0, 23(a3) ; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 @@ -2965,22 +2965,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli a5, a5, 32 ; RV64I-NEXT: or a5, a5, a0 ; RV64I-NEXT: slli a0, a5, 1 -; RV64I-NEXT: not a6, a3 +; RV64I-NEXT: not a6, a1 ; RV64I-NEXT: sll a0, a0, a6 -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 2(a1) -; RV64I-NEXT: lbu t1, 3(a1) +; RV64I-NEXT: lbu a6, 1(a3) +; RV64I-NEXT: lbu a7, 0(a3) +; RV64I-NEXT: lbu t0, 2(a3) +; RV64I-NEXT: lbu t1, 3(a3) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 5(a1) -; RV64I-NEXT: lbu t0, 4(a1) -; RV64I-NEXT: lbu t1, 6(a1) -; RV64I-NEXT: lbu t2, 7(a1) +; RV64I-NEXT: lbu a7, 5(a3) +; RV64I-NEXT: lbu t0, 4(a3) +; RV64I-NEXT: lbu t1, 6(a3) +; RV64I-NEXT: lbu t2, 7(a3) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 @@ -2989,37 +2989,37 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a7, t0, a7 ; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 25(a1) -; RV64I-NEXT: lbu t0, 24(a1) -; RV64I-NEXT: lbu t1, 26(a1) -; RV64I-NEXT: lbu t2, 27(a1) +; RV64I-NEXT: lbu a7, 25(a3) +; RV64I-NEXT: lbu t0, 24(a3) +; RV64I-NEXT: lbu t1, 26(a3) +; RV64I-NEXT: lbu t2, 27(a3) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 29(a1) -; RV64I-NEXT: lbu t1, 28(a1) -; RV64I-NEXT: lbu t2, 30(a1) -; RV64I-NEXT: lbu a1, 31(a1) +; RV64I-NEXT: lbu t0, 29(a3) +; RV64I-NEXT: lbu t1, 28(a3) +; RV64I-NEXT: lbu t2, 30(a3) +; RV64I-NEXT: lbu a3, 31(a3) ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: slli a3, a3, 24 +; RV64I-NEXT: or a3, a3, t2 ; RV64I-NEXT: slli t1, a4, 1 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: xori t0, a3, 63 +; RV64I-NEXT: or a3, a3, t0 +; RV64I-NEXT: xori t0, a1, 63 ; RV64I-NEXT: sll t1, t1, t0 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a7, a1, a7 -; RV64I-NEXT: slli a1, a7, 1 -; RV64I-NEXT: sll t0, a1, t0 -; RV64I-NEXT: srl a1, a4, a3 -; RV64I-NEXT: srl a4, a6, a3 -; RV64I-NEXT: srl a5, a5, a3 -; RV64I-NEXT: sra a3, a7, a3 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: or a7, a3, a7 +; RV64I-NEXT: slli a3, a7, 1 +; RV64I-NEXT: sll t0, a3, t0 +; RV64I-NEXT: srl a3, a4, a1 +; RV64I-NEXT: srl a4, a6, a1 +; RV64I-NEXT: srl a5, a5, a1 +; RV64I-NEXT: sra a1, a7, a1 ; RV64I-NEXT: srli a6, a5, 48 ; RV64I-NEXT: sb a6, 22(a2) ; RV64I-NEXT: srli a6, a5, 40 @@ -3034,53 +3034,53 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a5, 16(a2) ; RV64I-NEXT: srli a5, a5, 8 ; RV64I-NEXT: sb a5, 17(a2) -; RV64I-NEXT: srli a5, a3, 56 +; RV64I-NEXT: srli a5, a1, 56 ; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a3, 48 +; RV64I-NEXT: srli a5, a1, 48 ; RV64I-NEXT: sb a5, 30(a2) -; RV64I-NEXT: srli a5, a3, 40 +; RV64I-NEXT: srli a5, a1, 40 ; RV64I-NEXT: sb a5, 29(a2) -; RV64I-NEXT: srli a5, a3, 32 +; RV64I-NEXT: srli a5, a1, 32 ; RV64I-NEXT: sb a5, 28(a2) -; RV64I-NEXT: srli a5, a3, 24 +; RV64I-NEXT: srli a5, a1, 24 ; RV64I-NEXT: sb a5, 27(a2) -; RV64I-NEXT: srli a5, a3, 16 +; RV64I-NEXT: srli a5, a1, 16 ; RV64I-NEXT: sb a5, 26(a2) -; RV64I-NEXT: sb a3, 24(a2) -; RV64I-NEXT: srli a3, a3, 8 -; RV64I-NEXT: sb a3, 25(a2) -; RV64I-NEXT: srli a3, a4, 48 -; RV64I-NEXT: sb a3, 6(a2) -; RV64I-NEXT: srli a3, a4, 40 -; RV64I-NEXT: sb a3, 5(a2) -; RV64I-NEXT: srli a3, a4, 32 -; RV64I-NEXT: sb a3, 4(a2) -; RV64I-NEXT: srli a3, a4, 24 -; RV64I-NEXT: sb a3, 3(a2) -; RV64I-NEXT: srli a3, a4, 16 -; RV64I-NEXT: sb a3, 2(a2) -; RV64I-NEXT: or a3, a4, t1 +; RV64I-NEXT: sb a1, 24(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: sb a1, 6(a2) +; RV64I-NEXT: srli a1, a4, 40 +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: srli a1, a4, 32 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a4, 24 +; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a4, 16 +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: or a1, a4, t1 ; RV64I-NEXT: sb a4, 0(a2) ; RV64I-NEXT: srli a4, a4, 8 ; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: srli a4, a1, 48 +; RV64I-NEXT: srli a4, a3, 48 ; RV64I-NEXT: sb a4, 14(a2) -; RV64I-NEXT: srli a4, a1, 40 +; RV64I-NEXT: srli a4, a3, 40 ; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: srli a4, a1, 32 +; RV64I-NEXT: srli a4, a3, 32 ; RV64I-NEXT: sb a4, 12(a2) -; RV64I-NEXT: srli a4, a1, 24 +; RV64I-NEXT: srli a4, a3, 24 ; RV64I-NEXT: sb a4, 11(a2) -; RV64I-NEXT: srli a4, a1, 16 +; RV64I-NEXT: srli a4, a3, 16 ; RV64I-NEXT: sb a4, 10(a2) -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: sb a1, 8(a2) -; RV64I-NEXT: srli a1, a1, 8 -; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: srli a1, a6, 56 -; RV64I-NEXT: sb a1, 23(a2) -; RV64I-NEXT: srli a3, a3, 56 -; RV64I-NEXT: sb a3, 7(a2) +; RV64I-NEXT: or a0, a3, a0 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 9(a2) +; RV64I-NEXT: srli a3, a6, 56 +; RV64I-NEXT: sb a3, 23(a2) +; RV64I-NEXT: srli a1, a1, 56 +; RV64I-NEXT: sb a1, 7(a2) ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: sb a0, 15(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload @@ -3240,123 +3240,123 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 61(sp) ; RV32I-NEXT: slli a0, t1, 24 ; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a4, sp, 28 -; RV32I-NEXT: add a4, a4, a0 -; RV32I-NEXT: lbu a0, 5(a4) -; RV32I-NEXT: lbu a1, 4(a4) -; RV32I-NEXT: lbu a3, 6(a4) -; RV32I-NEXT: lbu a5, 7(a4) +; RV32I-NEXT: addi a5, sp, 28 +; RV32I-NEXT: add a5, a5, a0 +; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a1, 4(a5) +; RV32I-NEXT: lbu a3, 6(a5) +; RV32I-NEXT: lbu a4, 7(a5) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or t5, a3, a0 +; RV32I-NEXT: slli a4, a4, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a3, a0 ; RV32I-NEXT: andi a3, t1, 7 -; RV32I-NEXT: lbu a0, 9(a4) -; RV32I-NEXT: lbu a1, 8(a4) -; RV32I-NEXT: lbu a5, 10(a4) -; RV32I-NEXT: lbu a6, 11(a4) +; RV32I-NEXT: lbu a0, 9(a5) +; RV32I-NEXT: lbu a1, 8(a5) +; RV32I-NEXT: lbu a6, 10(a5) +; RV32I-NEXT: lbu a7, 11(a5) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a1, a6, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a1, a7, a6 ; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t1, a3 -; RV32I-NEXT: sll a0, a0, t1 -; RV32I-NEXT: lbu a1, 1(a4) -; RV32I-NEXT: lbu a5, 0(a4) -; RV32I-NEXT: lbu a7, 2(a4) -; RV32I-NEXT: lbu t0, 3(a4) +; RV32I-NEXT: not t2, a3 +; RV32I-NEXT: sll a0, a0, t2 +; RV32I-NEXT: lbu a1, 1(a5) +; RV32I-NEXT: lbu a7, 0(a5) +; RV32I-NEXT: lbu t0, 2(a5) +; RV32I-NEXT: lbu t1, 3(a5) ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or t0, a5, a1 -; RV32I-NEXT: slli a1, t5, 1 -; RV32I-NEXT: xori t2, a3, 31 -; RV32I-NEXT: sll a1, a1, t2 -; RV32I-NEXT: lbu a5, 13(a4) -; RV32I-NEXT: lbu a7, 12(a4) -; RV32I-NEXT: lbu t3, 14(a4) -; RV32I-NEXT: lbu t4, 15(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t3, a7, a5 -; RV32I-NEXT: lbu a5, 17(a4) -; RV32I-NEXT: lbu a7, 16(a4) -; RV32I-NEXT: lbu t4, 18(a4) -; RV32I-NEXT: lbu t6, 19(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or t0, a7, a1 +; RV32I-NEXT: slli a1, a4, 1 +; RV32I-NEXT: xori t1, a3, 31 +; RV32I-NEXT: sll a1, a1, t1 +; RV32I-NEXT: lbu a7, 13(a5) +; RV32I-NEXT: lbu t3, 12(a5) +; RV32I-NEXT: lbu t4, 14(a5) +; RV32I-NEXT: lbu t5, 15(a5) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t3 ; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: or t3, t5, t4 +; RV32I-NEXT: or t3, t3, a7 +; RV32I-NEXT: lbu a7, 17(a5) +; RV32I-NEXT: lbu t4, 16(a5) +; RV32I-NEXT: lbu t5, 18(a5) +; RV32I-NEXT: lbu t6, 19(a5) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t4 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t4 -; RV32I-NEXT: or t4, a7, a5 -; RV32I-NEXT: slli a5, t4, 1 -; RV32I-NEXT: sll a7, a5, t1 -; RV32I-NEXT: lbu a5, 21(a4) -; RV32I-NEXT: lbu t6, 20(a4) -; RV32I-NEXT: lbu s0, 22(a4) -; RV32I-NEXT: lbu s1, 23(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 +; RV32I-NEXT: or t4, t6, t5 +; RV32I-NEXT: or t4, t4, a7 +; RV32I-NEXT: slli a7, t4, 1 +; RV32I-NEXT: sll a7, a7, t2 +; RV32I-NEXT: lbu t5, 21(a5) +; RV32I-NEXT: lbu t6, 20(a5) +; RV32I-NEXT: lbu s0, 22(a5) +; RV32I-NEXT: lbu s1, 23(a5) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: or t5, t5, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 ; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or s0, s0, a5 -; RV32I-NEXT: lbu a5, 25(a4) -; RV32I-NEXT: lbu t6, 24(a4) -; RV32I-NEXT: lbu s1, 26(a4) -; RV32I-NEXT: lbu s2, 27(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 +; RV32I-NEXT: or t5, s0, t5 +; RV32I-NEXT: lbu t6, 25(a5) +; RV32I-NEXT: lbu s0, 24(a5) +; RV32I-NEXT: lbu s1, 26(a5) +; RV32I-NEXT: lbu s2, 27(a5) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t6, t6, s0 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or t6, s2, s1 -; RV32I-NEXT: or t6, t6, a5 -; RV32I-NEXT: lbu a5, 29(a4) -; RV32I-NEXT: lbu s1, 28(a4) +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: lbu s0, 29(a5) +; RV32I-NEXT: lbu s1, 28(a5) ; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t1, s2, t1 -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, s1 -; RV32I-NEXT: lbu s1, 30(a4) -; RV32I-NEXT: lbu a4, 31(a4) +; RV32I-NEXT: sll t2, s2, t2 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: or s0, s0, s1 +; RV32I-NEXT: lbu s1, 30(a5) +; RV32I-NEXT: lbu a5, 31(a5) ; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: sll s2, s2, t2 +; RV32I-NEXT: sll s2, s2, t1 ; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: slli s1, s0, 1 -; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or s3, a4, a5 -; RV32I-NEXT: slli a4, s3, 1 -; RV32I-NEXT: sll t2, a4, t2 -; RV32I-NEXT: srl a4, t5, a3 -; RV32I-NEXT: srl a5, t0, a3 -; RV32I-NEXT: srl t0, t3, a3 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a5, a5, s1 +; RV32I-NEXT: slli s1, t5, 1 +; RV32I-NEXT: sll s1, s1, t1 +; RV32I-NEXT: or a5, a5, s0 +; RV32I-NEXT: slli s0, a5, 1 +; RV32I-NEXT: sll t1, s0, t1 +; RV32I-NEXT: srl a4, a4, a3 +; RV32I-NEXT: srl t0, t0, a3 +; RV32I-NEXT: srl t3, t3, a3 ; RV32I-NEXT: srl a6, a6, a3 -; RV32I-NEXT: srl t3, s0, a3 +; RV32I-NEXT: srl t5, t5, a3 ; RV32I-NEXT: srl t4, t4, a3 -; RV32I-NEXT: srl t5, t6, a3 -; RV32I-NEXT: sra a3, s3, a3 -; RV32I-NEXT: srli t6, t5, 16 -; RV32I-NEXT: sb t6, 26(a2) -; RV32I-NEXT: or t2, t5, t2 -; RV32I-NEXT: sb t5, 24(a2) -; RV32I-NEXT: srli t5, t5, 8 -; RV32I-NEXT: sb t5, 25(a2) -; RV32I-NEXT: srli t5, a3, 24 -; RV32I-NEXT: sb t5, 31(a2) -; RV32I-NEXT: srli t5, a3, 16 -; RV32I-NEXT: sb t5, 30(a2) +; RV32I-NEXT: srl t6, t6, a3 +; RV32I-NEXT: sra a3, a5, a3 +; RV32I-NEXT: srli a5, t6, 16 +; RV32I-NEXT: sb a5, 26(a2) +; RV32I-NEXT: or a5, t6, t1 +; RV32I-NEXT: sb t6, 24(a2) +; RV32I-NEXT: srli t1, t6, 8 +; RV32I-NEXT: sb t1, 25(a2) +; RV32I-NEXT: srli t1, a3, 24 +; RV32I-NEXT: sb t1, 31(a2) +; RV32I-NEXT: srli t1, a3, 16 +; RV32I-NEXT: sb t1, 30(a2) ; RV32I-NEXT: sb a3, 28(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 29(a2) @@ -3364,45 +3364,45 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a3, 18(a2) ; RV32I-NEXT: or a3, t4, s1 ; RV32I-NEXT: sb t4, 16(a2) -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb t4, 17(a2) -; RV32I-NEXT: srli t4, t3, 16 -; RV32I-NEXT: sb t4, 22(a2) -; RV32I-NEXT: or t1, t3, t1 -; RV32I-NEXT: sb t3, 20(a2) -; RV32I-NEXT: srli t3, t3, 8 -; RV32I-NEXT: sb t3, 21(a2) -; RV32I-NEXT: srli t3, a6, 16 -; RV32I-NEXT: sb t3, 10(a2) -; RV32I-NEXT: or t3, a6, s2 +; RV32I-NEXT: srli t1, t4, 8 +; RV32I-NEXT: sb t1, 17(a2) +; RV32I-NEXT: srli t1, t5, 16 +; RV32I-NEXT: sb t1, 22(a2) +; RV32I-NEXT: or t1, t5, t2 +; RV32I-NEXT: sb t5, 20(a2) +; RV32I-NEXT: srli t2, t5, 8 +; RV32I-NEXT: sb t2, 21(a2) +; RV32I-NEXT: srli t2, a6, 16 +; RV32I-NEXT: sb t2, 10(a2) +; RV32I-NEXT: or t2, a6, s2 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: srli a6, a6, 8 ; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t0, 16 +; RV32I-NEXT: srli a6, t3, 16 ; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: sb t0, 12(a2) -; RV32I-NEXT: srli a7, t0, 8 +; RV32I-NEXT: or a6, t3, a7 +; RV32I-NEXT: sb t3, 12(a2) +; RV32I-NEXT: srli a7, t3, 8 ; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, a5, 16 +; RV32I-NEXT: srli a7, t0, 16 ; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, a5, a1 -; RV32I-NEXT: sb a5, 0(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: srli a5, a4, 16 -; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: or a1, t0, a1 +; RV32I-NEXT: sb t0, 0(a2) +; RV32I-NEXT: srli a7, t0, 8 +; RV32I-NEXT: sb a7, 1(a2) +; RV32I-NEXT: srli a7, a4, 16 +; RV32I-NEXT: sb a7, 6(a2) ; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: sb a4, 4(a2) ; RV32I-NEXT: srli a4, a4, 8 ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: srli a4, t2, 24 -; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: srli a5, a5, 24 +; RV32I-NEXT: sb a5, 27(a2) ; RV32I-NEXT: srli a3, a3, 24 ; RV32I-NEXT: sb a3, 19(a2) ; RV32I-NEXT: srli a3, t1, 24 ; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: srli a3, t3, 24 +; RV32I-NEXT: srli a3, t2, 24 ; RV32I-NEXT: sb a3, 11(a2) ; RV32I-NEXT: srli a3, a6, 24 ; RV32I-NEXT: sb a3, 15(a2) diff --git a/llvm/test/CodeGen/SPARC/atomics.ll b/llvm/test/CodeGen/SPARC/atomics.ll index 6a9abcc32545a9..234c6a8f3e5b65 100644 --- a/llvm/test/CodeGen/SPARC/atomics.ll +++ b/llvm/test/CodeGen/SPARC/atomics.ll @@ -87,24 +87,24 @@ entry: ; SPARC: sll %o4, %o1, %o4 ; SPARC: and %o0, 255, %o0 ; SPARC: sll %o0, %o1, %o0 -; SPARC: andn %g2, %o5, %o5 +; SPARC: andn %g2, %o5, %g2 ; SPARC: [[LABEL1:\.L.*]]: -; SPARC: or %o5, %o4, %g2 -; SPARC: or %o5, %o0, %g3 -; SPARC: cas [%o2], %g3, %g2 +; SPARC: or %g2, %o4, %o5 +; SPARC: or %g2, %o0, %g3 +; SPARC: cas [%o2], %g3, %o5 ; SPARC: mov %g0, %g4 -; SPARC: cmp %g2, %g3 +; SPARC: cmp %o5, %g3 ; SPARC: move %icc, 1, %g4 ; SPARC: cmp %g4, 0 ; SPARC: bne %icc, [[LABEL2:\.L.*]] ; SPARC: nop -; SPARC: and %g2, %o3, %g3 -; SPARC: cmp %o5, %g3 +; SPARC: and %o5, %o3, %g3 +; SPARC: cmp %g2, %g3 ; SPARC: bne %icc, [[LABEL1]] -; SPARC: mov %g3, %o5 +; SPARC: mov %g3, %g2 ; SPARC: [[LABEL2]]: ; SPARC: retl -; SPARC: srl %g2, %o1, %o0 +; SPARC: srl %o5, %o1, %o0 ; SPARC64-LABEL: test_cmpxchg_i8 ; SPARC64: and %o1, -4, %o2 ; SPARC64: mov 3, %o3 @@ -118,24 +118,24 @@ entry: ; SPARC64: sll %o4, %o1, %o4 ; SPARC64: and %o0, 255, %o0 ; SPARC64: sll %o0, %o1, %o0 -; SPARC64: andn %g2, %o5, %o5 +; SPARC64: andn %g2, %o5, %g2 ; SPARC64: [[LABEL1:\.L.*]]: -; SPARC64: or %o5, %o4, %g2 -; SPARC64: or %o5, %o0, %g3 -; SPARC64: cas [%o2], %g3, %g2 +; SPARC64: or %g2, %o4, %o5 +; SPARC64: or %g2, %o0, %g3 +; SPARC64: cas [%o2], %g3, %o5 ; SPARC64: mov %g0, %g4 -; SPARC64: cmp %g2, %g3 +; SPARC64: cmp %o5, %g3 ; SPARC64: move %icc, 1, %g4 ; SPARC64: cmp %g4, 0 ; SPARC64: bne %icc, [[LABEL2:\.L.*]] ; SPARC64: nop -; SPARC64: and %g2, %o3, %g3 -; SPARC64: cmp %o5, %g3 +; SPARC64: and %o5, %o3, %g3 +; SPARC64: cmp %g2, %g3 ; SPARC64: bne %icc, [[LABEL1]] -; SPARC64: mov %g3, %o5 +; SPARC64: mov %g3, %g2 ; SPARC64: [[LABEL2]]: ; SPARC64: retl -; SPARC64: srl %g2, %o1, %o0 +; SPARC64: srl %o5, %o1, %o0 define i8 @test_cmpxchg_i8(i8 %a, i8* %ptr) { entry: %pair = cmpxchg i8* %ptr, i8 %a, i8 123 monotonic monotonic @@ -157,24 +157,24 @@ entry: ; SPARC: mov 123, %o0 ; SPARC: sll %o0, %o1, %o0 ; SPARC: sll %o4, %o1, %o4 -; SPARC: andn %g2, %o5, %o5 +; SPARC: andn %g2, %o5, %g2 ; SPARC: [[LABEL1:\.L.*]]: -; SPARC: or %o5, %o0, %g2 -; SPARC: or %o5, %o4, %g3 -; SPARC: cas [%o2], %g3, %g2 +; SPARC: or %g2, %o0, %o5 +; SPARC: or %g2, %o4, %g3 +; SPARC: cas [%o2], %g3, %o5 ; SPARC: mov %g0, %g4 -; SPARC: cmp %g2, %g3 +; SPARC: cmp %o5, %g3 ; SPARC: move %icc, 1, %g4 ; SPARC: cmp %g4, 0 ; SPARC: bne %icc, [[LABEL2:\.L.*]] ; SPARC: nop -; SPARC: and %g2, %o3, %g3 -; SPARC: cmp %o5, %g3 +; SPARC: and %o5, %o3, %g3 +; SPARC: cmp %g2, %g3 ; SPARC: bne %icc, [[LABEL1]] -; SPARC: mov %g3, %o5 +; SPARC: mov %g3, %g2 ; SPARC: [[LABEL2]]: ; SPARC: retl -; SPARC: srl %g2, %o1, %o0 +; SPARC: srl %o5, %o1, %o0 ; SPARC64-LABEL: test_cmpxchg_i16 ; SPARC64: and %o1, -4, %o2 ; SPARC64: and %o1, 3, %o1 @@ -189,24 +189,24 @@ entry: ; SPARC64: mov 123, %o0 ; SPARC64: sll %o0, %o1, %o0 ; SPARC64: sll %o4, %o1, %o4 -; SPARC64: andn %g2, %o5, %o5 +; SPARC64: andn %g2, %o5, %g2 ; SPARC64: [[LABEL1:\.L.*]]: -; SPARC64: or %o5, %o0, %g2 -; SPARC64: or %o5, %o4, %g3 -; SPARC64: cas [%o2], %g3, %g2 +; SPARC64: or %g2, %o0, %o5 +; SPARC64: or %g2, %o4, %g3 +; SPARC64: cas [%o2], %g3, %o5 ; SPARC64: mov %g0, %g4 -; SPARC64: cmp %g2, %g3 +; SPARC64: cmp %o5, %g3 ; SPARC64: move %icc, 1, %g4 ; SPARC64: cmp %g4, 0 ; SPARC64: bne %icc, [[LABEL2:\.L.*]] ; SPARC64: nop -; SPARC64: and %g2, %o3, %g3 -; SPARC64: cmp %o5, %g3 +; SPARC64: and %o5, %o3, %g3 +; SPARC64: cmp %g2, %g3 ; SPARC64: bne %icc, [[LABEL1]] -; SPARC64: mov %g3, %o5 +; SPARC64: mov %g3, %g2 ; SPARC64: [[LABEL2]]: ; SPARC64: retl -; SPARC64: srl %g2, %o1, %o0 +; SPARC64: srl %o5, %o1, %o0 define i16 @test_cmpxchg_i16(i16 %a, i16* %ptr) { entry: %pair = cmpxchg i16* %ptr, i16 %a, i16 123 monotonic monotonic diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll index ae1de443bce05f..e274671f88757b 100644 --- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll @@ -11,27 +11,27 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: .cfi_window_save ; SPARC-NEXT: .cfi_register %o7, %i7 ; SPARC-NEXT: ld [%fp+96], %l1 -; SPARC-NEXT: mov %i3, %g4 +; SPARC-NEXT: mov %i3, %l0 ; SPARC-NEXT: mov %i2, %g2 ; SPARC-NEXT: umul %i3, %l1, %i3 ; SPARC-NEXT: rd %y, %i2 ; SPARC-NEXT: ld [%fp+92], %l2 ; SPARC-NEXT: umul %g2, %l1, %g3 -; SPARC-NEXT: rd %y, %l0 +; SPARC-NEXT: rd %y, %g4 ; SPARC-NEXT: addcc %g3, %i2, %i2 -; SPARC-NEXT: addxcc %l0, 0, %g3 -; SPARC-NEXT: umul %g4, %l2, %l0 +; SPARC-NEXT: addxcc %g4, 0, %g3 +; SPARC-NEXT: umul %l0, %l2, %g4 ; SPARC-NEXT: rd %y, %l3 -; SPARC-NEXT: addcc %l0, %i2, %i2 -; SPARC-NEXT: addxcc %l3, 0, %l0 -; SPARC-NEXT: addcc %g3, %l0, %g3 -; SPARC-NEXT: addxcc %g0, 0, %l0 +; SPARC-NEXT: addcc %g4, %i2, %i2 +; SPARC-NEXT: addxcc %l3, 0, %g4 +; SPARC-NEXT: addcc %g3, %g4, %g3 +; SPARC-NEXT: addxcc %g0, 0, %g4 ; SPARC-NEXT: umul %g2, %l2, %l3 ; SPARC-NEXT: rd %y, %l4 ; SPARC-NEXT: addcc %l3, %g3, %g3 ; SPARC-NEXT: umul %i1, %l1, %l3 ; SPARC-NEXT: rd %y, %l5 -; SPARC-NEXT: addxcc %l4, %l0, %l0 +; SPARC-NEXT: addxcc %l4, %g4, %g4 ; SPARC-NEXT: umul %i0, %l1, %l4 ; SPARC-NEXT: rd %y, %l6 ; SPARC-NEXT: addcc %l4, %l5, %l4 @@ -47,16 +47,16 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addcc %l7, %l5, %l5 ; SPARC-NEXT: addxcc %o0, %l6, %l6 ; SPARC-NEXT: addcc %l3, %g3, %g3 -; SPARC-NEXT: addxcc %l4, %l0, %l0 +; SPARC-NEXT: addxcc %l4, %g4, %g4 ; SPARC-NEXT: addxcc %l5, 0, %l3 -; SPARC-NEXT: umul %g4, %i5, %l4 +; SPARC-NEXT: umul %l0, %i5, %l4 ; SPARC-NEXT: rd %y, %l5 ; SPARC-NEXT: addxcc %l6, 0, %l6 ; SPARC-NEXT: umul %g2, %i5, %l7 ; SPARC-NEXT: rd %y, %o0 ; SPARC-NEXT: addcc %l7, %l5, %l5 ; SPARC-NEXT: addxcc %o0, 0, %l7 -; SPARC-NEXT: umul %g4, %i4, %o0 +; SPARC-NEXT: umul %l0, %i4, %o0 ; SPARC-NEXT: rd %y, %o1 ; SPARC-NEXT: addcc %o0, %l5, %l5 ; SPARC-NEXT: addxcc %o1, 0, %o0 @@ -67,7 +67,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addcc %o1, %l7, %l7 ; SPARC-NEXT: addxcc %o2, %o0, %o0 ; SPARC-NEXT: addcc %l4, %g3, %g3 -; SPARC-NEXT: addxcc %l5, %l0, %l0 +; SPARC-NEXT: addxcc %l5, %g4, %g4 ; SPARC-NEXT: addxcc %l7, 0, %l4 ; SPARC-NEXT: addxcc %o0, 0, %l5 ; SPARC-NEXT: addcc %l3, %l4, %l3 @@ -118,21 +118,21 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addxcc %o0, %o3, %l6 ; SPARC-NEXT: addcc %l2, %o1, %l2 ; SPARC-NEXT: sra %i4, 31, %i4 -; SPARC-NEXT: umul %g4, %i4, %g4 +; SPARC-NEXT: umul %l0, %i4, %l0 ; SPARC-NEXT: rd %y, %o0 ; SPARC-NEXT: addxcc %l6, %l7, %l6 ; SPARC-NEXT: umul %i4, %g2, %g2 ; SPARC-NEXT: rd %y, %l7 -; SPARC-NEXT: add %o0, %g4, %o1 +; SPARC-NEXT: add %o0, %l0, %o1 ; SPARC-NEXT: smul %i0, %i4, %i0 ; SPARC-NEXT: umul %i1, %i4, %i1 ; SPARC-NEXT: rd %y, %i4 ; SPARC-NEXT: add %o1, %g2, %o1 ; SPARC-NEXT: add %i4, %i1, %i4 ; SPARC-NEXT: add %i4, %i0, %i0 -; SPARC-NEXT: addcc %i1, %g4, %i1 +; SPARC-NEXT: addcc %i1, %l0, %i1 ; SPARC-NEXT: addxcc %i0, %o1, %i0 -; SPARC-NEXT: addcc %g4, %o0, %i4 +; SPARC-NEXT: addcc %l0, %o0, %i4 ; SPARC-NEXT: addxcc %o0, 0, %o0 ; SPARC-NEXT: addcc %g2, %i4, %i4 ; SPARC-NEXT: addxcc %l7, 0, %o1 @@ -142,7 +142,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addxcc %l7, %o1, %l7 ; SPARC-NEXT: addcc %g2, %i1, %i1 ; SPARC-NEXT: addxcc %l7, %i0, %i0 -; SPARC-NEXT: addcc %g4, %l1, %g2 +; SPARC-NEXT: addcc %l0, %l1, %g2 ; SPARC-NEXT: addxcc %i4, %o2, %i4 ; SPARC-NEXT: addxcc %i1, %l2, %i1 ; SPARC-NEXT: addxcc %i0, %l6, %i0 @@ -150,7 +150,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addxcc %l4, %i4, %i4 ; SPARC-NEXT: addxcc %l5, %i1, %i1 ; SPARC-NEXT: addxcc %i5, %i0, %i0 -; SPARC-NEXT: sra %l0, 31, %i5 +; SPARC-NEXT: sra %g4, 31, %i5 ; SPARC-NEXT: xor %i0, %i5, %i0 ; SPARC-NEXT: xor %i4, %i5, %i4 ; SPARC-NEXT: or %i4, %i0, %i0 @@ -167,7 +167,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: .LBB0_2: ; SPARC-NEXT: mov 1, %i4 ; SPARC-NEXT: .LBB0_3: ! %start -; SPARC-NEXT: mov %l0, %i0 +; SPARC-NEXT: mov %g4, %i0 ; SPARC-NEXT: ret ; SPARC-NEXT: restore %g0, %g3, %o1 ; @@ -226,13 +226,13 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC64-NEXT: mov %g0, %o2 ; SPARC64-NEXT: call __multi3 ; SPARC64-NEXT: mov %i2, %o3 -; SPARC64-NEXT: srlx %o1, 32, %g2 -; SPARC64-NEXT: srlx %o0, 32, %g3 -; SPARC64-NEXT: addcc %o1, %i5, %i3 -; SPARC64-NEXT: addxcc %g2, %i4, %i4 -; SPARC64-NEXT: addxcc %o0, 0, %i5 -; SPARC64-NEXT: addxcc %g3, 0, %g2 -; SPARC64-NEXT: addcc %l4, %i5, %i5 +; SPARC64-NEXT: srlx %o1, 32, %i3 +; SPARC64-NEXT: srlx %o0, 32, %g2 +; SPARC64-NEXT: addcc %o1, %i5, %i5 +; SPARC64-NEXT: addxcc %i3, %i4, %i3 +; SPARC64-NEXT: addxcc %o0, 0, %i4 +; SPARC64-NEXT: addxcc %g2, 0, %g2 +; SPARC64-NEXT: addcc %l4, %i4, %i4 ; SPARC64-NEXT: addxcc %l5, %g2, %l4 ; SPARC64-NEXT: addxcc %g0, 0, %l5 ; SPARC64-NEXT: addxcc %g0, 0, %l6 @@ -243,29 +243,29 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC64-NEXT: mov %i2, %o3 ; SPARC64-NEXT: mov %g0, %i2 ; SPARC64-NEXT: srlx %o1, 32, %i0 -; SPARC64-NEXT: addcc %o1, %i5, %i5 +; SPARC64-NEXT: addcc %o1, %i4, %i4 ; SPARC64-NEXT: srlx %o0, 32, %g2 ; SPARC64-NEXT: addxcc %i0, %l4, %i0 ; SPARC64-NEXT: addxcc %o0, %l5, %g3 ; SPARC64-NEXT: addxcc %g2, %l6, %g2 -; SPARC64-NEXT: addcc %i5, %l0, %i5 +; SPARC64-NEXT: addcc %i4, %l0, %i4 ; SPARC64-NEXT: addxcc %i0, %l1, %i0 ; SPARC64-NEXT: addxcc %g3, %l2, %g3 ; SPARC64-NEXT: addxcc %g2, %l3, %g2 ; SPARC64-NEXT: srl %g3, 0, %g3 ; SPARC64-NEXT: sllx %g2, 32, %g2 ; SPARC64-NEXT: or %g2, %g3, %g2 -; SPARC64-NEXT: sllx %i4, 32, %i4 -; SPARC64-NEXT: srax %i4, 63, %g3 +; SPARC64-NEXT: sllx %i3, 32, %i3 +; SPARC64-NEXT: srax %i3, 63, %g3 ; SPARC64-NEXT: xor %g2, %g3, %g2 -; SPARC64-NEXT: srl %i5, 0, %i5 +; SPARC64-NEXT: srl %i4, 0, %i4 ; SPARC64-NEXT: sllx %i0, 32, %i0 -; SPARC64-NEXT: or %i0, %i5, %i0 +; SPARC64-NEXT: or %i0, %i4, %i0 ; SPARC64-NEXT: xor %i0, %g3, %i0 ; SPARC64-NEXT: or %i0, %g2, %i0 ; SPARC64-NEXT: movrnz %i0, 1, %i2 -; SPARC64-NEXT: srl %i3, 0, %i0 -; SPARC64-NEXT: or %i4, %i0, %i0 +; SPARC64-NEXT: srl %i5, 0, %i0 +; SPARC64-NEXT: or %i3, %i0, %i0 ; SPARC64-NEXT: srl %i2, 0, %i2 ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll index 9ca895fe78073d..d602648a34f4e2 100644 --- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll @@ -14,146 +14,146 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: mov %i2, %g4 ; SPARC-NEXT: umul %i2, %i5, %i2 ; SPARC-NEXT: rd %y, %l7 -; SPARC-NEXT: ld [%fp+92], %l4 +; SPARC-NEXT: ld [%fp+92], %l5 ; SPARC-NEXT: umul %i4, %i3, %i3 -; SPARC-NEXT: rd %y, %o1 +; SPARC-NEXT: rd %y, %o2 ; SPARC-NEXT: ld [%fp+96], %g3 -; SPARC-NEXT: umul %i5, %g2, %l3 +; SPARC-NEXT: umul %i5, %g2, %l1 ; SPARC-NEXT: rd %y, %o0 -; SPARC-NEXT: umul %l4, %i1, %l2 -; SPARC-NEXT: rd %y, %l1 +; SPARC-NEXT: umul %l5, %i1, %l3 +; SPARC-NEXT: rd %y, %l2 ; SPARC-NEXT: add %i3, %i2, %i2 ; SPARC-NEXT: umul %i0, %g3, %i3 ; SPARC-NEXT: rd %y, %l6 -; SPARC-NEXT: add %o0, %i2, %o2 +; SPARC-NEXT: add %o0, %i2, %o1 ; SPARC-NEXT: umul %i1, %g3, %i2 ; SPARC-NEXT: rd %y, %l0 -; SPARC-NEXT: add %i3, %l2, %i3 -; SPARC-NEXT: add %l0, %i3, %l2 -; SPARC-NEXT: addcc %i2, %l3, %l3 +; SPARC-NEXT: add %i3, %l3, %i3 +; SPARC-NEXT: add %l0, %i3, %l3 +; SPARC-NEXT: addcc %i2, %l1, %l1 ; SPARC-NEXT: umul %g2, %g3, %i3 ; SPARC-NEXT: rd %y, %i2 -; SPARC-NEXT: addxcc %l2, %o2, %o4 +; SPARC-NEXT: addxcc %l3, %o1, %o4 ; SPARC-NEXT: umul %g4, %g3, %g3 -; SPARC-NEXT: rd %y, %l5 +; SPARC-NEXT: rd %y, %l4 ; SPARC-NEXT: addcc %g3, %i2, %i2 -; SPARC-NEXT: addxcc %l5, 0, %g3 -; SPARC-NEXT: umul %g2, %l4, %g2 -; SPARC-NEXT: rd %y, %l5 +; SPARC-NEXT: addxcc %l4, 0, %g3 +; SPARC-NEXT: umul %g2, %l5, %g2 +; SPARC-NEXT: rd %y, %l4 ; SPARC-NEXT: addcc %g2, %i2, %i2 -; SPARC-NEXT: addxcc %l5, 0, %g2 +; SPARC-NEXT: addxcc %l4, 0, %g2 ; SPARC-NEXT: addcc %g3, %g2, %g2 ; SPARC-NEXT: addxcc %g0, 0, %g3 -; SPARC-NEXT: umul %g4, %l4, %l5 +; SPARC-NEXT: umul %g4, %l5, %l4 ; SPARC-NEXT: rd %y, %o3 -; SPARC-NEXT: addcc %l5, %g2, %l5 +; SPARC-NEXT: addcc %l4, %g2, %l4 ; SPARC-NEXT: addxcc %o3, %g3, %o3 -; SPARC-NEXT: addcc %l5, %l3, %g2 +; SPARC-NEXT: addcc %l4, %l1, %g2 ; SPARC-NEXT: addxcc %o3, %o4, %g3 -; SPARC-NEXT: mov 1, %l3 +; SPARC-NEXT: mov 1, %l1 ; SPARC-NEXT: cmp %g3, %o3 ; SPARC-NEXT: bcs .LBB0_2 -; SPARC-NEXT: mov %l3, %o4 +; SPARC-NEXT: mov %l1, %o4 ; SPARC-NEXT: ! %bb.1: ! %start ; SPARC-NEXT: mov %g0, %o4 ; SPARC-NEXT: .LBB0_2: ! %start -; SPARC-NEXT: cmp %g2, %l5 +; SPARC-NEXT: cmp %g2, %l4 ; SPARC-NEXT: bcs .LBB0_4 -; SPARC-NEXT: mov %l3, %l5 +; SPARC-NEXT: mov %l1, %l4 ; SPARC-NEXT: ! %bb.3: ! %start -; SPARC-NEXT: mov %g0, %l5 +; SPARC-NEXT: mov %g0, %l4 ; SPARC-NEXT: .LBB0_4: ! %start ; SPARC-NEXT: cmp %g3, %o3 ; SPARC-NEXT: be .LBB0_6 ; SPARC-NEXT: nop ; SPARC-NEXT: ! %bb.5: ! %start -; SPARC-NEXT: mov %o4, %l5 +; SPARC-NEXT: mov %o4, %l4 ; SPARC-NEXT: .LBB0_6: ! %start ; SPARC-NEXT: cmp %g4, 0 ; SPARC-NEXT: bne .LBB0_8 -; SPARC-NEXT: mov %l3, %o3 +; SPARC-NEXT: mov %l1, %o3 ; SPARC-NEXT: ! %bb.7: ! %start ; SPARC-NEXT: mov %g0, %o3 ; SPARC-NEXT: .LBB0_8: ! %start ; SPARC-NEXT: cmp %i4, 0 ; SPARC-NEXT: bne .LBB0_10 -; SPARC-NEXT: mov %l3, %o4 +; SPARC-NEXT: mov %l1, %o4 ; SPARC-NEXT: ! %bb.9: ! %start ; SPARC-NEXT: mov %g0, %o4 ; SPARC-NEXT: .LBB0_10: ! %start -; SPARC-NEXT: cmp %o1, 0 +; SPARC-NEXT: cmp %o2, 0 ; SPARC-NEXT: bne .LBB0_12 -; SPARC-NEXT: mov %l3, %o1 +; SPARC-NEXT: mov %l1, %o2 ; SPARC-NEXT: ! %bb.11: ! %start -; SPARC-NEXT: mov %g0, %o1 +; SPARC-NEXT: mov %g0, %o2 ; SPARC-NEXT: .LBB0_12: ! %start ; SPARC-NEXT: cmp %l7, 0 ; SPARC-NEXT: bne .LBB0_14 -; SPARC-NEXT: mov %l3, %l7 +; SPARC-NEXT: mov %l1, %l7 ; SPARC-NEXT: ! %bb.13: ! %start ; SPARC-NEXT: mov %g0, %l7 ; SPARC-NEXT: .LBB0_14: ! %start -; SPARC-NEXT: cmp %o2, %o0 +; SPARC-NEXT: cmp %o1, %o0 ; SPARC-NEXT: bcs .LBB0_16 -; SPARC-NEXT: mov %l3, %g4 +; SPARC-NEXT: mov %l1, %g4 ; SPARC-NEXT: ! %bb.15: ! %start ; SPARC-NEXT: mov %g0, %g4 ; SPARC-NEXT: .LBB0_16: ! %start -; SPARC-NEXT: cmp %l4, 0 +; SPARC-NEXT: cmp %l5, 0 ; SPARC-NEXT: bne .LBB0_18 -; SPARC-NEXT: mov %l3, %l4 +; SPARC-NEXT: mov %l1, %l5 ; SPARC-NEXT: ! %bb.17: ! %start -; SPARC-NEXT: mov %g0, %l4 +; SPARC-NEXT: mov %g0, %l5 ; SPARC-NEXT: .LBB0_18: ! %start ; SPARC-NEXT: cmp %i0, 0 ; SPARC-NEXT: bne .LBB0_20 -; SPARC-NEXT: mov %l3, %o0 +; SPARC-NEXT: mov %l1, %o0 ; SPARC-NEXT: ! %bb.19: ! %start ; SPARC-NEXT: mov %g0, %o0 ; SPARC-NEXT: .LBB0_20: ! %start ; SPARC-NEXT: cmp %l6, 0 ; SPARC-NEXT: bne .LBB0_22 -; SPARC-NEXT: mov %l3, %l6 +; SPARC-NEXT: mov %l1, %l6 ; SPARC-NEXT: ! %bb.21: ! %start ; SPARC-NEXT: mov %g0, %l6 ; SPARC-NEXT: .LBB0_22: ! %start -; SPARC-NEXT: and %o4, %o3, %o2 -; SPARC-NEXT: cmp %l1, 0 -; SPARC-NEXT: and %o0, %l4, %l4 +; SPARC-NEXT: and %o4, %o3, %o1 +; SPARC-NEXT: cmp %l2, 0 +; SPARC-NEXT: and %o0, %l5, %l5 ; SPARC-NEXT: bne .LBB0_24 -; SPARC-NEXT: mov %l3, %l1 +; SPARC-NEXT: mov %l1, %l2 ; SPARC-NEXT: ! %bb.23: ! %start -; SPARC-NEXT: mov %g0, %l1 +; SPARC-NEXT: mov %g0, %l2 ; SPARC-NEXT: .LBB0_24: ! %start -; SPARC-NEXT: or %o2, %o1, %o0 -; SPARC-NEXT: cmp %l2, %l0 -; SPARC-NEXT: or %l4, %l6, %l4 +; SPARC-NEXT: or %o1, %o2, %o0 +; SPARC-NEXT: cmp %l3, %l0 +; SPARC-NEXT: or %l5, %l6, %l5 ; SPARC-NEXT: bcs .LBB0_26 -; SPARC-NEXT: mov %l3, %l0 +; SPARC-NEXT: mov %l1, %l0 ; SPARC-NEXT: ! %bb.25: ! %start ; SPARC-NEXT: mov %g0, %l0 ; SPARC-NEXT: .LBB0_26: ! %start -; SPARC-NEXT: or %o0, %l7, %l2 +; SPARC-NEXT: or %o0, %l7, %l3 ; SPARC-NEXT: or %i5, %i4, %i4 ; SPARC-NEXT: cmp %i4, 0 -; SPARC-NEXT: or %l4, %l1, %l1 +; SPARC-NEXT: or %l5, %l2, %l2 ; SPARC-NEXT: bne .LBB0_28 -; SPARC-NEXT: mov %l3, %i4 +; SPARC-NEXT: mov %l1, %i4 ; SPARC-NEXT: ! %bb.27: ! %start ; SPARC-NEXT: mov %g0, %i4 ; SPARC-NEXT: .LBB0_28: ! %start -; SPARC-NEXT: or %l2, %g4, %i5 +; SPARC-NEXT: or %l3, %g4, %i5 ; SPARC-NEXT: or %i1, %i0, %i0 ; SPARC-NEXT: cmp %i0, 0 ; SPARC-NEXT: bne .LBB0_30 -; SPARC-NEXT: or %l1, %l0, %i0 +; SPARC-NEXT: or %l2, %l0, %i0 ; SPARC-NEXT: ! %bb.29: ! %start -; SPARC-NEXT: mov %g0, %l3 +; SPARC-NEXT: mov %g0, %l1 ; SPARC-NEXT: .LBB0_30: ! %start -; SPARC-NEXT: and %l3, %i4, %i1 +; SPARC-NEXT: and %l1, %i4, %i1 ; SPARC-NEXT: or %i1, %i0, %i0 ; SPARC-NEXT: or %i0, %i5, %i0 -; SPARC-NEXT: or %i0, %l5, %i0 +; SPARC-NEXT: or %i0, %l4, %i0 ; SPARC-NEXT: and %i0, 1, %i4 ; SPARC-NEXT: mov %g3, %i0 ; SPARC-NEXT: ret diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-i128.ll b/llvm/test/CodeGen/SystemZ/inline-asm-i128.ll index d0000e26b65e6e..1abb4405ca020d 100644 --- a/llvm/test/CodeGen/SystemZ/inline-asm-i128.ll +++ b/llvm/test/CodeGen/SystemZ/inline-asm-i128.ll @@ -7,21 +7,24 @@ define i32 @fun0(ptr %p1, i32 signext %l1, ptr %p2, i32 signext %l2, i8 zeroext %pad) { ; CHECK-LABEL: fun0: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lgr %r0, %r5 -; CHECK-NEXT: # kill: def $r4d killed $r4d def $r4q -; CHECK-NEXT: lgr %r1, %r3 -; CHECK-NEXT: # kill: def $r2d killed $r2d def $r2q -; CHECK-NEXT: sllg %r5, %r6, 24 -; CHECK-NEXT: rosbg %r5, %r0, 40, 63, 0 -; CHECK-NEXT: risbg %r3, %r1, 40, 191, 0 +; CHECK-NEXT: stmg %r12, %r15, 96(%r15) +; CHECK-NEXT: .cfi_offset %r12, -64 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: lgr %r0, %r4 +; CHECK-NEXT: lgr %r12, %r2 +; CHECK-NEXT: sllg %r1, %r6, 24 +; CHECK-NEXT: rosbg %r1, %r5, 40, 63, 0 +; CHECK-NEXT: risbg %r13, %r3, 40, 191, 0 ; CHECK-NEXT: #APP -; CHECK-NEXT: clcl %r2, %r4 +; CHECK-NEXT: clcl %r12, %r0 ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: ogr %r3, %r5 -; CHECK-NEXT: risbg %r0, %r3, 40, 191, 0 +; CHECK-NEXT: ogr %r13, %r1 +; CHECK-NEXT: risbg %r0, %r13, 40, 191, 0 ; CHECK-NEXT: ipm %r2 ; CHECK-NEXT: afi %r2, -268435456 ; CHECK-NEXT: srl %r2, 31 +; CHECK-NEXT: lmg %r12, %r15, 96(%r15) ; CHECK-NEXT: br %r14 entry: %0 = ptrtoint ptr %p1 to i64 diff --git a/llvm/test/CodeGen/SystemZ/int-add-08.ll b/llvm/test/CodeGen/SystemZ/int-add-08.ll index 6f221040156af3..2c5548c215b8c8 100644 --- a/llvm/test/CodeGen/SystemZ/int-add-08.ll +++ b/llvm/test/CodeGen/SystemZ/int-add-08.ll @@ -114,7 +114,7 @@ define void @f7(ptr %aptr, i64 %base) { define void @f8(ptr %ptr0) { ; CHECK-LABEL: f8: ; CHECK: brasl %r14, foo@PLT -; CHECK: alg {{%r[0-9]+}}, {{[0-9]+}}(%r15) +; CHECK: alcg {{%r[0-9]+}}, {{[0-9]+}}(%r15) ; CHECK: alcg {{%r[0-9]+}}, {{[0-9]+}}(%r15) ; CHECK: br %r14 %ptr1 = getelementptr i128, ptr %ptr0, i128 2 diff --git a/llvm/test/CodeGen/SystemZ/int-conv-01.ll b/llvm/test/CodeGen/SystemZ/int-conv-01.ll index 491fb95cddf72b..1b332351c8197f 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-01.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-01.ll @@ -108,7 +108,7 @@ define i32 @f9(i64 %src, i64 %index) { ; to use LB if possible. define void @f10(ptr %ptr) { ; CHECK-LABEL: f10: -; CHECK: lb {{%r[0-9]+}}, 183(%r15) +; CHECK: lb {{%r[0-9]+}}, 171(%r15) ; CHECK: br %r14 %val0 = load volatile i32, ptr %ptr %val1 = load volatile i32, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-02.ll b/llvm/test/CodeGen/SystemZ/int-conv-02.ll index 6c33ee1098ff71..bc7260c5de111b 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-02.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-02.ll @@ -118,7 +118,7 @@ define i32 @f10(i64 %src, i64 %index) { ; to use LLC if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llc {{%r[0-9]+}}, 179(%r15) +; CHECK: llc {{%r[0-9]+}}, 175(%r15) ; CHECK: br %r14 %val0 = load volatile i32, ptr %ptr %val1 = load volatile i32, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-03.ll b/llvm/test/CodeGen/SystemZ/int-conv-03.ll index 41f2f87186a5ef..38cc37e853b8c2 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-03.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-03.ll @@ -108,7 +108,7 @@ define i64 @f9(i64 %src, i64 %index) { ; to use LGB if possible. define void @f10(ptr %ptr) { ; CHECK-LABEL: f10: -; CHECK: lgb {{%r[0-9]+}}, 199(%r15) +; CHECK: lgb {{%r[0-9]+}}, 183(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-04.ll b/llvm/test/CodeGen/SystemZ/int-conv-04.ll index 5c808920ff25e7..c35cebd77ecb27 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-04.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-04.ll @@ -117,7 +117,7 @@ define i64 @f10(i64 %src, i64 %index) { ; to use LLGC if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llgc {{%r[0-9]+}}, 199(%r15) +; CHECK: llgc {{%r[0-9]+}}, 183(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-06.ll b/llvm/test/CodeGen/SystemZ/int-conv-06.ll index 1163e1e04ce6cf..9878499d1f40f8 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-06.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-06.ll @@ -118,7 +118,7 @@ define i32 @f10(i64 %src, i64 %index) { ; to use LLH if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llh {{%r[0-9]+}}, 178(%r15) +; CHECK: llh {{%r[0-9]+}}, 174(%r15) ; CHECK: br %r14 %val0 = load volatile i32, ptr %ptr %val1 = load volatile i32, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-07.ll b/llvm/test/CodeGen/SystemZ/int-conv-07.ll index bc2895da2cde0a..69de6ffc261191 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-07.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-07.ll @@ -108,7 +108,7 @@ define i64 @f9(i64 %src, i64 %index) { ; to use LGH if possible. define void @f10(ptr %ptr) { ; CHECK-LABEL: f10: -; CHECK: lgh {{%r[0-9]+}}, 198(%r15) +; CHECK: lgh {{%r[0-9]+}}, 182(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-08.ll b/llvm/test/CodeGen/SystemZ/int-conv-08.ll index 82f2bcea4af780..aa43f80225fa45 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-08.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-08.ll @@ -117,7 +117,7 @@ define i64 @f10(i64 %src, i64 %index) { ; to use LLGH if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llgh {{%r[0-9]+}}, 198(%r15) +; CHECK: llgh {{%r[0-9]+}}, 182(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-sub-05.ll b/llvm/test/CodeGen/SystemZ/int-sub-05.ll index 1a2a2650c9e29c..0c4aca16f70c67 100644 --- a/llvm/test/CodeGen/SystemZ/int-sub-05.ll +++ b/llvm/test/CodeGen/SystemZ/int-sub-05.ll @@ -123,7 +123,7 @@ define void @f7(i64 %base) { define void @f8(ptr %ptr0) { ; CHECK-LABEL: f8: ; CHECK: brasl %r14, foo@PLT -; CHECK: slg {{%r[0-9]+}}, {{[0-9]+}}(%r15) +; CHECK: slbg {{%r[0-9]+}}, {{[0-9]+}}(%r15) ; CHECK: slbg {{%r[0-9]+}}, {{[0-9]+}}(%r15) ; CHECK: br %r14 %ptr1 = getelementptr i128, ptr %ptr0, i128 2 diff --git a/llvm/test/CodeGen/SystemZ/int-uadd-01.ll b/llvm/test/CodeGen/SystemZ/int-uadd-01.ll index 7d4aa9f3218485..7632abb38d75d2 100644 --- a/llvm/test/CodeGen/SystemZ/int-uadd-01.ll +++ b/llvm/test/CodeGen/SystemZ/int-uadd-01.ll @@ -271,39 +271,39 @@ define zeroext i1 @f14(ptr %ptr0) { ; CHECK-NEXT: .cfi_offset %r15, -40 ; CHECK-NEXT: aghi %r15, -168 ; CHECK-NEXT: .cfi_def_cfa_offset 328 -; CHECK-NEXT: l %r6, 0(%r2) -; CHECK-NEXT: l %r13, 8(%r2) -; CHECK-NEXT: l %r12, 16(%r2) -; CHECK-NEXT: l %r7, 24(%r2) +; CHECK-NEXT: l %r12, 0(%r2) +; CHECK-NEXT: l %r11, 8(%r2) +; CHECK-NEXT: l %r10, 16(%r2) +; CHECK-NEXT: l %r9, 24(%r2) ; CHECK-NEXT: l %r8, 32(%r2) -; CHECK-NEXT: l %r9, 40(%r2) -; CHECK-NEXT: l %r10, 48(%r2) -; CHECK-NEXT: l %r11, 56(%r2) +; CHECK-NEXT: l %r7, 40(%r2) +; CHECK-NEXT: l %r6, 48(%r2) +; CHECK-NEXT: l %r13, 56(%r2) ; CHECK-NEXT: mvc 160(4,%r15), 64(%r2) # 4-byte Folded Spill ; CHECK-NEXT: mvc 164(4,%r15), 72(%r2) # 4-byte Folded Spill ; CHECK-NEXT: brasl %r14, foo@PLT -; CHECK-NEXT: alr %r2, %r6 +; CHECK-NEXT: alr %r2, %r12 ; CHECK-NEXT: ipm %r0 ; CHECK-NEXT: risbg %r0, %r0, 63, 191, 35 -; CHECK-NEXT: alr %r2, %r13 +; CHECK-NEXT: alr %r2, %r11 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 -; CHECK-NEXT: alr %r2, %r12 +; CHECK-NEXT: alr %r2, %r10 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 -; CHECK-NEXT: alr %r2, %r7 +; CHECK-NEXT: alr %r2, %r9 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 ; CHECK-NEXT: alr %r2, %r8 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 -; CHECK-NEXT: alr %r2, %r9 +; CHECK-NEXT: alr %r2, %r7 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 -; CHECK-NEXT: alr %r2, %r10 +; CHECK-NEXT: alr %r2, %r6 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 -; CHECK-NEXT: alr %r2, %r11 +; CHECK-NEXT: alr %r2, %r13 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 ; CHECK-NEXT: al %r2, 160(%r15) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/SystemZ/int-uadd-02.ll b/llvm/test/CodeGen/SystemZ/int-uadd-02.ll index 46c5b4ff35fc05..15d4852a91cc93 100644 --- a/llvm/test/CodeGen/SystemZ/int-uadd-02.ll +++ b/llvm/test/CodeGen/SystemZ/int-uadd-02.ll @@ -215,39 +215,39 @@ define zeroext i1 @f11(ptr %ptr0) { ; CHECK-NEXT: .cfi_offset %r15, -40 ; CHECK-NEXT: aghi %r15, -176 ; CHECK-NEXT: .cfi_def_cfa_offset 336 -; CHECK-NEXT: lg %r6, 0(%r2) -; CHECK-NEXT: lg %r13, 16(%r2) -; CHECK-NEXT: lg %r12, 32(%r2) -; CHECK-NEXT: lg %r7, 48(%r2) +; CHECK-NEXT: lg %r12, 0(%r2) +; CHECK-NEXT: lg %r11, 16(%r2) +; CHECK-NEXT: lg %r10, 32(%r2) +; CHECK-NEXT: lg %r9, 48(%r2) ; CHECK-NEXT: lg %r8, 64(%r2) -; CHECK-NEXT: lg %r9, 80(%r2) -; CHECK-NEXT: lg %r10, 96(%r2) -; CHECK-NEXT: lg %r11, 112(%r2) +; CHECK-NEXT: lg %r7, 80(%r2) +; CHECK-NEXT: lg %r6, 96(%r2) +; CHECK-NEXT: lg %r13, 112(%r2) ; CHECK-NEXT: mvc 160(8,%r15), 128(%r2) # 8-byte Folded Spill ; CHECK-NEXT: mvc 168(8,%r15), 144(%r2) # 8-byte Folded Spill ; CHECK-NEXT: brasl %r14, foo@PLT -; CHECK-NEXT: algr %r2, %r6 +; CHECK-NEXT: algr %r2, %r12 ; CHECK-NEXT: ipm %r0 ; CHECK-NEXT: risbg %r0, %r0, 63, 191, 35 -; CHECK-NEXT: algr %r2, %r13 +; CHECK-NEXT: algr %r2, %r11 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 -; CHECK-NEXT: algr %r2, %r12 +; CHECK-NEXT: algr %r2, %r10 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 -; CHECK-NEXT: algr %r2, %r7 +; CHECK-NEXT: algr %r2, %r9 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 ; CHECK-NEXT: algr %r2, %r8 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 -; CHECK-NEXT: algr %r2, %r9 +; CHECK-NEXT: algr %r2, %r7 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 -; CHECK-NEXT: algr %r2, %r10 +; CHECK-NEXT: algr %r2, %r6 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 -; CHECK-NEXT: algr %r2, %r11 +; CHECK-NEXT: algr %r2, %r13 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 35 ; CHECK-NEXT: alg %r2, 160(%r15) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/SystemZ/int-usub-01.ll b/llvm/test/CodeGen/SystemZ/int-usub-01.ll index f9032ea64b6426..cb98c22efac54a 100644 --- a/llvm/test/CodeGen/SystemZ/int-usub-01.ll +++ b/llvm/test/CodeGen/SystemZ/int-usub-01.ll @@ -282,38 +282,38 @@ define zeroext i1 @f14(ptr %ptr0) { ; CHECK-NEXT: .cfi_offset %r15, -40 ; CHECK-NEXT: aghi %r15, -168 ; CHECK-NEXT: .cfi_def_cfa_offset 328 -; CHECK-NEXT: l %r6, 0(%r2) -; CHECK-NEXT: l %r13, 8(%r2) -; CHECK-NEXT: l %r12, 16(%r2) -; CHECK-NEXT: l %r7, 24(%r2) -; CHECK-NEXT: l %r8, 32(%r2) -; CHECK-NEXT: l %r9, 40(%r2) +; CHECK-NEXT: l %r9, 0(%r2) +; CHECK-NEXT: l %r8, 8(%r2) +; CHECK-NEXT: l %r7, 16(%r2) +; CHECK-NEXT: l %r6, 24(%r2) +; CHECK-NEXT: l %r13, 32(%r2) +; CHECK-NEXT: l %r12, 40(%r2) ; CHECK-NEXT: l %r10, 48(%r2) ; CHECK-NEXT: l %r11, 56(%r2) ; CHECK-NEXT: mvc 160(4,%r15), 64(%r2) # 4-byte Folded Spill ; CHECK-NEXT: mvc 164(4,%r15), 72(%r2) # 4-byte Folded Spill ; CHECK-NEXT: brasl %r14, foo@PLT -; CHECK-NEXT: slr %r2, %r6 +; CHECK-NEXT: slr %r2, %r9 ; CHECK-NEXT: ipm %r0 ; CHECK-NEXT: afi %r0, -536870912 ; CHECK-NEXT: srl %r0, 31 -; CHECK-NEXT: slr %r2, %r13 +; CHECK-NEXT: slr %r2, %r8 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: afi %r1, -536870912 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 33 -; CHECK-NEXT: slr %r2, %r12 +; CHECK-NEXT: slr %r2, %r7 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: afi %r1, -536870912 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 33 -; CHECK-NEXT: slr %r2, %r7 +; CHECK-NEXT: slr %r2, %r6 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: afi %r1, -536870912 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 33 -; CHECK-NEXT: slr %r2, %r8 +; CHECK-NEXT: slr %r2, %r13 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: afi %r1, -536870912 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 33 -; CHECK-NEXT: slr %r2, %r9 +; CHECK-NEXT: slr %r2, %r12 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: afi %r1, -536870912 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 33 diff --git a/llvm/test/CodeGen/SystemZ/int-usub-02.ll b/llvm/test/CodeGen/SystemZ/int-usub-02.ll index cda1a1ca8da8ec..d1049aaa5e99fe 100644 --- a/llvm/test/CodeGen/SystemZ/int-usub-02.ll +++ b/llvm/test/CodeGen/SystemZ/int-usub-02.ll @@ -223,38 +223,38 @@ define zeroext i1 @f11(ptr %ptr0) { ; CHECK-NEXT: .cfi_offset %r15, -40 ; CHECK-NEXT: aghi %r15, -176 ; CHECK-NEXT: .cfi_def_cfa_offset 336 -; CHECK-NEXT: lg %r6, 0(%r2) -; CHECK-NEXT: lg %r13, 16(%r2) -; CHECK-NEXT: lg %r12, 32(%r2) -; CHECK-NEXT: lg %r7, 48(%r2) -; CHECK-NEXT: lg %r8, 64(%r2) -; CHECK-NEXT: lg %r9, 80(%r2) +; CHECK-NEXT: lg %r9, 0(%r2) +; CHECK-NEXT: lg %r8, 16(%r2) +; CHECK-NEXT: lg %r7, 32(%r2) +; CHECK-NEXT: lg %r6, 48(%r2) +; CHECK-NEXT: lg %r13, 64(%r2) +; CHECK-NEXT: lg %r12, 80(%r2) ; CHECK-NEXT: lg %r10, 96(%r2) ; CHECK-NEXT: lg %r11, 112(%r2) ; CHECK-NEXT: mvc 160(8,%r15), 128(%r2) # 8-byte Folded Spill ; CHECK-NEXT: mvc 168(8,%r15), 144(%r2) # 8-byte Folded Spill ; CHECK-NEXT: brasl %r14, foo@PLT -; CHECK-NEXT: slgr %r2, %r6 +; CHECK-NEXT: slgr %r2, %r9 ; CHECK-NEXT: ipm %r0 ; CHECK-NEXT: afi %r0, -536870912 ; CHECK-NEXT: srl %r0, 31 -; CHECK-NEXT: slgr %r2, %r13 +; CHECK-NEXT: slgr %r2, %r8 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: afi %r1, -536870912 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 33 -; CHECK-NEXT: slgr %r2, %r12 +; CHECK-NEXT: slgr %r2, %r7 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: afi %r1, -536870912 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 33 -; CHECK-NEXT: slgr %r2, %r7 +; CHECK-NEXT: slgr %r2, %r6 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: afi %r1, -536870912 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 33 -; CHECK-NEXT: slgr %r2, %r8 +; CHECK-NEXT: slgr %r2, %r13 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: afi %r1, -536870912 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 33 -; CHECK-NEXT: slgr %r2, %r9 +; CHECK-NEXT: slgr %r2, %r12 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: afi %r1, -536870912 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 33 diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll index 5a629567d07069..f06bdc8a8727ae 100644 --- a/llvm/test/CodeGen/SystemZ/pr60413.ll +++ b/llvm/test/CodeGen/SystemZ/pr60413.ll @@ -24,59 +24,59 @@ define dso_local void @m() local_unnamed_addr #1 { ; CHECK-NEXT: clfi %r2, 128 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: risbg %r1, %r1, 63, 191, 36 -; CHECK-NEXT: vlvgp %v1, %r2, %r0 -; CHECK-NEXT: vlvgf %v1, %r2, 0 -; CHECK-NEXT: vlvgf %v1, %r2, 2 -; CHECK-NEXT: vlvgp %v0, %r0, %r2 +; CHECK-NEXT: vlvgp %v0, %r2, %r0 +; CHECK-NEXT: vlvgf %v0, %r2, 0 +; CHECK-NEXT: vlvgf %v0, %r2, 2 +; CHECK-NEXT: vlvgp %v1, %r0, %r2 ; CHECK-NEXT: vlvgp %v2, %r2, %r2 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d ; CHECK-NEXT: nilh %r2, 255 ; CHECK-NEXT: chi %r2, 128 ; CHECK-NEXT: ipm %r2 ; CHECK-NEXT: risbg %r2, %r2, 63, 191, 36 -; CHECK-NEXT: vlvgf %v0, %r0, 0 -; CHECK-NEXT: vlvgf %v0, %r0, 2 +; CHECK-NEXT: vlvgf %v1, %r0, 0 +; CHECK-NEXT: vlvgf %v1, %r0, 2 ; CHECK-NEXT: vrepf %v2, %v2, 1 ; CHECK-NEXT: vgbm %v3, 30583 -; CHECK-NEXT: vn %v0, %v0, %v3 ; CHECK-NEXT: vn %v1, %v1, %v3 +; CHECK-NEXT: vn %v0, %v0, %v3 ; CHECK-NEXT: vn %v2, %v2, %v3 ; CHECK-NEXT: vrepif %v3, 127 -; CHECK-NEXT: vchlf %v1, %v1, %v3 -; CHECK-NEXT: vlgvf %r12, %v1, 0 -; CHECK-NEXT: vchlf %v2, %v2, %v3 -; CHECK-NEXT: vlgvf %r4, %v2, 1 -; CHECK-NEXT: nilf %r4, 1 -; CHECK-NEXT: vlgvf %r5, %v2, 0 -; CHECK-NEXT: risbg %r3, %r5, 48, 176, 15 -; CHECK-NEXT: rosbg %r3, %r4, 32, 49, 14 -; CHECK-NEXT: vlgvf %r14, %v2, 2 -; CHECK-NEXT: nilf %r14, 1 -; CHECK-NEXT: rosbg %r3, %r14, 32, 50, 13 -; CHECK-NEXT: vlgvf %r13, %v2, 3 -; CHECK-NEXT: nilf %r13, 1 -; CHECK-NEXT: rosbg %r3, %r13, 32, 51, 12 -; CHECK-NEXT: rosbg %r3, %r12, 52, 52, 11 -; CHECK-NEXT: vlgvf %r12, %v1, 1 -; CHECK-NEXT: rosbg %r3, %r12, 53, 53, 10 -; CHECK-NEXT: vlgvf %r12, %v1, 2 -; CHECK-NEXT: rosbg %r3, %r12, 54, 54, 9 -; CHECK-NEXT: vlgvf %r12, %v1, 3 -; CHECK-NEXT: rosbg %r3, %r12, 55, 55, 8 ; CHECK-NEXT: vchlf %v0, %v0, %v3 -; CHECK-NEXT: vlgvf %r12, %v0, 0 -; CHECK-NEXT: rosbg %r3, %r12, 56, 56, 7 -; CHECK-NEXT: vlgvf %r12, %v0, 1 -; CHECK-NEXT: rosbg %r3, %r12, 57, 57, 6 -; CHECK-NEXT: vlgvf %r12, %v0, 2 -; CHECK-NEXT: rosbg %r3, %r12, 58, 58, 5 -; CHECK-NEXT: vlgvf %r12, %v0, 3 -; CHECK-NEXT: rosbg %r3, %r12, 59, 59, 4 +; CHECK-NEXT: vlgvf %r4, %v0, 0 +; CHECK-NEXT: vchlf %v2, %v2, %v3 +; CHECK-NEXT: vlgvf %r5, %v2, 1 ; CHECK-NEXT: nilf %r5, 1 -; CHECK-NEXT: rosbg %r3, %r5, 32, 60, 3 -; CHECK-NEXT: rosbg %r3, %r4, 32, 61, 2 -; CHECK-NEXT: rosbg %r3, %r14, 32, 62, 1 -; CHECK-NEXT: or %r3, %r13 +; CHECK-NEXT: vlgvf %r14, %v2, 0 +; CHECK-NEXT: risbg %r3, %r14, 48, 176, 15 +; CHECK-NEXT: rosbg %r3, %r5, 32, 49, 14 +; CHECK-NEXT: vlgvf %r13, %v2, 2 +; CHECK-NEXT: nilf %r13, 1 +; CHECK-NEXT: rosbg %r3, %r13, 32, 50, 13 +; CHECK-NEXT: vlgvf %r12, %v2, 3 +; CHECK-NEXT: nilf %r12, 1 +; CHECK-NEXT: rosbg %r3, %r12, 32, 51, 12 +; CHECK-NEXT: rosbg %r3, %r4, 52, 52, 11 +; CHECK-NEXT: vlgvf %r4, %v0, 1 +; CHECK-NEXT: rosbg %r3, %r4, 53, 53, 10 +; CHECK-NEXT: vlgvf %r4, %v0, 2 +; CHECK-NEXT: rosbg %r3, %r4, 54, 54, 9 +; CHECK-NEXT: vlgvf %r4, %v0, 3 +; CHECK-NEXT: rosbg %r3, %r4, 55, 55, 8 +; CHECK-NEXT: vchlf %v0, %v1, %v3 +; CHECK-NEXT: vlgvf %r4, %v0, 0 +; CHECK-NEXT: rosbg %r3, %r4, 56, 56, 7 +; CHECK-NEXT: vlgvf %r4, %v0, 1 +; CHECK-NEXT: rosbg %r3, %r4, 57, 57, 6 +; CHECK-NEXT: vlgvf %r4, %v0, 2 +; CHECK-NEXT: rosbg %r3, %r4, 58, 58, 5 +; CHECK-NEXT: vlgvf %r4, %v0, 3 +; CHECK-NEXT: rosbg %r3, %r4, 59, 59, 4 +; CHECK-NEXT: nilf %r14, 1 +; CHECK-NEXT: rosbg %r3, %r14, 32, 60, 3 +; CHECK-NEXT: rosbg %r3, %r5, 32, 61, 2 +; CHECK-NEXT: rosbg %r3, %r13, 32, 62, 1 +; CHECK-NEXT: or %r3, %r12 ; CHECK-NEXT: vlgvb %r5, %v0, 1 ; CHECK-NEXT: vlgvb %r4, %v0, 0 ; CHECK-NEXT: risbg %r4, %r4, 48, 176, 15 diff --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll index 1507f2c3581b2a..454b34ddfac6a1 100644 --- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll +++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll @@ -29,39 +29,39 @@ define i16 @fun1(<16 x i1> %src) ; CHECK: # %bb.0: ; CHECK-NEXT: aghi %r15, -168 ; CHECK-NEXT: .cfi_def_cfa_offset 328 -; CHECK-NEXT: vlgvb %r0, %v24, 0 -; CHECK-NEXT: vlgvb %r1, %v24, 1 -; CHECK-NEXT: risblg %r0, %r0, 16, 144, 15 -; CHECK-NEXT: rosbg %r0, %r1, 49, 49, 14 -; CHECK-NEXT: vlgvb %r1, %v24, 2 -; CHECK-NEXT: rosbg %r0, %r1, 50, 50, 13 -; CHECK-NEXT: vlgvb %r1, %v24, 3 -; CHECK-NEXT: rosbg %r0, %r1, 51, 51, 12 -; CHECK-NEXT: vlgvb %r1, %v24, 4 -; CHECK-NEXT: rosbg %r0, %r1, 52, 52, 11 -; CHECK-NEXT: vlgvb %r1, %v24, 5 -; CHECK-NEXT: rosbg %r0, %r1, 53, 53, 10 -; CHECK-NEXT: vlgvb %r1, %v24, 6 -; CHECK-NEXT: rosbg %r0, %r1, 54, 54, 9 -; CHECK-NEXT: vlgvb %r1, %v24, 7 -; CHECK-NEXT: rosbg %r0, %r1, 55, 55, 8 -; CHECK-NEXT: vlgvb %r1, %v24, 8 -; CHECK-NEXT: rosbg %r0, %r1, 56, 56, 7 -; CHECK-NEXT: vlgvb %r1, %v24, 9 -; CHECK-NEXT: rosbg %r0, %r1, 57, 57, 6 -; CHECK-NEXT: vlgvb %r1, %v24, 10 -; CHECK-NEXT: rosbg %r0, %r1, 58, 58, 5 -; CHECK-NEXT: vlgvb %r1, %v24, 11 -; CHECK-NEXT: rosbg %r0, %r1, 59, 59, 4 -; CHECK-NEXT: vlgvb %r1, %v24, 12 -; CHECK-NEXT: rosbg %r0, %r1, 60, 60, 3 -; CHECK-NEXT: vlgvb %r1, %v24, 13 -; CHECK-NEXT: rosbg %r0, %r1, 61, 61, 2 -; CHECK-NEXT: vlgvb %r1, %v24, 14 -; CHECK-NEXT: rosbg %r0, %r1, 62, 62, 1 -; CHECK-NEXT: vlgvb %r1, %v24, 15 -; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 0 -; CHECK-NEXT: llhr %r2, %r0 +; CHECK-NEXT: vlgvb %r1, %v24, 0 +; CHECK-NEXT: vlgvb %r0, %v24, 1 +; CHECK-NEXT: risblg %r1, %r1, 16, 144, 15 +; CHECK-NEXT: rosbg %r1, %r0, 49, 49, 14 +; CHECK-NEXT: vlgvb %r0, %v24, 2 +; CHECK-NEXT: rosbg %r1, %r0, 50, 50, 13 +; CHECK-NEXT: vlgvb %r0, %v24, 3 +; CHECK-NEXT: rosbg %r1, %r0, 51, 51, 12 +; CHECK-NEXT: vlgvb %r0, %v24, 4 +; CHECK-NEXT: rosbg %r1, %r0, 52, 52, 11 +; CHECK-NEXT: vlgvb %r0, %v24, 5 +; CHECK-NEXT: rosbg %r1, %r0, 53, 53, 10 +; CHECK-NEXT: vlgvb %r0, %v24, 6 +; CHECK-NEXT: rosbg %r1, %r0, 54, 54, 9 +; CHECK-NEXT: vlgvb %r0, %v24, 7 +; CHECK-NEXT: rosbg %r1, %r0, 55, 55, 8 +; CHECK-NEXT: vlgvb %r0, %v24, 8 +; CHECK-NEXT: rosbg %r1, %r0, 56, 56, 7 +; CHECK-NEXT: vlgvb %r0, %v24, 9 +; CHECK-NEXT: rosbg %r1, %r0, 57, 57, 6 +; CHECK-NEXT: vlgvb %r0, %v24, 10 +; CHECK-NEXT: rosbg %r1, %r0, 58, 58, 5 +; CHECK-NEXT: vlgvb %r0, %v24, 11 +; CHECK-NEXT: rosbg %r1, %r0, 59, 59, 4 +; CHECK-NEXT: vlgvb %r0, %v24, 12 +; CHECK-NEXT: rosbg %r1, %r0, 60, 60, 3 +; CHECK-NEXT: vlgvb %r0, %v24, 13 +; CHECK-NEXT: rosbg %r1, %r0, 61, 61, 2 +; CHECK-NEXT: vlgvb %r0, %v24, 14 +; CHECK-NEXT: rosbg %r1, %r0, 62, 62, 1 +; CHECK-NEXT: vlgvb %r0, %v24, 15 +; CHECK-NEXT: rosbg %r1, %r0, 63, 63, 0 +; CHECK-NEXT: llhr %r2, %r1 ; CHECK-NEXT: aghi %r15, 168 ; CHECK-NEXT: br %r14 { @@ -73,40 +73,40 @@ define i16 @fun1(<16 x i1> %src) define void @fun2(<8 x i32> %src, ptr %p) ; CHECK-LABEL: fun2: ; CHECK: # %bb.0: -; CHECK-NEXT: vlgvf %r1, %v26, 3 +; CHECK-NEXT: vlgvf %r0, %v26, 3 ; CHECK-NEXT: vlgvf %r5, %v24, 0 ; CHECK-NEXT: vlgvf %r3, %v24, 1 -; CHECK-NEXT: srlk %r0, %r1, 8 -; CHECK-NEXT: sth %r0, 28(%r2) -; CHECK-NEXT: vlgvf %r0, %v24, 2 +; CHECK-NEXT: srlk %r1, %r0, 8 +; CHECK-NEXT: sth %r1, 28(%r2) +; CHECK-NEXT: vlgvf %r1, %v24, 2 ; CHECK-NEXT: sllg %r5, %r5, 33 ; CHECK-NEXT: sllg %r4, %r3, 58 -; CHECK-NEXT: risbgn %r0, %r0, 6, 164, 27 +; CHECK-NEXT: risbgn %r1, %r1, 6, 164, 27 ; CHECK-NEXT: rosbg %r5, %r3, 31, 55, 2 ; CHECK-NEXT: vlgvf %r3, %v26, 2 -; CHECK-NEXT: stc %r1, 30(%r2) -; CHECK-NEXT: ogr %r4, %r0 -; CHECK-NEXT: risbgn %r1, %r1, 33, 167, 0 +; CHECK-NEXT: stc %r0, 30(%r2) +; CHECK-NEXT: ogr %r4, %r1 +; CHECK-NEXT: risbgn %r0, %r0, 33, 167, 0 ; CHECK-NEXT: rosbg %r5, %r4, 56, 63, 8 ; CHECK-NEXT: risbgn %r3, %r3, 2, 160, 31 -; CHECK-NEXT: ogr %r1, %r3 +; CHECK-NEXT: ogr %r0, %r3 ; CHECK-NEXT: vlgvf %r4, %v24, 3 -; CHECK-NEXT: srlg %r1, %r1, 24 -; CHECK-NEXT: rosbg %r0, %r4, 37, 63, 60 -; CHECK-NEXT: st %r1, 24(%r2) -; CHECK-NEXT: vlgvf %r1, %v26, 0 +; CHECK-NEXT: srlg %r0, %r0, 24 +; CHECK-NEXT: rosbg %r1, %r4, 37, 63, 60 +; CHECK-NEXT: st %r0, 24(%r2) +; CHECK-NEXT: vlgvf %r0, %v26, 0 ; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: risbgn %r1, %r1, 4, 162, 29 +; CHECK-NEXT: risbgn %r0, %r0, 4, 162, 29 ; CHECK-NEXT: sllg %r5, %r4, 60 -; CHECK-NEXT: ogr %r5, %r1 -; CHECK-NEXT: sllg %r0, %r0, 8 -; CHECK-NEXT: rosbg %r0, %r5, 56, 63, 8 -; CHECK-NEXT: stg %r0, 8(%r2) -; CHECK-NEXT: vlgvf %r0, %v26, 1 -; CHECK-NEXT: sllg %r4, %r0, 62 +; CHECK-NEXT: ogr %r5, %r0 +; CHECK-NEXT: sllg %r1, %r1, 8 +; CHECK-NEXT: rosbg %r1, %r5, 56, 63, 8 +; CHECK-NEXT: stg %r1, 8(%r2) +; CHECK-NEXT: vlgvf %r1, %v26, 1 +; CHECK-NEXT: sllg %r4, %r1, 62 ; CHECK-NEXT: ogr %r3, %r4 -; CHECK-NEXT: rosbg %r1, %r0, 35, 63, 62 -; CHECK-NEXT: sllg %r0, %r1, 8 +; CHECK-NEXT: rosbg %r0, %r1, 35, 63, 62 +; CHECK-NEXT: sllg %r0, %r0, 8 ; CHECK-NEXT: rosbg %r0, %r3, 56, 63, 8 ; CHECK-NEXT: stg %r0, 16(%r2) ; CHECK-NEXT: br %r14 diff --git a/llvm/test/CodeGen/Thumb/arm_q15_to_q31.ll b/llvm/test/CodeGen/Thumb/arm_q15_to_q31.ll index fdfbf3393098e4..f6fcd4534aae15 100644 --- a/llvm/test/CodeGen/Thumb/arm_q15_to_q31.ll +++ b/llvm/test/CodeGen/Thumb/arm_q15_to_q31.ll @@ -7,132 +7,137 @@ define void @arm_q15_to_q31(ptr nocapture noundef readonly %pSrc, ptr nocapture ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: mov r7, r2 +; CHECK-NEXT: .pad #12 +; CHECK-NEXT: sub sp, #12 ; CHECK-NEXT: lsrs r3, r2, #2 ; CHECK-NEXT: beq .LBB0_6 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: movs r5, #3 ; CHECK-NEXT: ands r5, r3 -; CHECK-NEXT: subs r2, r3, #1 +; CHECK-NEXT: subs r4, r3, #1 ; CHECK-NEXT: cbz r5, .LBB0_4 ; CHECK-NEXT: @ %bb.2: @ %while.body.prol -; CHECK-NEXT: str r2, [sp] @ 4-byte Spill -; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: ldrh r2, [r0] +; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: ldrh r7, [r0, #2] ; CHECK-NEXT: ldrh r4, [r0, #4] ; CHECK-NEXT: ldrh r6, [r0, #6] ; CHECK-NEXT: lsls r6, r6, #16 +; CHECK-NEXT: str r6, [sp] @ 4-byte Spill ; CHECK-NEXT: lsls r4, r4, #16 ; CHECK-NEXT: lsls r7, r7, #16 -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: stm r1!, {r2, r7} +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: lsls r6, r6, #16 +; CHECK-NEXT: stm r1!, {r6, r7} ; CHECK-NEXT: str r4, [r1] -; CHECK-NEXT: str r6, [r1, #4] +; CHECK-NEXT: ldr r4, [sp] @ 4-byte Reload +; CHECK-NEXT: str r4, [r1, #4] ; CHECK-NEXT: subs r1, #8 ; CHECK-NEXT: cmp r5, #1 ; CHECK-NEXT: bne .LBB0_11 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: .LBB0_4: @ %while.body.prol.loopexit -; CHECK-NEXT: cmp r2, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: blo .LBB0_6 ; CHECK-NEXT: .LBB0_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: ldrh r4, [r0, #2] -; CHECK-NEXT: ldrh r5, [r0, #4] -; CHECK-NEXT: ldrh r6, [r0, #6] +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: ldrh r5, [r0, #2] +; CHECK-NEXT: ldrh r6, [r0, #4] +; CHECK-NEXT: ldrh r7, [r0, #6] +; CHECK-NEXT: lsls r7, r7, #16 +; CHECK-NEXT: str r7, [r1, #12] ; CHECK-NEXT: lsls r6, r6, #16 -; CHECK-NEXT: str r6, [r1, #12] +; CHECK-NEXT: str r6, [r1, #8] ; CHECK-NEXT: lsls r5, r5, #16 -; CHECK-NEXT: str r5, [r1, #8] +; CHECK-NEXT: str r5, [r1, #4] ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: str r4, [r1, #4] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1] -; CHECK-NEXT: ldrh r2, [r0, #8] -; CHECK-NEXT: ldrh r4, [r0, #10] -; CHECK-NEXT: ldrh r5, [r0, #12] -; CHECK-NEXT: ldrh r6, [r0, #14] +; CHECK-NEXT: str r4, [r1] +; CHECK-NEXT: ldrh r4, [r0, #8] +; CHECK-NEXT: ldrh r5, [r0, #10] +; CHECK-NEXT: ldrh r6, [r0, #12] +; CHECK-NEXT: ldrh r7, [r0, #14] +; CHECK-NEXT: lsls r7, r7, #16 +; CHECK-NEXT: str r7, [r1, #28] ; CHECK-NEXT: lsls r6, r6, #16 -; CHECK-NEXT: str r6, [r1, #28] +; CHECK-NEXT: str r6, [r1, #24] ; CHECK-NEXT: lsls r5, r5, #16 -; CHECK-NEXT: str r5, [r1, #24] +; CHECK-NEXT: str r5, [r1, #20] ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: str r4, [r1, #20] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1, #16] -; CHECK-NEXT: ldrh r2, [r0, #16] -; CHECK-NEXT: ldrh r4, [r0, #18] -; CHECK-NEXT: ldrh r5, [r0, #20] -; CHECK-NEXT: ldrh r6, [r0, #22] +; CHECK-NEXT: str r4, [r1, #16] +; CHECK-NEXT: ldrh r4, [r0, #16] +; CHECK-NEXT: ldrh r5, [r0, #18] +; CHECK-NEXT: ldrh r6, [r0, #20] +; CHECK-NEXT: ldrh r7, [r0, #22] +; CHECK-NEXT: lsls r7, r7, #16 +; CHECK-NEXT: str r7, [r1, #44] ; CHECK-NEXT: lsls r6, r6, #16 -; CHECK-NEXT: str r6, [r1, #44] +; CHECK-NEXT: str r6, [r1, #40] ; CHECK-NEXT: lsls r5, r5, #16 -; CHECK-NEXT: str r5, [r1, #40] +; CHECK-NEXT: str r5, [r1, #36] ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: str r4, [r1, #36] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1, #32] -; CHECK-NEXT: ldrh r2, [r0, #24] -; CHECK-NEXT: ldrh r4, [r0, #26] -; CHECK-NEXT: ldrh r5, [r0, #28] -; CHECK-NEXT: ldrh r6, [r0, #30] +; CHECK-NEXT: str r4, [r1, #32] +; CHECK-NEXT: ldrh r4, [r0, #24] +; CHECK-NEXT: ldrh r5, [r0, #26] +; CHECK-NEXT: ldrh r6, [r0, #28] +; CHECK-NEXT: ldrh r7, [r0, #30] +; CHECK-NEXT: lsls r7, r7, #16 +; CHECK-NEXT: str r7, [r1, #60] ; CHECK-NEXT: lsls r6, r6, #16 -; CHECK-NEXT: str r6, [r1, #60] +; CHECK-NEXT: str r6, [r1, #56] ; CHECK-NEXT: lsls r5, r5, #16 -; CHECK-NEXT: str r5, [r1, #56] +; CHECK-NEXT: str r5, [r1, #52] ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: str r4, [r1, #52] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1, #48] +; CHECK-NEXT: str r4, [r1, #48] ; CHECK-NEXT: adds r1, #64 ; CHECK-NEXT: adds r0, #32 ; CHECK-NEXT: subs r3, r3, #4 ; CHECK-NEXT: bne .LBB0_5 ; CHECK-NEXT: .LBB0_6: @ %while.end -; CHECK-NEXT: movs r2, #3 -; CHECK-NEXT: ands r7, r2 +; CHECK-NEXT: movs r3, #3 +; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.7: @ %while.body12 -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1] -; CHECK-NEXT: cmp r7, #1 +; CHECK-NEXT: ldrh r3, [r0] +; CHECK-NEXT: lsls r3, r3, #16 +; CHECK-NEXT: str r3, [r1] +; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.8: @ %while.body12.1 -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1, #4] -; CHECK-NEXT: cmp r7, #2 +; CHECK-NEXT: ldrh r3, [r0, #2] +; CHECK-NEXT: lsls r3, r3, #16 +; CHECK-NEXT: str r3, [r1, #4] +; CHECK-NEXT: cmp r2, #2 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.9: @ %while.body12.2 ; CHECK-NEXT: ldrh r0, [r0, #4] ; CHECK-NEXT: lsls r0, r0, #16 ; CHECK-NEXT: str r0, [r1, #8] ; CHECK-NEXT: .LBB0_10: @ %while.end17 -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: add sp, #12 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .LBB0_11: @ %while.body.prol.1 -; CHECK-NEXT: ldrh r2, [r0, #8] -; CHECK-NEXT: ldrh r4, [r0, #10] -; CHECK-NEXT: ldrh r6, [r0, #12] -; CHECK-NEXT: ldrh r7, [r0, #14] +; CHECK-NEXT: ldrh r4, [r0, #8] +; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldrh r6, [r0, #10] +; CHECK-NEXT: ldrh r7, [r0, #12] +; CHECK-NEXT: ldrh r4, [r0, #14] +; CHECK-NEXT: lsls r4, r4, #16 +; CHECK-NEXT: str r4, [sp] @ 4-byte Spill ; CHECK-NEXT: lsls r7, r7, #16 ; CHECK-NEXT: lsls r6, r6, #16 +; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1, #16] -; CHECK-NEXT: str r4, [r1, #20] -; CHECK-NEXT: str r6, [r1, #24] -; CHECK-NEXT: str r7, [r1, #28] +; CHECK-NEXT: str r4, [r1, #16] +; CHECK-NEXT: str r6, [r1, #20] +; CHECK-NEXT: str r7, [r1, #24] +; CHECK-NEXT: ldr r4, [sp] @ 4-byte Reload +; CHECK-NEXT: str r4, [r1, #28] ; CHECK-NEXT: cmp r5, #2 ; CHECK-NEXT: bne .LBB0_13 ; CHECK-NEXT: @ %bb.12: @@ -141,24 +146,24 @@ define void @arm_q15_to_q31(ptr nocapture noundef readonly %pSrc, ptr nocapture ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: b .LBB0_14 ; CHECK-NEXT: .LBB0_13: @ %while.body.prol.2 -; CHECK-NEXT: ldrh r2, [r0, #16] -; CHECK-NEXT: ldrh r4, [r0, #18] -; CHECK-NEXT: ldrh r5, [r0, #20] -; CHECK-NEXT: ldrh r6, [r0, #22] +; CHECK-NEXT: ldrh r4, [r0, #16] +; CHECK-NEXT: ldrh r5, [r0, #18] +; CHECK-NEXT: ldrh r6, [r0, #20] +; CHECK-NEXT: ldrh r7, [r0, #22] +; CHECK-NEXT: lsls r7, r7, #16 ; CHECK-NEXT: lsls r6, r6, #16 ; CHECK-NEXT: lsls r5, r5, #16 ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: adds r7, #32 -; CHECK-NEXT: stm r7!, {r2, r4, r5, r6} +; CHECK-NEXT: str r4, [r1, #32] +; CHECK-NEXT: str r5, [r1, #36] +; CHECK-NEXT: str r6, [r1, #40] +; CHECK-NEXT: str r7, [r1, #44] ; CHECK-NEXT: subs r3, r3, #3 ; CHECK-NEXT: adds r1, #48 ; CHECK-NEXT: adds r0, #24 ; CHECK-NEXT: .LBB0_14: @ %while.body.prol.loopexit -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: cmp r2, #3 +; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: bhs .LBB0_5 ; CHECK-NEXT: b .LBB0_6 entry: @@ -414,133 +419,138 @@ define void @arm_q15_to_q31_altorder(ptr nocapture noundef readonly %pSrc, ptr n ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: mov r7, r2 +; CHECK-NEXT: .pad #12 +; CHECK-NEXT: sub sp, #12 ; CHECK-NEXT: lsrs r3, r2, #2 ; CHECK-NEXT: beq .LBB1_6 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: movs r5, #3 ; CHECK-NEXT: ands r5, r3 -; CHECK-NEXT: subs r2, r3, #1 +; CHECK-NEXT: subs r4, r3, #1 ; CHECK-NEXT: cbz r5, .LBB1_4 ; CHECK-NEXT: @ %bb.2: @ %while.body.prol -; CHECK-NEXT: str r2, [sp] @ 4-byte Spill -; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: ldrh r2, [r0] +; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: ldrh r7, [r0, #2] ; CHECK-NEXT: ldrh r4, [r0, #4] ; CHECK-NEXT: ldrh r6, [r0, #6] ; CHECK-NEXT: lsls r6, r6, #16 +; CHECK-NEXT: str r6, [sp] @ 4-byte Spill ; CHECK-NEXT: lsls r4, r4, #16 ; CHECK-NEXT: lsls r7, r7, #16 -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: stm r1!, {r2, r7} +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: lsls r6, r6, #16 +; CHECK-NEXT: stm r1!, {r6, r7} ; CHECK-NEXT: str r4, [r1] -; CHECK-NEXT: str r6, [r1, #4] +; CHECK-NEXT: ldr r4, [sp] @ 4-byte Reload +; CHECK-NEXT: str r4, [r1, #4] ; CHECK-NEXT: subs r1, #8 ; CHECK-NEXT: cmp r5, #1 ; CHECK-NEXT: bne .LBB1_11 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: .LBB1_4: @ %while.body.prol.loopexit -; CHECK-NEXT: cmp r2, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: blo .LBB1_6 ; CHECK-NEXT: .LBB1_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: ldrh r4, [r0, #2] -; CHECK-NEXT: ldrh r5, [r0, #4] -; CHECK-NEXT: ldrh r6, [r0, #6] +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: ldrh r5, [r0, #2] +; CHECK-NEXT: ldrh r6, [r0, #4] +; CHECK-NEXT: ldrh r7, [r0, #6] +; CHECK-NEXT: lsls r7, r7, #16 +; CHECK-NEXT: str r7, [r1, #12] ; CHECK-NEXT: lsls r6, r6, #16 -; CHECK-NEXT: str r6, [r1, #12] +; CHECK-NEXT: str r6, [r1, #8] ; CHECK-NEXT: lsls r5, r5, #16 -; CHECK-NEXT: str r5, [r1, #8] +; CHECK-NEXT: str r5, [r1, #4] ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: str r4, [r1, #4] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1] -; CHECK-NEXT: ldrh r2, [r0, #8] -; CHECK-NEXT: ldrh r4, [r0, #10] -; CHECK-NEXT: ldrh r5, [r0, #12] -; CHECK-NEXT: ldrh r6, [r0, #14] +; CHECK-NEXT: str r4, [r1] +; CHECK-NEXT: ldrh r4, [r0, #8] +; CHECK-NEXT: ldrh r5, [r0, #10] +; CHECK-NEXT: ldrh r6, [r0, #12] +; CHECK-NEXT: ldrh r7, [r0, #14] +; CHECK-NEXT: lsls r7, r7, #16 +; CHECK-NEXT: str r7, [r1, #28] ; CHECK-NEXT: lsls r6, r6, #16 -; CHECK-NEXT: str r6, [r1, #28] +; CHECK-NEXT: str r6, [r1, #24] ; CHECK-NEXT: lsls r5, r5, #16 -; CHECK-NEXT: str r5, [r1, #24] +; CHECK-NEXT: str r5, [r1, #20] ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: str r4, [r1, #20] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1, #16] -; CHECK-NEXT: ldrh r2, [r0, #16] -; CHECK-NEXT: ldrh r4, [r0, #18] -; CHECK-NEXT: ldrh r5, [r0, #20] -; CHECK-NEXT: ldrh r6, [r0, #22] +; CHECK-NEXT: str r4, [r1, #16] +; CHECK-NEXT: ldrh r4, [r0, #16] +; CHECK-NEXT: ldrh r5, [r0, #18] +; CHECK-NEXT: ldrh r6, [r0, #20] +; CHECK-NEXT: ldrh r7, [r0, #22] +; CHECK-NEXT: lsls r7, r7, #16 +; CHECK-NEXT: str r7, [r1, #44] ; CHECK-NEXT: lsls r6, r6, #16 -; CHECK-NEXT: str r6, [r1, #44] +; CHECK-NEXT: str r6, [r1, #40] ; CHECK-NEXT: lsls r5, r5, #16 -; CHECK-NEXT: str r5, [r1, #40] +; CHECK-NEXT: str r5, [r1, #36] ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: str r4, [r1, #36] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1, #32] -; CHECK-NEXT: ldrh r2, [r0, #24] -; CHECK-NEXT: ldrh r4, [r0, #26] -; CHECK-NEXT: ldrh r5, [r0, #28] -; CHECK-NEXT: ldrh r6, [r0, #30] +; CHECK-NEXT: str r4, [r1, #32] +; CHECK-NEXT: ldrh r4, [r0, #24] +; CHECK-NEXT: ldrh r5, [r0, #26] +; CHECK-NEXT: ldrh r6, [r0, #28] +; CHECK-NEXT: ldrh r7, [r0, #30] +; CHECK-NEXT: lsls r7, r7, #16 +; CHECK-NEXT: str r7, [r1, #60] ; CHECK-NEXT: lsls r6, r6, #16 -; CHECK-NEXT: str r6, [r1, #60] +; CHECK-NEXT: str r6, [r1, #56] ; CHECK-NEXT: lsls r5, r5, #16 -; CHECK-NEXT: str r5, [r1, #56] +; CHECK-NEXT: str r5, [r1, #52] ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: str r4, [r1, #52] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1, #48] +; CHECK-NEXT: str r4, [r1, #48] ; CHECK-NEXT: adds r1, #64 ; CHECK-NEXT: subs r3, r3, #4 ; CHECK-NEXT: adds r0, #32 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: bne .LBB1_5 ; CHECK-NEXT: .LBB1_6: @ %while.end -; CHECK-NEXT: movs r2, #3 -; CHECK-NEXT: ands r7, r2 +; CHECK-NEXT: movs r3, #3 +; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: beq .LBB1_10 ; CHECK-NEXT: @ %bb.7: @ %while.body12 -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1] -; CHECK-NEXT: cmp r7, #1 +; CHECK-NEXT: ldrh r3, [r0] +; CHECK-NEXT: lsls r3, r3, #16 +; CHECK-NEXT: str r3, [r1] +; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: beq .LBB1_10 ; CHECK-NEXT: @ %bb.8: @ %while.body12.1 -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1, #4] -; CHECK-NEXT: cmp r7, #2 +; CHECK-NEXT: ldrh r3, [r0, #2] +; CHECK-NEXT: lsls r3, r3, #16 +; CHECK-NEXT: str r3, [r1, #4] +; CHECK-NEXT: cmp r2, #2 ; CHECK-NEXT: beq .LBB1_10 ; CHECK-NEXT: @ %bb.9: @ %while.body12.2 ; CHECK-NEXT: ldrh r0, [r0, #4] ; CHECK-NEXT: lsls r0, r0, #16 ; CHECK-NEXT: str r0, [r1, #8] ; CHECK-NEXT: .LBB1_10: @ %while.end17 -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: add sp, #12 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .LBB1_11: @ %while.body.prol.1 -; CHECK-NEXT: ldrh r2, [r0, #8] -; CHECK-NEXT: ldrh r4, [r0, #10] -; CHECK-NEXT: ldrh r6, [r0, #12] -; CHECK-NEXT: ldrh r7, [r0, #14] +; CHECK-NEXT: ldrh r4, [r0, #8] +; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldrh r6, [r0, #10] +; CHECK-NEXT: ldrh r7, [r0, #12] +; CHECK-NEXT: ldrh r4, [r0, #14] +; CHECK-NEXT: lsls r4, r4, #16 +; CHECK-NEXT: str r4, [sp] @ 4-byte Spill ; CHECK-NEXT: lsls r7, r7, #16 ; CHECK-NEXT: lsls r6, r6, #16 +; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: str r2, [r1, #16] -; CHECK-NEXT: str r4, [r1, #20] -; CHECK-NEXT: str r6, [r1, #24] -; CHECK-NEXT: str r7, [r1, #28] +; CHECK-NEXT: str r4, [r1, #16] +; CHECK-NEXT: str r6, [r1, #20] +; CHECK-NEXT: str r7, [r1, #24] +; CHECK-NEXT: ldr r4, [sp] @ 4-byte Reload +; CHECK-NEXT: str r4, [r1, #28] ; CHECK-NEXT: cmp r5, #2 ; CHECK-NEXT: bne .LBB1_13 ; CHECK-NEXT: @ %bb.12: @@ -549,24 +559,24 @@ define void @arm_q15_to_q31_altorder(ptr nocapture noundef readonly %pSrc, ptr n ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: b .LBB1_14 ; CHECK-NEXT: .LBB1_13: @ %while.body.prol.2 -; CHECK-NEXT: ldrh r2, [r0, #16] -; CHECK-NEXT: ldrh r4, [r0, #18] -; CHECK-NEXT: ldrh r5, [r0, #20] -; CHECK-NEXT: ldrh r6, [r0, #22] +; CHECK-NEXT: ldrh r4, [r0, #16] +; CHECK-NEXT: ldrh r5, [r0, #18] +; CHECK-NEXT: ldrh r6, [r0, #20] +; CHECK-NEXT: ldrh r7, [r0, #22] +; CHECK-NEXT: lsls r7, r7, #16 ; CHECK-NEXT: lsls r6, r6, #16 ; CHECK-NEXT: lsls r5, r5, #16 ; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: lsls r2, r2, #16 -; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: adds r7, #32 -; CHECK-NEXT: stm r7!, {r2, r4, r5, r6} +; CHECK-NEXT: str r4, [r1, #32] +; CHECK-NEXT: str r5, [r1, #36] +; CHECK-NEXT: str r6, [r1, #40] +; CHECK-NEXT: str r7, [r1, #44] ; CHECK-NEXT: subs r3, r3, #3 ; CHECK-NEXT: adds r1, #48 ; CHECK-NEXT: adds r0, #24 ; CHECK-NEXT: .LBB1_14: @ %while.body.prol.loopexit -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: cmp r2, #3 +; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: bhs .LBB1_5 ; CHECK-NEXT: b .LBB1_6 entry: diff --git a/llvm/test/CodeGen/Thumb/branch-to-return.ll b/llvm/test/CodeGen/Thumb/branch-to-return.ll index 5bfccc06375503..ab78c3a5a643fa 100644 --- a/llvm/test/CodeGen/Thumb/branch-to-return.ll +++ b/llvm/test/CodeGen/Thumb/branch-to-return.ll @@ -10,32 +10,33 @@ define i32 @foo(i32* %x, i32 %n) { ; CHECK-NEXT: blt .LBB0_4 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: bic r3, r1, #3 -; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: cmp r1, #4 ; CHECK-NEXT: bhs .LBB0_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB0_6 ; CHECK-NEXT: .LBB0_3: @ %middle.block ; CHECK-NEXT: cmp r1, r3 ; CHECK-NEXT: bne .LBB0_5 ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .LBB0_5: -; CHECK-NEXT: ldr.w r0, [r12] +; CHECK-NEXT: ldr.w r12, [r0] ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader1 ; CHECK-NEXT: subs r3, r1, r3 ; CHECK-NEXT: mvn r2, #12 ; CHECK-NEXT: and.w r1, r2, r1, lsl #2 -; CHECK-NEXT: add r1, r12 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r2, [r1], #4 +; CHECK-NEXT: ldr r1, [r0], #4 ; CHECK-NEXT: subs r3, #1 -; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: add r12, r1 ; CHECK-NEXT: bne .LBB0_7 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup +; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: bx lr entry: %n.vec = and i32 %n, -4 diff --git a/llvm/test/CodeGen/Thumb/pr35836.ll b/llvm/test/CodeGen/Thumb/pr35836.ll index 96a6fe5d142025..376d8709297c0e 100644 --- a/llvm/test/CodeGen/Thumb/pr35836.ll +++ b/llvm/test/CodeGen/Thumb/pr35836.ll @@ -34,18 +34,18 @@ while.body: %shr32 = lshr i64 %add29, 32 br label %while.body } -; CHECK: adds r3, r0, r1 -; CHECK: push {r5} +; CHECK: adds r5, r0, r1 +; CHECK: push {r3} ; CHECK: pop {r1} -; CHECK: adcs r1, r5 +; CHECK: adcs r1, r3 ; CHECK: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK: ldr r2, [sp, #8] @ 4-byte Reload ; CHECK: adds r2, r0, r2 -; CHECK: push {r5} -; CHECK: pop {r4} -; CHECK: adcs r4, r5 -; CHECK: adds r0, r2, r5 ; CHECK: push {r3} +; CHECK: pop {r4} +; CHECK: adcs r4, r3 +; CHECK: adds r0, r2, r3 +; CHECK: push {r5} ; CHECK: pop {r0} ; CHECK: adcs r0, r4 ; CHECK: ldr r6, [sp, #4] @ 4-byte Reload diff --git a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll index 9b5fa1c2bc8113..0dd9e571c38280 100644 --- a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll @@ -10,11 +10,11 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV6-NEXT: sub sp, #60 ; THUMBV6-NEXT: mov r6, r3 ; THUMBV6-NEXT: mov r1, r2 -; THUMBV6-NEXT: str r2, [sp, #52] @ 4-byte Spill +; THUMBV6-NEXT: str r2, [sp, #48] @ 4-byte Spill ; THUMBV6-NEXT: mov r4, r0 ; THUMBV6-NEXT: str r0, [sp, #40] @ 4-byte Spill ; THUMBV6-NEXT: ldr r2, [sp, #88] -; THUMBV6-NEXT: str r2, [sp, #48] @ 4-byte Spill +; THUMBV6-NEXT: str r2, [sp, #44] @ 4-byte Spill ; THUMBV6-NEXT: movs r5, #0 ; THUMBV6-NEXT: mov r0, r1 ; THUMBV6-NEXT: mov r1, r5 @@ -25,19 +25,19 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV6-NEXT: ldr r2, [sp, #96] ; THUMBV6-NEXT: str r2, [sp, #36] @ 4-byte Spill ; THUMBV6-NEXT: mov r4, r6 -; THUMBV6-NEXT: str r6, [sp, #56] @ 4-byte Spill +; THUMBV6-NEXT: str r6, [sp, #52] @ 4-byte Spill ; THUMBV6-NEXT: mov r0, r6 ; THUMBV6-NEXT: mov r1, r5 ; THUMBV6-NEXT: mov r3, r5 ; THUMBV6-NEXT: bl __aeabi_lmul -; THUMBV6-NEXT: str r0, [sp, #44] @ 4-byte Spill +; THUMBV6-NEXT: str r0, [sp, #56] @ 4-byte Spill ; THUMBV6-NEXT: mov r7, r1 ; THUMBV6-NEXT: subs r0, r1, #1 ; THUMBV6-NEXT: sbcs r7, r0 ; THUMBV6-NEXT: ldr r0, [sp, #100] ; THUMBV6-NEXT: str r0, [sp, #32] @ 4-byte Spill ; THUMBV6-NEXT: mov r1, r5 -; THUMBV6-NEXT: ldr r6, [sp, #52] @ 4-byte Reload +; THUMBV6-NEXT: ldr r6, [sp, #48] @ 4-byte Reload ; THUMBV6-NEXT: mov r2, r6 ; THUMBV6-NEXT: mov r3, r5 ; THUMBV6-NEXT: bl __aeabi_lmul @@ -53,7 +53,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV6-NEXT: ands r4, r3 ; THUMBV6-NEXT: orrs r4, r1 ; THUMBV6-NEXT: orrs r4, r7 -; THUMBV6-NEXT: ldr r0, [sp, #44] @ 4-byte Reload +; THUMBV6-NEXT: ldr r0, [sp, #56] @ 4-byte Reload ; THUMBV6-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; THUMBV6-NEXT: adds r7, r1, r0 ; THUMBV6-NEXT: ldr r0, [sp, #36] @ 4-byte Reload @@ -69,7 +69,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV6-NEXT: orrs r0, r4 ; THUMBV6-NEXT: str r0, [sp, #16] @ 4-byte Spill ; THUMBV6-NEXT: ldr r0, [sp, #92] -; THUMBV6-NEXT: str r0, [sp, #44] @ 4-byte Spill +; THUMBV6-NEXT: str r0, [sp, #56] @ 4-byte Spill ; THUMBV6-NEXT: ldr r7, [sp, #80] ; THUMBV6-NEXT: mov r1, r5 ; THUMBV6-NEXT: mov r2, r7 @@ -82,13 +82,13 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV6-NEXT: ldr r6, [sp, #84] ; THUMBV6-NEXT: mov r0, r6 ; THUMBV6-NEXT: mov r1, r5 -; THUMBV6-NEXT: ldr r2, [sp, #48] @ 4-byte Reload +; THUMBV6-NEXT: ldr r2, [sp, #44] @ 4-byte Reload ; THUMBV6-NEXT: mov r3, r5 ; THUMBV6-NEXT: bl __aeabi_lmul ; THUMBV6-NEXT: str r0, [sp, #4] @ 4-byte Spill ; THUMBV6-NEXT: subs r2, r1, #1 ; THUMBV6-NEXT: sbcs r1, r2 -; THUMBV6-NEXT: ldr r3, [sp, #44] @ 4-byte Reload +; THUMBV6-NEXT: ldr r3, [sp, #56] @ 4-byte Reload ; THUMBV6-NEXT: subs r2, r3, #1 ; THUMBV6-NEXT: sbcs r3, r2 ; THUMBV6-NEXT: str r6, [sp, #8] @ 4-byte Spill @@ -103,7 +103,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV6-NEXT: str r0, [sp, #4] @ 4-byte Spill ; THUMBV6-NEXT: mov r0, r7 ; THUMBV6-NEXT: mov r1, r5 -; THUMBV6-NEXT: ldr r4, [sp, #48] @ 4-byte Reload +; THUMBV6-NEXT: ldr r4, [sp, #44] @ 4-byte Reload ; THUMBV6-NEXT: mov r2, r4 ; THUMBV6-NEXT: mov r3, r5 ; THUMBV6-NEXT: bl __aeabi_lmul @@ -133,7 +133,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV6-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; THUMBV6-NEXT: adcs r0, r1 ; THUMBV6-NEXT: str r0, [sp, #36] @ 4-byte Spill -; THUMBV6-NEXT: ldr r0, [sp, #56] @ 4-byte Reload +; THUMBV6-NEXT: ldr r0, [sp, #52] @ 4-byte Reload ; THUMBV6-NEXT: mov r1, r5 ; THUMBV6-NEXT: mov r2, r4 ; THUMBV6-NEXT: mov r3, r5 @@ -142,9 +142,9 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV6-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; THUMBV6-NEXT: adds r6, r0, r1 ; THUMBV6-NEXT: adcs r4, r5 -; THUMBV6-NEXT: ldr r0, [sp, #52] @ 4-byte Reload +; THUMBV6-NEXT: ldr r0, [sp, #48] @ 4-byte Reload ; THUMBV6-NEXT: mov r1, r5 -; THUMBV6-NEXT: ldr r2, [sp, #44] @ 4-byte Reload +; THUMBV6-NEXT: ldr r2, [sp, #56] @ 4-byte Reload ; THUMBV6-NEXT: mov r3, r5 ; THUMBV6-NEXT: bl __aeabi_lmul ; THUMBV6-NEXT: adds r0, r0, r6 @@ -155,35 +155,34 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV6-NEXT: str r0, [sp, #28] @ 4-byte Spill ; THUMBV6-NEXT: mov r6, r5 ; THUMBV6-NEXT: adcs r6, r5 -; THUMBV6-NEXT: ldr r0, [sp, #56] @ 4-byte Reload +; THUMBV6-NEXT: ldr r0, [sp, #52] @ 4-byte Reload ; THUMBV6-NEXT: mov r1, r5 -; THUMBV6-NEXT: ldr r4, [sp, #44] @ 4-byte Reload -; THUMBV6-NEXT: mov r2, r4 +; THUMBV6-NEXT: ldr r2, [sp, #56] @ 4-byte Reload ; THUMBV6-NEXT: mov r3, r5 ; THUMBV6-NEXT: bl __aeabi_lmul -; THUMBV6-NEXT: ldr r2, [sp, #28] @ 4-byte Reload -; THUMBV6-NEXT: adds r0, r0, r2 +; THUMBV6-NEXT: mov r4, r1 +; THUMBV6-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; THUMBV6-NEXT: adds r0, r0, r1 ; THUMBV6-NEXT: str r0, [sp, #28] @ 4-byte Spill -; THUMBV6-NEXT: adcs r1, r6 -; THUMBV6-NEXT: str r1, [sp, #24] @ 4-byte Spill -; THUMBV6-NEXT: ldr r0, [sp, #48] @ 4-byte Reload -; THUMBV6-NEXT: mov r1, r4 +; THUMBV6-NEXT: adcs r4, r6 +; THUMBV6-NEXT: ldr r0, [sp, #44] @ 4-byte Reload +; THUMBV6-NEXT: ldr r1, [sp, #56] @ 4-byte Reload ; THUMBV6-NEXT: mov r2, r5 ; THUMBV6-NEXT: mov r3, r5 ; THUMBV6-NEXT: bl __aeabi_lmul ; THUMBV6-NEXT: mov r6, r0 -; THUMBV6-NEXT: mov r4, r1 -; THUMBV6-NEXT: ldr r0, [sp, #52] @ 4-byte Reload -; THUMBV6-NEXT: ldr r1, [sp, #56] @ 4-byte Reload +; THUMBV6-NEXT: str r1, [sp, #56] @ 4-byte Spill +; THUMBV6-NEXT: ldr r0, [sp, #48] @ 4-byte Reload +; THUMBV6-NEXT: ldr r1, [sp, #52] @ 4-byte Reload ; THUMBV6-NEXT: mov r2, r5 ; THUMBV6-NEXT: mov r3, r5 ; THUMBV6-NEXT: bl __aeabi_lmul ; THUMBV6-NEXT: adds r0, r0, r6 -; THUMBV6-NEXT: adcs r1, r4 +; THUMBV6-NEXT: ldr r2, [sp, #56] @ 4-byte Reload +; THUMBV6-NEXT: adcs r1, r2 ; THUMBV6-NEXT: ldr r2, [sp, #28] @ 4-byte Reload ; THUMBV6-NEXT: adds r0, r2, r0 -; THUMBV6-NEXT: ldr r2, [sp, #24] @ 4-byte Reload -; THUMBV6-NEXT: adcs r1, r2 +; THUMBV6-NEXT: adcs r1, r4 ; THUMBV6-NEXT: ldr r2, [sp, #32] @ 4-byte Reload ; THUMBV6-NEXT: adds r0, r0, r2 ; THUMBV6-NEXT: ldr r2, [sp, #40] @ 4-byte Reload diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll index 9933720953d33c..d2525479d45f2a 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -293,8 +293,8 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr ; CHECK-NEXT: mov lr, lr ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vdup.32 q2, r12 +; CHECK-NEXT: vdup.32 q3, r12 ; CHECK-NEXT: b .LBB2_3 ; CHECK-NEXT: .LBB2_2: @ %else24 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 @@ -306,15 +306,15 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr ; CHECK-NEXT: vcvtb.f32.f16 s20, s20 ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: vadd.f32 q5, q3, q5 +; CHECK-NEXT: vadd.f32 q5, q1, q5 ; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: bne .LBB2_3 ; CHECK-NEXT: b .LBB2_19 ; CHECK-NEXT: .LBB2_3: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vadd.i32 q4, q0, r3 -; CHECK-NEXT: vmov q3, q5 -; CHECK-NEXT: vcmp.u32 cs, q1, q4 +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vcmp.u32 cs, q2, q4 ; CHECK-NEXT: @ implicit-def: $q5 ; CHECK-NEXT: vmrs r4, p0 ; CHECK-NEXT: and r2, r4, #1 @@ -350,7 +350,7 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr ; CHECK-NEXT: vins.f16 s21, s22 ; CHECK-NEXT: .LBB2_8: @ %else11 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vcmp.u32 cs, q2, q4 +; CHECK-NEXT: vcmp.u32 cs, q3, q4 ; CHECK-NEXT: @ implicit-def: $q6 ; CHECK-NEXT: vmrs r4, p0 ; CHECK-NEXT: and r2, r4, #1 @@ -426,7 +426,7 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr ; CHECK-NEXT: .LBB2_19: @ %middle.block ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: vcmp.u32 cs, q0, q4 -; CHECK-NEXT: vpsel q0, q5, q3 +; CHECK-NEXT: vpsel q0, q5, q1 ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll index 421b5b5364d354..01411c896243df 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll @@ -4,26 +4,23 @@ define i32 @test(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { ; CHECK-LABEL: test: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: itt lt -; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: bxlt lr -; CHECK-NEXT: .LBB0_1: @ %for.body.preheader ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: mov lr, r0 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: .LBB0_2: @ %for.body +; CHECK-NEXT: mov.w lr, #0 +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: blt .LBB0_2 +; CHECK-NEXT: .LBB0_1: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrh r3, [r1], #2 ; CHECK-NEXT: subs r2, #1 -; CHECK-NEXT: ldrh r12, [lr], #2 +; CHECK-NEXT: ldrh r12, [r0], #2 ; CHECK-NEXT: @APP ; CHECK-NEXT: add r3, r12 ; CHECK-NEXT: @NO_APP -; CHECK-NEXT: add r0, r3 -; CHECK-NEXT: bne .LBB0_2 -; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: add lr, r3 +; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: .LBB0_2: @ %for.cond.cleanup +; CHECK-NEXT: mov r0, lr ; CHECK-NEXT: pop {r7, pc} entry: %cmp9 = icmp sgt i32 %n, 0 @@ -57,20 +54,21 @@ define i32 @testlr(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) ; CHECK-NEXT: .LBB1_1: @ %for.body.preheader ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB1_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrh r4, [r1], #2 ; CHECK-NEXT: subs r2, #1 -; CHECK-NEXT: ldrh r12, [r3], #2 +; CHECK-NEXT: ldrh r12, [r0], #2 ; CHECK-NEXT: @APP ; CHECK-NEXT: add r4, r12 ; CHECK-NEXT: @NO_APP -; CHECK-NEXT: add r0, r4 +; CHECK-NEXT: add r3, r4 ; CHECK-NEXT: bne .LBB1_2 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop.w {r4, lr} +; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: bx lr entry: %cmp9 = icmp sgt i32 %n, 0 br i1 %cmp9, label %for.body, label %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll index 44cbd7d65125ea..b86f56e2fe5308 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll @@ -7,11 +7,11 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: ldr.w r12, [r0] -; CHECK-NEXT: subs.w r9, r1, #1 +; CHECK-NEXT: subs.w r10, r1, #1 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: subs r7, r1, #2 -; CHECK-NEXT: and r8, r9, #3 +; CHECK-NEXT: and r8, r10, #3 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB0_4 ; CHECK-NEXT: @ %bb.2: @@ -21,7 +21,7 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap ; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .LBB0_4: @ %while.body.preheader.new -; CHECK-NEXT: bic r7, r9, #3 +; CHECK-NEXT: bic r7, r10, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 @@ -29,33 +29,32 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap ; CHECK-NEXT: movs r7, #4 ; CHECK-NEXT: .LBB0_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r10, [r0, #16]! -; CHECK-NEXT: sub.w r9, r9, #4 -; CHECK-NEXT: ldrd r5, r4, [r0, #-12] -; CHECK-NEXT: ldr r11, [r0, #-4] -; CHECK-NEXT: cmp r12, r5 +; CHECK-NEXT: ldr r9, [r0, #16]! +; CHECK-NEXT: sub.w r10, r10, #4 +; CHECK-NEXT: ldmdb r0, {r4, r5, r11} +; CHECK-NEXT: cmp r12, r4 ; CHECK-NEXT: it gt ; CHECK-NEXT: subgt r6, r7, #3 -; CHECK-NEXT: csel r5, r5, r12, gt -; CHECK-NEXT: cmp r5, r4 +; CHECK-NEXT: csel r4, r4, r12, gt +; CHECK-NEXT: cmp r4, r5 ; CHECK-NEXT: it gt ; CHECK-NEXT: subgt r6, r7, #2 -; CHECK-NEXT: csel r5, r4, r5, gt +; CHECK-NEXT: csel r5, r5, r4, gt ; CHECK-NEXT: cmp r5, r11 ; CHECK-NEXT: it gt ; CHECK-NEXT: subgt r6, r7, #1 ; CHECK-NEXT: csel r5, r11, r5, gt -; CHECK-NEXT: cmp r5, r10 +; CHECK-NEXT: cmp r5, r9 ; CHECK-NEXT: csel r6, r7, r6, gt ; CHECK-NEXT: add.w r7, r7, #4 -; CHECK-NEXT: csel r12, r10, r5, gt +; CHECK-NEXT: csel r12, r9, r5, gt ; CHECK-NEXT: le lr, .LBB0_5 ; CHECK-NEXT: .LBB0_6: @ %while.end.loopexit.unr-lcssa ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.7: @ %while.body.epil ; CHECK-NEXT: ldr r7, [r0, #4] -; CHECK-NEXT: sub.w r1, r1, r9 +; CHECK-NEXT: sub.w r1, r1, r10 ; CHECK-NEXT: cmp r12, r7 ; CHECK-NEXT: csel r6, r1, r6, gt ; CHECK-NEXT: csel r12, r7, r12, gt diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll index cc6d0925d18031..1e5e703561cd35 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -33,15 +33,15 @@ define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: beq .LBB0_11 ; CHECK-NEXT: .LBB0_4: @ %for.body.preheader22 ; CHECK-NEXT: mvn.w r7, r12 -; CHECK-NEXT: add.w r8, r7, r3 +; CHECK-NEXT: adds r4, r7, r3 ; CHECK-NEXT: and r5, r3, #3 ; CHECK-NEXT: wls lr, r5, .LBB0_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader -; CHECK-NEXT: add.w r4, r12, r5 +; CHECK-NEXT: add.w r8, r12, r5 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2 -; CHECK-NEXT: mov r12, r4 +; CHECK-NEXT: mov r12, r8 ; CHECK-NEXT: .LBB0_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldmia r6!, {s0} @@ -50,7 +50,7 @@ define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: vstmia r7!, {s0} ; CHECK-NEXT: le lr, .LBB0_6 ; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp.w r8, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 ; CHECK-NEXT: sub.w r3, r3, r12 @@ -246,15 +246,15 @@ define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: beq .LBB1_11 ; CHECK-NEXT: .LBB1_4: @ %for.body.preheader22 ; CHECK-NEXT: mvn.w r7, r12 -; CHECK-NEXT: add.w r8, r7, r3 +; CHECK-NEXT: adds r4, r7, r3 ; CHECK-NEXT: and r5, r3, #3 ; CHECK-NEXT: wls lr, r5, .LBB1_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader -; CHECK-NEXT: add.w r4, r12, r5 +; CHECK-NEXT: add.w r8, r12, r5 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2 -; CHECK-NEXT: mov r12, r4 +; CHECK-NEXT: mov r12, r8 ; CHECK-NEXT: .LBB1_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldmia r6!, {s0} @@ -263,7 +263,7 @@ define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: vstmia r7!, {s0} ; CHECK-NEXT: le lr, .LBB1_6 ; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp.w r8, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: blo .LBB1_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 ; CHECK-NEXT: sub.w r3, r3, r12 @@ -459,15 +459,15 @@ define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: beq .LBB2_11 ; CHECK-NEXT: .LBB2_4: @ %for.body.preheader22 ; CHECK-NEXT: mvn.w r7, r12 -; CHECK-NEXT: add.w r8, r7, r3 +; CHECK-NEXT: adds r4, r7, r3 ; CHECK-NEXT: and r5, r3, #3 ; CHECK-NEXT: wls lr, r5, .LBB2_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader -; CHECK-NEXT: add.w r4, r12, r5 +; CHECK-NEXT: add.w r8, r12, r5 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2 -; CHECK-NEXT: mov r12, r4 +; CHECK-NEXT: mov r12, r8 ; CHECK-NEXT: .LBB2_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldmia r6!, {s0} @@ -476,7 +476,7 @@ define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: vstmia r7!, {s0} ; CHECK-NEXT: le lr, .LBB2_6 ; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp.w r8, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: blo .LBB2_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 ; CHECK-NEXT: sub.w r3, r3, r12 @@ -647,7 +647,7 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: it eq ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: .LBB3_1: @ %for.body.preheader -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bls .LBB3_6 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck @@ -681,15 +681,15 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader16 ; CHECK-NEXT: mvn.w r7, r12 -; CHECK-NEXT: add.w r8, r7, r3 +; CHECK-NEXT: add.w r9, r7, r3 ; CHECK-NEXT: and r5, r3, #3 ; CHECK-NEXT: wls lr, r5, .LBB3_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader -; CHECK-NEXT: add.w r4, r12, r5 +; CHECK-NEXT: add.w r8, r12, r5 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2 -; CHECK-NEXT: mov r12, r4 +; CHECK-NEXT: mov r12, r8 ; CHECK-NEXT: .LBB3_9: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r4, [r6], #4 @@ -700,7 +700,7 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: vstmia r7!, {s0} ; CHECK-NEXT: le lr, .LBB3_9 ; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp.w r8, #3 +; CHECK-NEXT: cmp.w r9, #3 ; CHECK-NEXT: blo .LBB3_13 ; CHECK-NEXT: @ %bb.11: @ %for.body.preheader1 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2 @@ -737,7 +737,7 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: vstr s0, [r6, #12] ; CHECK-NEXT: bne .LBB3_12 ; CHECK-NEXT: .LBB3_13: -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: bx lr entry: %cmp8 = icmp eq i32 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll index d41d7d2c1a51de..bd28b92ab9c223 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -328,14 +328,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-LABEL: test_vec_mul_scalar_add_char: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr r4, [sp, #28] -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB5_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: adds r7, r1, r4 -; CHECK-NEXT: add.w r6, r3, r4, lsl #2 +; CHECK-NEXT: add.w r7, r1, r12 +; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: cmp r7, r3 -; CHECK-NEXT: add.w r5, r0, r4 +; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: csel r7, zr, r7, ls @@ -348,15 +348,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: cmpeq r7, #0 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r7, r4, #1 -; CHECK-NEXT: and r12, r4, #3 +; CHECK-NEXT: sub.w r7, r12, #1 +; CHECK-NEXT: and r8, r12, #3 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB5_8 ; CHECK-NEXT: .LBB5_4: @ %vector.ph -; CHECK-NEXT: dlstp.32 lr, r4 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 @@ -366,18 +366,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_11 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: bic r7, r12, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r5, r3, #8 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: adds r6, r0, #3 ; CHECK-NEXT: adds r7, r1, #1 ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r9, [r6, #-3] -; CHECK-NEXT: add.w r8, r8, #4 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb r4, [r7, #-1] ; CHECK-NEXT: smlabb r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #-8] @@ -396,11 +396,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: le lr, .LBB5_7 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, r12, .LBB5_11 +; CHECK-NEXT: wls lr, r8, .LBB5_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader -; CHECK-NEXT: add r0, r8 -; CHECK-NEXT: add r1, r8 -; CHECK-NEXT: add.w r3, r3, r8, lsl #2 +; CHECK-NEXT: add r0, r12 +; CHECK-NEXT: add r1, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsl #2 ; CHECK-NEXT: .LBB5_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r7, [r0], #1 @@ -604,14 +604,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-LABEL: test_vec_mul_scalar_add_uchar: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr r4, [sp, #28] -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB7_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: adds r7, r1, r4 -; CHECK-NEXT: add.w r6, r3, r4, lsl #2 +; CHECK-NEXT: add.w r7, r1, r12 +; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: cmp r7, r3 -; CHECK-NEXT: add.w r5, r0, r4 +; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: csel r7, zr, r7, ls @@ -624,15 +624,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: cmpeq r7, #0 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r7, r4, #1 -; CHECK-NEXT: and r12, r4, #3 +; CHECK-NEXT: sub.w r7, r12, #1 +; CHECK-NEXT: and r8, r12, #3 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB7_8 ; CHECK-NEXT: .LBB7_4: @ %vector.ph -; CHECK-NEXT: dlstp.32 lr, r4 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 @@ -642,18 +642,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_11 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: bic r7, r12, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r5, r3, #8 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: adds r6, r0, #3 ; CHECK-NEXT: adds r7, r1, #1 ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r9, [r6, #-3] -; CHECK-NEXT: add.w r8, r8, #4 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb r4, [r7, #-1] ; CHECK-NEXT: smlabb r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #-8] @@ -672,11 +672,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, r12, .LBB7_11 +; CHECK-NEXT: wls lr, r8, .LBB7_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader -; CHECK-NEXT: add r0, r8 -; CHECK-NEXT: add r1, r8 -; CHECK-NEXT: add.w r3, r3, r8, lsl #2 +; CHECK-NEXT: add r0, r12 +; CHECK-NEXT: add r1, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsl #2 ; CHECK-NEXT: .LBB7_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r7, [r0], #1 @@ -880,14 +880,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly ; CHECK-LABEL: test_vec_mul_scalar_add_int: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr r4, [sp, #28] -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB9_11 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck -; CHECK-NEXT: add.w r7, r1, r4, lsl #2 -; CHECK-NEXT: add.w r6, r3, r4, lsl #2 +; CHECK-NEXT: add.w r7, r1, r12, lsl #2 +; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: cmp r7, r3 -; CHECK-NEXT: add.w r5, r0, r4, lsl #2 +; CHECK-NEXT: add.w r5, r0, r12, lsl #2 ; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: csel r7, zr, r7, ls @@ -900,15 +900,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly ; CHECK-NEXT: cmpeq r7, #0 ; CHECK-NEXT: beq .LBB9_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r7, r4, #1 -; CHECK-NEXT: and r12, r4, #3 +; CHECK-NEXT: sub.w r7, r12, #1 +; CHECK-NEXT: and r8, r12, #3 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB9_6 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB9_8 ; CHECK-NEXT: .LBB9_4: @ %vector.ph -; CHECK-NEXT: dlstp.32 lr, r4 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -918,18 +918,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly ; CHECK-NEXT: letp lr, .LBB9_5 ; CHECK-NEXT: b .LBB9_11 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: bic r7, r12, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r5, r3, #8 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: add.w r6, r0, #8 ; CHECK-NEXT: add.w r7, r1, #8 ; CHECK-NEXT: .LBB9_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r9, [r6, #-8] -; CHECK-NEXT: add.w r8, r8, #4 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldr r4, [r7, #-8] ; CHECK-NEXT: mla r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #-8] @@ -950,11 +950,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: le lr, .LBB9_7 ; CHECK-NEXT: .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, r12, .LBB9_11 +; CHECK-NEXT: wls lr, r8, .LBB9_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader -; CHECK-NEXT: add.w r0, r0, r8, lsl #2 -; CHECK-NEXT: add.w r1, r1, r8, lsl #2 -; CHECK-NEXT: add.w r3, r3, r8, lsl #2 +; CHECK-NEXT: add.w r0, r0, r12, lsl #2 +; CHECK-NEXT: add.w r1, r1, r12, lsl #2 +; CHECK-NEXT: add.w r3, r3, r12, lsl #2 ; CHECK-NEXT: .LBB9_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r7, [r0], #4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll index 93119eac2d564c..9697229209687b 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -418,7 +418,7 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read ; CHECK-NEXT: .LBB6_1: @ %vector.ph ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: subs r7, r3, #4 @@ -430,16 +430,16 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read ; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u32 q1, [r4], #4 +; CHECK-NEXT: vldrbt.u32 q0, [r4], #4 ; CHECK-NEXT: vldrbt.u32 q2, [r5], #4 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vmul.i32 q0, q2, q0 +; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB6_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r12, q0 ; CHECK-NEXT: cbz r2, .LBB6_7 ; CHECK-NEXT: @ %bb.4: @ %vector.ph47 @@ -567,15 +567,15 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(i8* nocaptur ; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q2, q3, q2 ; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: vaddv.u16 r4, q2 -; CHECK-NEXT: vaddv.u16 r2, q0 +; CHECK-NEXT: vaddv.u16 r2, q2 +; CHECK-NEXT: vaddv.u16 r4, q0 ; CHECK-NEXT: b .LBB7_5 ; CHECK-NEXT: .LBB7_4: -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup -; CHECK-NEXT: strb r2, [r0] -; CHECK-NEXT: strb r4, [r1] +; CHECK-NEXT: strb r4, [r0] +; CHECK-NEXT: strb r2, [r1] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, pc} entry: @@ -633,35 +633,35 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) { ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: movw r12, #47184 -; CHECK-NEXT: movw r3, #23593 ; CHECK-NEXT: ldrd r2, lr, [r1, #4] +; CHECK-NEXT: movw r1, #23593 ; CHECK-NEXT: movt r12, #1310 -; CHECK-NEXT: movt r3, #49807 -; CHECK-NEXT: mla r3, lr, r3, r12 -; CHECK-NEXT: movw r1, #55051 +; CHECK-NEXT: movt r1, #49807 +; CHECK-NEXT: mla r1, lr, r1, r12 +; CHECK-NEXT: movw r3, #55051 ; CHECK-NEXT: movw r4, #23593 -; CHECK-NEXT: movt r1, #163 +; CHECK-NEXT: movt r3, #163 ; CHECK-NEXT: ldr r0, [r0] ; CHECK-NEXT: movt r4, #655 -; CHECK-NEXT: ror.w r12, r3, #4 -; CHECK-NEXT: cmp r12, r1 -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: ror.w r3, r3, #2 +; CHECK-NEXT: ror.w r12, r1, #4 +; CHECK-NEXT: cmp r12, r3 +; CHECK-NEXT: cset r3, lo +; CHECK-NEXT: ror.w r1, r1, #2 ; CHECK-NEXT: mov.w r12, #1 -; CHECK-NEXT: cmp r3, r4 -; CHECK-NEXT: csel r3, r1, r12, lo +; CHECK-NEXT: cmp r1, r4 +; CHECK-NEXT: csel r1, r3, r12, lo ; CHECK-NEXT: lsls.w r4, lr, #30 -; CHECK-NEXT: csel r1, r1, r3, ne +; CHECK-NEXT: csel r3, r3, r1, ne ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB8_1: @ %vector.ph -; CHECK-NEXT: movw r3, :lower16:days +; CHECK-NEXT: movw r1, :lower16:days ; CHECK-NEXT: movs r4, #52 -; CHECK-NEXT: movt r3, :upper16:days -; CHECK-NEXT: smlabb r1, r1, r4, r3 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: movt r1, :upper16:days +; CHECK-NEXT: smlabb r3, r3, r4, r1 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: adds r0, r2, #3 ; CHECK-NEXT: bic r0, r0, #3 @@ -673,7 +673,7 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) { ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r1], #16 +; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB8_2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll index a0e690212d5a43..7acc83343dcb8b 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll @@ -17,18 +17,18 @@ define dso_local void @check_option(ptr noalias nocapture %A, ptr noalias nocapt ; ENABLED-NEXT: .LBB0_2: @ %vector.ph ; ENABLED-NEXT: @ =>This Loop Header: Depth=1 ; ENABLED-NEXT: @ Child Loop BB0_3 Depth 2 -; ENABLED-NEXT: mov r12, r0 -; ENABLED-NEXT: mov r4, r2 -; ENABLED-NEXT: mov r5, r1 -; ENABLED-NEXT: mov r6, r3 -; ENABLED-NEXT: dlstp.32 lr, r6 +; ENABLED-NEXT: mov r4, r0 +; ENABLED-NEXT: mov r5, r2 +; ENABLED-NEXT: mov r6, r1 +; ENABLED-NEXT: mov r7, r3 +; ENABLED-NEXT: dlstp.32 lr, r7 ; ENABLED-NEXT: .LBB0_3: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_2 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; ENABLED-NEXT: vldrw.u32 q0, [r5], #16 -; ENABLED-NEXT: vldrw.u32 q1, [r4], #16 +; ENABLED-NEXT: vldrw.u32 q0, [r6], #16 +; ENABLED-NEXT: vldrw.u32 q1, [r5], #16 ; ENABLED-NEXT: vadd.i32 q0, q1, q0 -; ENABLED-NEXT: vstrw.32 q0, [r12], #16 +; ENABLED-NEXT: vstrw.32 q0, [r4], #16 ; ENABLED-NEXT: letp lr, .LBB0_3 ; ENABLED-NEXT: b .LBB0_2 ; ENABLED-NEXT: .LBB0_4: @ %for.cond.cleanup @@ -44,29 +44,29 @@ define dso_local void @check_option(ptr noalias nocapture %A, ptr noalias nocapt ; DISABLED-NEXT: movs r6, #1 ; DISABLED-NEXT: bic r7, r7, #3 ; DISABLED-NEXT: subs r7, #4 -; DISABLED-NEXT: add.w r8, r6, r7, lsr #2 +; DISABLED-NEXT: add.w r12, r6, r7, lsr #2 ; DISABLED-NEXT: .LBB0_2: @ %vector.ph ; DISABLED-NEXT: @ =>This Loop Header: Depth=1 ; DISABLED-NEXT: @ Child Loop BB0_3 Depth 2 -; DISABLED-NEXT: mov r7, r8 -; DISABLED-NEXT: mov r12, r0 -; DISABLED-NEXT: mov r4, r2 -; DISABLED-NEXT: mov r5, r1 -; DISABLED-NEXT: mov r6, r3 -; DISABLED-NEXT: dls lr, r8 +; DISABLED-NEXT: mov r8, r12 +; DISABLED-NEXT: mov r4, r0 +; DISABLED-NEXT: mov r5, r2 +; DISABLED-NEXT: mov r6, r1 +; DISABLED-NEXT: mov r7, r3 +; DISABLED-NEXT: dls lr, r12 ; DISABLED-NEXT: .LBB0_3: @ %vector.body ; DISABLED-NEXT: @ Parent Loop BB0_2 Depth=1 ; DISABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; DISABLED-NEXT: vctp.32 r6 -; DISABLED-NEXT: mov lr, r7 +; DISABLED-NEXT: vctp.32 r7 +; DISABLED-NEXT: mov lr, r8 ; DISABLED-NEXT: vpstt -; DISABLED-NEXT: vldrwt.u32 q0, [r5], #16 -; DISABLED-NEXT: vldrwt.u32 q1, [r4], #16 -; DISABLED-NEXT: subs r7, #1 -; DISABLED-NEXT: subs r6, #4 +; DISABLED-NEXT: vldrwt.u32 q0, [r6], #16 +; DISABLED-NEXT: vldrwt.u32 q1, [r5], #16 +; DISABLED-NEXT: sub.w r8, r8, #1 +; DISABLED-NEXT: subs r7, #4 ; DISABLED-NEXT: vadd.i32 q0, q1, q0 ; DISABLED-NEXT: vpst -; DISABLED-NEXT: vstrwt.32 q0, [r12], #16 +; DISABLED-NEXT: vstrwt.32 q0, [r4], #16 ; DISABLED-NEXT: le lr, .LBB0_3 ; DISABLED-NEXT: b .LBB0_2 ; DISABLED-NEXT: .LBB0_4: @ %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 07c06e10979cde..24ef4fd3bbc5d2 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -17,19 +17,19 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr { ; ENABLED-LABEL: varying_outer_2d_reduction: ; ENABLED: @ %bb.0: @ %entry -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; ENABLED-NEXT: sub sp, #4 ; ENABLED-NEXT: cmp r3, #1 -; ENABLED-NEXT: str r0, [sp] @ 4-byte Spill -; ENABLED-NEXT: blt .LBB0_8 -; ENABLED-NEXT: @ %bb.1: @ %for.body.lr.ph +; ENABLED-NEXT: it lt +; ENABLED-NEXT: bxlt lr +; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; ENABLED-NEXT: mov r11, r0 ; ENABLED-NEXT: ldr r0, [sp, #36] ; ENABLED-NEXT: add.w r12, r2, #3 -; ENABLED-NEXT: ldr.w r10, [sp] @ 4-byte Reload ; ENABLED-NEXT: mov.w r8, #0 -; ENABLED-NEXT: mov r9, r12 +; ENABLED-NEXT: mov r10, r11 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 +; ENABLED-NEXT: mov r9, r12 ; ENABLED-NEXT: b .LBB0_4 ; ENABLED-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: movs r0, #0 @@ -61,7 +61,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 ; ENABLED-NEXT: mov r7, r10 ; ENABLED-NEXT: dls lr, r0 -; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload +; ENABLED-NEXT: mov r0, r11 ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 @@ -82,25 +82,25 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: vpsel q0, q1, q0 ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 -; ENABLED-NEXT: .LBB0_8: @ %for.end17 -; ENABLED-NEXT: add sp, #4 -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; ENABLED-NEXT: .LBB0_8: +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; ENABLED-NEXT: bx lr ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: ; NOREDUCTIONS: @ %bb.0: @ %entry -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; NOREDUCTIONS-NEXT: sub sp, #4 ; NOREDUCTIONS-NEXT: cmp r3, #1 -; NOREDUCTIONS-NEXT: str r0, [sp] @ 4-byte Spill -; NOREDUCTIONS-NEXT: blt .LBB0_8 -; NOREDUCTIONS-NEXT: @ %bb.1: @ %for.body.lr.ph +; NOREDUCTIONS-NEXT: it lt +; NOREDUCTIONS-NEXT: bxlt lr +; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: mov r11, r0 ; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] ; NOREDUCTIONS-NEXT: add.w r12, r2, #3 -; NOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload ; NOREDUCTIONS-NEXT: mov.w r8, #0 -; NOREDUCTIONS-NEXT: mov r9, r12 +; NOREDUCTIONS-NEXT: mov r10, r11 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 +; NOREDUCTIONS-NEXT: mov r9, r12 ; NOREDUCTIONS-NEXT: b .LBB0_4 ; NOREDUCTIONS-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: movs r0, #0 @@ -132,7 +132,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r7, r10 ; NOREDUCTIONS-NEXT: dls lr, r0 -; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload +; NOREDUCTIONS-NEXT: mov r0, r11 ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 @@ -153,9 +153,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: vpsel q0, q1, q0 ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 -; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17 -; NOREDUCTIONS-NEXT: add sp, #4 -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; NOREDUCTIONS-NEXT: .LBB0_8: +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: bx lr entry: %conv = sext i16 %N to i32 %cmp36 = icmp sgt i16 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll index 015af0b4097770..793f75fb176aa8 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -79,21 +79,21 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: adds r1, r2, #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: bic r1, r1, #3 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w r1, r3, r1, lsr #2 ; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -139,21 +139,21 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: adds r1, r2, #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: bic r1, r1, #3 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w r1, r3, r1, lsr #2 ; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll index cbcbf1f392ce8b..23a8cd85cb770f 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -161,81 +161,83 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" { ; CHECK-NEXT: add r7, sp, #12 ; CHECK-NEXT: .save {r8, r9, r10, r11} ; CHECK-NEXT: push.w {r8, r9, r10, r11} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: wls lr, r1, .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: adds r6, r3, #4 -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: mvn r8, #1 -; CHECK-NEXT: @ implicit-def: $r9 -; CHECK-NEXT: @ implicit-def: $r4 +; CHECK-NEXT: adds r4, r3, #4 +; CHECK-NEXT: add.w r12, r0, #4 +; CHECK-NEXT: mvn r11, #1 +; CHECK-NEXT: @ implicit-def: $r1 +; CHECK-NEXT: @ implicit-def: $r8 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: muls r1, r3, r1 -; CHECK-NEXT: adds r4, r4, r1 -; CHECK-NEXT: adc.w r1, r2, r1, asr #31 -; CHECK-NEXT: adds.w r2, r4, #-2147483648 -; CHECK-NEXT: ldrd r2, r4, [r8] -; CHECK-NEXT: adc r5, r1, #0 -; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: smull r4, r2, r4, r9 -; CHECK-NEXT: asrs r1, r5, #31 -; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: sbcs r1, r2 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds.w r10, r4, #-2147483648 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: ldr r4, [r2, #-4] -; CHECK-NEXT: muls r4, r3, r4 -; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: adds.w r12, r4, #-2147483648 -; CHECK-NEXT: asr.w r5, r4, #31 -; CHECK-NEXT: ldr r4, [r6] +; CHECK-NEXT: ldr.w r5, [r12] +; CHECK-NEXT: asr.w r6, r8, #31 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: muls r5, r3, r5 +; CHECK-NEXT: adds.w r8, r8, r5 +; CHECK-NEXT: adc.w r5, r6, r5, asr #31 +; CHECK-NEXT: adds.w r6, r8, #-2147483648 +; CHECK-NEXT: adc r2, r5, #0 +; CHECK-NEXT: ldrd r5, r6, [r11] +; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: asr.w r8, r2, #31 +; CHECK-NEXT: strd r1, r5, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: smull r6, r1, r6, r1 +; CHECK-NEXT: subs r6, r2, r6 +; CHECK-NEXT: sbc.w r5, r8, r1 +; CHECK-NEXT: ldr r1, [r12, #-4] +; CHECK-NEXT: adds.w r6, r6, #-2147483648 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: adc r5, r5, #0 -; CHECK-NEXT: mul r2, r4, r0 +; CHECK-NEXT: muls r1, r3, r1 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: adds.w r10, r1, #-2147483648 +; CHECK-NEXT: asr.w r2, r1, #31 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: ldr r2, [r4] +; CHECK-NEXT: mul r8, r2, r0 +; CHECK-NEXT: add.w r4, r8, #-2147483648 +; CHECK-NEXT: asrl r10, r1, r4 +; CHECK-NEXT: smull r4, r1, r2, r10 +; CHECK-NEXT: lsll r4, r1, #30 +; CHECK-NEXT: asr.w r9, r1, #31 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: ldrd r0, r1, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: lsll r4, r9, r2 +; CHECK-NEXT: lsrl r4, r9, #2 +; CHECK-NEXT: muls r1, r0, r1 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: add.w r2, r2, #-2147483648 -; CHECK-NEXT: asrl r12, r5, r2 -; CHECK-NEXT: smull r2, r5, r4, r12 -; CHECK-NEXT: lsll r2, r5, #30 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: asr.w r11, r5, #31 -; CHECK-NEXT: mov r12, r5 -; CHECK-NEXT: lsll r12, r11, r4 -; CHECK-NEXT: mul r2, r2, r9 -; CHECK-NEXT: lsrl r12, r11, #2 -; CHECK-NEXT: adds r2, #2 -; CHECK-NEXT: lsll r12, r11, r2 -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r12, #-2147483648 -; CHECK-NEXT: asrl r10, r1, r5 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: lsrl r10, r1, #2 +; CHECK-NEXT: adds r1, #2 +; CHECK-NEXT: lsll r4, r9, r1 +; CHECK-NEXT: add.w r1, r4, #-2147483648 +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: asrl r6, r5, r1 ; CHECK-NEXT: movs r1, #2 -; CHECK-NEXT: mov r9, r10 -; CHECK-NEXT: str.w r10, [r1] -; CHECK-NEXT: ldr r1, [r8], #-4 -; CHECK-NEXT: mls r5, r1, r4, r5 -; CHECK-NEXT: adds.w r4, r5, #-2147483648 -; CHECK-NEXT: asr.w r1, r5, #31 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: lsrl r4, r1, #2 -; CHECK-NEXT: rsbs r1, r4, #0 +; CHECK-NEXT: lsrl r6, r5, #2 +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: str r6, [r1] +; CHECK-NEXT: ldr r1, [r11], #-4 +; CHECK-NEXT: adds r3, #4 +; CHECK-NEXT: ldr r5, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: mls r1, r1, r2, r5 +; CHECK-NEXT: adds.w r8, r1, #-2147483648 +; CHECK-NEXT: asr.w r2, r1, #31 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: lsrl r8, r1, #2 +; CHECK-NEXT: rsb.w r1, r8, #0 ; CHECK-NEXT: str r1, [r2] -; CHECK-NEXT: str r1, [r6, #-4] -; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: str r1, [r4, #-4] +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %while.end -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: pop.w {r8, r9, r10, r11} ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll index 3ce79225cd5e61..df060b0475fecc 100644 --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -150,33 +150,33 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) { ; CHECK-NEXT: adr.w r12, .LCPI3_0 ; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: vldrw.u32 q0, [r12] -; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vqadd.u32 q0, q0, r0 ; CHECK-NEXT: vcmp.u32 hi, q1, q0 -; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r1, r12, d8 -; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov.16 q0[1], r12 -; CHECK-NEXT: vmov r1, r12, d9 -; CHECK-NEXT: vmov.16 q0[2], r1 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q3, q2, q0 +; CHECK-NEXT: vmov r1, r12, d6 +; CHECK-NEXT: vmov.16 q4[0], r1 +; CHECK-NEXT: vmov.16 q4[1], r12 +; CHECK-NEXT: vmov r1, r12, d7 +; CHECK-NEXT: vmov.16 q4[2], r1 ; CHECK-NEXT: adr r1, .LCPI3_1 -; CHECK-NEXT: vldrw.u32 q4, [r1] -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vqadd.u32 q4, q4, r0 -; CHECK-NEXT: vcmp.u32 hi, q1, q4 -; CHECK-NEXT: vpsel q1, q3, q2 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vmov.16 q4[3], r12 +; CHECK-NEXT: vqadd.u32 q3, q3, r0 +; CHECK-NEXT: vcmp.u32 hi, q1, q3 +; CHECK-NEXT: vpsel q0, q2, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov.16 q4[5], r1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov.16 q4[6], r0 ; CHECK-NEXT: add r0, sp, #24 -; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vmov.16 q4[7], r1 ; CHECK-NEXT: vldr d1, [sp, #16] -; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 @@ -210,25 +210,25 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK-NEXT: vqadd.u32 q0, q0, r0 ; CHECK-NEXT: vcmp.u32 hi, q3, q0 ; CHECK-NEXT: vmov.i8 q0, #0x0 -; CHECK-NEXT: vpsel q4, q1, q0 -; CHECK-NEXT: vmov r1, r12, d8 -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.16 q2[1], r12 -; CHECK-NEXT: vmov r1, r12, d9 -; CHECK-NEXT: vmov.16 q2[2], r1 +; CHECK-NEXT: vpsel q2, q1, q0 +; CHECK-NEXT: vmov r1, r12, d4 +; CHECK-NEXT: vmov.16 q4[0], r1 +; CHECK-NEXT: vmov.16 q4[1], r12 +; CHECK-NEXT: vmov r1, r12, d5 +; CHECK-NEXT: vmov.16 q4[2], r1 ; CHECK-NEXT: adr r1, .LCPI4_1 -; CHECK-NEXT: vldrw.u32 q4, [r1] -; CHECK-NEXT: vmov.16 q2[3], r12 -; CHECK-NEXT: vqadd.u32 q4, q4, r0 -; CHECK-NEXT: vcmp.u32 hi, q3, q4 -; CHECK-NEXT: vpsel q4, q1, q0 -; CHECK-NEXT: vmov r1, r12, d8 -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.16 q2[5], r12 -; CHECK-NEXT: vmov r1, r12, d9 -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.16 q2[7], r12 -; CHECK-NEXT: vcmp.i16 ne, q2, zr +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov.16 q4[3], r12 +; CHECK-NEXT: vqadd.u32 q2, q2, r0 +; CHECK-NEXT: vcmp.u32 hi, q3, q2 +; CHECK-NEXT: vpsel q2, q1, q0 +; CHECK-NEXT: vmov r1, r12, d4 +; CHECK-NEXT: vmov.16 q4[4], r1 +; CHECK-NEXT: vmov.16 q4[5], r12 +; CHECK-NEXT: vmov r1, r12, d5 +; CHECK-NEXT: vmov.16 q4[6], r1 +; CHECK-NEXT: vmov.16 q4[7], r12 +; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q4, q1, q0 ; CHECK-NEXT: vmov.u16 r1, q4[0] ; CHECK-NEXT: vmov.8 q2[0], r1 @@ -250,25 +250,25 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK-NEXT: vldrw.u32 q4, [r1] ; CHECK-NEXT: vqadd.u32 q4, q4, r0 ; CHECK-NEXT: vcmp.u32 hi, q3, q4 -; CHECK-NEXT: vpsel q5, q1, q0 -; CHECK-NEXT: vmov r1, r12, d10 -; CHECK-NEXT: vmov.16 q4[0], r1 -; CHECK-NEXT: vmov.16 q4[1], r12 -; CHECK-NEXT: vmov r1, r12, d11 -; CHECK-NEXT: vmov.16 q4[2], r1 +; CHECK-NEXT: vpsel q4, q1, q0 +; CHECK-NEXT: vmov r1, r12, d8 +; CHECK-NEXT: vmov.16 q5[0], r1 +; CHECK-NEXT: vmov.16 q5[1], r12 +; CHECK-NEXT: vmov r1, r12, d9 +; CHECK-NEXT: vmov.16 q5[2], r1 ; CHECK-NEXT: adr r1, .LCPI4_3 -; CHECK-NEXT: vldrw.u32 q5, [r1] -; CHECK-NEXT: vmov.16 q4[3], r12 -; CHECK-NEXT: vqadd.u32 q5, q5, r0 -; CHECK-NEXT: vcmp.u32 hi, q3, q5 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vmov.16 q5[3], r12 +; CHECK-NEXT: vqadd.u32 q4, q4, r0 +; CHECK-NEXT: vcmp.u32 hi, q3, q4 ; CHECK-NEXT: vpsel q3, q1, q0 ; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.16 q4[5], r1 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.16 q5[5], r1 ; CHECK-NEXT: vmov r0, r1, d7 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.16 q4[7], r1 -; CHECK-NEXT: vcmp.i16 ne, q4, zr +; CHECK-NEXT: vmov.16 q5[6], r0 +; CHECK-NEXT: vmov.16 q5[7], r1 +; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov.u16 r0, q0[0] ; CHECK-NEXT: vmov.8 q2[8], r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll index 39bf97d880ea3f..89de92dab7faf6 100644 --- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll +++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll @@ -55,27 +55,27 @@ define i32 @test(i8 zeroext %var_2, i16 signext %var_15, ptr %arr_60) { ; CHECK-NEXT: @ Parent Loop BB0_4 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 ; CHECK-NEXT: str r2, [r4, #-4] -; CHECK-NEXT: ldrb r6, [r3, #-1] -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: ldrb r7, [r3, #-1] +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: ite ne ; CHECK-NEXT: sxthne.w r9, r1 ; CHECK-NEXT: moveq.w r9, #0 -; CHECK-NEXT: add.w r6, r2, #396 +; CHECK-NEXT: add.w r7, r2, #396 ; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: str r6, [r4] -; CHECK-NEXT: cset r6, ne -; CHECK-NEXT: strb r6, [r5] +; CHECK-NEXT: str r7, [r4] +; CHECK-NEXT: cset r7, ne +; CHECK-NEXT: strb r7, [r5] ; CHECK-NEXT: add.w r2, r2, #792 -; CHECK-NEXT: ldrb r6, [r3] +; CHECK-NEXT: ldrb r7, [r3] ; CHECK-NEXT: adds r4, #8 ; CHECK-NEXT: adds r3, #2 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: ite ne -; CHECK-NEXT: sxthne r6, r1 -; CHECK-NEXT: moveq r6, #0 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: cset r6, ne -; CHECK-NEXT: strb r6, [r5] +; CHECK-NEXT: sxthne r7, r1 +; CHECK-NEXT: moveq r7, #0 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cset r7, ne +; CHECK-NEXT: strb r7, [r5] ; CHECK-NEXT: le lr, .LBB0_5 ; CHECK-NEXT: @ %bb.6: @ %for.cond.cleanup9 ; CHECK-NEXT: @ in Loop: Header=BB0_4 Depth=2 @@ -89,15 +89,15 @@ define i32 @test(i8 zeroext %var_2, i16 signext %var_15, ptr %arr_60) { ; CHECK-NEXT: @ Parent Loop BB0_4 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 ; CHECK-NEXT: str r2, [r4, #-4] -; CHECK-NEXT: add.w r7, r2, #396 -; CHECK-NEXT: ldrb r6, [r3, #-1] +; CHECK-NEXT: add.w r6, r2, #396 +; CHECK-NEXT: ldrb r7, [r3, #-1] ; CHECK-NEXT: add.w r2, r2, #792 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: ite ne -; CHECK-NEXT: sxthne r6, r1 -; CHECK-NEXT: moveq r6, #0 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: str r7, [r4] +; CHECK-NEXT: sxthne r7, r1 +; CHECK-NEXT: moveq r7, #0 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: str r6, [r4] ; CHECK-NEXT: cset r6, ne ; CHECK-NEXT: adds r4, #8 ; CHECK-NEXT: strb r6, [r5] @@ -105,9 +105,9 @@ define i32 @test(i8 zeroext %var_2, i16 signext %var_15, ptr %arr_60) { ; CHECK-NEXT: adds r3, #2 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: ite ne -; CHECK-NEXT: sxthne r6, r1 -; CHECK-NEXT: moveq r6, #0 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: sxthne r7, r1 +; CHECK-NEXT: moveq r7, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: cset r6, ne ; CHECK-NEXT: strb r6, [r5] ; CHECK-NEXT: le lr, .LBB0_7 @@ -117,14 +117,15 @@ define i32 @test(i8 zeroext %var_2, i16 signext %var_15, ptr %arr_60) { ; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r7, r2 ; CHECK-NEXT: .LBB0_9: @ %for.body10.2 ; CHECK-NEXT: @ Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: @ Parent Loop BB0_4 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: str r2, [r11, #-4] -; CHECK-NEXT: add.w r6, r2, #396 +; CHECK-NEXT: str r7, [r11, #-4] +; CHECK-NEXT: add.w r6, r7, #396 ; CHECK-NEXT: ldrb r4, [r3, #-1] -; CHECK-NEXT: add.w r2, r2, #792 +; CHECK-NEXT: add.w r7, r7, #792 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: ite ne ; CHECK-NEXT: sxthne r4, r1 @@ -146,12 +147,11 @@ define i32 @test(i8 zeroext %var_2, i16 signext %var_15, ptr %arr_60) { ; CHECK-NEXT: le lr, .LBB0_9 ; CHECK-NEXT: @ %bb.10: @ %for.cond.cleanup9.2 ; CHECK-NEXT: @ in Loop: Header=BB0_4 Depth=2 -; CHECK-NEXT: add.w r2, r10, #3 +; CHECK-NEXT: add.w r3, r10, #3 ; CHECK-NEXT: add.w r12, r12, #66 ; CHECK-NEXT: adds r0, #66 ; CHECK-NEXT: add.w r8, r8, #66 -; CHECK-NEXT: uxtb.w r10, r2 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: uxtb.w r10, r3 ; CHECK-NEXT: cmp.w r10, #18 ; CHECK-NEXT: blo.w .LBB0_4 ; CHECK-NEXT: b .LBB0_3 @@ -355,8 +355,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: subs r1, r0, #1 ; CHECK-NEXT: sbcs r1, r12, #0 @@ -365,31 +365,31 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: csel lr, r2, r3, lt -; CHECK-NEXT: movw r4, #43691 +; CHECK-NEXT: mov r7, r2 ; CHECK-NEXT: mov r1, lr ; CHECK-NEXT: cmp.w lr, #3 ; CHECK-NEXT: it ls ; CHECK-NEXT: movls r1, #3 -; CHECK-NEXT: movt r4, #43690 +; CHECK-NEXT: movw r2, #43691 ; CHECK-NEXT: sub.w r1, r1, lr -; CHECK-NEXT: ldr r6, [sp, #128] +; CHECK-NEXT: movt r2, #43690 ; CHECK-NEXT: adds r1, #2 +; CHECK-NEXT: ldr r4, [sp, #120] ; CHECK-NEXT: movw r8, :lower16:c -; CHECK-NEXT: movt r8, :upper16:c ; CHECK-NEXT: mov.w r9, #12 -; CHECK-NEXT: umull r1, r4, r1, r4 +; CHECK-NEXT: umull r1, r2, r1, r2 +; CHECK-NEXT: movt r8, :upper16:c +; CHECK-NEXT: movs r1, #4 ; CHECK-NEXT: @ implicit-def: $r10 ; CHECK-NEXT: @ implicit-def: $r5 ; CHECK-NEXT: @ implicit-def: $r11 -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: movs r1, #4 -; CHECK-NEXT: strd r2, r12, [sp, #4] @ 8-byte Folded Spill -; CHECK-NEXT: add.w r3, r3, r4, lsr #1 -; CHECK-NEXT: add.w r1, r1, r4, lsr #1 -; CHECK-NEXT: movw r4, #65532 +; CHECK-NEXT: strd r12, r0, [sp] @ 8-byte Folded Spill +; CHECK-NEXT: add.w r3, r3, r2, lsr #1 +; CHECK-NEXT: add.w r1, r1, r2, lsr #1 +; CHECK-NEXT: movw r2, #65532 ; CHECK-NEXT: vdup.32 q6, r3 -; CHECK-NEXT: movt r4, #32767 -; CHECK-NEXT: and.w r7, r1, r4 +; CHECK-NEXT: movt r2, #32767 +; CHECK-NEXT: and.w r6, r1, r2 ; CHECK-NEXT: adr r1, .LCPI1_0 ; CHECK-NEXT: vdup.32 q7, r3 ; CHECK-NEXT: vldrw.u32 q0, [r1] @@ -417,13 +417,13 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: subs r1, r2, r1 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: add.w r11, r0, #7 -; CHECK-NEXT: ldrd r12, r0, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r12, r0, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 ; CHECK-NEXT: adds r5, #2 ; CHECK-NEXT: subs r1, r5, r0 -; CHECK-NEXT: asr.w r3, r5, #31 -; CHECK-NEXT: sbcs.w r1, r3, r12 +; CHECK-NEXT: asr.w r2, r5, #31 +; CHECK-NEXT: sbcs.w r1, r2, r12 ; CHECK-NEXT: bge.w .LBB1_28 ; CHECK-NEXT: .LBB1_4: @ %for.cond2.preheader ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -439,17 +439,15 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: bhi .LBB1_15 ; CHECK-NEXT: @ %bb.6: @ %for.body6.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: ldrd r2, r3, [sp, #120] +; CHECK-NEXT: ldrd r2, r3, [sp, #112] ; CHECK-NEXT: movs r0, #32 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: mov r4, r7 -; CHECK-NEXT: mov r7, lr +; CHECK-NEXT: mov r4, lr ; CHECK-NEXT: bl __aeabi_ldivmod +; CHECK-NEXT: mov lr, r4 +; CHECK-NEXT: ldr r4, [sp, #120] +; CHECK-NEXT: ldrd r12, r0, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldrd r2, r12, [sp, #4] @ 8-byte Folded Reload -; CHECK-NEXT: mov lr, r7 -; CHECK-NEXT: mov r7, r4 ; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: b .LBB1_8 ; CHECK-NEXT: .LBB1_7: @ %for.cond.cleanup17.us @@ -465,23 +463,23 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ Child Loop BB1_10 Depth 3 ; CHECK-NEXT: @ Child Loop BB1_12 Depth 3 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: cbz r2, .LBB1_11 +; CHECK-NEXT: cbz r7, .LBB1_11 ; CHECK-NEXT: @ %bb.9: @ %for.body13.us51.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: movw r4, :lower16:a +; CHECK-NEXT: movw r2, :lower16:a ; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: movt r4, :upper16:a -; CHECK-NEXT: str r1, [r4] -; CHECK-NEXT: movw r4, :lower16:b -; CHECK-NEXT: movt r4, :upper16:b -; CHECK-NEXT: str r1, [r4] -; CHECK-NEXT: mov r4, r7 +; CHECK-NEXT: movt r2, :upper16:a +; CHECK-NEXT: str r1, [r2] +; CHECK-NEXT: movw r2, :lower16:b +; CHECK-NEXT: movt r2, :upper16:b +; CHECK-NEXT: str r1, [r2] +; CHECK-NEXT: mov r2, r6 ; CHECK-NEXT: .LBB1_10: @ %vector.body111 ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 ; CHECK-NEXT: vqadd.u32 q2, q5, r1 -; CHECK-NEXT: subs r4, #4 +; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vcmp.u32 hi, q7, q2 ; CHECK-NEXT: vshl.i32 q2, q1, #2 ; CHECK-NEXT: add.w r1, r1, #4 @@ -493,14 +491,14 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: b .LBB1_13 ; CHECK-NEXT: .LBB1_11: @ %vector.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: mov r4, r7 +; CHECK-NEXT: mov r2, r6 ; CHECK-NEXT: vmov q1, q4 ; CHECK-NEXT: .LBB1_12: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 ; CHECK-NEXT: vqadd.u32 q2, q5, r1 -; CHECK-NEXT: subs r4, #4 +; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vcmp.u32 hi, q6, q2 ; CHECK-NEXT: vshl.i32 q2, q1, #2 ; CHECK-NEXT: add.w r1, r1, #4 @@ -511,7 +509,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: bne .LBB1_12 ; CHECK-NEXT: .LBB1_13: @ %for.cond9.for.cond15.preheader_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: beq .LBB1_7 ; CHECK-NEXT: @ %bb.14: @ %for.cond9.for.cond15.preheader_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 @@ -521,11 +519,11 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: b .LBB1_26 ; CHECK-NEXT: .LBB1_15: @ %for.body6.lr.ph.split ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: beq.w .LBB1_2 ; CHECK-NEXT: @ %bb.16: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: ldrd r12, r0, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: ldrd r12, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r2, r11 ; CHECK-NEXT: .LBB1_17: @ %for.body6.us60 ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -533,35 +531,35 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: bne .LBB1_27 ; CHECK-NEXT: @ %bb.18: @ %for.cond.cleanup17.us63 ; CHECK-NEXT: @ in Loop: Header=BB1_17 Depth=2 -; CHECK-NEXT: cmn.w r3, #4 +; CHECK-NEXT: cmn.w r2, #4 ; CHECK-NEXT: bge .LBB1_22 ; CHECK-NEXT: @ %bb.19: @ %for.cond.cleanup17.us63.1 ; CHECK-NEXT: @ in Loop: Header=BB1_17 Depth=2 -; CHECK-NEXT: cmn.w r3, #12 +; CHECK-NEXT: cmn.w r2, #12 ; CHECK-NEXT: bgt .LBB1_23 ; CHECK-NEXT: @ %bb.20: @ %for.cond.cleanup17.us63.2 ; CHECK-NEXT: @ in Loop: Header=BB1_17 Depth=2 -; CHECK-NEXT: cmn.w r3, #19 +; CHECK-NEXT: cmn.w r2, #19 ; CHECK-NEXT: bgt .LBB1_24 ; CHECK-NEXT: @ %bb.21: @ %for.cond.cleanup17.us63.3 ; CHECK-NEXT: @ in Loop: Header=BB1_17 Depth=2 -; CHECK-NEXT: add.w r11, r3, #28 -; CHECK-NEXT: cmn.w r3, #25 +; CHECK-NEXT: add.w r11, r2, #28 +; CHECK-NEXT: cmn.w r2, #25 ; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r2, r11 ; CHECK-NEXT: blt .LBB1_17 ; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .LBB1_22: @ %for.cond.cleanup5.loopexit134.split.loop.exit139 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r11, r3, #7 +; CHECK-NEXT: add.w r11, r2, #7 ; CHECK-NEXT: b .LBB1_25 ; CHECK-NEXT: .LBB1_23: @ %for.cond.cleanup5.loopexit134.split.loop.exit137 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r11, r3, #14 +; CHECK-NEXT: add.w r11, r2, #14 ; CHECK-NEXT: b .LBB1_25 ; CHECK-NEXT: .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit135 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r11, r3, #21 +; CHECK-NEXT: add.w r11, r2, #21 ; CHECK-NEXT: .LBB1_25: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 ; CHECK-NEXT: mov.w r10, #0 @@ -573,7 +571,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: b .LBB1_27 ; CHECK-NEXT: .LBB1_28: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll index 11e604b5079b89..b605ca2c6a1ec4 100644 --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll @@ -128,29 +128,29 @@ entry: define arm_aapcs_vfpcc <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) { ; CHECK-LABEL: complex_mul_v32f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: add r0, sp, #48 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add r0, sp, #64 -; CHECK-NEXT: vcmul.f16 q0, q0, q5, #0 -; CHECK-NEXT: vcmla.f16 q0, q4, q5, #90 ; CHECK-NEXT: vldrw.u32 q5, [r0] ; CHECK-NEXT: add r0, sp, #80 -; CHECK-NEXT: vcmul.f16 q4, q1, q5, #0 -; CHECK-NEXT: vcmla.f16 q4, q1, q5, #90 -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmul.f16 q4, q0, q5, #0 +; CHECK-NEXT: vcmla.f16 q4, q0, q5, #90 +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: add r0, sp, #96 -; CHECK-NEXT: vcmul.f16 q5, q2, q1, #0 -; CHECK-NEXT: vcmla.f16 q5, q2, q1, #90 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov q2, q5 -; CHECK-NEXT: vcmul.f16 q6, q3, q1, #0 -; CHECK-NEXT: vcmla.f16 q6, q3, q1, #90 -; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: vmov q3, q6 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vcmul.f16 q5, q1, q0, #0 +; CHECK-NEXT: vcmla.f16 q5, q1, q0, #90 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #112 +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vcmul.f16 q6, q2, q0, #0 +; CHECK-NEXT: vcmla.f16 q6, q2, q0, #90 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov q2, q6 +; CHECK-NEXT: vcmul.f16 q7, q3, q0, #0 +; CHECK-NEXT: vcmla.f16 q7, q3, q0, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q3, q7 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll index 15859cd6fa1825..0f4addff0ddcbe 100644 --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll @@ -49,29 +49,29 @@ entry: define arm_aapcs_vfpcc <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: complex_add_v8f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: add r0, sp, #32 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add r0, sp, #48 -; CHECK-NEXT: vadd.f64 d1, d1, d2 -; CHECK-NEXT: vsub.f64 d0, d0, d3 -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add r0, sp, #64 -; CHECK-NEXT: vadd.f64 d3, d3, d8 -; CHECK-NEXT: vsub.f64 d2, d2, d9 ; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: add r0, sp, #80 -; CHECK-NEXT: vadd.f64 d9, d9, d4 -; CHECK-NEXT: vsub.f64 d8, d8, d5 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vadd.f64 d11, d5, d6 -; CHECK-NEXT: vsub.f64 d10, d4, d7 -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmov q3, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vadd.f64 d9, d9, d0 +; CHECK-NEXT: vsub.f64 d8, d8, d1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #96 +; CHECK-NEXT: vadd.f64 d11, d1, d2 +; CHECK-NEXT: vsub.f64 d10, d0, d3 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #112 +; CHECK-NEXT: vadd.f64 d13, d1, d4 +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vsub.f64 d12, d0, d5 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vadd.f64 d15, d1, d6 +; CHECK-NEXT: vmov q2, q6 +; CHECK-NEXT: vsub.f64 d14, d0, d7 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q3, q7 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll index dab66d0e37f900..adb74c7b12a2a8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll @@ -34,16 +34,16 @@ define arm_aapcs_vfpcc <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x dou ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmul.f64 d9, d7, d2 -; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmul.f64 d8, d3, d7 -; CHECK-NEXT: vfma.f64 d9, d6, d3 -; CHECK-NEXT: vfnms.f64 d8, d6, d2 -; CHECK-NEXT: vmul.f64 d1, d5, d10 -; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: vmul.f64 d0, d11, d5 -; CHECK-NEXT: vfma.f64 d1, d4, d11 -; CHECK-NEXT: vfnms.f64 d0, d4, d10 +; CHECK-NEXT: vmul.f64 d9, d5, d0 +; CHECK-NEXT: vmul.f64 d8, d1, d5 +; CHECK-NEXT: vmul.f64 d11, d7, d2 +; CHECK-NEXT: vmul.f64 d10, d3, d7 +; CHECK-NEXT: vfma.f64 d9, d4, d1 +; CHECK-NEXT: vfnms.f64 d8, d4, d0 +; CHECK-NEXT: vfma.f64 d11, d6, d3 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vfnms.f64 d10, d6, d2 +; CHECK-NEXT: vmov q1, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll index 2f09c98891d035..6e6fe75e9da899 100644 --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll @@ -43,21 +43,21 @@ define arm_aapcs_vfpcc <4 x float> @add_mul(<4 x float> %a, <4 x float> %b, <4 x ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vsub.f32 q3, q1, q2 ; CHECK-NEXT: vsub.f32 q0, q1, q0 -; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s4, s9 ; CHECK-NEXT: vmov.f32 s13, s14 -; CHECK-NEXT: vmov.f32 s17, s11 +; CHECK-NEXT: vmov.f32 s5, s11 ; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmul.f32 q1, q3, q4 +; CHECK-NEXT: vmul.f32 q4, q3, q1 ; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vfma.f32 q1, q2, q0 -; CHECK-NEXT: vmul.f32 q0, q4, q0 -; CHECK-NEXT: vneg.f32 q4, q0 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vfma.f32 q4, q2, q3 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s0, s16 -; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vfma.f32 q4, q2, q0 +; CHECK-NEXT: vmul.f32 q0, q1, q0 +; CHECK-NEXT: vneg.f32 q1, q0 +; CHECK-NEXT: vmov.f32 s1, s16 +; CHECK-NEXT: vfma.f32 q1, q2, q3 +; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -81,41 +81,35 @@ entry: define arm_aapcs_vfpcc <4 x float> @mul_mul270_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_mul270_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d12} -; CHECK-NEXT: vpush {d12} -; CHECK-NEXT: .vsave {d10} -; CHECK-NEXT: vpush {d10} -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vmov.f32 s20, s4 -; CHECK-NEXT: vmov.f32 s16, s8 -; CHECK-NEXT: vmov.f32 s17, s10 -; CHECK-NEXT: vmov.f32 s21, s6 -; CHECK-NEXT: vmul.f32 q3, q5, q4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov.f32 s12, s8 +; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vmov.f32 s13, s10 +; CHECK-NEXT: vmov.f32 s17, s6 +; CHECK-NEXT: vmul.f32 q5, q4, q3 +; CHECK-NEXT: vmov.f32 s8, s9 ; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vneg.f32 q3, q3 -; CHECK-NEXT: vmov.f32 s24, s9 -; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vneg.f32 q5, q5 +; CHECK-NEXT: vmov.f32 s9, s11 ; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vmul.f32 q2, q1, q4 -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vfma.f32 q3, q1, q6 -; CHECK-NEXT: vmov.f32 s17, s2 +; CHECK-NEXT: vfma.f32 q5, q1, q2 +; CHECK-NEXT: vmul.f32 q1, q1, q3 +; CHECK-NEXT: vfma.f32 q1, q4, q2 +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s9, s2 ; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vfma.f32 q2, q5, q6 -; CHECK-NEXT: vmul.f32 q1, q3, q4 +; CHECK-NEXT: vmul.f32 q3, q5, q2 ; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vfma.f32 q1, q2, q0 -; CHECK-NEXT: vmul.f32 q0, q3, q0 -; CHECK-NEXT: vneg.f32 q3, q0 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vfma.f32 q3, q2, q4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s0, s12 -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: vpop {d10} -; CHECK-NEXT: vpop {d12} +; CHECK-NEXT: vfma.f32 q3, q1, q0 +; CHECK-NEXT: vmul.f32 q0, q5, q0 +; CHECK-NEXT: vneg.f32 q4, q0 +; CHECK-NEXT: vmov.f32 s1, s12 +; CHECK-NEXT: vfma.f32 q4, q1, q2 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov.f32 s0, s16 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> @@ -274,29 +268,29 @@ define arm_aapcs_vfpcc <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x flo ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmov.f32 s20, s5 -; CHECK-NEXT: vmov.f32 s17, s2 -; CHECK-NEXT: vmov.f32 s21, s7 +; CHECK-NEXT: vmov.f32 s12, s0 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s2 +; CHECK-NEXT: vmov.f32 s17, s7 ; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmul.f32 q3, q5, q4 -; CHECK-NEXT: vmul.f32 q4, q1, q4 +; CHECK-NEXT: vmul.f32 q5, q4, q3 +; CHECK-NEXT: vmul.f32 q3, q1, q3 ; CHECK-NEXT: vmov.f32 s0, s1 ; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov q6, q4 -; CHECK-NEXT: vfms.f32 q6, q5, q0 -; CHECK-NEXT: vmov q7, q3 -; CHECK-NEXT: vfma.f32 q3, q1, q0 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s10 +; CHECK-NEXT: vmov q6, q3 +; CHECK-NEXT: vfms.f32 q6, q4, q0 +; CHECK-NEXT: vmov q7, q5 +; CHECK-NEXT: vfma.f32 q5, q1, q0 +; CHECK-NEXT: vmov.f32 s16, s8 +; CHECK-NEXT: vmov.f32 s17, s10 ; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vfma.f32 q7, q5, q0 +; CHECK-NEXT: vfma.f32 q7, q4, q0 ; CHECK-NEXT: vmov.f32 s5, s11 -; CHECK-NEXT: vadd.f32 q5, q7, q6 -; CHECK-NEXT: vfms.f32 q4, q1, q0 -; CHECK-NEXT: vmov.f32 s1, s20 -; CHECK-NEXT: vsub.f32 q1, q4, q3 -; CHECK-NEXT: vmov.f32 s3, s21 +; CHECK-NEXT: vadd.f32 q4, q7, q6 +; CHECK-NEXT: vfms.f32 q3, q1, q0 +; CHECK-NEXT: vmov.f32 s1, s16 +; CHECK-NEXT: vsub.f32 q1, q3, q5 +; CHECK-NEXT: vmov.f32 s3, s17 ; CHECK-NEXT: vmov.f32 s0, s4 ; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -335,32 +329,32 @@ define arm_aapcs_vfpcc <4 x float> @mul_triangle_multiuses(<4 x float> %a, <4 x ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vmov.f32 s12, s4 ; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f32 s17, s6 +; CHECK-NEXT: vmov.f32 s13, s6 ; CHECK-NEXT: vmov.f32 s9, s3 ; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmul.f32 q3, q2, q4 +; CHECK-NEXT: vmul.f32 q4, q2, q3 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vfma.f32 q3, q1, q0 +; CHECK-NEXT: vfma.f32 q4, q1, q0 ; CHECK-NEXT: vmul.f32 q1, q1, q2 ; CHECK-NEXT: vneg.f32 q1, q1 -; CHECK-NEXT: vfma.f32 q1, q4, q0 -; CHECK-NEXT: vmov.f32 s18, s12 -; CHECK-NEXT: vmov.f32 s16, s4 -; CHECK-NEXT: vmov.f32 s17, s5 -; CHECK-NEXT: vmov.f32 s19, s13 -; CHECK-NEXT: vstrw.32 q4, [r0] -; CHECK-NEXT: vmul.f32 q4, q3, q0 -; CHECK-NEXT: vfma.f32 q4, q1, q2 -; CHECK-NEXT: vmul.f32 q2, q3, q2 +; CHECK-NEXT: vfma.f32 q1, q3, q0 +; CHECK-NEXT: vmov.f32 s14, s16 +; CHECK-NEXT: vmov.f32 s12, s4 +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vmov.f32 s15, s17 +; CHECK-NEXT: vstrw.32 q3, [r0] +; CHECK-NEXT: vmul.f32 q3, q4, q0 +; CHECK-NEXT: vfma.f32 q3, q1, q2 +; CHECK-NEXT: vmul.f32 q2, q4, q2 ; CHECK-NEXT: vneg.f32 q2, q2 ; CHECK-NEXT: vfma.f32 q2, q1, q0 -; CHECK-NEXT: vmov.f32 s1, s16 +; CHECK-NEXT: vmov.f32 s1, s12 ; CHECK-NEXT: vmov.f32 s0, s8 ; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vmov.f32 s3, s13 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -503,24 +497,24 @@ define <4 x float> @mul_divequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) ; CHECK-NEXT: add r0, sp, #24 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: add.w r12, sp, #40 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmul.f32 q5, q3, q0 +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vmul.f32 q5, q1, q0 ; CHECK-NEXT: vmov.f32 s17, s3 ; CHECK-NEXT: vfma.f32 q5, q2, q4 -; CHECK-NEXT: vmul.f32 q3, q4, q3 -; CHECK-NEXT: vdiv.f32 s3, s21, s7 -; CHECK-NEXT: vneg.f32 q3, q3 -; CHECK-NEXT: vfma.f32 q3, q2, q0 -; CHECK-NEXT: vdiv.f32 s1, s20, s5 -; CHECK-NEXT: vdiv.f32 s2, s13, s6 -; CHECK-NEXT: vdiv.f32 s0, s12, s4 +; CHECK-NEXT: vmul.f32 q1, q4, q1 +; CHECK-NEXT: vdiv.f32 s3, s21, s15 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q2, q0 +; CHECK-NEXT: vdiv.f32 s1, s20, s13 +; CHECK-NEXT: vdiv.f32 s2, s5, s14 +; CHECK-NEXT: vdiv.f32 s0, s4, s12 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: vpop {d8} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll index c5f61b7fcdde55..386d179b7d0ad5 100644 --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll @@ -208,80 +208,83 @@ entry: define arm_aapcs_vfpcc <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) { ; CHECK-LABEL: abp90c12: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: vldr s23, [sp, #140] +; CHECK-NEXT: .vsave {d14, d15} +; CHECK-NEXT: vpush {d14, d15} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: vldr s23, [sp, #116] ; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vldr s22, [sp, #132] +; CHECK-NEXT: vldr s22, [sp, #108] +; CHECK-NEXT: vmov.f32 s16, s12 ; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vldr s19, [sp, #112] ; CHECK-NEXT: vmov.f32 s13, s10 -; CHECK-NEXT: vldr s19, [sp, #136] +; CHECK-NEXT: vldr s18, [sp, #104] +; CHECK-NEXT: vmov.f32 s12, s8 +; CHECK-NEXT: vldr s31, [sp, #164] ; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vldr s18, [sp, #128] +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vldr s31, [sp, #188] +; CHECK-NEXT: vldr s30, [sp, #156] ; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vldr s30, [sp, #180] -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vldr s29, [sp, #172] +; CHECK-NEXT: vldr s29, [sp, #148] ; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vldr s28, [sp, #164] +; CHECK-NEXT: vldr s3, [sp, #160] ; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmov.f32 s24, s9 -; CHECK-NEXT: vmov.f32 s16, s12 -; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s12, s8 -; CHECK-NEXT: vldr s27, [sp, #184] +; CHECK-NEXT: vldr s1, [sp, #144] +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vldr s28, [sp, #140] ; CHECK-NEXT: vmov.f32 s17, s14 -; CHECK-NEXT: vldr s26, [sp, #176] +; CHECK-NEXT: vmul.f32 q3, q5, q1 +; CHECK-NEXT: vmov.f32 s24, s9 +; CHECK-NEXT: vneg.f32 q3, q3 ; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vldr s25, [sp, #168] +; CHECK-NEXT: vldr s2, [sp, #152] ; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmul.f32 q0, q5, q1 ; CHECK-NEXT: vmul.f32 q1, q4, q1 -; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vldr s24, [sp, #160] +; CHECK-NEXT: vldr s0, [sp, #136] +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill ; CHECK-NEXT: vfma.f32 q1, q5, q2 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vsub.f32 q6, q6, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldr s13, [sp, #156] +; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vsub.f32 q5, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vldr s25, [sp, #132] ; CHECK-NEXT: vfma.f32 q1, q4, q2 -; CHECK-NEXT: vldr s12, [sp, #148] +; CHECK-NEXT: vldr s24, [sp, #124] +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 q1, q7, q1 +; CHECK-NEXT: vldr s13, [sp, #128] +; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmul.f32 q2, q6, q2 +; CHECK-NEXT: vldr s12, [sp, #120] ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldr s1, [sp, #152] -; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill -; CHECK-NEXT: vmul.f32 q2, q3, q7 -; CHECK-NEXT: vldr s0, [sp, #144] -; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vneg.f32 q2, q2 -; CHECK-NEXT: vldr s21, [sp, #200] -; CHECK-NEXT: vfma.f32 q2, q0, q3 -; CHECK-NEXT: vmul.f32 q0, q0, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vldr s20, [sp, #192] -; CHECK-NEXT: vldr s17, [sp, #204] -; CHECK-NEXT: vldr s16, [sp, #196] -; CHECK-NEXT: vfma.f32 q0, q7, q3 -; CHECK-NEXT: vsub.f32 q3, q5, q0 +; CHECK-NEXT: vfma.f32 q2, q3, q4 +; CHECK-NEXT: vldr s1, [sp, #176] +; CHECK-NEXT: vmul.f32 q3, q3, q7 +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldr s0, [sp, #168] +; CHECK-NEXT: vldr s17, [sp, #180] +; CHECK-NEXT: vfma.f32 q3, q6, q7 +; CHECK-NEXT: vldr s16, [sp, #172] +; CHECK-NEXT: vsub.f32 q3, q0, q3 ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vadd.f32 q4, q4, q2 ; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s0, s24 -; CHECK-NEXT: vmov.f32 s2, s25 -; CHECK-NEXT: vmov.f32 s4, s26 -; CHECK-NEXT: vmov.f32 s6, s27 +; CHECK-NEXT: vmov.f32 s0, s20 +; CHECK-NEXT: vmov.f32 s2, s21 +; CHECK-NEXT: vmov.f32 s4, s22 +; CHECK-NEXT: vmov.f32 s6, s23 ; CHECK-NEXT: vmov.f32 s8, s12 ; CHECK-NEXT: vmov.f32 s9, s16 ; CHECK-NEXT: vmov.f32 s10, s13 ; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: add sp, #64 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpop {d14, d15} ; CHECK-NEXT: bx lr entry: %ar = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll index 724bd4f7963b81..765851ffa05339 100644 --- a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll @@ -7,50 +7,50 @@ define arm_aapcs_vfpcc <2 x i64> @ctpop_2i64_t(<2 x i64> %src){ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: mov.w lr, #1431655765 ; CHECK-NEXT: vmov r3, r4, d0 ; CHECK-NEXT: mov.w r12, #858993459 ; CHECK-NEXT: vldr s1, .LCPI0_0 ; CHECK-NEXT: vmov.f32 s3, s1 -; CHECK-NEXT: and.w r0, lr, r2, lsr #1 -; CHECK-NEXT: subs r0, r2, r0 -; CHECK-NEXT: and.w r2, r12, r0, lsr #2 -; CHECK-NEXT: bic r0, r0, #-858993460 -; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: and.w r2, lr, r1, lsr #1 ; CHECK-NEXT: subs r1, r1, r2 -; CHECK-NEXT: add.w r0, r0, r0, lsr #4 ; CHECK-NEXT: and.w r2, r12, r1, lsr #2 ; CHECK-NEXT: bic r1, r1, #-858993460 ; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: and.w r2, lr, r0, lsr #1 +; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: add.w r1, r1, r1, lsr #4 +; CHECK-NEXT: and.w r2, r12, r0, lsr #2 +; CHECK-NEXT: bic r0, r0, #-858993460 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: and.w r2, lr, r3, lsr #1 ; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: bic r5, r0, #-252645136 -; CHECK-NEXT: add.w r1, r1, r1, lsr #4 -; CHECK-NEXT: mov.w r0, #16843009 +; CHECK-NEXT: bic r5, r1, #-252645136 +; CHECK-NEXT: add.w r0, r0, r0, lsr #4 +; CHECK-NEXT: mov.w r1, #16843009 ; CHECK-NEXT: and.w r3, r12, r2, lsr #2 ; CHECK-NEXT: bic r2, r2, #-858993460 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: and.w r3, lr, r4, lsr #1 ; CHECK-NEXT: subs r3, r4, r3 -; CHECK-NEXT: bic r1, r1, #-252645136 +; CHECK-NEXT: bic r0, r0, #-252645136 ; CHECK-NEXT: add.w r2, r2, r2, lsr #4 -; CHECK-NEXT: muls r5, r0, r5 +; CHECK-NEXT: muls r5, r1, r5 ; CHECK-NEXT: and.w r4, r12, r3, lsr #2 ; CHECK-NEXT: bic r3, r3, #-858993460 -; CHECK-NEXT: bic r2, r2, #-252645136 +; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: add r3, r4 -; CHECK-NEXT: muls r1, r0, r1 +; CHECK-NEXT: bic r2, r2, #-252645136 ; CHECK-NEXT: add.w r3, r3, r3, lsr #4 -; CHECK-NEXT: muls r2, r0, r2 ; CHECK-NEXT: bic r3, r3, #-252645136 -; CHECK-NEXT: muls r0, r3, r0 -; CHECK-NEXT: lsrs r1, r1, #24 -; CHECK-NEXT: add.w r1, r1, r5, lsr #24 +; CHECK-NEXT: muls r2, r1, r2 +; CHECK-NEXT: muls r1, r3, r1 +; CHECK-NEXT: lsrs r0, r0, #24 +; CHECK-NEXT: add.w r0, r0, r5, lsr #24 +; CHECK-NEXT: vmov s2, r0 ; CHECK-NEXT: lsrs r2, r2, #24 -; CHECK-NEXT: vmov s2, r1 -; CHECK-NEXT: add.w r0, r2, r0, lsr #24 +; CHECK-NEXT: add.w r0, r2, r1, lsr #24 ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 2 diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll index 939ab71a8061c5..fa9e95624b8f56 100644 --- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll +++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -8,14 +8,14 @@ define arm_aapcs_vfpcc <4 x i32> @udiv_i32(<4 x i32> %in1, <4 x i32> %in2) { ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov r0, r12, d3 -; CHECK-NEXT: vmov r2, lr, d1 -; CHECK-NEXT: vmov r1, r3, d2 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r1, lr, d2 ; CHECK-NEXT: udiv r0, r2, r0 ; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: udiv r1, r4, r1 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: udiv r0, lr, r12 -; CHECK-NEXT: udiv r1, r5, r3 +; CHECK-NEXT: udiv r0, r3, r12 +; CHECK-NEXT: udiv r1, r5, lr ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -29,14 +29,14 @@ define arm_aapcs_vfpcc <4 x i32> @sdiv_i32(<4 x i32> %in1, <4 x i32> %in2) { ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov r0, r12, d3 -; CHECK-NEXT: vmov r2, lr, d1 -; CHECK-NEXT: vmov r1, r3, d2 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r1, lr, d2 ; CHECK-NEXT: sdiv r0, r2, r0 ; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: sdiv r1, r4, r1 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: sdiv r0, lr, r12 -; CHECK-NEXT: sdiv r1, r5, r3 +; CHECK-NEXT: sdiv r0, r3, r12 +; CHECK-NEXT: sdiv r1, r5, lr ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -195,15 +195,15 @@ define arm_aapcs_vfpcc <8 x i16> @urem_i16(<8 x i16> %in1, <8 x i16> %in2) { ; CHECK-NEXT: vmov.u16 r2, q1[4] ; CHECK-NEXT: vmov.u16 r3, q0[4] ; CHECK-NEXT: udiv r0, r3, r2 -; CHECK-NEXT: mls r2, r0, r2, r3 -; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: mls r0, r0, r2, r3 +; CHECK-NEXT: vmov.u16 r2, q1[5] ; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: udiv r1, r3, r0 -; CHECK-NEXT: mls r0, r1, r0, r3 -; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: udiv r1, r3, r2 +; CHECK-NEXT: mls r1, r1, r2, r3 +; CHECK-NEXT: vmov.u16 r2, q1[2] ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: udiv r4, r3, r1 -; CHECK-NEXT: mls r1, r4, r1, r3 +; CHECK-NEXT: udiv r4, r3, r2 +; CHECK-NEXT: mls r2, r4, r2, r3 ; CHECK-NEXT: vmov.u16 r3, q1[3] ; CHECK-NEXT: vmov.u16 r4, q0[3] ; CHECK-NEXT: udiv r5, r4, r3 @@ -218,10 +218,10 @@ define arm_aapcs_vfpcc <8 x i16> @urem_i16(<8 x i16> %in1, <8 x i16> %in2) { ; CHECK-NEXT: vmov.16 q0[0], r4 ; CHECK-NEXT: mls r5, r7, r5, r6 ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r1 +; CHECK-NEXT: vmov.16 q0[2], r2 ; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 ; CHECK-NEXT: vmov.16 q0[6], r12 ; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -246,15 +246,15 @@ define arm_aapcs_vfpcc <8 x i16> @srem_i16(<8 x i16> %in1, <8 x i16> %in2) { ; CHECK-NEXT: vmov.s16 r2, q1[4] ; CHECK-NEXT: vmov.s16 r3, q0[4] ; CHECK-NEXT: sdiv r0, r3, r2 -; CHECK-NEXT: mls r2, r0, r2, r3 -; CHECK-NEXT: vmov.s16 r0, q1[5] +; CHECK-NEXT: mls r0, r0, r2, r3 +; CHECK-NEXT: vmov.s16 r2, q1[5] ; CHECK-NEXT: vmov.s16 r3, q0[5] -; CHECK-NEXT: sdiv r1, r3, r0 -; CHECK-NEXT: mls r0, r1, r0, r3 -; CHECK-NEXT: vmov.s16 r1, q1[2] +; CHECK-NEXT: sdiv r1, r3, r2 +; CHECK-NEXT: mls r1, r1, r2, r3 +; CHECK-NEXT: vmov.s16 r2, q1[2] ; CHECK-NEXT: vmov.s16 r3, q0[2] -; CHECK-NEXT: sdiv r4, r3, r1 -; CHECK-NEXT: mls r1, r4, r1, r3 +; CHECK-NEXT: sdiv r4, r3, r2 +; CHECK-NEXT: mls r2, r4, r2, r3 ; CHECK-NEXT: vmov.s16 r3, q1[3] ; CHECK-NEXT: vmov.s16 r4, q0[3] ; CHECK-NEXT: sdiv r5, r4, r3 @@ -269,10 +269,10 @@ define arm_aapcs_vfpcc <8 x i16> @srem_i16(<8 x i16> %in1, <8 x i16> %in2) { ; CHECK-NEXT: vmov.16 q0[0], r4 ; CHECK-NEXT: mls r5, r7, r5, r6 ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r1 +; CHECK-NEXT: vmov.16 q0[2], r2 ; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 ; CHECK-NEXT: vmov.16 q0[6], r12 ; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll index 88131fcf21a923..4418388499c64e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -747,24 +747,24 @@ define void @arm_fir_f32_1_4_mve(ptr nocapture readonly %S, ptr nocapture readon ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: ldrh.w r9, [r0] -; CHECK-NEXT: ldr.w r10, [r0, #4] -; CHECK-NEXT: sub.w r6, r9, #1 +; CHECK-NEXT: ldrh.w r10, [r0] +; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: sub.w r6, r10, #1 ; CHECK-NEXT: cmp r6, #3 ; CHECK-NEXT: bhi .LBB15_6 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: ldr r7, [r0, #8] -; CHECK-NEXT: add.w r4, r10, r6, lsl #1 +; CHECK-NEXT: add.w r4, r12, r6, lsl #1 ; CHECK-NEXT: lsrs r5, r3, #2 ; CHECK-NEXT: ldrh.w r8, [r7, #6] -; CHECK-NEXT: ldrh.w r12, [r7, #4] +; CHECK-NEXT: ldrh.w r9, [r7, #4] ; CHECK-NEXT: ldrh r6, [r7, #2] ; CHECK-NEXT: ldrh r7, [r7] ; CHECK-NEXT: wls lr, r5, .LBB15_5 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph -; CHECK-NEXT: str.w r9, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str.w r10, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bic r5, r3, #3 -; CHECK-NEXT: add.w r9, r10, #2 +; CHECK-NEXT: add.w r10, r12, #2 ; CHECK-NEXT: str r5, [sp] @ 4-byte Spill ; CHECK-NEXT: add.w r5, r2, r5, lsl #1 ; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill @@ -772,71 +772,71 @@ define void @arm_fir_f32_1_4_mve(ptr nocapture readonly %S, ptr nocapture readon ; CHECK-NEXT: .LBB15_3: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #8 -; CHECK-NEXT: sub.w r11, r9, #2 -; CHECK-NEXT: add.w r5, r9, #2 +; CHECK-NEXT: sub.w r11, r10, #2 +; CHECK-NEXT: add.w r5, r10, #2 ; CHECK-NEXT: vstrb.8 q0, [r4], #8 ; CHECK-NEXT: vldrw.u32 q0, [r11] -; CHECK-NEXT: vldrw.u32 q1, [r9] +; CHECK-NEXT: vldrw.u32 q1, [r10] ; CHECK-NEXT: vmul.f16 q0, q0, r7 ; CHECK-NEXT: vfma.f16 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vfma.f16 q0, q1, r12 -; CHECK-NEXT: vldrw.u32 q1, [r9, #4] -; CHECK-NEXT: add.w r9, r9, #8 +; CHECK-NEXT: vfma.f16 q0, q1, r9 +; CHECK-NEXT: vldrw.u32 q1, [r10, #4] +; CHECK-NEXT: add.w r10, r10, #8 ; CHECK-NEXT: vfma.f16 q0, q1, r8 ; CHECK-NEXT: vstrb.8 q0, [r2], #8 ; CHECK-NEXT: le lr, .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload ; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldr.w r9, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: add.w r10, r10, r2, lsl #1 +; CHECK-NEXT: ldr.w r10, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add.w r12, r12, r2, lsl #1 ; CHECK-NEXT: add.w r1, r1, r2, lsl #1 ; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB15_5: @ %while.end ; CHECK-NEXT: and r5, r3, #3 ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vctp.16 r5 -; CHECK-NEXT: add.w r1, r10, #2 +; CHECK-NEXT: add.w r1, r12, #2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrht.16 q0, [r4] -; CHECK-NEXT: vldrw.u32 q0, [r10] +; CHECK-NEXT: vldrw.u32 q0, [r12] ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: add.w r1, r10, #6 +; CHECK-NEXT: add.w r1, r12, #6 ; CHECK-NEXT: vmul.f16 q0, q0, r7 ; CHECK-NEXT: vfma.f16 q0, q1, r6 -; CHECK-NEXT: vldrw.u32 q1, [r10, #4] -; CHECK-NEXT: vfma.f16 q0, q1, r12 +; CHECK-NEXT: vldrw.u32 q1, [r12, #4] +; CHECK-NEXT: vfma.f16 q0, q1, r9 ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vfma.f16 q0, q1, r8 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrht.16 q0, [r2] -; CHECK-NEXT: ldr.w r10, [r0, #4] +; CHECK-NEXT: ldr.w r12, [r0, #4] ; CHECK-NEXT: .LBB15_6: @ %if.end -; CHECK-NEXT: add.w r0, r10, r3, lsl #1 -; CHECK-NEXT: lsr.w r1, r9, #2 +; CHECK-NEXT: add.w r0, r12, r3, lsl #1 +; CHECK-NEXT: lsr.w r1, r10, #2 ; CHECK-NEXT: wls lr, r1, .LBB15_10 ; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader -; CHECK-NEXT: bic r2, r9, #3 -; CHECK-NEXT: adds r1, r2, r3 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: add.w r1, r10, r1, lsl #1 +; CHECK-NEXT: bic r1, r10, #3 +; CHECK-NEXT: adds r2, r1, r3 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: add.w r2, r12, r2, lsl #1 ; CHECK-NEXT: .LBB15_8: @ %while.body51 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #8 ; CHECK-NEXT: vstrb.8 q0, [r3], #8 ; CHECK-NEXT: le lr, .LBB15_8 ; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit -; CHECK-NEXT: add.w r10, r10, r2, lsl #1 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: add.w r12, r12, r1, lsl #1 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: .LBB15_10: @ %while.end55 -; CHECK-NEXT: ands r1, r9, #3 +; CHECK-NEXT: ands r1, r10, #3 ; CHECK-NEXT: beq .LBB15_12 ; CHECK-NEXT: @ %bb.11: @ %if.then59 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vctp.16 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q0, [r10] +; CHECK-NEXT: vstrht.16 q0, [r12] ; CHECK-NEXT: .LBB15_12: @ %if.end61 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1024,15 +1024,15 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: b .LBB16_5 ; CHECK-NEXT: .LBB16_3: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: wls lr, r0, .LBB16_4 +; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: wls lr, r7, .LBB16_4 ; CHECK-NEXT: b .LBB16_9 ; CHECK-NEXT: .LBB16_4: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: subs.w r12, r12, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #8 -; CHECK-NEXT: add.w r0, r5, r0, lsl #1 +; CHECK-NEXT: add.w r0, r0, r7, lsl #1 ; CHECK-NEXT: add.w r5, r0, #8 ; CHECK-NEXT: beq.w .LBB16_12 ; CHECK-NEXT: .LBB16_5: @ %while.body @@ -1068,50 +1068,50 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: add.w r0, r5, #14 ; CHECK-NEXT: vfma.f16 q0, q1, r7 ; CHECK-NEXT: vldrw.u32 q1, [r5, #12] -; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vfma.f16 q0, q1, r8 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add.w r0, r5, #16 +; CHECK-NEXT: cmp r7, #16 ; CHECK-NEXT: vfma.f16 q0, q1, lr -; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: blo .LBB16_8 ; CHECK-NEXT: @ %bb.6: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB16_7: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r0, [r6], #16 -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: adds r4, r5, #2 -; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-14] -; CHECK-NEXT: adds r4, r5, #6 -; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-12] -; CHECK-NEXT: vldrw.u32 q1, [r5, #4] -; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-10] -; CHECK-NEXT: add.w r4, r5, #10 -; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-8] -; CHECK-NEXT: vldrw.u32 q1, [r5, #8] -; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-6] -; CHECK-NEXT: ldrh r4, [r6, #-2] -; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-4] -; CHECK-NEXT: vldrw.u32 q1, [r5, #12] -; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: add.w r0, r5, #14 +; CHECK-NEXT: ldrh r4, [r6], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: adds r5, r0, #2 +; CHECK-NEXT: vfma.f16 q0, q1, r4 +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: ldrh r4, [r6, #-14] +; CHECK-NEXT: adds r5, r0, #6 +; CHECK-NEXT: vfma.f16 q0, q1, r4 +; CHECK-NEXT: ldrh r4, [r6, #-12] +; CHECK-NEXT: vldrw.u32 q1, [r0, #4] +; CHECK-NEXT: vfma.f16 q0, q1, r4 +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: ldrh r4, [r6, #-10] +; CHECK-NEXT: add.w r5, r0, #10 +; CHECK-NEXT: vfma.f16 q0, q1, r4 +; CHECK-NEXT: ldrh r4, [r6, #-8] +; CHECK-NEXT: vldrw.u32 q1, [r0, #8] +; CHECK-NEXT: vfma.f16 q0, q1, r4 +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: ldrh r4, [r6, #-6] +; CHECK-NEXT: ldrh r5, [r6, #-2] +; CHECK-NEXT: vfma.f16 q0, q1, r4 +; CHECK-NEXT: ldrh r4, [r6, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r0, #12] ; CHECK-NEXT: vfma.f16 q0, q1, r4 +; CHECK-NEXT: add.w r4, r0, #14 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vfma.f16 q0, q1, r5 ; CHECK-NEXT: le lr, .LBB16_7 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1 @@ -1119,18 +1119,18 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: .LBB16_10: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrh r4, [r6], #2 -; CHECK-NEXT: vldrh.u16 q1, [r0], #2 +; CHECK-NEXT: vldrh.u16 q1, [r5], #2 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: le lr, .LBB16_10 ; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r5, r0, lsl #1 +; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: add.w r0, r0, r7, lsl #1 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_12: @ %if.end ; CHECK-NEXT: add sp, #24 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index ca6b8c2fffa22c..616e4679be3182 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -747,10 +747,10 @@ define void @arm_fir_f32_1_4_mve(ptr nocapture readonly %S, ptr nocapture readon ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldrh.w r10, [r0] +; CHECK-NEXT: ldrh.w r9, [r0] ; CHECK-NEXT: mov r11, r1 ; CHECK-NEXT: ldr.w r12, [r0, #4] -; CHECK-NEXT: sub.w r1, r10, #1 +; CHECK-NEXT: sub.w r1, r9, #1 ; CHECK-NEXT: cmp r1, #3 ; CHECK-NEXT: bhi .LBB15_6 ; CHECK-NEXT: @ %bb.1: @ %if.then @@ -763,7 +763,7 @@ define void @arm_fir_f32_1_4_mve(ptr nocapture readonly %S, ptr nocapture readon ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph ; CHECK-NEXT: bic r1, r3, #3 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r9, r12, #4 +; CHECK-NEXT: add.w r10, r12, #4 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r1, r11 @@ -771,12 +771,12 @@ define void @arm_fir_f32_1_4_mve(ptr nocapture readonly %S, ptr nocapture readon ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vstrb.8 q0, [r4], #16 -; CHECK-NEXT: vldrw.u32 q0, [r9, #-4] -; CHECK-NEXT: vldrw.u32 q1, [r9], #16 +; CHECK-NEXT: vldrw.u32 q0, [r10, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r10], #16 ; CHECK-NEXT: vmul.f32 q0, q0, r7 -; CHECK-NEXT: vldrw.u32 q2, [r9, #-8] +; CHECK-NEXT: vldrw.u32 q2, [r10, #-8] ; CHECK-NEXT: vfma.f32 q0, q1, r6 -; CHECK-NEXT: vldrw.u32 q1, [r9, #-12] +; CHECK-NEXT: vldrw.u32 q1, [r10, #-12] ; CHECK-NEXT: vfma.f32 q0, q1, r5 ; CHECK-NEXT: vfma.f32 q0, q2, r8 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 @@ -805,23 +805,23 @@ define void @arm_fir_f32_1_4_mve(ptr nocapture readonly %S, ptr nocapture readon ; CHECK-NEXT: ldr.w r12, [r0, #4] ; CHECK-NEXT: .LBB15_6: @ %if.end ; CHECK-NEXT: add.w r0, r12, r3, lsl #2 -; CHECK-NEXT: lsr.w r1, r10, #2 +; CHECK-NEXT: lsr.w r1, r9, #2 ; CHECK-NEXT: wls lr, r1, .LBB15_10 ; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader -; CHECK-NEXT: bic r2, r10, #3 -; CHECK-NEXT: adds r1, r2, r3 +; CHECK-NEXT: bic r1, r9, #3 +; CHECK-NEXT: adds r2, r1, r3 ; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: add.w r1, r12, r1, lsl #2 +; CHECK-NEXT: add.w r2, r12, r2, lsl #2 ; CHECK-NEXT: .LBB15_8: @ %while.body51 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vstrb.8 q0, [r3], #16 ; CHECK-NEXT: le lr, .LBB15_8 ; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit -; CHECK-NEXT: add.w r12, r12, r2, lsl #2 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: add.w r12, r12, r1, lsl #2 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: .LBB15_10: @ %while.end55 -; CHECK-NEXT: ands r1, r10, #3 +; CHECK-NEXT: ands r1, r9, #3 ; CHECK-NEXT: beq .LBB15_12 ; CHECK-NEXT: @ %bb.11: @ %if.then59 ; CHECK-NEXT: vldrw.u32 q0, [r0] @@ -999,7 +999,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: ldrh r6, [r0] ; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: ldrd r4, r10, [r0, #4] +; CHECK-NEXT: ldrd r4, r9, [r0, #4] ; CHECK-NEXT: sub.w r0, r6, #8 ; CHECK-NEXT: add.w r3, r0, r0, lsr #29 ; CHECK-NEXT: and r0, r0, #7 @@ -1008,10 +1008,11 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: it gt ; CHECK-NEXT: asrgt r5, r3, #3 ; CHECK-NEXT: add.w r3, r4, r6, lsl #2 -; CHECK-NEXT: sub.w r9, r3, #4 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: str r3, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: rsbs r3, r6, #0 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: add.w r3, r10, #32 +; CHECK-NEXT: add.w r3, r9, #32 ; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill @@ -1019,8 +1020,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: b .LBB16_5 ; CHECK-NEXT: .LBB16_3: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r0, r1, [sp, #20] @ 8-byte Folded Reload ; CHECK-NEXT: wls lr, r0, .LBB16_4 ; CHECK-NEXT: b .LBB16_9 ; CHECK-NEXT: .LBB16_4: @ %while.end @@ -1035,14 +1035,17 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB16_7 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 -; CHECK-NEXT: add.w lr, r10, #8 +; CHECK-NEXT: add.w lr, r9, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: ldrd r3, r7, [r10] -; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr} -; CHECK-NEXT: ldrd r11, r8, [r10, #24] -; CHECK-NEXT: vstrb.8 q0, [r9], #16 +; CHECK-NEXT: ldr r6, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldrd r3, r7, [r9] +; CHECK-NEXT: ldm.w lr, {r0, r5, lr} +; CHECK-NEXT: ldrd r10, r11, [r9, #20] +; CHECK-NEXT: ldr.w r8, [r9, #28] +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: vldrw.u32 q0, [r4], #32 -; CHECK-NEXT: strd r9, r1, [sp, #24] @ 8-byte Folded Spill +; CHECK-NEXT: str r6, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] ; CHECK-NEXT: vmul.f32 q0, q0, r3 ; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] @@ -1053,9 +1056,9 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] ; CHECK-NEXT: vfma.f32 q0, q4, r5 ; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] -; CHECK-NEXT: vfma.f32 q0, q5, r6 +; CHECK-NEXT: vfma.f32 q0, q5, lr ; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: vfma.f32 q0, q2, lr +; CHECK-NEXT: vfma.f32 q0, q2, r10 ; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: cmp r0, #16 @@ -1069,25 +1072,26 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: .LBB16_7: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11} +; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6} ; CHECK-NEXT: vldrw.u32 q1, [r4], #32 +; CHECK-NEXT: add.w r11, r7, #16 ; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] ; CHECK-NEXT: vldrw.u32 q4, [r4, #-20] ; CHECK-NEXT: vfma.f32 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] +; CHECK-NEXT: ldm.w r11, {r1, r8, r10, r11} ; CHECK-NEXT: vldrw.u32 q5, [r4, #-16] -; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] ; CHECK-NEXT: vfma.f32 q0, q1, r3 -; CHECK-NEXT: ldrd r9, r1, [r7, #24] +; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] ; CHECK-NEXT: vfma.f32 q0, q6, r5 ; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] ; CHECK-NEXT: vfma.f32 q0, q4, r6 ; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] -; CHECK-NEXT: vfma.f32 q0, q5, r8 +; CHECK-NEXT: vfma.f32 q0, q5, r1 ; CHECK-NEXT: adds r7, #32 -; CHECK-NEXT: vfma.f32 q0, q2, r11 -; CHECK-NEXT: vfma.f32 q0, q3, r9 -; CHECK-NEXT: vfma.f32 q0, q1, r1 +; CHECK-NEXT: vfma.f32 q0, q2, r8 +; CHECK-NEXT: vfma.f32 q0, q3, r10 +; CHECK-NEXT: vfma.f32 q0, q1, r11 ; CHECK-NEXT: le lr, .LBB16_7 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1 @@ -1577,23 +1581,23 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly % ; CHECK-NEXT: and r7, r3, #3 ; CHECK-NEXT: ldr r0, [r0, #8] ; CHECK-NEXT: lsrs r3, r3, #2 -; CHECK-NEXT: @ implicit-def: $r12 +; CHECK-NEXT: @ implicit-def: $r8 ; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: b .LBB19_3 ; CHECK-NEXT: .LBB19_1: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r7, r5 +; CHECK-NEXT: mov r1, r12 +; CHECK-NEXT: mov r7, r10 ; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: mov r8, r10 +; CHECK-NEXT: mov r12, r5 ; CHECK-NEXT: .LBB19_2: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: ldrd r2, r6, [sp, #8] @ 8-byte Folded Reload ; CHECK-NEXT: adds r0, #128 -; CHECK-NEXT: strd r7, r4, [r9] +; CHECK-NEXT: strd r1, r12, [r9, #8] ; CHECK-NEXT: subs r6, #1 -; CHECK-NEXT: strd r3, r8, [r9, #8] +; CHECK-NEXT: strd r7, r4, [r9] ; CHECK-NEXT: add.w r9, r9, #16 ; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: beq.w .LBB19_13 @@ -1602,98 +1606,98 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly % ; CHECK-NEXT: @ Child Loop BB19_5 Depth 2 ; CHECK-NEXT: str r6, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: ldrd r5, r11, [r9] -; CHECK-NEXT: ldrd r8, r10, [r9, #8] +; CHECK-NEXT: ldr.w r5, [r9, #12] ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: ldm.w r9, {r10, r11, r12} ; CHECK-NEXT: wls lr, r2, .LBB19_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r7, r10 ; CHECK-NEXT: .LBB19_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB19_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr r5, [r1, #12] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: ldm.w r1, {r2, r7, r11} -; CHECK-NEXT: vmul.f32 q2, q2, r5 -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vfma.f32 q2, q6, r11 -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: vfma.f32 q2, q7, r7 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vfma.f32 q2, q4, r2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vfma.f32 q2, q5, r3 -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vfma.f32 q2, q3, r4 +; CHECK-NEXT: ldr.w r10, [r1, #12] +; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: vldrw.u32 q5, [r0, #16] +; CHECK-NEXT: ldm.w r1, {r2, r3, r11} +; CHECK-NEXT: vmul.f32 q7, q7, r10 +; CHECK-NEXT: vldrw.u32 q6, [r0, #32] +; CHECK-NEXT: vfma.f32 q7, q5, r11 +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vfma.f32 q7, q6, r3 +; CHECK-NEXT: vldrw.u32 q4, [r0, #64] +; CHECK-NEXT: vfma.f32 q7, q3, r2 +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vfma.f32 q7, q4, r7 +; CHECK-NEXT: vldrw.u32 q2, [r0, #96] +; CHECK-NEXT: vfma.f32 q7, q1, r4 ; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vfma.f32 q2, q1, r8 +; CHECK-NEXT: vfma.f32 q7, q2, r12 ; CHECK-NEXT: adds r1, #16 -; CHECK-NEXT: vfma.f32 q2, q0, r10 +; CHECK-NEXT: vfma.f32 q7, q0, r5 ; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: vmov r10, r8, d5 -; CHECK-NEXT: vstrb.8 q2, [r6], #16 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: mov r12, r5 +; CHECK-NEXT: vmov r5, r12, d15 +; CHECK-NEXT: vstrb.8 q7, [r6], #16 +; CHECK-NEXT: mov r7, r10 +; CHECK-NEXT: mov r8, r10 ; CHECK-NEXT: le lr, .LBB19_5 ; CHECK-NEXT: .LBB19_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB19_1 ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldrd lr, r4, [r1] +; CHECK-NEXT: ldrd r7, r2, [r1, #8] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldrd r7, r1, [r1, #8] ; CHECK-NEXT: vldrw.u32 q6, [r0, #16] +; CHECK-NEXT: ldrd lr, r4, [r1] +; CHECK-NEXT: vmul.f32 q0, q0, r2 ; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: vmul.f32 q0, q0, r1 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] ; CHECK-NEXT: vfma.f32 q0, q6, r7 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: vfma.f32 q0, q7, r4 -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] ; CHECK-NEXT: vfma.f32 q0, q4, lr +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vfma.f32 q0, q5, r10 +; CHECK-NEXT: vldrw.u32 q3, [r0, #96] +; CHECK-NEXT: vfma.f32 q0, q2, r11 ; CHECK-NEXT: vldrw.u32 q1, [r0, #112] -; CHECK-NEXT: vfma.f32 q0, q5, r5 -; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: vfma.f32 q0, q3, r11 -; CHECK-NEXT: vfma.f32 q0, q2, r8 -; CHECK-NEXT: vfma.f32 q0, q1, r10 +; CHECK-NEXT: vfma.f32 q0, q3, r12 +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: vfma.f32 q0, q1, r5 ; CHECK-NEXT: vmov r5, s0 ; CHECK-NEXT: bne .LBB19_9 ; CHECK-NEXT: @ %bb.8: @ %if.then58 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: str r5, [r6] ; CHECK-NEXT: mov r7, lr -; CHECK-NEXT: mov r4, r12 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r4, r8 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_9: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vmov r8, s1 -; CHECK-NEXT: cmp r2, #2 +; CHECK-NEXT: vmov r12, s1 +; CHECK-NEXT: cmp r3, #2 ; CHECK-NEXT: vstr s1, [r6, #4] ; CHECK-NEXT: str r5, [r6] ; CHECK-NEXT: bne .LBB19_11 ; CHECK-NEXT: @ %bb.10: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: mov r7, r4 -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r1, r12 ; CHECK-NEXT: mov r4, lr -; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: mov r12, r5 ; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_11: @ %if.else64 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vstr s2, [r6, #8] ; CHECK-NEXT: .LBB19_12: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: mov r8, r2 ; CHECK-NEXT: b .LBB19_2 ; CHECK-NEXT: .LBB19_13: @ %do.end ; CHECK-NEXT: add sp, #16 @@ -1909,11 +1913,11 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: b .LBB20_3 ; CHECK-NEXT: .LBB20_1: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vstr s4, [r6] +; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vstr s12, [r6] ; CHECK-NEXT: .LBB20_2: @ %if.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vstr s6, [r6, #4] +; CHECK-NEXT: vstr s14, [r6, #4] ; CHECK-NEXT: add.w r12, r12, #20 ; CHECK-NEXT: subs r0, #1 ; CHECK-NEXT: add.w r6, r6, #8 @@ -1922,41 +1926,41 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: .LBB20_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB20_5 Depth 2 -; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vldrw.u32 q1, [r12] ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vshlc q4, r5, #32 ; CHECK-NEXT: vldrw.u32 q2, [r12, #8] ; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vshlc q5, r5, #32 -; CHECK-NEXT: vldrw.u32 q1, [r6] -; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vldrw.u32 q3, [r6] +; CHECK-NEXT: vmov.f32 s14, s0 ; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov.f32 s15, s0 ; CHECK-NEXT: wls lr, r8, .LBB20_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: .LBB20_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB20_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrd r7, r4, [r1], #8 -; CHECK-NEXT: vfma.f32 q6, q3, r7 +; CHECK-NEXT: vfma.f32 q6, q1, r7 ; CHECK-NEXT: vmov r7, s24 -; CHECK-NEXT: vmov q1, q6 -; CHECK-NEXT: vfma.f32 q1, q2, r7 +; CHECK-NEXT: vmov q3, q6 +; CHECK-NEXT: vfma.f32 q3, q2, r7 ; CHECK-NEXT: vstr s24, [r5] -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vfma.f32 q1, q4, r4 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vstr s5, [r5, #4] -; CHECK-NEXT: vfma.f32 q1, q5, r4 +; CHECK-NEXT: vmov.f32 s15, s0 +; CHECK-NEXT: vfma.f32 q3, q4, r4 +; CHECK-NEXT: vmov r4, s13 +; CHECK-NEXT: vstr s13, [r5, #4] +; CHECK-NEXT: vfma.f32 q3, q5, r4 ; CHECK-NEXT: adds r5, #8 -; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vmov.f32 s12, s14 +; CHECK-NEXT: vmov.f32 s13, s15 +; CHECK-NEXT: vmov.f32 s14, s0 +; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: le lr, .LBB20_5 ; CHECK-NEXT: .LBB20_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 @@ -1965,11 +1969,11 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 ; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vfma.f32 q1, q3, r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vstr s4, [r5] -; CHECK-NEXT: vfma.f32 q1, q2, r1 -; CHECK-NEXT: vstr s5, [r6] +; CHECK-NEXT: vfma.f32 q3, q1, r1 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vstr s12, [r5] +; CHECK-NEXT: vfma.f32 q3, q2, r1 +; CHECK-NEXT: vstr s13, [r6] ; CHECK-NEXT: b .LBB20_2 ; CHECK-NEXT: .LBB20_8: @ %do.end ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll index c299b62a4c9429..d2bc667052aa17 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -1092,15 +1092,15 @@ define arm_aapcs_vfpcc <2 x double> @copysign_float64_t(<2 x double> %src1, <2 x ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r0, lr, d2 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: vmov r12, r2, d0 +; CHECK-NEXT: vmov r0, r2, d2 +; CHECK-NEXT: vmov lr, r3, d1 +; CHECK-NEXT: vmov r12, r0, d0 ; CHECK-NEXT: lsrs r1, r1, #31 ; CHECK-NEXT: bfi r3, r1, #31, #1 -; CHECK-NEXT: lsr.w r1, lr, #31 -; CHECK-NEXT: bfi r2, r1, #31, #1 -; CHECK-NEXT: vmov d1, r0, r3 -; CHECK-NEXT: vmov d0, r12, r2 +; CHECK-NEXT: lsrs r1, r2, #31 +; CHECK-NEXT: bfi r0, r1, #31, #1 +; CHECK-NEXT: vmov d1, lr, r3 +; CHECK-NEXT: vmov d0, r12, r0 ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <2 x double> @llvm.copysign.v2f64(<2 x double> %src1, <2 x double> %src2) diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll index 94210d795867a0..0486f8529a1940 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll @@ -467,32 +467,32 @@ define arm_aapcs_vfpcc <2 x i16> @ustest_f64i16(<2 x double> %x) { ; CHECK-NEXT: movw r4, #65535 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: vmov.i64 q1, #0xffff -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, r5, d1 -; CHECK-NEXT: subs r1, r1, r4 -; CHECK-NEXT: sbcs r1, r2, #0 -; CHECK-NEXT: mov.w r2, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r2, r1, #0, #8 -; CHECK-NEXT: subs r1, r3, r4 -; CHECK-NEXT: sbcs r1, r5, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r2, r1, #8, #8 -; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: subs r0, r0, r4 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: subs r0, r2, r4 +; CHECK-NEXT: sbcs r0, r3, #0 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: vmov r3, r5, d1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: sbcs.w r1, r0, r2 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: rsbs r2, r3, #0 -; CHECK-NEXT: sbcs.w r2, r0, r5 -; CHECK-NEXT: bfi r0, r1, #0, #8 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r0, r1, #8, #8 -; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbcs.w r0, r5, r1 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: rsbs r1, r2, #0 +; CHECK-NEXT: sbcs.w r1, r5, r3 +; CHECK-NEXT: bfi r5, r0, #0, #8 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r5, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r5 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} @@ -615,43 +615,43 @@ entry: define arm_aapcs_vfpcc <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-LABEL: ustest_f16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmovx.f16 s12, s2 +; CHECK-NEXT: vmovx.f16 s10, s3 +; CHECK-NEXT: vcvt.s32.f16 s14, s3 +; CHECK-NEXT: vcvt.s32.f16 s2, s2 +; CHECK-NEXT: vcvt.s32.f16 s10, s10 +; CHECK-NEXT: vcvt.s32.f16 s12, s12 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vcvt.s32.f16 s10, s0 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vcvt.s32.f16 s5, s3 -; CHECK-NEXT: vcvt.s32.f16 s12, s0 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vcvt.s32.f16 s7, s2 -; CHECK-NEXT: vcvt.s32.f16 s14, s0 -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r1 ; CHECK-NEXT: vcvt.s32.f16 s8, s1 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vcvt.s32.f16 s0, s0 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vcvt.s32.f16 s4, s4 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r1 ; CHECK-NEXT: vcvt.s32.f16 s6, s6 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmax.s32 q3, q4, q0 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmax.s32 q3, q5, q4 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 ; CHECK-NEXT: vstrh.32 q3, [r0, #8] -; CHECK-NEXT: vmax.s32 q0, q2, q0 +; CHECK-NEXT: vmax.s32 q0, q0, q4 ; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %conv = fptosi <8 x half> %x to <8 x i32> @@ -851,7 +851,7 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f32i64(<2 x float> %x) { ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vmov r9, r0, d0 +; CHECK-NEXT: vmov r8, r0, d0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: subs.w r7, r0, #-1 ; CHECK-NEXT: mvn r10, #-2147483648 @@ -871,9 +871,9 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f32i64(<2 x float> %x) { ; CHECK-NEXT: sbcs.w r0, r4, r2 ; CHECK-NEXT: sbcs.w r0, r4, r3 ; CHECK-NEXT: cset r6, lt -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r8, r1, r11, ne +; CHECK-NEXT: csel r9, r1, r11, ne ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: subs.w r7, r0, #-1 ; CHECK-NEXT: sbcs.w r7, r1, r10 @@ -897,7 +897,7 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f32i64(<2 x float> %x) { ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csel r0, r0, r2, ne ; CHECK-NEXT: vmov q0[2], q0[0], r0, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r8 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r9 ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: @@ -1617,32 +1617,32 @@ define arm_aapcs_vfpcc <2 x i16> @ustest_f64i16_mm(<2 x double> %x) { ; CHECK-NEXT: movw r4, #65535 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: vmov.i64 q1, #0xffff -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, r5, d1 -; CHECK-NEXT: subs r1, r1, r4 -; CHECK-NEXT: sbcs r1, r2, #0 -; CHECK-NEXT: mov.w r2, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r2, r1, #0, #8 -; CHECK-NEXT: subs r1, r3, r4 -; CHECK-NEXT: sbcs r1, r5, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r2, r1, #8, #8 -; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: subs r0, r0, r4 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: subs r0, r2, r4 +; CHECK-NEXT: sbcs r0, r3, #0 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: vmov r3, r5, d1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: sbcs.w r1, r0, r2 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: rsbs r2, r3, #0 -; CHECK-NEXT: sbcs.w r2, r0, r5 -; CHECK-NEXT: bfi r0, r1, #0, #8 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r0, r1, #8, #8 -; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbcs.w r0, r5, r1 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: rsbs r1, r2, #0 +; CHECK-NEXT: sbcs.w r1, r5, r3 +; CHECK-NEXT: bfi r5, r0, #0, #8 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r5, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r5 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} @@ -1755,43 +1755,43 @@ entry: define arm_aapcs_vfpcc <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: ustest_f16i16_mm: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmovx.f16 s12, s2 +; CHECK-NEXT: vmovx.f16 s10, s3 +; CHECK-NEXT: vcvt.s32.f16 s14, s3 +; CHECK-NEXT: vcvt.s32.f16 s2, s2 +; CHECK-NEXT: vcvt.s32.f16 s10, s10 +; CHECK-NEXT: vcvt.s32.f16 s12, s12 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vcvt.s32.f16 s10, s0 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vcvt.s32.f16 s5, s3 -; CHECK-NEXT: vcvt.s32.f16 s12, s0 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vcvt.s32.f16 s7, s2 -; CHECK-NEXT: vcvt.s32.f16 s14, s0 -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r1 ; CHECK-NEXT: vcvt.s32.f16 s8, s1 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vcvt.s32.f16 s0, s0 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vcvt.s32.f16 s4, s4 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r1 ; CHECK-NEXT: vcvt.s32.f16 s6, s6 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmax.s32 q3, q4, q0 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmax.s32 q3, q5, q4 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 ; CHECK-NEXT: vstrh.32 q3, [r0, #8] -; CHECK-NEXT: vmax.s32 q0, q2, q0 +; CHECK-NEXT: vmax.s32 q0, q0, q4 ; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %conv = fptosi <8 x half> %x to <8 x i32> @@ -1973,7 +1973,7 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f32i64_mm(<2 x float> %x) { ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vmov r9, r0, d0 +; CHECK-NEXT: vmov r8, r0, d0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: subs.w r7, r0, #-1 ; CHECK-NEXT: mvn r10, #-2147483648 @@ -1993,9 +1993,9 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f32i64_mm(<2 x float> %x) { ; CHECK-NEXT: sbcs.w r0, r4, r2 ; CHECK-NEXT: sbcs.w r0, r4, r3 ; CHECK-NEXT: cset r6, lt -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: csel r8, r1, r11, ne +; CHECK-NEXT: csel r9, r1, r11, ne ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: subs.w r7, r0, #-1 ; CHECK-NEXT: sbcs.w r7, r1, r10 @@ -2019,7 +2019,7 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f32i64_mm(<2 x float> %x) { ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csel r0, r0, r2, ne ; CHECK-NEXT: vmov q0[2], q0[0], r0, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r8 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r9 ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: @@ -2072,19 +2072,19 @@ define arm_aapcs_vfpcc <2 x i64> @ustest_f32i64_mm(<2 x float> %x) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vmov r5, r0, d0 +; CHECK-NEXT: vmov r4, r0, d0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: mov r8, r1 ; CHECK-NEXT: subs r1, r2, #1 ; CHECK-NEXT: sbcs r1, r3, #0 -; CHECK-NEXT: cset r7, lt -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r6, r0, r7, ne -; CHECK-NEXT: csel r4, r3, r7, ne -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: cset r6, lt +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: csel r7, r0, r6, ne +; CHECK-NEXT: csel r5, r3, r6, ne +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it mi -; CHECK-NEXT: movmi r6, #0 +; CHECK-NEXT: movmi r7, #0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: sbcs r2, r3, #0 @@ -2095,18 +2095,18 @@ define arm_aapcs_vfpcc <2 x i64> @ustest_f32i64_mm(<2 x float> %x) { ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it mi ; CHECK-NEXT: movmi r0, #0 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r6 -; CHECK-NEXT: csel r7, r8, r7, ne -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r7 +; CHECK-NEXT: csel r6, r8, r6, ne +; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it mi -; CHECK-NEXT: movmi r7, #0 +; CHECK-NEXT: movmi r6, #0 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csel r1, r1, r2, ne ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it mi ; CHECK-NEXT: movmi r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r7 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r6 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %conv = fptosi <2 x float> %x to <2 x i128> diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll index 3ca01cfa3a8f77..c4088efb46acab 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll @@ -35,23 +35,23 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f32_v2i32(<2 x float> %f) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, s17 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vldr s18, .LCPI1_0 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vldr s20, .LCPI1_1 ; CHECK-NEXT: vcmp.f32 s17, s18 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r5, #-2147483648 +; CHECK-NEXT: movlt.w r4, #-2147483648 ; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: mvngt r5, #-2147483648 +; CHECK-NEXT: mvngt r4, #-2147483648 ; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r5, #0 +; CHECK-NEXT: movvs r4, #0 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vcmp.f32 s16, s18 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -69,20 +69,20 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f32_v2i32(<2 x float> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r4, #-1 +; CHECK-NEXT: movlt.w r5, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r4, #0 +; CHECK-NEXT: movgt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s18 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r4, #0 +; CHECK-NEXT: movvs r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w r1, #-1 ; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt r1, #0 @@ -90,7 +90,7 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f32_v2i32(<2 x float> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 2 @@ -363,25 +363,25 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f64_v2i32(<2 x double> %f) { ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: strd r2, r3, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: strd r3, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: vldr d0, .LCPI9_1 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: lsrs r4, r0, #5 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: vmov r6, r5, d0 +; CHECK-NEXT: vmov r6, r10, d0 ; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-2147483648 +; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #-2147483648 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne @@ -393,76 +393,77 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f64_v2i32(<2 x double> %f) { ; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r11, #-2147483648 +; CHECK-NEXT: mvnne r4, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun -; CHECK-NEXT: vmov r10, r7, d8 ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r8, #1 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #0 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vmov r5, r4, d8 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: ldrd r2, r3, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: ldrd r3, r2, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: lsr.w r9, r0, #5 -; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r5, #-2147483648 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: movne.w r7, #-2147483648 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r5, #-2147483648 +; CHECK-NEXT: mvnne r7, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r5, r11 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r2, #-1 +; CHECK-NEXT: movne.w r11, #-1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r2, #0 +; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r2, #0 +; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 +; CHECK-NEXT: movne.w r10, #-1 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r2 +; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r7, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r10, r11 ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -494,12 +495,13 @@ define arm_aapcs_vfpcc <3 x i32> @test_signed_v3f64_v3i32(<3 x double> %f) { ; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vldr d0, .LCPI10_0 ; CHECK-NEXT: vmov r4, r6, d1 -; CHECK-NEXT: vmov r2, r11, d0 +; CHECK-NEXT: vmov r8, r11, d0 ; CHECK-NEXT: vmov.f32 s18, s4 ; CHECK-NEXT: vmov.f32 s19, s5 -; CHECK-NEXT: str r2, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str.w r8, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: str.w r11, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpgt @@ -507,10 +509,8 @@ define arm_aapcs_vfpcc <3 x i32> @test_signed_v3f64_v3i32(<3 x double> %f) { ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: vmov r2, r8, d0 -; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: str.w r8, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: strd r2, r3, [sp, #16] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: mov r0, r4 @@ -531,18 +531,17 @@ define arm_aapcs_vfpcc <3 x i32> @test_signed_v3f64_v3i32(<3 x double> %f) { ; CHECK-NEXT: mvnne r10, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldrd r2, r3, [sp, #16] @ 8-byte Folded Reload ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: mov r0, r5 @@ -564,14 +563,12 @@ define arm_aapcs_vfpcc <3 x i32> @test_signed_v3f64_v3i32(<3 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r6, #0 -; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldrd r2, r3, [sp, #8] @ 8-byte Folded Reload ; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldrd r2, r3, [sp, #16] @ 8-byte Folded Reload ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: ldr r3, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_dcmpge @@ -663,12 +660,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_signed_v4f64_v4i32(<4 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: ldr.w r8, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: str r4, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: str.w r9, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload @@ -678,12 +675,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_signed_v4f64_v4i32(<4 x double> %f) { ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: moveq.w r6, #-2147483648 ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload @@ -698,32 +695,32 @@ define arm_aapcs_vfpcc <4 x i32> @test_signed_v4f64_v4i32(<4 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r6, #0 -; CHECK-NEXT: ldr r5, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr.w r10, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: moveq.w r8, #-2147483648 ; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp.w r10, #0 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: vmov r7, r4, d9 ; CHECK-NEXT: it ne ; CHECK-NEXT: mvnne r8, #-2147483648 @@ -806,7 +803,7 @@ define arm_aapcs_vfpcc <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) { ; CHECK-NEXT: vmov.f32 s23, s3 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: strd r2, r3, [sp, #20] @ 8-byte Folded Spill +; CHECK-NEXT: strd r3, r2, [sp, #20] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI12_1 ; CHECK-NEXT: mov r1, r4 @@ -841,9 +838,9 @@ define arm_aapcs_vfpcc <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) { ; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: str.w r11, [r7, #16] ; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: ldr.w r10, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr.w r10, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: bl __aeabi_dcmpgt @@ -880,9 +877,10 @@ define arm_aapcs_vfpcc <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) { ; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mov r9, r7 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: mov r2, r4 @@ -897,11 +895,12 @@ define arm_aapcs_vfpcc <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) { ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: moveq.w r10, #-2147483648 -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: vmov r11, r4, d9 ; CHECK-NEXT: it ne ; CHECK-NEXT: mvnne r10, #-2147483648 @@ -909,9 +908,10 @@ define arm_aapcs_vfpcc <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: ldrd r2, r3, [sp, #20] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: mov r8, r0 @@ -939,7 +939,7 @@ define arm_aapcs_vfpcc <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: ldrd r2, r3, [sp, #20] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r3, r2, [sp, #20] @ 8-byte Folded Reload ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpgt @@ -1001,11 +1001,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: .pad #40 ; CHECK-NEXT: sub sp, #40 ; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vldr d0, .LCPI13_0 ; CHECK-NEXT: vmov r9, r4, d5 -; CHECK-NEXT: vmov r2, r6, d0 +; CHECK-NEXT: vmov r6, r3, d0 ; CHECK-NEXT: vmov.f32 s22, s8 ; CHECK-NEXT: vmov.f32 s20, s6 ; CHECK-NEXT: vmov.f32 s18, s4 @@ -1014,11 +1014,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: vmov.f32 s21, s7 ; CHECK-NEXT: vmov.f32 s19, s5 ; CHECK-NEXT: vmov.f32 s25, s3 -; CHECK-NEXT: str r2, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: str r6, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: str r6, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI13_1 ; CHECK-NEXT: mov r1, r4 @@ -1052,25 +1052,25 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: ldr.w r11, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r4, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: str.w r10, [r11, #20] -; CHECK-NEXT: ldr.w r10, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: str.w r10, [r4, #20] +; CHECK-NEXT: ldr.w r11, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: ldrd r2, r3, [sp, #32] @ 8-byte Folded Reload ; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: moveq.w r6, #-2147483648 ; CHECK-NEXT: mov r0, r7 @@ -1084,21 +1084,21 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r6, #0 -; CHECK-NEXT: str.w r6, [r11, #16] +; CHECK-NEXT: str r6, [r4, #16] ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: ldr.w r11, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: ldr r5, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r7, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: ldr r5, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: ldr.w r9, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: ldr r7, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: mov r0, r8 @@ -1108,11 +1108,12 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: moveq.w r10, #-2147483648 -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: mvnne r10, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun @@ -1120,32 +1121,34 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mov r5, r9 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r5, r6 +; CHECK-NEXT: mov r9, r6 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: cmp.w r11, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: moveq.w r8, #-2147483648 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r1, r9 ; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: vmov r7, r6, d9 ; CHECK-NEXT: it ne ; CHECK-NEXT: mvnne r8, #-2147483648 @@ -1153,16 +1156,17 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r8, #0 -; CHECK-NEXT: ldr.w r11, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr.w r11, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: mov r2, r11 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldrd r2, r3, [sp, #32] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r3, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r7 @@ -1214,7 +1218,7 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r5, #0 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: vmov q0[3], q0[1], r8, r10 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: add sp, #40 @@ -1268,23 +1272,23 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f16_v2i32(<2 x half> %f) { ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vcvtb.f32.f16 s16, s16 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vldr s20, .LCPI15_0 ; CHECK-NEXT: vldr s22, .LCPI15_1 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r5, #-2147483648 +; CHECK-NEXT: movlt.w r4, #-2147483648 ; CHECK-NEXT: vcmp.f32 s18, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: mvngt r5, #-2147483648 +; CHECK-NEXT: mvngt r4, #-2147483648 ; CHECK-NEXT: vcmp.f32 s18, s18 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r5, #0 +; CHECK-NEXT: movvs r4, #0 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -1302,20 +1306,20 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f16_v2i32(<2 x half> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, s22 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r4, #-1 +; CHECK-NEXT: movlt.w r5, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, s18 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r4, #0 +; CHECK-NEXT: movgt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r4, #0 +; CHECK-NEXT: movvs r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w r1, #-1 ; CHECK-NEXT: vcmp.f32 s16, s22 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt r1, #0 @@ -1323,7 +1327,7 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f16_v2i32(<2 x half> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 2 @@ -1982,8 +1986,10 @@ define arm_aapcs_vfpcc <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) { define arm_aapcs_vfpcc <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) { ; CHECK-LABEL: test_signed_v4f32_v4i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 @@ -1992,7 +1998,7 @@ define arm_aapcs_vfpcc <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) { ; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vldr s22, .LCPI29_0 -; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: mov r11, r1 ; CHECK-NEXT: vldr s20, .LCPI29_1 ; CHECK-NEXT: vmov r8, s16 ; CHECK-NEXT: vcmp.f32 s19, s22 @@ -2005,68 +2011,68 @@ define arm_aapcs_vfpcc <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) { ; CHECK-NEXT: movgt.w r10, #-1 ; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov r9, s17 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs.w r10, #0 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vcmp.f32 s18, s22 -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #0 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, s18 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r7, #-1 +; CHECK-NEXT: movgt.w r6, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s22 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r7, #0 +; CHECK-NEXT: movvs r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r9, #-2147483648 +; CHECK-NEXT: movlt.w r11, #-2147483648 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s19 ; CHECK-NEXT: it gt -; CHECK-NEXT: mvngt r9, #-2147483648 +; CHECK-NEXT: mvngt r11, #-2147483648 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: mov r7, r1 ; CHECK-NEXT: vcmp.f32 s18, s22 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w r9, #0 +; CHECK-NEXT: movvs.w r11, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r6, #-2147483648 +; CHECK-NEXT: movlt.w r7, #-2147483648 ; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: it gt -; CHECK-NEXT: mvngt r6, #-2147483648 +; CHECK-NEXT: mvngt r7, #-2147483648 ; CHECK-NEXT: vcmp.f32 s18, s18 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r6, #0 +; CHECK-NEXT: movvs r7, #0 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vcmp.f32 s17, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #0 +; CHECK-NEXT: movlt r4, #0 ; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r5, #-1 +; CHECK-NEXT: movgt.w r4, #-1 ; CHECK-NEXT: vcmp.f32 s17, s17 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r5, #0 +; CHECK-NEXT: movvs r4, #0 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vcmp.f32 s16, s22 -; CHECK-NEXT: vmov q1[2], q1[0], r7, r10 +; CHECK-NEXT: vmov q1[2], q1[0], r6, r10 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it lt @@ -2082,31 +2088,32 @@ define arm_aapcs_vfpcc <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r4, #-2147483648 +; CHECK-NEXT: movlt.w r5, #-2147483648 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: it gt -; CHECK-NEXT: mvngt r4, #-2147483648 +; CHECK-NEXT: mvngt r5, #-2147483648 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s22 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r4, #0 +; CHECK-NEXT: movvs r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w r1, #-2147483648 ; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: mvngt r1, #-2147483648 ; CHECK-NEXT: vcmp.f32 s16, s16 -; CHECK-NEXT: vmov q1[3], q1[1], r6, r9 +; CHECK-NEXT: vmov q1[3], q1[1], r7, r11 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI29_0: @@ -2650,34 +2657,34 @@ define arm_aapcs_vfpcc <2 x i1> @test_signed_v2f64_v2i1(<2 x double> %f) { ; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI32_0 -; CHECK-NEXT: vmov r8, r7, d8 +; CHECK-NEXT: vmov r8, r6, d8 ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: strd r2, r3, [sp, #12] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI32_1 ; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: strd r2, r3, [sp, #4] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_d2iz ; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: moveq.w r11, #-1 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: vmov r6, r5, d9 +; CHECK-NEXT: vmov r7, r5, d9 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: bl __aeabi_dcmpun @@ -2689,34 +2696,34 @@ define arm_aapcs_vfpcc <2 x i1> @test_signed_v2f64_v2i1(<2 x double> %f) { ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: bfi r4, r0, #0, #1 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: ldrd r2, r3, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_d2iz -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r7, #-1 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: moveq.w r6, #-1 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r2, r7 ; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: movne r6, #0 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: and r0, r7, #1 +; CHECK-NEXT: movne r6, #0 +; CHECK-NEXT: and r0, r6, #1 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: bfi r4, r0, #1, #1 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload @@ -2754,25 +2761,25 @@ define arm_aapcs_vfpcc <2 x i8> @test_signed_v2f64_v2i8(<2 x double> %f) { ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: strd r2, r3, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: strd r3, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: vldr d0, .LCPI33_1 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: lsrs r4, r0, #5 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: vmov r6, r5, d0 +; CHECK-NEXT: vmov r6, r10, d0 ; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: mvnne r0, #127 +; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r11, #127 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne @@ -2784,76 +2791,77 @@ define arm_aapcs_vfpcc <2 x i8> @test_signed_v2f64_v2i8(<2 x double> %f) { ; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #127 +; CHECK-NEXT: movne r4, #127 ; CHECK-NEXT: bl __aeabi_dcmpun -; CHECK-NEXT: vmov r10, r7, d8 ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r8, #1 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #0 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vmov r5, r4, d8 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: ldrd r2, r3, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: ldrd r3, r2, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: lsr.w r9, r0, #5 -; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r5, #127 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mvnne r7, #127 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #127 +; CHECK-NEXT: movne r7, #127 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r5, r11 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r2, #-1 +; CHECK-NEXT: movne.w r11, #-1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r2, #0 +; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r2, #0 +; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 +; CHECK-NEXT: movne.w r10, #-1 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r2 +; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r7, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r10, r11 ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -2887,14 +2895,14 @@ define arm_aapcs_vfpcc <2 x i13> @test_signed_v2f64_v2i13(<2 x double> %f) { ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: strd r2, r3, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: strd r3, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: vldr d0, .LCPI34_1 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: lsrs r4, r0, #5 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: vmov r6, r5, d0 +; CHECK-NEXT: vmov r6, r10, d0 ; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: cmp r4, #0 @@ -2905,7 +2913,7 @@ define arm_aapcs_vfpcc <2 x i13> @test_signed_v2f64_v2i13(<2 x double> %f) { ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne @@ -2919,56 +2927,56 @@ define arm_aapcs_vfpcc <2 x i13> @test_signed_v2f64_v2i13(<2 x double> %f) { ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 ; CHECK-NEXT: bl __aeabi_dcmpun -; CHECK-NEXT: vmov r10, r7, d8 ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r8, #1 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 ; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: vmov r5, r4, d8 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: ldrd r2, r3, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: ldrd r3, r2, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: lsr.w r9, r0, #5 -; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r4, r1 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: movne.w r7, #-1 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: itt ne -; CHECK-NEXT: movwne r5, #61440 -; CHECK-NEXT: movtne r5, #65535 +; CHECK-NEXT: movwne r10, #61440 +; CHECK-NEXT: movtne r10, #65535 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: itt ne @@ -2983,13 +2991,13 @@ define arm_aapcs_vfpcc <2 x i13> @test_signed_v2f64_v2i13(<2 x double> %f) { ; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movwne r5, #4095 +; CHECK-NEXT: movwne r10, #4095 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r5, r11 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r10, r11 +; CHECK-NEXT: vmov q0[3], q0[1], r7, r0 ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -3023,14 +3031,14 @@ define arm_aapcs_vfpcc <2 x i16> @test_signed_v2f64_v2i16(<2 x double> %f) { ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: strd r2, r3, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: strd r3, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: vldr d0, .LCPI35_1 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: lsrs r4, r0, #5 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: vmov r6, r5, d0 +; CHECK-NEXT: vmov r6, r10, d0 ; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: cmp r4, #0 @@ -3041,7 +3049,7 @@ define arm_aapcs_vfpcc <2 x i16> @test_signed_v2f64_v2i16(<2 x double> %f) { ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne @@ -3055,56 +3063,56 @@ define arm_aapcs_vfpcc <2 x i16> @test_signed_v2f64_v2i16(<2 x double> %f) { ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 ; CHECK-NEXT: bl __aeabi_dcmpun -; CHECK-NEXT: vmov r10, r7, d8 ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r8, #1 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 ; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: vmov r5, r4, d8 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: ldrd r2, r3, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: ldrd r3, r2, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: lsr.w r9, r0, #5 -; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r4, r1 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: movne.w r7, #-1 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: itt ne -; CHECK-NEXT: movwne r5, #32768 -; CHECK-NEXT: movtne r5, #65535 +; CHECK-NEXT: movwne r10, #32768 +; CHECK-NEXT: movtne r10, #65535 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: itt ne @@ -3119,13 +3127,13 @@ define arm_aapcs_vfpcc <2 x i16> @test_signed_v2f64_v2i16(<2 x double> %f) { ; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movwne r5, #32767 +; CHECK-NEXT: movwne r10, #32767 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r5, r11 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r10, r11 +; CHECK-NEXT: vmov q0[3], q0[1], r7, r0 ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -3157,30 +3165,32 @@ define arm_aapcs_vfpcc <2 x i19> @test_signed_v2f64_v2i19(<2 x double> %f) { ; CHECK-NEXT: vldr d0, .LCPI36_0 ; CHECK-NEXT: vmov r7, r6, d9 ; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: strd r2, r3, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: vldr d0, .LCPI36_1 -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: vmov r8, r0, d8 -; CHECK-NEXT: vmov r11, r10, d0 -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: clz r0, r4 -; CHECK-NEXT: lsrs r0, r0, #5 -; CHECK-NEXT: ittt ne -; CHECK-NEXT: movwne r9, #0 -; CHECK-NEXT: movtne r9, #65532 -; CHECK-NEXT: movne.w r1, #-1 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: vmov r5, r1, d8 +; CHECK-NEXT: vmov r8, r11, d0 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: clz r1, r4 +; CHECK-NEXT: lsrs r1, r1, #5 +; CHECK-NEXT: itt ne +; CHECK-NEXT: movne r0, #0 +; CHECK-NEXT: movtne r0, #65532 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r9, #-1 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: cmp r0, #0 @@ -3192,7 +3202,7 @@ define arm_aapcs_vfpcc <2 x i19> @test_signed_v2f64_v2i19(<2 x double> %f) { ; CHECK-NEXT: movne r4, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne.w r9, #0 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 @@ -3200,68 +3210,71 @@ define arm_aapcs_vfpcc <2 x i19> @test_signed_v2f64_v2i19(<2 x double> %f) { ; CHECK-NEXT: movne r6, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: itt ne -; CHECK-NEXT: movwne r9, #65535 -; CHECK-NEXT: movtne r9, #3 +; CHECK-NEXT: movwne r10, #65535 +; CHECK-NEXT: movtne r10, #3 +; CHECK-NEXT: str.w r10, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 -; CHECK-NEXT: ldr r4, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: movne.w r9, #0 +; CHECK-NEXT: mov r7, r5 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #1 -; CHECK-NEXT: ldrd r2, r3, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: movne r4, #1 +; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r11, r4 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r8, r7 +; CHECK-NEXT: mov r11, r5 ; CHECK-NEXT: lsr.w r10, r0, #5 -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: itt ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: movtne r4, #65532 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: movtne r7, #65532 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: itt ne -; CHECK-NEXT: movwne r4, #65535 -; CHECK-NEXT: movtne r4, #3 +; CHECK-NEXT: movwne r7, #65535 +; CHECK-NEXT: movtne r7, #3 ; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r7, #-1 +; CHECK-NEXT: movne.w r5, #-1 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: movne r5, #0 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: movne r5, #0 ; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r9, #0 +; CHECK-NEXT: movne r1, #0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r4, r9 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r0 +; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r7, r1 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r9 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -3295,25 +3308,25 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f64_v2i32_duplicate(<2 x double> ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: strd r2, r3, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: strd r3, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: vldr d0, .LCPI37_1 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: lsrs r4, r0, #5 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: vmov r6, r5, d0 +; CHECK-NEXT: vmov r6, r10, d0 ; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-2147483648 +; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #-2147483648 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne @@ -3325,76 +3338,77 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f64_v2i32_duplicate(<2 x double> ; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r11, #-2147483648 +; CHECK-NEXT: mvnne r4, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun -; CHECK-NEXT: vmov r10, r7, d8 ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r8, #1 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #0 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vmov r5, r4, d8 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: ldrd r2, r3, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: ldrd r3, r2, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: lsr.w r9, r0, #5 -; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r5, #-2147483648 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: movne.w r7, #-2147483648 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r5, #-2147483648 +; CHECK-NEXT: mvnne r7, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r5, r11 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r2, #-1 +; CHECK-NEXT: movne.w r11, #-1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r2, #0 +; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r2, #0 +; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 +; CHECK-NEXT: movne.w r10, #-1 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r2 +; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r7, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r10, r11 ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -3426,9 +3440,10 @@ define arm_aapcs_vfpcc <2 x i50> @test_signed_v2f64_v2i50(<2 x double> %f) { ; CHECK-NEXT: vldr d0, .LCPI38_0 ; CHECK-NEXT: vmov r7, r6, d9 ; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: strd r2, r3, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r7 @@ -3436,19 +3451,19 @@ define arm_aapcs_vfpcc <2 x i50> @test_signed_v2f64_v2i50(<2 x double> %f) { ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: vldr d0, .LCPI38_1 ; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: vmov r8, r0, d8 -; CHECK-NEXT: vmov r11, r10, d0 +; CHECK-NEXT: vmov r5, r0, d8 +; CHECK-NEXT: vmov r8, r11, d0 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: clz r0, r4 ; CHECK-NEXT: lsrs r0, r0, #5 ; CHECK-NEXT: itt ne ; CHECK-NEXT: movne r1, #0 ; CHECK-NEXT: movtne r1, #65534 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r9, #0 ; CHECK-NEXT: bl __aeabi_dcmpgt @@ -3470,68 +3485,71 @@ define arm_aapcs_vfpcc <2 x i50> @test_signed_v2f64_v2i50(<2 x double> %f) { ; CHECK-NEXT: movne r6, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: itt ne -; CHECK-NEXT: movwne r5, #65535 -; CHECK-NEXT: movtne r5, #1 -; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: movwne r10, #65535 +; CHECK-NEXT: movtne r10, #1 +; CHECK-NEXT: str.w r10, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r9, #0 -; CHECK-NEXT: ldr r4, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r7, r5 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #1 -; CHECK-NEXT: ldrd r2, r3, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: movne r4, #1 +; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r11, r4 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r8, r7 +; CHECK-NEXT: mov r11, r5 ; CHECK-NEXT: lsr.w r10, r0, #5 -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: itt ne -; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: movtne r7, #65534 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movtne r5, #65534 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: itt ne -; CHECK-NEXT: movwne r7, #65535 -; CHECK-NEXT: movtne r7, #1 +; CHECK-NEXT: movwne r5, #65535 +; CHECK-NEXT: movtne r5, #1 ; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 +; CHECK-NEXT: movne.w r7, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r4, r9 +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r7, r9 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r1 +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -3565,25 +3583,25 @@ define arm_aapcs_vfpcc <2 x i64> @test_signed_v2f64_v2i64(<2 x double> %f) { ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: strd r2, r3, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: strd r3, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: vldr d0, .LCPI39_1 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: lsrs r4, r0, #5 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: vmov r6, r5, d0 +; CHECK-NEXT: vmov r6, r10, d0 ; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #0 +; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #0 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne @@ -3595,76 +3613,77 @@ define arm_aapcs_vfpcc <2 x i64> @test_signed_v2f64_v2i64(<2 x double> %f) { ; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #-1 +; CHECK-NEXT: movne.w r4, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun -; CHECK-NEXT: vmov r10, r7, d8 ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r8, #1 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #0 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vmov r5, r4, d8 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: ldrd r2, r3, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: ldrd r3, r2, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: clz r0, r0 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: lsr.w r9, r0, #5 -; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r5, #-1 +; CHECK-NEXT: movne.w r7, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r5, r11 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r2, #-2147483648 +; CHECK-NEXT: movne.w r11, #-2147483648 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r2, #-2147483648 +; CHECK-NEXT: mvnne r11, #-2147483648 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r2, #0 +; CHECK-NEXT: movne.w r11, #0 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-2147483648 +; CHECK-NEXT: movne.w r10, #-2147483648 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r4, #-2147483648 +; CHECK-NEXT: mvnne r10, #-2147483648 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r2 +; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r7, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r10, r11 ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -3694,263 +3713,262 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) { ; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI40_0 -; CHECK-NEXT: vmov r6, r5, d8 -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: vmov r9, r8, d0 -; CHECK-NEXT: str.w r8, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: vmov r4, r6, d8 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: vmov r8, r3, d0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r11, r3 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI40_1 ; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: vmov r7, r3, d0 -; CHECK-NEXT: str r3, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: vmov r9, r3, d0 +; CHECK-NEXT: str r3, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __fixdfti -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: strd r1, r0, [sp, #8] @ 8-byte Folded Spill -; CHECK-NEXT: csel r4, r2, r4, ne -; CHECK-NEXT: str r3, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: strd r1, r0, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: csel r7, r2, r7, ne +; CHECK-NEXT: str r3, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 +; CHECK-NEXT: movne.w r7, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: str.w r11, [sp, #44] @ 4-byte Spill -; CHECK-NEXT: str.w r4, [r11, #8] -; CHECK-NEXT: str.w r9, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: str r7, [r5, #8] +; CHECK-NEXT: str r5, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: str.w r8, [sp, #32] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r4, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r10, r7 -; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: ldr r3, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: csel r7, r1, r0, ne -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r7, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: str.w r7, [r11, #4] -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: ldr.w r11, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: str r7, [r5, #4] +; CHECK-NEXT: str.w r11, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: str.w r10, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: ldr.w r10, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: csel r7, r1, r0, ne -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r7, #-1 -; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: str r5, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r6, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: vmov r9, r8, d9 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: ldr r0, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #40] @ 4-byte Reload ; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: mov r5, r11 ; CHECK-NEXT: str r7, [r0] -; CHECK-NEXT: ldr r7, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: mov r2, r7 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r4, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r6, r5 +; CHECK-NEXT: mov r4, r10 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __fixdfti +; CHECK-NEXT: strd r2, r0, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: cmp.w r11, #0 -; CHECK-NEXT: strd r2, r0, [sp, #4] @ 8-byte Folded Spill -; CHECK-NEXT: csel r10, r1, r11, ne -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: csel r11, r1, r11, ne +; CHECK-NEXT: str r3, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #-1 +; CHECK-NEXT: movne.w r11, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r11, #0 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r10, r7 +; CHECK-NEXT: ldr r7, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r6, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: csel r4, r1, r0, ne ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: cmp.w r11, #0 +; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r4, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: lsr.w r0, r10, #28 +; CHECK-NEXT: ldr r5, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: lsr.w r0, r11, #28 ; CHECK-NEXT: orr.w r0, r0, r4, lsl #4 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: mov r7, r5 -; CHECK-NEXT: str r0, [r1, #20] -; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: ldr r6, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: str r0, [r5, #20] ; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: ldr r3, [sp, #44] @ 4-byte Reload ; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: csel r11, r1, r0, ne -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: csel r7, r1, r0, ne +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #-1 +; CHECK-NEXT: movne.w r7, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #0 -; CHECK-NEXT: ldr r5, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: lsr.w r0, r11, #28 -; CHECK-NEXT: orr.w r0, r0, r10, lsl #4 -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: lsrs r0, r7, #28 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: orr.w r0, r0, r11, lsl #4 ; CHECK-NEXT: str r0, [r5, #16] +; CHECK-NEXT: ldr.w r10, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: ldr r2, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: ldr r3, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: mov r10, r6 +; CHECK-NEXT: mov r2, r6 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: it eq ; CHECK-NEXT: mvneq r0, #7 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp.w r11, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #7 -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: lsr.w r0, r4, #28 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r6, #0 -; CHECK-NEXT: orr.w r0, r0, r6, lsl #4 -; CHECK-NEXT: strb r0, [r5, #24] -; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: ldr r2, [sp, #40] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: ldr.w r8, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: lsrs r0, r4, #28 +; CHECK-NEXT: orr.w r0, r0, r5, lsl #4 ; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: mov r6, r4 +; CHECK-NEXT: strb.w r0, [r8, #24] +; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: ldr r3, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: it eq ; CHECK-NEXT: mvneq r0, #7 -; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #7 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 ; CHECK-NEXT: and r0, r4, #15 -; CHECK-NEXT: orr.w r0, r0, r11, lsl #4 -; CHECK-NEXT: str r0, [r5, #12] +; CHECK-NEXT: orr.w r0, r0, r7, lsl #4 +; CHECK-NEXT: str.w r0, [r8, #12] ; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -3981,20 +3999,20 @@ define arm_aapcs_vfpcc <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI41_0 ; CHECK-NEXT: vmov r8, r7, d9 -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r11, r3 +; CHECK-NEXT: str r3, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI41_1 ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: vmov r4, r3, d0 -; CHECK-NEXT: str r3, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r9, r0 @@ -4002,8 +4020,8 @@ define arm_aapcs_vfpcc <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: mov r10, r3 -; CHECK-NEXT: strd r2, r1, [sp] @ 8-byte Folded Spill -; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: strd r1, r0, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: moveq.w r10, #-2147483648 @@ -4018,23 +4036,20 @@ define arm_aapcs_vfpcc <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: str.w r10, [r6, #28] +; CHECK-NEXT: str.w r10, [r11, #28] ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr.w r9, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: mov r5, r11 -; CHECK-NEXT: str.w r11, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: ldr r5, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr.w r10, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr.w r10, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r11, r4 ; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 @@ -4048,22 +4063,24 @@ define arm_aapcs_vfpcc <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: movne.w r4, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: str r4, [r6, #24] +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: mov r6, r11 +; CHECK-NEXT: str.w r4, [r11, #24] +; CHECK-NEXT: mov r11, r9 +; CHECK-NEXT: mov r9, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r2, r11 ; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r7 @@ -4075,22 +4092,24 @@ define arm_aapcs_vfpcc <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: movne.w r4, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 ; CHECK-NEXT: str r4, [r6, #20] -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: ldr.w r10, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r11, r6 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r10, r6 +; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldrd r2, r3, [sp, #20] @ 8-byte Folded Reload +; CHECK-NEXT: ldr.w r11, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r11 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r7 @@ -4105,28 +4124,28 @@ define arm_aapcs_vfpcc <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: str.w r4, [r11, #16] +; CHECK-NEXT: str.w r4, [r10, #16] ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: ldr r7, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr.w r8, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: ldr.w r8, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r9, r11 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: strd r2, r1, [sp] @ 8-byte Folded Spill -; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: strd r1, r0, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: cmp.w r11, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: moveq.w r4, #-2147483648 @@ -4141,19 +4160,19 @@ define arm_aapcs_vfpcc <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: ldr.w r10, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr.w r10, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r11, r8 ; CHECK-NEXT: str.w r4, [r10, #12] -; CHECK-NEXT: ldr.w r11, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 @@ -4171,17 +4190,21 @@ define arm_aapcs_vfpcc <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: str.w r7, [r10, #8] ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr.w r9, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r8, r10 +; CHECK-NEXT: mov r10, r11 +; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: ldr.w r11, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r2, r11 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r2, r6 ; CHECK-NEXT: mov r3, r5 @@ -4193,21 +4216,21 @@ define arm_aapcs_vfpcc <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: movne.w r7, #-1 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: str.w r7, [r10, #4] ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: str.w r7, [r8, #4] ; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r2, r11 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r2, r6 ; CHECK-NEXT: mov r3, r5 @@ -4221,7 +4244,7 @@ define arm_aapcs_vfpcc <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: str.w r7, [r10] +; CHECK-NEXT: str.w r7, [r8] ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -4258,31 +4281,31 @@ define arm_aapcs_vfpcc <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) { ; CHECK-NEXT: .vsave {d8} ; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vcvtb.f32.f16 s15, s0 -; CHECK-NEXT: vmov.f32 s5, #-1.000000e+00 -; CHECK-NEXT: vldr s7, .LCPI42_0 -; CHECK-NEXT: vmaxnm.f32 s16, s15, s5 -; CHECK-NEXT: vcvtt.f32.f16 s12, s2 +; CHECK-NEXT: vmov.f32 s12, #-1.000000e+00 +; CHECK-NEXT: vldr s14, .LCPI42_0 +; CHECK-NEXT: vmaxnm.f32 s16, s15, s12 ; CHECK-NEXT: vcvtt.f32.f16 s9, s1 -; CHECK-NEXT: vminnm.f32 s16, s16, s7 ; CHECK-NEXT: vcvtt.f32.f16 s4, s3 -; CHECK-NEXT: vcvt.s32.f32 s16, s16 +; CHECK-NEXT: vminnm.f32 s16, s16, s14 ; CHECK-NEXT: vcvtb.f32.f16 s8, s3 +; CHECK-NEXT: vcvt.s32.f32 s16, s16 +; CHECK-NEXT: vcvtt.f32.f16 s3, s2 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-NEXT: vcvtt.f32.f16 s0, s0 -; CHECK-NEXT: vmaxnm.f32 s6, s4, s5 -; CHECK-NEXT: vmaxnm.f32 s10, s8, s5 -; CHECK-NEXT: vmaxnm.f32 s14, s12, s5 -; CHECK-NEXT: vmaxnm.f32 s3, s2, s5 -; CHECK-NEXT: vmaxnm.f32 s11, s9, s5 -; CHECK-NEXT: vmaxnm.f32 s13, s1, s5 -; CHECK-NEXT: vmaxnm.f32 s5, s0, s5 -; CHECK-NEXT: vminnm.f32 s5, s5, s7 -; CHECK-NEXT: vminnm.f32 s13, s13, s7 -; CHECK-NEXT: vcvt.s32.f32 s5, s5 +; CHECK-NEXT: vmaxnm.f32 s6, s4, s12 +; CHECK-NEXT: vmaxnm.f32 s10, s8, s12 +; CHECK-NEXT: vmaxnm.f32 s5, s3, s12 +; CHECK-NEXT: vmaxnm.f32 s7, s2, s12 +; CHECK-NEXT: vmaxnm.f32 s11, s9, s12 +; CHECK-NEXT: vmaxnm.f32 s13, s1, s12 +; CHECK-NEXT: vmaxnm.f32 s12, s0, s12 +; CHECK-NEXT: vminnm.f32 s12, s12, s14 +; CHECK-NEXT: vminnm.f32 s13, s13, s14 +; CHECK-NEXT: vcvt.s32.f32 s12, s12 ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vcmp.f32 s15, s15 -; CHECK-NEXT: vminnm.f32 s11, s11, s7 +; CHECK-NEXT: vminnm.f32 s11, s11, s14 ; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs @@ -4294,20 +4317,20 @@ define arm_aapcs_vfpcc <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: bfi r1, r2, #0, #1 ; CHECK-NEXT: vcvt.s32.f32 s11, s11 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vminnm.f32 s3, s3, s7 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vminnm.f32 s7, s7, s14 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r2, #0 ; CHECK-NEXT: vcmp.f32 s1, s1 ; CHECK-NEXT: and r2, r2, #1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: vcvt.s32.f32 s3, s3 +; CHECK-NEXT: vcvt.s32.f32 s7, s7 ; CHECK-NEXT: bfi r1, r2, #1, #1 ; CHECK-NEXT: vmov r2, s13 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r2, #0 -; CHECK-NEXT: vminnm.f32 s14, s14, s7 +; CHECK-NEXT: vminnm.f32 s5, s5, s14 ; CHECK-NEXT: and r2, r2, #1 ; CHECK-NEXT: vcmp.f32 s9, s9 ; CHECK-NEXT: rsbs r2, r2, #0 @@ -4316,23 +4339,23 @@ define arm_aapcs_vfpcc <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) { ; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r2, #0 -; CHECK-NEXT: vcvt.s32.f32 s14, s14 +; CHECK-NEXT: vcvt.s32.f32 s5, s5 ; CHECK-NEXT: and r2, r2, #1 -; CHECK-NEXT: vminnm.f32 s10, s10, s7 +; CHECK-NEXT: vminnm.f32 s10, s10, s14 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vcmp.f32 s2, s2 ; CHECK-NEXT: bfi r1, r2, #3, #1 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s7 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r2, #0 ; CHECK-NEXT: vcvt.s32.f32 s10, s10 ; CHECK-NEXT: and r2, r2, #1 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vminnm.f32 s6, s6, s7 +; CHECK-NEXT: vminnm.f32 s6, s6, s14 ; CHECK-NEXT: bfi r1, r2, #4, #1 -; CHECK-NEXT: vcmp.f32 s12, s12 -; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vcmp.f32 s3, s3 +; CHECK-NEXT: vmov r2, s5 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r2, #0 @@ -4374,35 +4397,35 @@ define arm_aapcs_vfpcc <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) { ; CHECK-MVE-NEXT: push {r4, r5, r7, lr} ; CHECK-MVE-NEXT: .vsave {d8} ; CHECK-MVE-NEXT: vpush {d8} -; CHECK-MVE-NEXT: vldr s8, .LCPI43_1 +; CHECK-MVE-NEXT: vldr s6, .LCPI43_1 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s13, s3 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s3, s3 -; CHECK-MVE-NEXT: vldr s6, .LCPI43_0 -; CHECK-MVE-NEXT: vmaxnm.f32 s16, s3, s8 -; CHECK-MVE-NEXT: vcvtt.f32.f16 s4, s0 +; CHECK-MVE-NEXT: vldr s4, .LCPI43_0 +; CHECK-MVE-NEXT: vmaxnm.f32 s16, s3, s6 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s8, s0 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s12, s1 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s7, s2 -; CHECK-MVE-NEXT: vmaxnm.f32 s15, s13, s8 -; CHECK-MVE-NEXT: vminnm.f32 s16, s16, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s15, s13, s6 +; CHECK-MVE-NEXT: vminnm.f32 s16, s16, s4 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-MVE-NEXT: vmaxnm.f32 s10, s4, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s14, s12, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s5, s0, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s9, s7, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s11, s1, s8 -; CHECK-MVE-NEXT: vminnm.f32 s15, s15, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s10, s8, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s14, s12, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s5, s0, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s9, s7, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s11, s1, s6 +; CHECK-MVE-NEXT: vminnm.f32 s15, s15, s4 ; CHECK-MVE-NEXT: vcvt.s32.f32 s16, s16 -; CHECK-MVE-NEXT: vmaxnm.f32 s8, s2, s8 -; CHECK-MVE-NEXT: vminnm.f32 s10, s10, s6 -; CHECK-MVE-NEXT: vminnm.f32 s14, s14, s6 -; CHECK-MVE-NEXT: vminnm.f32 s5, s5, s6 -; CHECK-MVE-NEXT: vminnm.f32 s9, s9, s6 -; CHECK-MVE-NEXT: vminnm.f32 s11, s11, s6 -; CHECK-MVE-NEXT: vminnm.f32 s6, s8, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s6, s2, s6 +; CHECK-MVE-NEXT: vminnm.f32 s10, s10, s4 +; CHECK-MVE-NEXT: vminnm.f32 s14, s14, s4 +; CHECK-MVE-NEXT: vminnm.f32 s5, s5, s4 +; CHECK-MVE-NEXT: vminnm.f32 s9, s9, s4 +; CHECK-MVE-NEXT: vminnm.f32 s11, s11, s4 +; CHECK-MVE-NEXT: vminnm.f32 s4, s6, s4 ; CHECK-MVE-NEXT: vcvt.s32.f32 s15, s15 -; CHECK-MVE-NEXT: vcvt.s32.f32 s6, s6 +; CHECK-MVE-NEXT: vcvt.s32.f32 s4, s4 ; CHECK-MVE-NEXT: vcvt.s32.f32 s9, s9 ; CHECK-MVE-NEXT: vcvt.s32.f32 s11, s11 ; CHECK-MVE-NEXT: vcvt.s32.f32 s14, s14 @@ -4420,7 +4443,7 @@ define arm_aapcs_vfpcc <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) { ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs.w lr, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: vmov r2, s6 +; CHECK-MVE-NEXT: vmov r2, s4 ; CHECK-MVE-NEXT: vcmp.f32 s7, s7 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r2, #0 @@ -4443,7 +4466,7 @@ define arm_aapcs_vfpcc <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) { ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r4, #0 -; CHECK-MVE-NEXT: vcmp.f32 s4, s4 +; CHECK-MVE-NEXT: vcmp.f32 s8, s8 ; CHECK-MVE-NEXT: vmov.16 q0[0], r4 ; CHECK-MVE-NEXT: vmov r5, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr @@ -4482,35 +4505,35 @@ define arm_aapcs_vfpcc <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) { ; CHECK-MVE-NEXT: push {r4, r5, r7, lr} ; CHECK-MVE-NEXT: .vsave {d8} ; CHECK-MVE-NEXT: vpush {d8} -; CHECK-MVE-NEXT: vldr s8, .LCPI44_1 +; CHECK-MVE-NEXT: vldr s6, .LCPI44_1 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s13, s3 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s3, s3 -; CHECK-MVE-NEXT: vldr s6, .LCPI44_0 -; CHECK-MVE-NEXT: vmaxnm.f32 s16, s3, s8 -; CHECK-MVE-NEXT: vcvtt.f32.f16 s4, s0 +; CHECK-MVE-NEXT: vldr s4, .LCPI44_0 +; CHECK-MVE-NEXT: vmaxnm.f32 s16, s3, s6 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s8, s0 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s12, s1 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s7, s2 -; CHECK-MVE-NEXT: vmaxnm.f32 s15, s13, s8 -; CHECK-MVE-NEXT: vminnm.f32 s16, s16, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s15, s13, s6 +; CHECK-MVE-NEXT: vminnm.f32 s16, s16, s4 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-MVE-NEXT: vmaxnm.f32 s10, s4, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s14, s12, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s5, s0, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s9, s7, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s11, s1, s8 -; CHECK-MVE-NEXT: vminnm.f32 s15, s15, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s10, s8, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s14, s12, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s5, s0, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s9, s7, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s11, s1, s6 +; CHECK-MVE-NEXT: vminnm.f32 s15, s15, s4 ; CHECK-MVE-NEXT: vcvt.s32.f32 s16, s16 -; CHECK-MVE-NEXT: vmaxnm.f32 s8, s2, s8 -; CHECK-MVE-NEXT: vminnm.f32 s10, s10, s6 -; CHECK-MVE-NEXT: vminnm.f32 s14, s14, s6 -; CHECK-MVE-NEXT: vminnm.f32 s5, s5, s6 -; CHECK-MVE-NEXT: vminnm.f32 s9, s9, s6 -; CHECK-MVE-NEXT: vminnm.f32 s11, s11, s6 -; CHECK-MVE-NEXT: vminnm.f32 s6, s8, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s6, s2, s6 +; CHECK-MVE-NEXT: vminnm.f32 s10, s10, s4 +; CHECK-MVE-NEXT: vminnm.f32 s14, s14, s4 +; CHECK-MVE-NEXT: vminnm.f32 s5, s5, s4 +; CHECK-MVE-NEXT: vminnm.f32 s9, s9, s4 +; CHECK-MVE-NEXT: vminnm.f32 s11, s11, s4 +; CHECK-MVE-NEXT: vminnm.f32 s4, s6, s4 ; CHECK-MVE-NEXT: vcvt.s32.f32 s15, s15 -; CHECK-MVE-NEXT: vcvt.s32.f32 s6, s6 +; CHECK-MVE-NEXT: vcvt.s32.f32 s4, s4 ; CHECK-MVE-NEXT: vcvt.s32.f32 s9, s9 ; CHECK-MVE-NEXT: vcvt.s32.f32 s11, s11 ; CHECK-MVE-NEXT: vcvt.s32.f32 s14, s14 @@ -4528,7 +4551,7 @@ define arm_aapcs_vfpcc <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) { ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs.w lr, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: vmov r2, s6 +; CHECK-MVE-NEXT: vmov r2, s4 ; CHECK-MVE-NEXT: vcmp.f32 s7, s7 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r2, #0 @@ -4551,7 +4574,7 @@ define arm_aapcs_vfpcc <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) { ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r4, #0 -; CHECK-MVE-NEXT: vcmp.f32 s4, s4 +; CHECK-MVE-NEXT: vcmp.f32 s8, s8 ; CHECK-MVE-NEXT: vmov.16 q0[0], r4 ; CHECK-MVE-NEXT: vmov r5, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr @@ -4592,35 +4615,35 @@ define arm_aapcs_vfpcc <8 x i16> @test_signed_v8f16_v8i16(<8 x half> %f) { ; CHECK-MVE-NEXT: push {r4, r5, r7, lr} ; CHECK-MVE-NEXT: .vsave {d8} ; CHECK-MVE-NEXT: vpush {d8} -; CHECK-MVE-NEXT: vldr s8, .LCPI45_1 +; CHECK-MVE-NEXT: vldr s6, .LCPI45_1 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s13, s3 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s3, s3 -; CHECK-MVE-NEXT: vldr s6, .LCPI45_0 -; CHECK-MVE-NEXT: vmaxnm.f32 s16, s3, s8 -; CHECK-MVE-NEXT: vcvtt.f32.f16 s4, s0 +; CHECK-MVE-NEXT: vldr s4, .LCPI45_0 +; CHECK-MVE-NEXT: vmaxnm.f32 s16, s3, s6 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s8, s0 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s12, s1 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s7, s2 -; CHECK-MVE-NEXT: vmaxnm.f32 s15, s13, s8 -; CHECK-MVE-NEXT: vminnm.f32 s16, s16, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s15, s13, s6 +; CHECK-MVE-NEXT: vminnm.f32 s16, s16, s4 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-MVE-NEXT: vmaxnm.f32 s10, s4, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s14, s12, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s5, s0, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s9, s7, s8 -; CHECK-MVE-NEXT: vmaxnm.f32 s11, s1, s8 -; CHECK-MVE-NEXT: vminnm.f32 s15, s15, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s10, s8, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s14, s12, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s5, s0, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s9, s7, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s11, s1, s6 +; CHECK-MVE-NEXT: vminnm.f32 s15, s15, s4 ; CHECK-MVE-NEXT: vcvt.s32.f32 s16, s16 -; CHECK-MVE-NEXT: vmaxnm.f32 s8, s2, s8 -; CHECK-MVE-NEXT: vminnm.f32 s10, s10, s6 -; CHECK-MVE-NEXT: vminnm.f32 s14, s14, s6 -; CHECK-MVE-NEXT: vminnm.f32 s5, s5, s6 -; CHECK-MVE-NEXT: vminnm.f32 s9, s9, s6 -; CHECK-MVE-NEXT: vminnm.f32 s11, s11, s6 -; CHECK-MVE-NEXT: vminnm.f32 s6, s8, s6 +; CHECK-MVE-NEXT: vmaxnm.f32 s6, s2, s6 +; CHECK-MVE-NEXT: vminnm.f32 s10, s10, s4 +; CHECK-MVE-NEXT: vminnm.f32 s14, s14, s4 +; CHECK-MVE-NEXT: vminnm.f32 s5, s5, s4 +; CHECK-MVE-NEXT: vminnm.f32 s9, s9, s4 +; CHECK-MVE-NEXT: vminnm.f32 s11, s11, s4 +; CHECK-MVE-NEXT: vminnm.f32 s4, s6, s4 ; CHECK-MVE-NEXT: vcvt.s32.f32 s15, s15 -; CHECK-MVE-NEXT: vcvt.s32.f32 s6, s6 +; CHECK-MVE-NEXT: vcvt.s32.f32 s4, s4 ; CHECK-MVE-NEXT: vcvt.s32.f32 s9, s9 ; CHECK-MVE-NEXT: vcvt.s32.f32 s11, s11 ; CHECK-MVE-NEXT: vcvt.s32.f32 s14, s14 @@ -4638,7 +4661,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_signed_v8f16_v8i16(<8 x half> %f) { ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs.w lr, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: vmov r2, s6 +; CHECK-MVE-NEXT: vmov r2, s4 ; CHECK-MVE-NEXT: vcmp.f32 s7, s7 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r2, #0 @@ -4661,7 +4684,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_signed_v8f16_v8i16(<8 x half> %f) { ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r4, #0 -; CHECK-MVE-NEXT: vcmp.f32 s4, s4 +; CHECK-MVE-NEXT: vcmp.f32 s8, s8 ; CHECK-MVE-NEXT: vmov.16 q0[0], r4 ; CHECK-MVE-NEXT: vmov r5, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr @@ -4698,75 +4721,75 @@ define arm_aapcs_vfpcc <8 x i19> @test_signed_v8f16_v8i19(<8 x half> %f) { ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8} ; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vldr s12, .LCPI46_0 +; CHECK-NEXT: vldr s10, .LCPI46_0 ; CHECK-NEXT: vcvtt.f32.f16 s15, s3 -; CHECK-NEXT: vldr s14, .LCPI46_1 +; CHECK-NEXT: vldr s12, .LCPI46_1 ; CHECK-NEXT: vcvtb.f32.f16 s7, s0 -; CHECK-NEXT: vmaxnm.f32 s16, s15, s12 +; CHECK-NEXT: vmaxnm.f32 s16, s15, s10 ; CHECK-NEXT: vcvtb.f32.f16 s4, s1 ; CHECK-NEXT: vcvtt.f32.f16 s8, s1 ; CHECK-NEXT: vcvtb.f32.f16 s1, s2 +; CHECK-NEXT: vminnm.f32 s16, s16, s12 ; CHECK-NEXT: vcvtt.f32.f16 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s2, s2 ; CHECK-NEXT: vcvtb.f32.f16 s3, s3 -; CHECK-NEXT: vmaxnm.f32 s6, s4, s12 -; CHECK-NEXT: vmaxnm.f32 s10, s8, s12 -; CHECK-NEXT: vmaxnm.f32 s5, s1, s12 -; CHECK-NEXT: vmaxnm.f32 s9, s7, s12 -; CHECK-NEXT: vmaxnm.f32 s11, s0, s12 -; CHECK-NEXT: vmaxnm.f32 s13, s2, s12 -; CHECK-NEXT: vminnm.f32 s16, s16, s14 -; CHECK-NEXT: vmaxnm.f32 s12, s3, s12 +; CHECK-NEXT: vmaxnm.f32 s6, s4, s10 +; CHECK-NEXT: vmaxnm.f32 s14, s8, s10 +; CHECK-NEXT: vmaxnm.f32 s5, s1, s10 +; CHECK-NEXT: vmaxnm.f32 s9, s7, s10 +; CHECK-NEXT: vmaxnm.f32 s11, s0, s10 +; CHECK-NEXT: vmaxnm.f32 s13, s2, s10 +; CHECK-NEXT: vmaxnm.f32 s10, s3, s10 ; CHECK-NEXT: vcvt.s32.f32 s16, s16 -; CHECK-NEXT: vminnm.f32 s12, s12, s14 -; CHECK-NEXT: vminnm.f32 s13, s13, s14 -; CHECK-NEXT: vcvt.s32.f32 s12, s12 -; CHECK-NEXT: vminnm.f32 s9, s9, s14 +; CHECK-NEXT: vminnm.f32 s10, s10, s12 +; CHECK-NEXT: vminnm.f32 s13, s13, s12 +; CHECK-NEXT: vcvt.s32.f32 s10, s10 +; CHECK-NEXT: vminnm.f32 s9, s9, s12 ; CHECK-NEXT: vcvt.s32.f32 s13, s13 -; CHECK-NEXT: vminnm.f32 s11, s11, s14 +; CHECK-NEXT: vminnm.f32 s11, s11, s12 ; CHECK-NEXT: vcvt.s32.f32 s11, s11 -; CHECK-NEXT: vminnm.f32 s5, s5, s14 +; CHECK-NEXT: vminnm.f32 s5, s5, s12 ; CHECK-NEXT: vcvt.s32.f32 s9, s9 -; CHECK-NEXT: vminnm.f32 s10, s10, s14 +; CHECK-NEXT: vminnm.f32 s14, s14, s12 ; CHECK-NEXT: vcmp.f32 s15, s15 -; CHECK-NEXT: vminnm.f32 s6, s6, s14 +; CHECK-NEXT: vminnm.f32 s6, s6, s12 ; CHECK-NEXT: vmov r1, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 ; CHECK-NEXT: lsrs r2, r1, #11 -; CHECK-NEXT: vcmp.f32 s3, s3 ; CHECK-NEXT: strb r2, [r0, #18] -; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vcmp.f32 s3, s3 +; CHECK-NEXT: vcvt.s32.f32 s5, s5 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r3, #0 -; CHECK-NEXT: ubfx r2, r3, #14, #5 -; CHECK-NEXT: vcvt.s32.f32 s5, s5 -; CHECK-NEXT: orr.w r1, r2, r1, lsl #5 +; CHECK-NEXT: movvs r2, #0 +; CHECK-NEXT: ubfx r3, r2, #14, #5 ; CHECK-NEXT: vcmp.f32 s2, s2 -; CHECK-NEXT: strh r1, [r0, #16] -; CHECK-NEXT: vmov lr, s13 +; CHECK-NEXT: orr.w r1, r3, r1, lsl #5 +; CHECK-NEXT: vmov r12, s13 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: strh r1, [r0, #16] +; CHECK-NEXT: vcvt.s32.f32 s14, s14 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w lr, #0 -; CHECK-NEXT: ubfx r1, lr, #1, #18 +; CHECK-NEXT: movvs.w r12, #0 +; CHECK-NEXT: ubfx r3, r12, #1, #18 ; CHECK-NEXT: vcmp.f32 s0, s0 -; CHECK-NEXT: orr.w r1, r1, r3, lsl #18 -; CHECK-NEXT: vcvt.s32.f32 s10, s10 +; CHECK-NEXT: orr.w r2, r3, r2, lsl #18 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: str r1, [r0, #12] +; CHECK-NEXT: vmov lr, s11 ; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w r12, #0 +; CHECK-NEXT: str r2, [r0, #12] ; CHECK-NEXT: vcmp.f32 s7, s7 +; CHECK-NEXT: it vs +; CHECK-NEXT: movvs.w lr, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r3, #0 ; CHECK-NEXT: bfc r3, #19, #13 ; CHECK-NEXT: vcvt.s32.f32 s6, s6 -; CHECK-NEXT: orr.w r3, r3, r12, lsl #19 +; CHECK-NEXT: orr.w r3, r3, lr, lsl #19 ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: vcmp.f32 s1, s1 ; CHECK-NEXT: vmov r3, s5 @@ -4775,7 +4798,7 @@ define arm_aapcs_vfpcc <8 x i19> @test_signed_v8f16_v8i19(<8 x half> %f) { ; CHECK-NEXT: movvs r3, #0 ; CHECK-NEXT: vcmp.f32 s8, s8 ; CHECK-NEXT: bfc r3, #19, #13 -; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 @@ -4783,10 +4806,10 @@ define arm_aapcs_vfpcc <8 x i19> @test_signed_v8f16_v8i19(<8 x half> %f) { ; CHECK-NEXT: vcmp.f32 s4, s4 ; CHECK-NEXT: orr.w r2, r2, r3, lsl #12 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: orr.w r2, r2, lr, lsl #31 +; CHECK-NEXT: orr.w r2, r2, r12, lsl #31 ; CHECK-NEXT: str r2, [r0, #8] ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: ubfx r3, r12, #13, #6 +; CHECK-NEXT: ubfx r3, lr, #13, #6 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r2, #0 ; CHECK-NEXT: bfc r2, #19, #13 @@ -5170,7 +5193,7 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: vmov r0, s20 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vcvtb.f32.f16 s22, s19 -; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: vmov r0, s22 ; CHECK-NEXT: vldr s30, .LCPI49_1 ; CHECK-NEXT: vldr s28, .LCPI49_0 @@ -5179,58 +5202,58 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: vcvtt.f32.f16 s16, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r9, #0 +; CHECK-NEXT: movlt.w r8, #0 ; CHECK-NEXT: vcmp.f32 s20, s28 -; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: mov r9, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r9, #-1 +; CHECK-NEXT: movgt.w r8, #-1 ; CHECK-NEXT: vcmp.f32 s20, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: vmov r5, s16 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w r9, #0 +; CHECK-NEXT: movvs.w r8, #0 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vcmp.f32 s22, s30 -; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s22, s28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r11, #0 +; CHECK-NEXT: movlt.w r10, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s22, s22 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r11, #-1 +; CHECK-NEXT: movgt.w r10, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s20, s30 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w r11, #0 +; CHECK-NEXT: movvs.w r10, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s20, s28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r8, #-2147483648 +; CHECK-NEXT: movlt.w r9, #-2147483648 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s20, s20 ; CHECK-NEXT: it gt -; CHECK-NEXT: mvngt r8, #-2147483648 +; CHECK-NEXT: mvngt r9, #-2147483648 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r11, r1 ; CHECK-NEXT: vcmp.f32 s22, s30 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w r8, #0 +; CHECK-NEXT: movvs.w r9, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r10, #-2147483648 +; CHECK-NEXT: movlt.w r11, #-2147483648 ; CHECK-NEXT: vcmp.f32 s22, s28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: it gt -; CHECK-NEXT: mvngt r10, #-2147483648 +; CHECK-NEXT: mvngt r11, #-2147483648 ; CHECK-NEXT: vcmp.f32 s22, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs.w r10, #0 +; CHECK-NEXT: movvs.w r11, #0 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vcmp.f32 s16, s30 @@ -5266,11 +5289,8 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: vmov q5[2], q5[0], r0, r6 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcvtb.f32.f16 s17, s17 -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: vcmp.f32 s19, s30 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s28 ; CHECK-NEXT: it lt @@ -5289,8 +5309,10 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: movlt.w r5, #-2147483648 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s16 +; CHECK-NEXT: vcvtb.f32.f16 s16, s17 ; CHECK-NEXT: it gt ; CHECK-NEXT: mvngt r5, #-2147483648 +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s24, s30 ; CHECK-NEXT: it vs @@ -5299,6 +5321,7 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w r7, #-2147483648 ; CHECK-NEXT: vcmp.f32 s24, s28 +; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: mvngt r7, #-2147483648 @@ -5308,41 +5331,38 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: movvs r7, #0 ; CHECK-NEXT: vmov q5[3], q5[1], r7, r5 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcvtt.f32.f16 s16, s18 +; CHECK-NEXT: vcvtt.f32.f16 s17, s18 ; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vcmp.f32 s17, s30 +; CHECK-NEXT: vmov r1, s17 +; CHECK-NEXT: vcmp.f32 s16, s30 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s17, s28 +; CHECK-NEXT: vcmp.f32 s16, s28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 -; CHECK-NEXT: vcmp.f32 s17, s17 +; CHECK-NEXT: vcmp.f32 s16, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 ; CHECK-NEXT: vmov q6[2], q6[0], r0, r6 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcvtb.f32.f16 s18, s18 -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vcmp.f32 s16, s30 +; CHECK-NEXT: vcmp.f32 s17, s30 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s28 +; CHECK-NEXT: vcmp.f32 s17, s28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r6, #0 +; CHECK-NEXT: movlt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s16 +; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r6, #-1 +; CHECK-NEXT: movgt.w r5, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s30 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r6, #0 +; CHECK-NEXT: movvs r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s28 ; CHECK-NEXT: it lt @@ -5352,62 +5372,65 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: it gt ; CHECK-NEXT: mvngt r4, #-2147483648 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s17, s30 +; CHECK-NEXT: vcmp.f32 s16, s30 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r4, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s16, s28 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w r7, #-2147483648 -; CHECK-NEXT: vcmp.f32 s17, s28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vcmp.f32 s16, s16 +; CHECK-NEXT: vcvtb.f32.f16 s16, s18 +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: it gt ; CHECK-NEXT: mvngt r7, #-2147483648 -; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r7, #0 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: vmov q6[3], q6[1], r7, r4 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcmp.f32 s18, s30 -; CHECK-NEXT: vmov q3[2], q3[0], r11, r9 +; CHECK-NEXT: vcmp.f32 s16, s30 +; CHECK-NEXT: vmov q3[2], q3[0], r10, r8 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s28 +; CHECK-NEXT: vcmp.f32 s16, s28 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s18 +; CHECK-NEXT: vcmp.f32 s16, s16 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s30 +; CHECK-NEXT: vcmp.f32 s17, s30 ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s28 +; CHECK-NEXT: vcmp.f32 s17, s28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r5, #-2147483648 +; CHECK-NEXT: movlt.w r6, #-2147483648 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s16, s16 +; CHECK-NEXT: vcmp.f32 s17, s17 ; CHECK-NEXT: it gt -; CHECK-NEXT: mvngt r5, #-2147483648 +; CHECK-NEXT: mvngt r6, #-2147483648 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s30 +; CHECK-NEXT: vcmp.f32 s16, s30 ; CHECK-NEXT: it vs -; CHECK-NEXT: movvs r5, #0 +; CHECK-NEXT: movvs r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w r1, #-2147483648 -; CHECK-NEXT: vcmp.f32 s18, s28 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r6 +; CHECK-NEXT: vcmp.f32 s16, s28 +; CHECK-NEXT: vmov q2[2], q2[0], r0, r5 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: mvngt r1, #-2147483648 -; CHECK-NEXT: vcmp.f32 s18, s18 -; CHECK-NEXT: vmov q3[3], q3[1], r10, r8 +; CHECK-NEXT: vcmp.f32 s16, s16 +; CHECK-NEXT: vmov q3[3], q3[1], r11, r9 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r1, #0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r5 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r6 ; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vmov q1, q6 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll index 8ea12bd1fc0deb..c34c3a99bcb9f2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll @@ -35,18 +35,18 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f32_v2i32(<2 x float> %f) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, s17 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vldr s18, .LCPI1_0 ; CHECK-NEXT: vcmp.f32 s17, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #0 +; CHECK-NEXT: movlt r4, #0 ; CHECK-NEXT: vcmp.f32 s17, s18 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r5, #-1 +; CHECK-NEXT: movgt.w r4, #-1 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -60,20 +60,20 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f32_v2i32(<2 x float> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s18 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r4, #0 +; CHECK-NEXT: movlt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r4, #0 +; CHECK-NEXT: movgt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vcmp.f32 s16, s18 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 2 @@ -343,9 +343,9 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f64_v2i32(<2 x double> %f) { ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vldr d0, .LCPI9_1 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmov r9, r8, d0 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: clz r0, r0 @@ -354,26 +354,26 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f64_v2i32(<2 x double> %f) { ; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r5, #-1 +; CHECK-NEXT: movne.w r10, #-1 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: movne r6, #1 ; CHECK-NEXT: ldrd r3, r2, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 @@ -387,24 +387,24 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f64_v2i32(<2 x double> %f) { ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 ; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r10 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: movne r5, #0 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r10 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -541,20 +541,21 @@ define arm_aapcs_vfpcc <4 x i32> @test_unsigned_v4f64_v4i32(<4 x double> %f) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI11_0 ; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov r7, r9, d0 +; CHECK-NEXT: vmov r7, r3, d0 ; CHECK-NEXT: vmov r4, r5, d10 -; CHECK-NEXT: str.w r9, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI11_1 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: strd r2, r3, [sp, #16] @ 8-byte Folded Spill +; CHECK-NEXT: vmov r9, r11, d0 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: mov r0, r4 @@ -562,69 +563,69 @@ define arm_aapcs_vfpcc <4 x i32> @test_unsigned_v4f64_v4i32(<4 x double> %f) { ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vmov r10, r8, d8 ; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: csel r0, r0, r6, ne ; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: vmov r5, r4, d11 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: vmov r11, r5, d11 -; CHECK-NEXT: mov r4, r7 -; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: mov r0, r10 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r7, r9 +; CHECK-NEXT: str.w r9, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: mov r0, r10 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: csel r8, r0, r9, ne -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: csel r9, r0, r9, ne +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r8, #-1 -; CHECK-NEXT: ldr.w r10, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: movne.w r9, #-1 +; CHECK-NEXT: ldr.w r10, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vmov r4, r5, d9 ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: csel r6, r0, r7, ne -; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r6, #-1 -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldrd r2, r3, [sp, #16] @ 8-byte Folded Reload -; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: mov r0, r4 @@ -632,11 +633,11 @@ define arm_aapcs_vfpcc <4 x i32> @test_unsigned_v4f64_v4i32(<4 x double> %f) { ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: csel r0, r0, r7, ne -; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r8, r1 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r9, r1 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r6 ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11} @@ -666,130 +667,131 @@ define arm_aapcs_vfpcc <5 x i32> @test_unsigned_v5f64_v5i32(<5 x double> %f) { ; CHECK-NEXT: .pad #40 ; CHECK-NEXT: sub sp, #40 ; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vldr d0, .LCPI12_0 ; CHECK-NEXT: vmov r5, r6, d4 -; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: vmov r7, r4, d0 ; CHECK-NEXT: vmov.f32 s20, s6 ; CHECK-NEXT: vmov.f32 s18, s4 ; CHECK-NEXT: vmov.f32 s22, s2 ; CHECK-NEXT: vmov.f32 s21, s7 ; CHECK-NEXT: vmov.f32 s19, s5 ; CHECK-NEXT: vmov.f32 s23, s3 +; CHECK-NEXT: str r7, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: strd r2, r3, [sp, #32] @ 8-byte Folded Spill +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI12_1 -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: vmov r7, r3, d0 -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: strd r2, r3, [sp, #32] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vmov r8, r1, d11 -; CHECK-NEXT: cmp.w r11, #0 -; CHECK-NEXT: vmov r6, r9, d10 -; CHECK-NEXT: csel r0, r0, r11, ne ; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: vmov r6, r11, d10 +; CHECK-NEXT: csel r0, r0, r10, ne +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: vmov r2, r1, d9 -; CHECK-NEXT: strd r2, r1, [sp, #16] @ 8-byte Folded Spill +; CHECK-NEXT: strd r2, r1, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: str r0, [r4, #16] +; CHECK-NEXT: str.w r0, [r9, #16] ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: ldr r5, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r1, r9 -; CHECK-NEXT: ldr.w r10, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: ldr r7, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: ldr r5, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r9 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: csel r0, r0, r4, ne -; CHECK-NEXT: cmp.w r11, #0 +; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: ldr.w r10, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: mov r11, r10 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr.w r10, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: mov r5, r6 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r9, r7 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: csel r0, r0, r6, ne -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: csel r11, r0, r6, ne +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r8, r4 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: ldr.w r8, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: movne.w r11, #-1 +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: ldr.w r10, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: ldr.w r9, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r4, r5 ; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: mov r11, r10 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: csel r4, r0, r5, ne ; CHECK-NEXT: vmov r5, r6, d8 ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r4, #-1 -; CHECK-NEXT: ldr r3, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r7, r0 @@ -798,14 +800,13 @@ define arm_aapcs_vfpcc <5 x i32> @test_unsigned_v5f64_v5i32(<5 x double> %f) { ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: csel r0, r0, r7, ne -; CHECK-NEXT: cmp.w r10, #0 +; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: vmov q0[3], q0[1], r11, r0 ; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: add sp, #40 ; CHECK-NEXT: vpop {d8, d9, d10, d11} @@ -832,14 +833,14 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vldr d0, .LCPI13_0 ; CHECK-NEXT: vmov r5, r6, d5 -; CHECK-NEXT: vmov r11, r3, d0 +; CHECK-NEXT: vmov r11, r4, d0 ; CHECK-NEXT: vmov.f32 s22, s8 ; CHECK-NEXT: vmov.f32 s20, s6 ; CHECK-NEXT: vmov.f32 s18, s4 @@ -848,48 +849,51 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: vmov.f32 s21, s7 ; CHECK-NEXT: vmov.f32 s19, s5 ; CHECK-NEXT: vmov.f32 s25, s3 -; CHECK-NEXT: str r3, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: str.w r11, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: str.w r11, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: str r4, [sp, #40] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI13_1 ; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: vmov r4, r9, d0 -; CHECK-NEXT: str r4, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: vmov r9, r3, d0 +; CHECK-NEXT: str r3, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov r10, r1, d10 +; CHECK-NEXT: vmov r2, r1, d10 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: vmov r5, r6, d11 ; CHECK-NEXT: csel r0, r0, r8, ne ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r7, r10 +; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: vmov r2, r1, d12 -; CHECK-NEXT: strd r2, r1, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: strd r2, r1, [sp, #20] @ 8-byte Folded Spill ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r7, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: str.w r10, [sp, #32] @ 4-byte Spill ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: str r0, [r7, #20] +; CHECK-NEXT: str.w r0, [r10, #20] ; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: ldr.w r8, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r2, r11 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: ldr.w r11, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r10, r9 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r0, r5 @@ -898,92 +902,95 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: vmov r2, r1, d9 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: csel r0, r0, r4, ne -; CHECK-NEXT: cmp.w r11, #0 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: strd r2, r1, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: strd r2, r1, [sp, #12] @ 8-byte Folded Spill ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 ; CHECK-NEXT: str r0, [r7, #16] -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: ldr.w r11, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr.w r9, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: ldr.w r8, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r5, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: mov r8, r9 -; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: csel r0, r0, r7, ne ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: ldr.w r9, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: ldr.w r9, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r6, r7 -; CHECK-NEXT: mov r10, r5 +; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: mov r11, r5 +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: ldr r4, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r5, r7 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r6, r10 +; CHECK-NEXT: str.w r10, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r9 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: csel r9, r0, r7, ne -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: csel r10, r0, r7, ne +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r9, #-1 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: movne.w r10, #-1 +; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r11, r7 +; CHECK-NEXT: mov r8, r4 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vmov r5, r6, d8 ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: csel r4, r0, r7, ne -; CHECK-NEXT: cmp.w r11, #0 +; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r4, #-1 -; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: ldrd r2, r3, [sp, #36] @ 8-byte Folded Reload ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: mov r3, r8 @@ -994,15 +1001,15 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) { ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: csel r0, r0, r7, ne -; CHECK-NEXT: cmp.w r10, #0 +; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov q0[3], q0[1], r9, r0 +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: vmov q0[3], q0[1], r10, r0 ; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1053,18 +1060,18 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f16_v2i32(<2 x half> %f) { ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcvtb.f32.f16 s16, s16 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vldr s20, .LCPI15_0 ; CHECK-NEXT: vcmp.f32 s18, #0 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #0 +; CHECK-NEXT: movlt r4, #0 ; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r5, #-1 +; CHECK-NEXT: movgt.w r4, #-1 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -1078,20 +1085,20 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f16_v2i32(<2 x half> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r4, #0 +; CHECK-NEXT: movlt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt r4, #0 +; CHECK-NEXT: movgt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 2 @@ -1632,63 +1639,63 @@ define arm_aapcs_vfpcc <4 x i64> @test_unsigned_v4f32_v4i64(<4 x float> %f) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, s19 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vldr s20, .LCPI29_0 ; CHECK-NEXT: vcmp.f32 s19, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r11, #0 +; CHECK-NEXT: movlt.w r10, #0 ; CHECK-NEXT: vcmp.f32 s19, s20 -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r11, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vmov r9, s17 ; CHECK-NEXT: vmov r8, s16 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r11, #-1 +; CHECK-NEXT: movgt.w r10, #-1 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s18, #0 -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #0 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r7, #-1 +; CHECK-NEXT: movgt.w r6, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r10, #0 +; CHECK-NEXT: movlt.w r11, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: mov r7, r1 ; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r10, #-1 +; CHECK-NEXT: movgt.w r11, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r6, #0 +; CHECK-NEXT: movlt r7, #0 ; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r6, #-1 +; CHECK-NEXT: movgt.w r7, #-1 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vcmp.f32 s17, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #0 +; CHECK-NEXT: movlt r4, #0 ; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r5, #-1 +; CHECK-NEXT: movgt.w r4, #-1 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s16, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r7, r11 +; CHECK-NEXT: vmov q1[2], q1[0], r6, r10 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s20 ; CHECK-NEXT: it lt @@ -1700,21 +1707,21 @@ define arm_aapcs_vfpcc <4 x i64> @test_unsigned_v4f32_v4i64(<4 x float> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r4, #0 +; CHECK-NEXT: movlt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r4, #-1 +; CHECK-NEXT: movgt.w r5, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vcmp.f32 s16, s20 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r4 -; CHECK-NEXT: vmov q1[3], q1[1], r6, r10 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 +; CHECK-NEXT: vmov q1[3], q1[1], r7, r11 ; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -2145,7 +2152,7 @@ define arm_aapcs_vfpcc <2 x i1> @test_unsigned_v2f64_v2i1(<2 x double> %f) { ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_d2uiz -; CHECK-NEXT: vmov r6, r5, d9 +; CHECK-NEXT: vmov r5, r6, d9 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: csel r0, r0, r8, ne ; CHECK-NEXT: cmp r7, #0 @@ -2157,18 +2164,18 @@ define arm_aapcs_vfpcc <2 x i1> @test_unsigned_v2f64_v2i1(<2 x double> %f) { ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: bfi r7, r0, #0, #1 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_d2uiz ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: csel r0, r0, r4, ne @@ -2220,9 +2227,9 @@ define arm_aapcs_vfpcc <2 x i8> @test_unsigned_v2f64_v2i8(<2 x double> %f) { ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vldr d0, .LCPI33_1 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmov r9, r8, d0 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: clz r0, r0 @@ -2231,26 +2238,26 @@ define arm_aapcs_vfpcc <2 x i8> @test_unsigned_v2f64_v2i8(<2 x double> %f) { ; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #255 +; CHECK-NEXT: movne.w r10, #255 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: movne r6, #1 ; CHECK-NEXT: ldrd r3, r2, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 @@ -2264,24 +2271,24 @@ define arm_aapcs_vfpcc <2 x i8> @test_unsigned_v2f64_v2i8(<2 x double> %f) { ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #255 ; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r10 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: movne r5, #0 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r10 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -2322,9 +2329,9 @@ define arm_aapcs_vfpcc <2 x i13> @test_unsigned_v2f64_v2i13(<2 x double> %f) { ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vldr d0, .LCPI34_1 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmov r9, r8, d0 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: clz r0, r0 @@ -2333,26 +2340,26 @@ define arm_aapcs_vfpcc <2 x i13> @test_unsigned_v2f64_v2i13(<2 x double> %f) { ; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movwne r5, #8191 +; CHECK-NEXT: movwne r10, #8191 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: movne r6, #1 ; CHECK-NEXT: ldrd r3, r2, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 @@ -2366,24 +2373,24 @@ define arm_aapcs_vfpcc <2 x i13> @test_unsigned_v2f64_v2i13(<2 x double> %f) { ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movwne r0, #8191 ; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r10 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: movne r5, #0 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r10 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -2424,9 +2431,9 @@ define arm_aapcs_vfpcc <2 x i16> @test_unsigned_v2f64_v2i16(<2 x double> %f) { ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vldr d0, .LCPI35_1 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmov r9, r8, d0 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: clz r0, r0 @@ -2435,26 +2442,26 @@ define arm_aapcs_vfpcc <2 x i16> @test_unsigned_v2f64_v2i16(<2 x double> %f) { ; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movwne r5, #65535 +; CHECK-NEXT: movwne r10, #65535 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: movne r6, #1 ; CHECK-NEXT: ldrd r3, r2, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 @@ -2468,24 +2475,24 @@ define arm_aapcs_vfpcc <2 x i16> @test_unsigned_v2f64_v2i16(<2 x double> %f) { ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movwne r0, #65535 ; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r10 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: movne r5, #0 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r10 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -2546,12 +2553,12 @@ define arm_aapcs_vfpcc <2 x i19> @test_unsigned_v2f64_v2i19(<2 x double> %f) { ; CHECK-NEXT: lsr.w r9, r0, #5 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne r7, #0 ; CHECK-NEXT: ldrd r2, r3, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_dcmpgt @@ -2563,8 +2570,8 @@ define arm_aapcs_vfpcc <2 x i19> @test_unsigned_v2f64_v2i19(<2 x double> %f) { ; CHECK-NEXT: movne r6, #1 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: itt ne -; CHECK-NEXT: movwne r5, #65535 -; CHECK-NEXT: movtne r5, #7 +; CHECK-NEXT: movwne r7, #65535 +; CHECK-NEXT: movtne r7, #7 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: cmp r2, #0 @@ -2576,18 +2583,18 @@ define arm_aapcs_vfpcc <2 x i19> @test_unsigned_v2f64_v2i19(<2 x double> %f) { ; CHECK-NEXT: movtne r0, #7 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: movne r5, #0 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: movne r5, #0 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r7 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r7 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -2628,9 +2635,9 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f64_v2i32_duplicate(<2 x doubl ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vldr d0, .LCPI37_1 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmov r9, r8, d0 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: clz r0, r0 @@ -2639,26 +2646,26 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f64_v2i32_duplicate(<2 x doubl ; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r5, #-1 +; CHECK-NEXT: movne.w r10, #-1 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: movne r6, #1 ; CHECK-NEXT: ldrd r3, r2, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 @@ -2672,24 +2679,24 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f64_v2i32_duplicate(<2 x doubl ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 ; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r10 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: movne r5, #0 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r10 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -2832,9 +2839,9 @@ define arm_aapcs_vfpcc <2 x i64> @test_unsigned_v2f64_v2i64(<2 x double> %f) { ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vldr d0, .LCPI39_1 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmov r9, r8, d0 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: clz r0, r0 @@ -2843,26 +2850,26 @@ define arm_aapcs_vfpcc <2 x i64> @test_unsigned_v2f64_v2i64(<2 x double> %f) { ; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: movne.w r10, #0 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r6, #1 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r5, #-1 +; CHECK-NEXT: movne.w r10, #-1 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #1 +; CHECK-NEXT: movne r6, #1 ; CHECK-NEXT: ldrd r3, r2, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r4 @@ -2876,24 +2883,24 @@ define arm_aapcs_vfpcc <2 x i64> @test_unsigned_v2f64_v2i64(<2 x double> %f) { ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 ; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r10 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #-1 +; CHECK-NEXT: movne.w r5, #-1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r1, #-1 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r10 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -2923,195 +2930,196 @@ define arm_aapcs_vfpcc <2 x i100> @test_unsigned_v2f64_v2i100(<2 x double> %f) { ; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI40_0 -; CHECK-NEXT: vmov r9, r5, d8 -; CHECK-NEXT: str r0, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: vmov r7, r5, d8 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: str r3, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r7, r2 -; CHECK-NEXT: mov r6, r3 +; CHECK-NEXT: mov r9, r2 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: vldr d0, .LCPI40_1 ; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: str r2, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #32] @ 4-byte Spill ; CHECK-NEXT: mov r10, r3 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __fixunsdfti ; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: strd r1, r0, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: strd r1, r0, [sp, #12] @ 8-byte Folded Spill ; CHECK-NEXT: csel r0, r2, r8, ne -; CHECK-NEXT: str r3, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: cmp.w r11, #0 +; CHECK-NEXT: str r3, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r4, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: mov r11, r7 ; CHECK-NEXT: str r0, [r4, #8] -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: str r5, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: str r7, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: str r6, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: mov r11, r5 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: ldr r5, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: str.w r9, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r7, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: str.w r9, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: str r7, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: mov r2, r6 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: csel r0, r1, r0, ne ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 ; CHECK-NEXT: str r0, [r4, #4] -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: str r4, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: mov r9, r5 +; CHECK-NEXT: str.w r11, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: mov r2, r6 ; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: mov r9, r7 -; CHECK-NEXT: str.w r10, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: mov r7, r6 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: vmov r8, r11, d9 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: csel r0, r1, r0, ne -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 ; CHECK-NEXT: str r0, [r4] -; CHECK-NEXT: ldr r5, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: ldr r6, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: ldr r6, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: mov r2, r6 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: mov r2, r7 ; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r9, r7 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: strd r2, r0, [sp, #4] @ 8-byte Folded Spill -; CHECK-NEXT: csel r7, r1, r10, ne -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: strd r2, r0, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: csel r4, r1, r7, ne +; CHECK-NEXT: str r3, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: mov r1, r11 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r7, #-1 -; CHECK-NEXT: mov r4, r6 +; CHECK-NEXT: movne.w r4, #-1 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr.w r10, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r6, r9 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: csel r9, r1, r0, ne -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r9, #-1 -; CHECK-NEXT: ldr r6, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: lsrs r0, r7, #28 -; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: lsrs r0, r4, #28 ; CHECK-NEXT: orr.w r0, r0, r9, lsl #4 -; CHECK-NEXT: str r0, [r6, #20] +; CHECK-NEXT: str r0, [r1, #20] ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #40] @ 4-byte Reload ; CHECK-NEXT: mov r1, r11 -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldr r7, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r2, r6 ; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: mov r5, r10 +; CHECK-NEXT: mov r6, r10 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csel r4, r1, r0, ne -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: csel r10, r1, r0, ne +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 -; CHECK-NEXT: lsrs r0, r4, #28 -; CHECK-NEXT: orr.w r0, r0, r7, lsl #4 -; CHECK-NEXT: str r0, [r6, #16] -; CHECK-NEXT: ldr r6, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: movne.w r10, #-1 +; CHECK-NEXT: lsr.w r0, r10, #28 +; CHECK-NEXT: orr.w r0, r0, r4, lsl #4 +; CHECK-NEXT: ldr r4, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: str r0, [r4, #16] ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: ldr.w r10, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: mov r1, r11 -; CHECK-NEXT: ldr.w r11, [sp, #40] @ 4-byte Reload -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: ldr.w r11, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: mov r7, r6 ; CHECK-NEXT: mov r2, r11 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: csel r0, r1, r0, ne -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #15 ; CHECK-NEXT: lsr.w r1, r9, #28 -; CHECK-NEXT: ldr.w r9, [sp, #44] @ 4-byte Reload ; CHECK-NEXT: orr.w r0, r1, r0, lsl #4 -; CHECK-NEXT: strb.w r0, [r9, #24] -; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: strb r0, [r4, #24] +; CHECK-NEXT: ldr r6, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: ldr r5, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: ldrd r2, r3, [sp, #40] @ 8-byte Folded Reload +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r3, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csel r0, r1, r0, ne ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #15 ; CHECK-NEXT: and r0, r0, #15 -; CHECK-NEXT: orr.w r0, r0, r4, lsl #4 -; CHECK-NEXT: str.w r0, [r9, #12] +; CHECK-NEXT: orr.w r0, r0, r10, lsl #4 +; CHECK-NEXT: str r0, [r4, #12] ; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -3263,18 +3271,18 @@ define arm_aapcs_vfpcc <2 x i128> @test_unsigned_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: mov r7, r11 +; CHECK-NEXT: mov r10, r8 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r10, r4 -; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr.w r8, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: csel r0, r1, r0, ne ; CHECK-NEXT: cmp.w r11, #0 @@ -3287,12 +3295,12 @@ define arm_aapcs_vfpcc <2 x i128> @test_unsigned_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: csel r0, r1, r0, ne ; CHECK-NEXT: cmp.w r11, #0 @@ -3302,16 +3310,16 @@ define arm_aapcs_vfpcc <2 x i128> @test_unsigned_v2f64_v2i128(<2 x double> %f) { ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csel r0, r1, r0, ne -; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 ; CHECK-NEXT: str r0, [r7] @@ -3773,18 +3781,18 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) { ; CHECK-NEXT: movlt r7, #0 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s26, #0 -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s26, s20 -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r10, #0 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: vcmp.f32 s24, s20 ; CHECK-NEXT: itt gt -; CHECK-NEXT: movwgt r10, #65535 -; CHECK-NEXT: movtgt r10, #3 +; CHECK-NEXT: movwgt r6, #65535 +; CHECK-NEXT: movtgt r6, #3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r7, #-1 @@ -3803,14 +3811,14 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) { ; CHECK-NEXT: str r0, [r4] ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r6, #0 +; CHECK-NEXT: movlt.w r9, #0 ; CHECK-NEXT: vcmp.f32 s26, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r6, #-1 -; CHECK-NEXT: lsl.w r0, r10, #22 -; CHECK-NEXT: str r6, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: orr.w r6, r0, r6, lsr #10 +; CHECK-NEXT: movgt.w r9, #-1 +; CHECK-NEXT: lsls r0, r6, #22 +; CHECK-NEXT: str.w r9, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: orr.w r9, r0, r9, lsr #10 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s18, #0 @@ -3826,7 +3834,7 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) { ; CHECK-NEXT: movwgt r5, #65535 ; CHECK-NEXT: movtgt r5, #3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str.w r6, [r4, #45] +; CHECK-NEXT: str.w r9, [r4, #45] ; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r7, #0 @@ -3842,26 +3850,26 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) { ; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: mov r9, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #0 +; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r0, #-1 +; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w r9, #0 ; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: vcvtt.f32.f16 s16, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: itt gt ; CHECK-NEXT: movwgt r9, #65535 ; CHECK-NEXT: movtgt r9, #3 ; CHECK-NEXT: lsl.w r0, r9, #22 ; CHECK-NEXT: orr.w r0, r0, r1, lsr #10 -; CHECK-NEXT: vcvtt.f32.f16 s16, s16 ; CHECK-NEXT: str r0, [r4, #20] ; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: bl __aeabi_f2ulz @@ -3888,7 +3896,7 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) { ; CHECK-NEXT: vcvtb.f32.f16 s16, s19 ; CHECK-NEXT: orr.w r0, r0, r11, lsl #18 ; CHECK-NEXT: str r0, [r4, #8] -; CHECK-NEXT: lsr.w r0, r10, #10 +; CHECK-NEXT: lsrs r0, r6, #10 ; CHECK-NEXT: strb.w r0, [r4, #49] ; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: bl __aeabi_f2ulz @@ -4003,7 +4011,7 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: vmov r0, s20 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcvtb.f32.f16 s22, s19 -; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: vmov r0, s22 ; CHECK-NEXT: vldr s28, .LCPI49_0 ; CHECK-NEXT: vcmp.f32 s20, #0 @@ -4011,42 +4019,42 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcvtb.f32.f16 s16, s16 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r9, #0 +; CHECK-NEXT: movlt.w r8, #0 ; CHECK-NEXT: vcmp.f32 s20, s28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: mov r9, r1 ; CHECK-NEXT: vmov r5, s24 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r9, #-1 +; CHECK-NEXT: movgt.w r8, #-1 ; CHECK-NEXT: vmov r4, s16 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s22, #0 -; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s22, s28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r11, #0 +; CHECK-NEXT: movlt.w r10, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s20, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r11, #-1 +; CHECK-NEXT: movgt.w r10, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s20, s28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r8, #0 +; CHECK-NEXT: movlt.w r9, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r11, r1 ; CHECK-NEXT: vcmp.f32 s22, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r8, #-1 +; CHECK-NEXT: movgt.w r9, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w r10, #0 +; CHECK-NEXT: movlt.w r11, #0 ; CHECK-NEXT: vcmp.f32 s22, s28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r10, #-1 +; CHECK-NEXT: movgt.w r11, #-1 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vcmp.f32 s24, #0 @@ -4119,15 +4127,15 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s17, #0 -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r6, #0 +; CHECK-NEXT: movlt r5, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s30, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r6, #-1 +; CHECK-NEXT: movgt.w r5, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s30, s28 ; CHECK-NEXT: it lt @@ -4145,11 +4153,11 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r7, #-1 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: vmov q6[3], q6[1], r7, r4 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s16, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r11, r9 +; CHECK-NEXT: vmov q3[2], q3[0], r10, r8 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, s28 ; CHECK-NEXT: it lt @@ -4161,21 +4169,21 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s17, s28 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #0 +; CHECK-NEXT: movlt r6, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: it gt -; CHECK-NEXT: movgt.w r5, #-1 +; CHECK-NEXT: movgt.w r6, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vcmp.f32 s16, s28 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r6 +; CHECK-NEXT: vmov q2[2], q2[0], r0, r5 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r5 -; CHECK-NEXT: vmov q3[3], q3[1], r10, r8 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r6 +; CHECK-NEXT: vmov q3[3], q3[1], r11, r9 ; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vmov q1, q6 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -4331,6 +4339,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vcmp.f32 s30, #0 +; CHECK-NEXT: vcvtt.f32.f16 s18, s18 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s30, s20 ; CHECK-NEXT: it lt @@ -4362,18 +4371,17 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: vcmp.f32 s30, #0 ; CHECK-NEXT: str.w r1, [r4, #91] ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #0 +; CHECK-NEXT: vcmp.f32 s30, s20 ; CHECK-NEXT: lsrs r1, r2, #28 -; CHECK-NEXT: vcvtt.f32.f16 s30, s18 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt r3, #15 ; CHECK-NEXT: orr.w r2, r1, r3, lsl #4 -; CHECK-NEXT: vmov r1, s30 -; CHECK-NEXT: strb.w r2, [r4, #99] +; CHECK-NEXT: vmov r1, s18 ; CHECK-NEXT: vcmp.f32 s28, #0 +; CHECK-NEXT: strb.w r2, [r4, #99] ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r5, #0 @@ -4386,23 +4394,23 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: str.w r0, [r4, #87] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: vcmp.f32 s30, #0 -; CHECK-NEXT: vcvtt.f32.f16 s18, s17 +; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: vcvtt.f32.f16 s28, s17 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, #0 +; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: lsr.w r6, r1, #28 -; CHECK-NEXT: vcmp.f32 s30, #0 +; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r2, #-1 ; CHECK-NEXT: orr.w r6, r6, r2, lsl #4 @@ -4410,18 +4418,18 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: str.w r6, [r4, #70] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: lsrs r6, r0, #28 ; CHECK-NEXT: orr.w r1, r6, r1, lsl #4 ; CHECK-NEXT: str.w r1, [r4, #66] -; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: vcmp.f32 s30, #0 +; CHECK-NEXT: vmov r1, s28 +; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: lsrs r2, r2, #28 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s30, s20 +; CHECK-NEXT: vcmp.f32 s18, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -4443,22 +4451,22 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: str.w r0, [r4, #62] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: vcmp.f32 s28, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: vcmp.f32 s28, #0 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r1, #-1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r2, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: lsr.w r7, r1, #28 -; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: vcmp.f32 s28, #0 ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r2, #-1 ; CHECK-NEXT: orr.w r7, r7, r2, lsl #4 @@ -4466,16 +4474,16 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: str.w r7, [r4, #45] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: lsrs r7, r0, #28 -; CHECK-NEXT: vcmp.f32 s18, #0 +; CHECK-NEXT: vcmp.f32 s28, #0 ; CHECK-NEXT: orr.w r7, r7, r1, lsl #4 ; CHECK-NEXT: vmov r1, s16 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: vcmp.f32 s28, s20 ; CHECK-NEXT: str.w r7, [r4, #41] ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #0 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index 4ab569777b2adb..f13e151ccb2e23 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -41,26 +41,26 @@ define arm_aapcs_vfpcc <8 x i16> @gather_inc_mini_8i16(ptr noalias nocapture rea ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q1, q1, r12 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r1, lr, d3 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: vadd.i32 q0, q0, r12 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: vmov r2, r4, d2 -; CHECK-NEXT: ldrh r6, [r1] -; CHECK-NEXT: vmov r1, r5, d0 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r4, lr, d3 +; CHECK-NEXT: ldrh r5, [r2] +; CHECK-NEXT: ldrh r6, [r3] +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh.w r12, [lr] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r3 ; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r4 -; CHECK-NEXT: vmov.16 q0[6], r6 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.16 q0[4], r5 +; CHECK-NEXT: vmov.16 q0[5], r6 +; CHECK-NEXT: vmov.16 q0[6], r4 ; CHECK-NEXT: vmov.16 q0[7], r12 ; CHECK-NEXT: pop {r4, r5, r6, pc} %1 = add <8 x i32> %offs, @@ -103,53 +103,53 @@ define arm_aapcs_vfpcc <16 x i8> @gather_inc_mini_16i8(ptr noalias nocapture rea ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: movs r5, #16 +; CHECK-NEXT: movs r2, #16 ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vadd.i32 q3, q3, r5 +; CHECK-NEXT: vadd.i32 q3, q3, r2 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r1, r2, d7 +; CHECK-NEXT: vmov r1, r3, d7 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, r4, d6 -; CHECK-NEXT: vadd.i32 q3, q0, r5 +; CHECK-NEXT: vmov r4, r5, d6 +; CHECK-NEXT: vadd.i32 q3, q0, r2 +; CHECK-NEXT: vmov r6, r7, d7 ; CHECK-NEXT: vadd.i32 q0, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r5 -; CHECK-NEXT: vadd.i32 q2, q0, r5 +; CHECK-NEXT: vadd.i32 q2, q0, r2 +; CHECK-NEXT: vadd.i32 q1, q1, r2 ; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: ldrb.w lr, [r3] ; CHECK-NEXT: ldrb r3, [r4] -; CHECK-NEXT: vmov r2, r4, d6 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: vmov r2, r6, d5 -; CHECK-NEXT: vmov.8 q0[1], r4 -; CHECK-NEXT: ldrb r4, [r2] -; CHECK-NEXT: ldrb r2, [r6] -; CHECK-NEXT: vmov r6, r7, d7 +; CHECK-NEXT: ldrb r1, [r5] +; CHECK-NEXT: vmov r4, r5, d6 ; CHECK-NEXT: ldrb r0, [r6] ; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r4 +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r4, r5, d5 ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, r5, d2 +; CHECK-NEXT: vmov r0, r2, d2 ; CHECK-NEXT: vmov.8 q0[3], r7 +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.8 q0[5], r5 -; CHECK-NEXT: vmov r0, r5, d3 -; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[5], r2 +; CHECK-NEXT: vmov r0, r2, d3 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.8 q0[7], r5 -; CHECK-NEXT: vmov r0, r5, d4 +; CHECK-NEXT: vmov.8 q0[7], r2 +; CHECK-NEXT: vmov r0, r2, d4 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.8 q0[9], r5 +; CHECK-NEXT: vmov.8 q0[9], r2 ; CHECK-NEXT: vmov.8 q0[10], r4 -; CHECK-NEXT: vmov.8 q0[11], r2 -; CHECK-NEXT: vmov.8 q0[12], r1 -; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], r3 +; CHECK-NEXT: vmov.8 q0[13], r1 ; CHECK-NEXT: vmov.8 q0[14], r12 ; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -165,35 +165,35 @@ define arm_aapcs_vfpcc <16 x i8> @gather_inc_minipred_16i8(ptr noalias nocapture ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: movs r1, #16 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vadd.i32 q1, q1, r1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vadd.i32 q2, q2, r1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vadd.i32 q1, q3, r0 ; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vadd.i32 q1, q1, r1 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r4 ; CHECK-NEXT: vmov.8 q0[2], r5 ; CHECK-NEXT: vmov.8 q0[4], r12 -; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vadd.i32 q1, q3, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.8 q0[6], r3 ; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.8 q0[6], r2 ; CHECK-NEXT: vmov.8 q0[8], lr ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[10], r3 +; CHECK-NEXT: vmov.8 q0[10], r2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[12], r0 ; CHECK-NEXT: vmov.8 q0[14], r1 @@ -577,22 +577,22 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado ; CHECK-NEXT: vldrh.s32 q2, [r2] ; CHECK-NEXT: vshl.i32 q2, q2, #1 ; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r2, r11, d4 ; CHECK-NEXT: vmov r1, r10, d5 ; CHECK-NEXT: ldrh r7, [r7] +; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh.w r2, [r10] -; CHECK-NEXT: ldrh.w r10, [r3] -; CHECK-NEXT: vmov r3, r11, d4 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh.w r11, [r11] -; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q2[1], r11 +; CHECK-NEXT: ldrh.w r10, [r10] ; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.16 q2[3], r2 ; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: vmov.16 q2[4], r10 +; CHECK-NEXT: vmov.16 q2[3], r10 +; CHECK-NEXT: vmov.16 q2[4], r3 ; CHECK-NEXT: vmov.16 q2[5], r4 ; CHECK-NEXT: vmov.16 q2[6], r7 ; CHECK-NEXT: vmov.16 q2[7], r5 @@ -658,8 +658,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #136 -; CHECK-NEXT: sub sp, #136 +; CHECK-NEXT: .pad #144 +; CHECK-NEXT: sub sp, #144 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: str r1, [sp, #64] @ 4-byte Spill ; CHECK-NEXT: mov r1, r2 @@ -674,131 +674,135 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: subs r1, #8 ; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q2, #0x18 +; CHECK-NEXT: add r3, sp, #128 +; CHECK-NEXT: add r7, sp, #96 ; CHECK-NEXT: add.w r1, r2, r1, lsr #3 ; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill ; CHECK-NEXT: adr r1, .LCPI12_0 ; CHECK-NEXT: adr r2, .LCPI12_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov.i16 q2, #0x18 ; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: add r2, sp, #120 +; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill ; CHECK-NEXT: .LBB12_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 ; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: add.w r10, sp, #104 ; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload +; CHECK-NEXT: ldr.w r12, [sp, #64] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q6, [sp, #8] @ 16-byte Reload ; CHECK-NEXT: .LBB12_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q5, [r2] -; CHECK-NEXT: mov r8, r2 -; CHECK-NEXT: vldrh.s32 q0, [r2, #8] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vldrh.s32 q0, [r2] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q2, q0, r0 -; CHECK-NEXT: vmov r6, r2, d4 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh.w r12, [r4] -; CHECK-NEXT: add r4, sp, #88 -; CHECK-NEXT: ldrh.w r11, [r5] +; CHECK-NEXT: vstrw.32 q5, [r3] +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: vldrh.s32 q1, [r3, #8] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r4 +; CHECK-NEXT: vmov r1, r2, d2 +; CHECK-NEXT: vmov r5, r6, d3 +; CHECK-NEXT: vldrh.s32 q1, [r3] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q2, q1, r4 +; CHECK-NEXT: vmov r3, r9, d4 +; CHECK-NEXT: ldrh r0, [r2] +; CHECK-NEXT: ldrh.w r11, [r1] +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: str r0, [sp, #92] @ 4-byte Spill +; CHECK-NEXT: add r0, sp, #112 +; CHECK-NEXT: ldrh.w r10, [r5] +; CHECK-NEXT: ldrh r2, [r6] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r5, [r6] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vstrw.32 q6, [r4] -; CHECK-NEXT: vldrh.s32 q0, [r4] -; CHECK-NEXT: vmov.16 q7[0], r5 -; CHECK-NEXT: vmov.16 q7[1], r2 -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r6, r9, d0 -; CHECK-NEXT: vmov r2, r5, d1 -; CHECK-NEXT: vldrh.s32 q0, [r4, #8] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: ldrh.w r5, [r9] +; CHECK-NEXT: vstrw.32 q6, [r7] +; CHECK-NEXT: vldrh.s32 q1, [r7] +; CHECK-NEXT: vmov.16 q7[0], r3 +; CHECK-NEXT: vmov.16 q7[1], r5 +; CHECK-NEXT: vldrh.s32 q3, [r1, #8] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r4 +; CHECK-NEXT: vshl.i32 q3, q3, #1 +; CHECK-NEXT: vmov r6, r7, d2 +; CHECK-NEXT: vadd.i32 q3, q3, r4 +; CHECK-NEXT: vmov r3, r5, d3 ; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: vmov.16 q1[0], r6 -; CHECK-NEXT: ldrh.w r6, [r9] +; CHECK-NEXT: ldrh r6, [r7] ; CHECK-NEXT: ldrh r5, [r5] ; CHECK-NEXT: vmov.16 q1[1], r6 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, r6, d0 +; CHECK-NEXT: vmov.16 q1[2], r3 +; CHECK-NEXT: vmov r3, r6, d6 ; CHECK-NEXT: vmov.16 q1[3], r5 -; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, r5, d1 +; CHECK-NEXT: vmov.16 q1[4], r3 +; CHECK-NEXT: vmov r3, r5, d7 ; CHECK-NEXT: vmov.16 q1[5], r6 -; CHECK-NEXT: mov r6, r10 -; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vstrw.32 q4, [r10] -; CHECK-NEXT: vldrh.s32 q0, [r6] -; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vstrw.32 q4, [r0] +; CHECK-NEXT: vldrh.s32 q3, [r0] +; CHECK-NEXT: vmov.16 q1[6], r3 ; CHECK-NEXT: vmov.16 q1[7], r5 +; CHECK-NEXT: vshl.i32 q3, q3, #1 +; CHECK-NEXT: vadd.i32 q0, q3, r4 +; CHECK-NEXT: vmov r3, r5, d0 +; CHECK-NEXT: vmov r6, r7, d1 +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: ldr r0, [sp, #92] @ 4-byte Reload ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, r5, d0 -; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vadd.i32 q0, q0, r4 +; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.16 q3[0], r3 +; CHECK-NEXT: ldrh r6, [r6] ; CHECK-NEXT: vmov.16 q3[1], r5 -; CHECK-NEXT: vmov r2, r5, d5 +; CHECK-NEXT: ldrh r7, [r7] +; CHECK-NEXT: vmov.16 q3[2], r6 +; CHECK-NEXT: vmov r3, r5, d5 +; CHECK-NEXT: vmov.16 q3[3], r7 +; CHECK-NEXT: vmov r6, r7, d0 ; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload ; CHECK-NEXT: vadd.i16 q6, q6, q2 ; CHECK-NEXT: vadd.i16 q5, q5, q2 ; CHECK-NEXT: vadd.i16 q4, q4, q2 -; CHECK-NEXT: ldrh.w r9, [r2] -; CHECK-NEXT: vmov r2, r4, d1 -; CHECK-NEXT: vldrh.s32 q0, [r6, #8] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: ldrh r7, [r7] +; CHECK-NEXT: vmov.16 q7[2], r3 +; CHECK-NEXT: vmov.16 q3[4], r6 ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q7[2], r9 -; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vmov.16 q3[5], r7 +; CHECK-NEXT: vmov r6, r7, d1 ; CHECK-NEXT: vmov.16 q7[3], r5 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov.16 q7[4], r1 -; CHECK-NEXT: vmov.16 q7[5], r3 -; CHECK-NEXT: vmov.16 q7[6], r12 -; CHECK-NEXT: vmov.16 q7[7], r11 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.16 q3[3], r4 -; CHECK-NEXT: vmov r2, r4, d0 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.16 q3[5], r4 -; CHECK-NEXT: vmov r2, r4, d1 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: vmov.16 q3[7], r4 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: vmov.16 q7[4], r11 +; CHECK-NEXT: vmov.16 q7[5], r0 +; CHECK-NEXT: vmov.16 q7[6], r10 +; CHECK-NEXT: vmov.16 q7[7], r2 +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: ldrh r7, [r7] +; CHECK-NEXT: vmov.16 q3[6], r6 +; CHECK-NEXT: vmov.16 q3[7], r7 +; CHECK-NEXT: mov r7, r1 ; CHECK-NEXT: vadd.i16 q0, q3, q1 ; CHECK-NEXT: vadd.i16 q0, q0, q7 -; CHECK-NEXT: vstrb.8 q0, [r7], #16 +; CHECK-NEXT: vstrb.8 q0, [r12], #16 ; CHECK-NEXT: le lr, .LBB12_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp, #68] @ 4-byte Reload -; CHECK-NEXT: cmp r1, r3 +; CHECK-NEXT: ldr r2, [sp, #68] @ 4-byte Reload +; CHECK-NEXT: cmp r1, r2 ; CHECK-NEXT: bne.w .LBB12_2 ; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #136 +; CHECK-NEXT: add sp, #144 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -886,167 +890,159 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: str r1, [sp, #116] @ 4-byte Spill ; CHECK-NEXT: blt.w .LBB13_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: adr r1, .LCPI13_0 -; CHECK-NEXT: adr r6, .LCPI13_8 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI13_1 -; CHECK-NEXT: adr r7, .LCPI13_7 -; CHECK-NEXT: adr r3, .LCPI13_6 +; CHECK-NEXT: adr r3, .LCPI13_0 +; CHECK-NEXT: adr r7, .LCPI13_8 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: adr r3, .LCPI13_1 +; CHECK-NEXT: adr r6, .LCPI13_7 +; CHECK-NEXT: adr r5, .LCPI13_6 ; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI13_5 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: adr r3, .LCPI13_5 ; CHECK-NEXT: bic r10, r2, #7 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI13_9 -; CHECK-NEXT: vmov.i32 q2, #0x30 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: adr r7, .LCPI13_9 +; CHECK-NEXT: vmov.i32 q6, #0x30 +; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vstrw.32 q6, [sp, #120] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r7] ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB13_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB13_3 Depth 2 +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: adr r1, .LCPI13_3 -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adr r1, .LCPI13_4 -; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vstrw.32 q7, [sp, #296] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: adr r1, .LCPI13_2 -; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vstrw.32 q7, [sp, #280] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [r1] ; CHECK-NEXT: adr r1, .LCPI13_10 -; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vstrw.32 q7, [sp, #248] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: adr r1, .LCPI13_11 -; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [r1] -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill +; CHECK-NEXT: ldr.w r9, [sp, #116] @ 4-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q7, [sp, #232] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload ; CHECK-NEXT: mov r11, r10 -; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q7, [sp, #216] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill ; CHECK-NEXT: .LBB13_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill -; CHECK-NEXT: vmov r1, lr, d8 -; CHECK-NEXT: vadd.i32 q7, q7, r0 -; CHECK-NEXT: vmov r5, r4, d15 +; CHECK-NEXT: vadd.i32 q7, q5, r0 ; CHECK-NEXT: vadd.i32 q6, q0, r0 -; CHECK-NEXT: vmov r6, r7, d13 -; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov q3, q5 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill +; CHECK-NEXT: vmov r1, r3, d14 +; CHECK-NEXT: vstrw.32 q2, [sp, #168] @ 16-byte Spill +; CHECK-NEXT: vmov r4, r12, d15 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r5, r6, d13 +; CHECK-NEXT: vstrw.32 q0, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vmov lr, r8, d4 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vadd.i32 q1, q4, r0 +; CHECK-NEXT: vstrw.32 q4, [sp, #152] @ 16-byte Spill ; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vldrw.u32 q4, [sp, #232] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [sp, #136] @ 16-byte Spill ; CHECK-NEXT: subs.w r11, r11, #16 -; CHECK-NEXT: ldrb.w r9, [r1] -; CHECK-NEXT: vmov r1, r3, d14 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: vmov.8 q7[0], r1 ; CHECK-NEXT: ldrb r1, [r3] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q7[1], r1 ; CHECK-NEXT: vmov r1, r3, d12 -; CHECK-NEXT: vmov.8 q7[2], r5 -; CHECK-NEXT: ldrb r5, [r6] -; CHECK-NEXT: ldrb r6, [r4] -; CHECK-NEXT: vmov.8 q7[3], r6 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q7[2], r4 +; CHECK-NEXT: ldrb.w r7, [lr] +; CHECK-NEXT: ldrb.w r4, [r12] +; CHECK-NEXT: vmov.8 q7[3], r4 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: vmov.8 q6[0], r1 -; CHECK-NEXT: vmov r6, r1, d2 +; CHECK-NEXT: vmov r1, r4, d2 ; CHECK-NEXT: vmov.8 q6[1], r3 ; CHECK-NEXT: vmov.8 q6[2], r5 -; CHECK-NEXT: vmov.8 q6[3], r7 -; CHECK-NEXT: ldrb.w r7, [lr] -; CHECK-NEXT: vmov.8 q6[4], r9 -; CHECK-NEXT: vmov.8 q6[5], r7 -; CHECK-NEXT: ldrb r4, [r1] -; CHECK-NEXT: vmov r1, r5, d3 -; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, r3, d9 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q6[6], r1 -; CHECK-NEXT: vmov r1, r7, d0 -; CHECK-NEXT: vmov.8 q6[7], r3 +; CHECK-NEXT: ldrb.w r5, [r8] +; CHECK-NEXT: vmov.8 q6[3], r6 +; CHECK-NEXT: vmov.8 q6[4], r7 +; CHECK-NEXT: vmov r7, r6, d5 +; CHECK-NEXT: vmov.8 q6[5], r5 +; CHECK-NEXT: vldrw.u32 q2, [sp, #248] @ 16-byte Reload +; CHECK-NEXT: ldrb r3, [r4] +; CHECK-NEXT: vmov r4, r12, d3 ; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vldrw.u32 q1, [sp, #216] @ 16-byte Reload ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q7[4], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q7[5], r7 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q7[6], r1 -; CHECK-NEXT: ldrb r1, [r6] -; CHECK-NEXT: vmov r7, r6, d0 -; CHECK-NEXT: vmov.8 q7[7], r3 -; CHECK-NEXT: vmov r3, lr, d1 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q6[6], r7 +; CHECK-NEXT: vmov r7, r5, d0 +; CHECK-NEXT: vmov.8 q6[7], r6 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q7[4], r7 +; CHECK-NEXT: vmov r7, r6, d1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q7[8], r1 +; CHECK-NEXT: vmov.8 q7[5], r5 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov.8 q7[9], r4 -; CHECK-NEXT: vmov r4, r1, d0 -; CHECK-NEXT: vmov.8 q7[10], r12 -; CHECK-NEXT: vmov.8 q7[11], r5 ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q7[6], r7 +; CHECK-NEXT: vmov r7, r5, d0 +; CHECK-NEXT: vmov.8 q7[7], r6 +; CHECK-NEXT: ldrb.w r6, [r12] +; CHECK-NEXT: vmov.8 q7[8], r1 +; CHECK-NEXT: vmov r1, lr, d1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #296] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q7[9], r3 +; CHECK-NEXT: vmov.8 q7[10], r4 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov.8 q7[11], r6 +; CHECK-NEXT: ldrb r3, [r5] +; CHECK-NEXT: vmov r5, r4, d0 +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[8], r4 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov.8 q6[9], r1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q6[10], r5 -; CHECK-NEXT: vmov.8 q6[11], r4 +; CHECK-NEXT: vmov.8 q6[8], r5 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: vmov.8 q6[9], r4 +; CHECK-NEXT: vadd.i32 q0, q3, r0 +; CHECK-NEXT: vldrw.u32 q3, [sp, #200] @ 16-byte Reload +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q6[10], r6 +; CHECK-NEXT: vmov.8 q6[11], r5 ; CHECK-NEXT: vmov.8 q6[12], r7 -; CHECK-NEXT: vmov.8 q6[13], r6 -; CHECK-NEXT: vmov.8 q6[14], r3 +; CHECK-NEXT: vmov.8 q6[13], r3 +; CHECK-NEXT: vmov.8 q6[14], r1 ; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[12], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[13], r1 ; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vadd.i32 q0, q2, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[14], r1 ; CHECK-NEXT: ldrb r1, [r3] @@ -1060,18 +1056,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: vmov.8 q7[0], r1 ; CHECK-NEXT: vmov.8 q7[1], r3 ; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q3, r0 -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q0, q4, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[2], r1 ; CHECK-NEXT: ldrb r1, [r3] @@ -1082,11 +1067,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[5], r1 ; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vadd.i32 q5, q5, q2 -; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q5, q5, q2 +; CHECK-NEXT: vadd.i32 q0, q1, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[6], r1 ; CHECK-NEXT: ldrb r1, [r3] @@ -1097,9 +1078,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[9], r1 ; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q4, r0 -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q0, q3, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[10], r1 ; CHECK-NEXT: ldrb r1, [r3] @@ -1115,11 +1094,34 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[15], r1 ; CHECK-NEXT: vadd.i8 q0, q6, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload -; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q7, q7, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vldrw.u32 q6, [sp, #120] @ 16-byte Reload +; CHECK-NEXT: vstrb.8 q0, [r9], #16 +; CHECK-NEXT: vldrw.u32 q0, [sp, #184] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q2, q2, q6 +; CHECK-NEXT: vadd.i32 q1, q1, q6 +; CHECK-NEXT: vstrw.32 q2, [sp, #248] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #264] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #216] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q2, q2, q6 +; CHECK-NEXT: vadd.i32 q4, q4, q6 +; CHECK-NEXT: vadd.i32 q1, q1, q6 +; CHECK-NEXT: vadd.i32 q3, q3, q6 +; CHECK-NEXT: vstrw.32 q2, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #168] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #232] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [sp, #152] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #296] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #136] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q5, q5, q6 +; CHECK-NEXT: vadd.i32 q0, q0, q6 +; CHECK-NEXT: vadd.i32 q3, q3, q6 +; CHECK-NEXT: vadd.i32 q2, q2, q6 +; CHECK-NEXT: vadd.i32 q4, q4, q6 +; CHECK-NEXT: vadd.i32 q1, q1, q6 +; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill ; CHECK-NEXT: bne.w .LBB13_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1 @@ -1249,89 +1251,91 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: blt.w .LBB14_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader ; CHECK-NEXT: adr r5, .LCPI14_3 -; CHECK-NEXT: adr r7, .LCPI14_1 +; CHECK-NEXT: adr r4, .LCPI14_1 ; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: adr r3, .LCPI14_0 -; CHECK-NEXT: adr r6, .LCPI14_2 +; CHECK-NEXT: adr.w lr, .LCPI14_0 +; CHECK-NEXT: adr r3, .LCPI14_2 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: bic r9, r1, #7 -; CHECK-NEXT: vldrw.u32 q3, [r3] +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: bic r1, r1, #7 +; CHECK-NEXT: vldrw.u32 q3, [lr] ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: mov.w lr, #16 -; CHECK-NEXT: str.w r9, [sp, #52] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #52] @ 4-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB14_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 -; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: .LBB14_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vadd.i32 q1, q5, r0 ; CHECK-NEXT: vadd.i32 q2, q4, r0 -; CHECK-NEXT: vmov r7, r3, d3 +; CHECK-NEXT: vmov r7, r6, d3 ; CHECK-NEXT: vadd.i32 q6, q0, lr -; CHECK-NEXT: vmov r5, r6, d5 -; CHECK-NEXT: subs.w r9, r9, #16 -; CHECK-NEXT: vmov r4, r10, d2 +; CHECK-NEXT: vmov r9, r10, d2 ; CHECK-NEXT: vadd.i32 q1, q7, lr +; CHECK-NEXT: vmov r5, r8, d5 +; CHECK-NEXT: subs r4, #16 ; CHECK-NEXT: vadd.i32 q4, q4, lr ; CHECK-NEXT: vadd.i32 q5, q5, lr -; CHECK-NEXT: ldrb.w r11, [r3] -; CHECK-NEXT: ldrb r3, [r7] -; CHECK-NEXT: vmov r7, r12, d4 +; CHECK-NEXT: ldrb.w r11, [r6] +; CHECK-NEXT: ldrb.w r6, [r9] +; CHECK-NEXT: ldrb.w r9, [r10] +; CHECK-NEXT: vmov r10, r12, d4 ; CHECK-NEXT: vadd.i32 q2, q7, r0 ; CHECK-NEXT: vadd.i32 q7, q0, r0 ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w r10, [r10] +; CHECK-NEXT: ldrb.w r8, [r8] ; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: ldrb.w r2, [r10] ; CHECK-NEXT: ldrb.w r1, [r12] -; CHECK-NEXT: vmov.8 q0[0], r7 +; CHECK-NEXT: vmov.8 q0[0], r2 +; CHECK-NEXT: vmov r10, r2, d15 ; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: vmov r1, r7, d15 ; CHECK-NEXT: vmov.8 q0[2], r5 -; CHECK-NEXT: vmov.8 q0[3], r6 -; CHECK-NEXT: vmov.8 q0[4], r4 -; CHECK-NEXT: vmov r4, r2, d4 -; CHECK-NEXT: vmov.8 q0[5], r10 -; CHECK-NEXT: vmov.8 q0[6], r3 +; CHECK-NEXT: vmov.8 q0[3], r8 +; CHECK-NEXT: vmov.8 q0[4], r6 +; CHECK-NEXT: vmov.8 q0[5], r9 +; CHECK-NEXT: vmov.8 q0[6], r7 +; CHECK-NEXT: vmov r7, r1, d14 ; CHECK-NEXT: vmov.8 q0[7], r11 -; CHECK-NEXT: ldrb r6, [r7] -; CHECK-NEXT: vmov r5, r7, d5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r5] -; CHECK-NEXT: ldrb.w r12, [r7] -; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: vmov r4, r7, d14 ; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r5, r2, d5 ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q0[8], r4 -; CHECK-NEXT: vmov.8 q0[9], r7 -; CHECK-NEXT: vmov.8 q0[10], r1 -; CHECK-NEXT: vmov.8 q0[11], r6 -; CHECK-NEXT: vmov.8 q0[12], r5 -; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov.8 q0[15], r12 -; CHECK-NEXT: vstrb.8 q0, [r8], #16 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q0[8], r7 +; CHECK-NEXT: vmov.8 q0[9], r1 +; CHECK-NEXT: ldrb.w r8, [r2] +; CHECK-NEXT: vmov r6, r2, d4 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w r9, [r2] +; CHECK-NEXT: ldrb.w r2, [r10] +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q0[10], r2 +; CHECK-NEXT: vmov.8 q0[11], r12 +; CHECK-NEXT: vmov.8 q0[12], r6 +; CHECK-NEXT: vmov.8 q0[13], r9 +; CHECK-NEXT: vmov.8 q0[14], r5 +; CHECK-NEXT: vmov.8 q0[15], r8 +; CHECK-NEXT: vstrb.8 q0, [r3], #16 ; CHECK-NEXT: vmov q0, q6 ; CHECK-NEXT: bne .LBB14_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1 +; CHECK-NEXT: ldr r2, [sp, #52] @ 4-byte Reload ; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload -; CHECK-NEXT: cmp r9, r1 +; CHECK-NEXT: cmp r2, r1 +; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: bne .LBB14_2 ; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #64 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll index b4aefeaa931157..d389aa6ba0e9be 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -52,16 +52,16 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(ptr %base, ptr %offptr) { ; CHECK-NEXT: vldrh.s32 q0, [r1, #8] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, r12, d0 -; CHECK-NEXT: vmov r3, lr, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r12, lr, d1 ; CHECK-NEXT: vldrh.s32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh.w lr, [lr] ; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] @@ -72,8 +72,8 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(ptr %base, ptr %offptr) { ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.16 q0[3], r1 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r12 -; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov.16 q0[6], r12 ; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -249,38 +249,38 @@ entry: define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep(ptr %base, ptr %offptr) { ; CHECK-LABEL: scaled_v8i16_i16_2gep: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrh.s32 q0, [r1, #8] ; CHECK-NEXT: mov.w r12, #40 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vadd.i32 q0, q0, r12 -; CHECK-NEXT: vmov r3, lr, d0 -; CHECK-NEXT: vmov r2, r4, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: vmov lr, r4, d1 ; CHECK-NEXT: vldrh.s32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vadd.i32 q0, q0, r12 ; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: ldrh r6, [r3] +; CHECK-NEXT: ldrh r5, [r2] +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ldrh.w r12, [lr] -; CHECK-NEXT: ldrh.w lr, [r4] -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r3 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov.16 q0[4], r3 -; CHECK-NEXT: vmov.16 q0[5], r12 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], lr -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov.16 q0[4], r6 +; CHECK-NEXT: vmov.16 q0[5], r5 +; CHECK-NEXT: vmov.16 q0[6], r12 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i16>, ptr %offptr, align 2 %ptrs = getelementptr inbounds i16, ptr %base, <8 x i16> %offs @@ -470,26 +470,26 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep5(<8 x ptr> %base) { ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: mov.w r12, #131072 -; CHECK-NEXT: vadd.i32 q0, q0, r12 ; CHECK-NEXT: vadd.i32 q1, q1, r12 -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: vmov r0, r12, d1 +; CHECK-NEXT: vadd.i32 q0, q0, r12 ; CHECK-NEXT: vmov r3, lr, d3 ; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov r0, r12, d1 ; CHECK-NEXT: ldrh r6, [r3] -; CHECK-NEXT: ldrh.w r3, [r12] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: ldrh r5, [r2] +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r4, [r1] +; CHECK-NEXT: ldrh.w r1, [r12] ; CHECK-NEXT: ldrh.w lr, [lr] -; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.16 q0[4], r4 +; CHECK-NEXT: vmov.16 q0[5], r5 ; CHECK-NEXT: vmov.16 q0[6], r6 ; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -609,16 +609,16 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_basei32(ptr %base, ptr %offpt ; CHECK-NEXT: vldrh.u32 q0, [r1, #8] ; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, r12, d0 -; CHECK-NEXT: vmov r3, lr, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r12, lr, d1 ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh.w lr, [lr] ; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] @@ -629,8 +629,8 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_basei32(ptr %base, ptr %offpt ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.16 q0[3], r1 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r12 -; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov.16 q0[6], r12 ; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll index 7580da3b89d26f..57bfbcc7bd361a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll @@ -23,26 +23,26 @@ define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16_noext(ptr %base, ptr %off ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrb.s32 q0, [r1, #4] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, lr, d1 ; CHECK-NEXT: vmov r12, r3, d0 +; CHECK-NEXT: vmov r2, lr, d1 ; CHECK-NEXT: vldrb.s32 q0, [r1] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: ldrb r5, [r3] +; CHECK-NEXT: vmov r3, r4, d0 ; CHECK-NEXT: ldrb r6, [r2] ; CHECK-NEXT: ldrb.w r2, [r12] -; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: ldrb.w lr, [lr] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r4 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[1], r5 ; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q0[1], r4 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.16 q0[3], r1 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov.16 q0[5], r5 ; CHECK-NEXT: vmov.16 q0[6], r6 ; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: vmovlb.u8 q0, q0 @@ -63,16 +63,16 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(ptr %base, ptr %offptr) { ; CHECK-NEXT: vldrb.s32 q0, [r1, #4] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, r12, d0 -; CHECK-NEXT: vmov r3, lr, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r12, lr, d1 ; CHECK-NEXT: vldrb.s32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh.w lr, [lr] ; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] @@ -83,8 +83,8 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(ptr %base, ptr %offptr) { ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.16 q0[3], r1 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r12 -; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov.16 q0[6], r12 ; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -103,16 +103,16 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_zext(ptr %base, ptr %offptr) { ; CHECK-NEXT: vldrb.u32 q0, [r1, #4] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, r12, d0 -; CHECK-NEXT: vmov r3, lr, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r12, lr, d1 ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh.w lr, [lr] ; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] @@ -123,8 +123,8 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_zext(ptr %base, ptr %offptr) { ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.16 q0[3], r1 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r12 -; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov.16 q0[6], r12 ; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll index b4a2aa7a1ed1b4..e79df4531eaa32 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -63,30 +63,30 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_sext(ptr %base, ptr %offptr) { ; CHECK-NEXT: vldrb.s32 q0, [r1] ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vmov r6, r7, d5 ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb.w r12, [r2] ; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: ldrb r3, [r4] -; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: ldrb r2, [r4] +; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: ldrb r0, [r6] ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r4 -; CHECK-NEXT: vmov r4, r6, d3 ; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: ldrb r4, [r6] -; CHECK-NEXT: vmov r6, r7, d5 -; CHECK-NEXT: vldrb.s32 q2, [r1, #4] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: ldrb r0, [r6] -; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov r4, r5, d3 ; CHECK-NEXT: vmov.8 q0[2], r0 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov.8 q0[3], r7 +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[5], r1 ; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] @@ -98,10 +98,10 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_sext(ptr %base, ptr %offptr) { ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r1 -; CHECK-NEXT: vmov.8 q0[10], r5 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], r3 -; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], r2 +; CHECK-NEXT: vmov.8 q0[13], r3 ; CHECK-NEXT: vmov.8 q0[14], r12 ; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -125,30 +125,30 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i16(ptr %base, ptr %offptr) { ; CHECK-NEXT: vldrh.s32 q0, [r1] ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: vldrh.s32 q0, [r1, #16] +; CHECK-NEXT: vmov r6, r7, d5 ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb.w r12, [r2] ; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: ldrb r3, [r4] -; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: ldrb r2, [r4] +; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vldrh.s32 q2, [r1, #8] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: ldrb r0, [r6] ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r4 -; CHECK-NEXT: vmov r4, r6, d3 ; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: ldrb r4, [r6] -; CHECK-NEXT: vmov r6, r7, d5 -; CHECK-NEXT: vldrh.s32 q2, [r1, #8] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: ldrb r0, [r6] -; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov r4, r5, d3 ; CHECK-NEXT: vmov.8 q0[2], r0 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov.8 q0[3], r7 +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[5], r1 ; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] @@ -160,10 +160,10 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i16(ptr %base, ptr %offptr) { ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r1 -; CHECK-NEXT: vmov.8 q0[10], r5 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], r3 -; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], r2 +; CHECK-NEXT: vmov.8 q0[13], r3 ; CHECK-NEXT: vmov.8 q0[14], r12 ; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -189,32 +189,32 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_scaled(ptr %base, ptr %offptr) ; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: vldrb.u32 q0, [r1, #8] +; CHECK-NEXT: vmov r6, r7, d5 ; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb.w r12, [r2] ; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: ldrb r3, [r4] -; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: ldrb r2, [r4] +; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q0[0], r4 -; CHECK-NEXT: vmov r4, r6, d3 -; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: ldrb r4, [r6] -; CHECK-NEXT: vmov r6, r7, d5 ; CHECK-NEXT: vldrb.u32 q2, [r1, #4] +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vshl.i32 q2, q2, #2 ; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: ldrb r0, [r6] -; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r4 +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r4, r5, d3 ; CHECK-NEXT: vmov.8 q0[2], r0 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov.8 q0[3], r7 +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[5], r1 ; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] @@ -226,10 +226,10 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_scaled(ptr %base, ptr %offptr) ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r1 -; CHECK-NEXT: vmov.8 q0[10], r5 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], r3 -; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], r2 +; CHECK-NEXT: vmov.8 q0[13], r3 ; CHECK-NEXT: vmov.8 q0[14], r12 ; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -254,30 +254,30 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_next(ptr %base, ptr %offptr) ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: vldrw.u32 q0, [r1, #32] +; CHECK-NEXT: vmov r6, r7, d5 ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb.w r12, [r2] ; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: ldrb r3, [r4] -; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: ldrb r2, [r4] +; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vldrw.u32 q2, [r1, #16] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: ldrb r0, [r6] ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r4 -; CHECK-NEXT: vmov r4, r6, d3 ; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: ldrb r4, [r6] -; CHECK-NEXT: vmov r6, r7, d5 -; CHECK-NEXT: vldrw.u32 q2, [r1, #16] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: ldrb r0, [r6] -; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov r4, r5, d3 ; CHECK-NEXT: vmov.8 q0[2], r0 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov.8 q0[3], r7 +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[5], r1 ; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] @@ -289,10 +289,10 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_next(ptr %base, ptr %offptr) ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r1 -; CHECK-NEXT: vmov.8 q0[10], r5 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], r3 -; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], r2 +; CHECK-NEXT: vmov.8 q0[13], r3 ; CHECK-NEXT: vmov.8 q0[14], r12 ; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -309,41 +309,41 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep(ptr %base, ptr %offptr) ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: vldrb.s32 q0, [r1, #12] -; CHECK-NEXT: movs r6, #5 +; CHECK-NEXT: movs r4, #5 ; CHECK-NEXT: vldrb.s32 q1, [r1, #8] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q0, q0, r6 +; CHECK-NEXT: vadd.i32 q0, q0, r4 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vadd.i32 q1, q1, r6 -; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vadd.i32 q1, q1, r4 +; CHECK-NEXT: vmov r5, r6, d0 ; CHECK-NEXT: vldrb.s32 q0, [r1] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q2, q0, r6 +; CHECK-NEXT: vadd.i32 q2, q0, r4 ; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: ldrb r3, [r4] ; CHECK-NEXT: ldrb.w r8, [r5] -; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: ldrb r3, [r6] +; CHECK-NEXT: vmov r5, r6, d4 ; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q0[0], r4 -; CHECK-NEXT: ldrb r4, [r5] -; CHECK-NEXT: vmov.8 q0[1], r4 -; CHECK-NEXT: vmov r4, r7, d3 -; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: ldrb r4, [r7] ; CHECK-NEXT: vmov r7, r2, d5 ; CHECK-NEXT: vldrb.s32 q2, [r1, #4] ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r6 +; CHECK-NEXT: vadd.i32 q2, q2, r4 +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r0, [r7] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: ldrb r5, [r6] ; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, r6, d3 ; CHECK-NEXT: vmov.8 q0[2], r0 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov.8 q0[3], r2 +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: vmov.8 q0[5], r1 ; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] @@ -356,9 +356,9 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep(ptr %base, ptr %offptr) ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r1 ; CHECK-NEXT: vmov.8 q0[10], r5 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], r3 -; CHECK-NEXT: vmov.8 q0[13], r8 +; CHECK-NEXT: vmov.8 q0[11], r6 +; CHECK-NEXT: vmov.8 q0[12], r8 +; CHECK-NEXT: vmov.8 q0[13], r3 ; CHECK-NEXT: vmov.8 q0[14], r12 ; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} @@ -487,16 +487,16 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep3(ptr %base) { ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vmov r4, r6, d3 -; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: ldrb.w r12, [r1] ; CHECK-NEXT: adr r1, .LCPI11_2 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb.w lr, [r2] ; CHECK-NEXT: ldrb r1, [r5] ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: ldrb r2, [r4] +; CHECK-NEXT: ldrb r4, [r6] ; CHECK-NEXT: vmov r5, r6, d4 -; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r5 ; CHECK-NEXT: ldrb r5, [r6] @@ -523,12 +523,12 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep3(ptr %base) { ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r5 -; CHECK-NEXT: vmov.8 q0[10], r4 -; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.8 q0[10], r2 +; CHECK-NEXT: vmov.8 q0[11], r4 ; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov.8 q0[14], lr -; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: @@ -575,16 +575,16 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep4(ptr %base) { ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vmov r4, r6, d3 -; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: ldrb.w r12, [r1] ; CHECK-NEXT: adr r1, .LCPI12_2 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb.w lr, [r2] ; CHECK-NEXT: ldrb r1, [r5] ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: ldrb r2, [r4] +; CHECK-NEXT: ldrb r4, [r6] ; CHECK-NEXT: vmov r5, r6, d4 -; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r5 ; CHECK-NEXT: ldrb r5, [r6] @@ -611,12 +611,12 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep4(ptr %base) { ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r5 -; CHECK-NEXT: vmov.8 q0[10], r4 -; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.8 q0[10], r2 +; CHECK-NEXT: vmov.8 q0[11], r4 ; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov.8 q0[14], lr -; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: @@ -650,54 +650,54 @@ entry: define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep5(<16 x ptr> %base) { ; CHECK-LABEL: unscaled_v16i8_i8_biggep5: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: mov.w r4, #256 -; CHECK-NEXT: vadd.i32 q3, q3, r4 -; CHECK-NEXT: vadd.i32 q2, q2, r4 -; CHECK-NEXT: vmov r3, r2, d7 -; CHECK-NEXT: vadd.i32 q1, q1, r4 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vadd.i32 q3, q0, r4 -; CHECK-NEXT: vmov r6, r7, d5 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: mov.w lr, #256 +; CHECK-NEXT: vadd.i32 q3, q3, lr +; CHECK-NEXT: vadd.i32 q2, q2, lr +; CHECK-NEXT: vmov r2, r0, d7 +; CHECK-NEXT: vadd.i32 q1, q1, lr +; CHECK-NEXT: vmov r4, r5, d5 +; CHECK-NEXT: vmov r1, r3, d6 +; CHECK-NEXT: vadd.i32 q3, q0, lr ; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: ldrb r1, [r6] -; CHECK-NEXT: vmov r2, r6, d6 -; CHECK-NEXT: ldrb r5, [r0] -; CHECK-NEXT: ldrb r0, [r7] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: ldrb r2, [r6] -; CHECK-NEXT: vmov.8 q0[1], r2 -; CHECK-NEXT: vmov r2, r6, d7 -; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: ldrb r0, [r4] +; CHECK-NEXT: ldrb r4, [r5] +; CHECK-NEXT: vmov r5, r6, d6 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: ldrb r5, [r6] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, r6, d7 +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: vmov.8 q0[2], r2 -; CHECK-NEXT: vmov r2, r4, d2 +; CHECK-NEXT: vmov.8 q0[2], r5 ; CHECK-NEXT: vmov.8 q0[3], r6 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q0[4], r2 -; CHECK-NEXT: vmov.8 q0[5], r4 -; CHECK-NEXT: vmov r2, r4, d3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q0[6], r2 -; CHECK-NEXT: vmov.8 q0[7], r4 -; CHECK-NEXT: vmov r2, r4, d4 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q0[8], r2 -; CHECK-NEXT: vmov.8 q0[9], r4 -; CHECK-NEXT: vmov.8 q0[10], r1 -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.8 q0[12], r5 +; CHECK-NEXT: vmov r6, r5, d2 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[4], r6 +; CHECK-NEXT: vmov.8 q0[5], r5 +; CHECK-NEXT: vmov r6, r5, d3 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[6], r6 +; CHECK-NEXT: vmov.8 q0[7], r5 +; CHECK-NEXT: vmov r6, r5, d4 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: vmov.8 q0[9], r5 +; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], r1 ; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov.8 q0[14], lr -; CHECK-NEXT: vmov.8 q0[15], r12 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %ptrs2 = getelementptr inbounds i8, <16 x ptr> %base, i32 256 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) @@ -720,16 +720,16 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep6(ptr %base) { ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vmov r4, r6, d3 -; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: ldrb.w r12, [r1] ; CHECK-NEXT: adr r1, .LCPI14_2 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb.w lr, [r2] ; CHECK-NEXT: ldrb r1, [r5] ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: ldrb r2, [r4] +; CHECK-NEXT: ldrb r4, [r6] ; CHECK-NEXT: vmov r5, r6, d4 -; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r5 ; CHECK-NEXT: ldrb r5, [r6] @@ -756,12 +756,12 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep6(ptr %base) { ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r5 -; CHECK-NEXT: vmov.8 q0[10], r4 -; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.8 q0[10], r2 +; CHECK-NEXT: vmov.8 q0[11], r4 ; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov.8 q0[14], lr -; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: @@ -808,16 +808,16 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep7(ptr %base) { ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vmov r4, r6, d3 -; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: ldrb.w r12, [r1] ; CHECK-NEXT: adr r1, .LCPI15_2 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb.w lr, [r2] ; CHECK-NEXT: ldrb r1, [r5] ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: ldrb r2, [r4] +; CHECK-NEXT: ldrb r4, [r6] ; CHECK-NEXT: vmov r5, r6, d4 -; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r5 ; CHECK-NEXT: ldrb r5, [r6] @@ -844,12 +844,12 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep7(ptr %base) { ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r5 -; CHECK-NEXT: vmov.8 q0[10], r4 -; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.8 q0[10], r2 +; CHECK-NEXT: vmov.8 q0[11], r4 ; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov.8 q0[14], lr -; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: @@ -893,30 +893,30 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2(ptr %base, ptr %offptr) { ; CHECK-NEXT: vldrb.s32 q0, [r1] ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vmov r6, r7, d5 ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb.w r12, [r2] ; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: ldrb r3, [r4] -; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: ldrb r2, [r4] +; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: ldrb r0, [r6] ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r4 -; CHECK-NEXT: vmov r4, r6, d3 ; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: ldrb r4, [r6] -; CHECK-NEXT: vmov r6, r7, d5 -; CHECK-NEXT: vldrb.s32 q2, [r1, #4] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: ldrb r0, [r6] -; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov r4, r5, d3 ; CHECK-NEXT: vmov.8 q0[2], r0 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov.8 q0[3], r7 +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[5], r1 ; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] @@ -928,10 +928,10 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2(ptr %base, ptr %offptr) { ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r1 -; CHECK-NEXT: vmov.8 q0[10], r5 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], r3 -; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], r2 +; CHECK-NEXT: vmov.8 q0[13], r3 ; CHECK-NEXT: vmov.8 q0[14], r12 ; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -989,32 +989,32 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_basei16(ptr %base, ptr %offptr) ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: vldrb.u32 q0, [r1, #8] +; CHECK-NEXT: vmov r6, r7, d5 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb.w r12, [r2] ; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: ldrb r3, [r4] -; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: ldrb r2, [r4] +; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q0[0], r4 -; CHECK-NEXT: vmov r4, r6, d3 -; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: ldrb r4, [r6] -; CHECK-NEXT: vmov r6, r7, d5 ; CHECK-NEXT: vldrb.u32 q2, [r1, #4] +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vshl.i32 q2, q2, #1 ; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: ldrb r0, [r6] -; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r4 +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r4, r5, d3 ; CHECK-NEXT: vmov.8 q0[2], r0 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov.8 q0[3], r7 +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[5], r1 ; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] @@ -1026,10 +1026,10 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_basei16(ptr %base, ptr %offptr) ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r1 -; CHECK-NEXT: vmov.8 q0[10], r5 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], r3 -; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], r2 +; CHECK-NEXT: vmov.8 q0[13], r3 ; CHECK-NEXT: vmov.8 q0[14], r12 ; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -1090,16 +1090,16 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_rangebad(ptr %base, ptr %off ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vmov r4, r6, d3 -; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: ldrb.w r12, [r1] ; CHECK-NEXT: adr r1, .LCPI20_2 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb.w lr, [r2] ; CHECK-NEXT: ldrb r1, [r5] ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: ldrb r2, [r4] +; CHECK-NEXT: ldrb r4, [r6] ; CHECK-NEXT: vmov r5, r6, d4 -; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r5 ; CHECK-NEXT: ldrb r5, [r6] @@ -1126,12 +1126,12 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_rangebad(ptr %base, ptr %off ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.8 q0[9], r5 -; CHECK-NEXT: vmov.8 q0[10], r4 -; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.8 q0[10], r2 +; CHECK-NEXT: vmov.8 q0[11], r4 ; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov.8 q0[14], lr -; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll index b45cca7e1b4c58..629f4c24c2cca1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -38,20 +38,20 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(ptr %offptr) { ; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: vmov r3, r12, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov r0, lr, d1 -; CHECK-NEXT: ldr r7, [r2] -; CHECK-NEXT: vmov r2, r4, d0 -; CHECK-NEXT: ldr r6, [r1] +; CHECK-NEXT: ldr r7, [r1] ; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr.w r1, [r12] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r6 -; CHECK-NEXT: ldr.w r5, [lr] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r7 ; CHECK-NEXT: ldr r2, [r2] ; CHECK-NEXT: ldr r4, [r4] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r7 +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: ldr.w r1, [r12] +; CHECK-NEXT: ldr.w r6, [lr] +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: ldr r5, [r5] +; CHECK-NEXT: vmov q1[3], q1[1], r1, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x ptr>, ptr %offptr, align 4 @@ -62,45 +62,45 @@ entry: define arm_aapcs_vfpcc <16 x i32> @ptr_v16i32(ptr %offptr) { ; CHECK-LABEL: ptr_v16i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: vmov r3, r12, d0 +; CHECK-NEXT: vmov r1, lr, d1 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: ldr r7, [r2] -; CHECK-NEXT: vmov r2, r6, d0 -; CHECK-NEXT: ldr.w r12, [r1] +; CHECK-NEXT: vmov r2, r4, d1 +; CHECK-NEXT: vmov r6, r7, d0 ; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: ldr r4, [r4] -; CHECK-NEXT: ldr r5, [r5] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r12 +; CHECK-NEXT: ldr.w r8, [r1] ; CHECK-NEXT: ldr.w r1, [lr] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r7 ; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r8 ; CHECK-NEXT: ldr r6, [r6] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 -; CHECK-NEXT: vmov r2, r4, d3 -; CHECK-NEXT: vmov q0[3], q0[1], r6, r5 -; CHECK-NEXT: vmov r6, r5, d2 +; CHECK-NEXT: ldr r4, [r4] +; CHECK-NEXT: ldr r7, [r7] +; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 +; CHECK-NEXT: vmov r2, r6, d3 +; CHECK-NEXT: ldr.w r5, [r12] +; CHECK-NEXT: vmov q0[3], q0[1], r7, r4 +; CHECK-NEXT: vmov r7, r4, d2 +; CHECK-NEXT: vmov q3[3], q3[1], r5, r1 ; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: ldr r6, [r6] -; CHECK-NEXT: ldr r5, [r5] -; CHECK-NEXT: vmov q1[2], q1[0], r6, r2 +; CHECK-NEXT: ldr r7, [r7] +; CHECK-NEXT: vmov q1[2], q1[0], r7, r2 +; CHECK-NEXT: ldr r7, [r6] ; CHECK-NEXT: ldr r6, [r4] ; CHECK-NEXT: vmov r0, r2, d5 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r6 -; CHECK-NEXT: vmov r6, r5, d4 +; CHECK-NEXT: vmov q1[3], q1[1], r6, r7 +; CHECK-NEXT: vmov r7, r6, d4 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r6, [r6] ; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: ldr r5, [r5] -; CHECK-NEXT: vmov q2[2], q2[0], r6, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r2 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: ldr r7, [r7] +; CHECK-NEXT: ldr r6, [r6] +; CHECK-NEXT: vmov q2[2], q2[0], r7, r0 +; CHECK-NEXT: vmov q2[3], q2[1], r6, r2 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %offs = load <16 x ptr>, ptr %offptr, align 4 %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %offs, i32 4, <16 x i1> , <16 x i32> undef) @@ -140,17 +140,17 @@ define arm_aapcs_vfpcc <8 x float> @ptr_v8f32(ptr %offptr) { ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r12, r2, d1 -; CHECK-NEXT: vmov lr, r1, d0 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r12, r3, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: vmov lr, r0, d1 ; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vldr s3, [r2] -; CHECK-NEXT: vldr s2, [r12] -; CHECK-NEXT: vldr s1, [r1] -; CHECK-NEXT: vldr s0, [lr] -; CHECK-NEXT: vldr s7, [r3] -; CHECK-NEXT: vldr s6, [r0] +; CHECK-NEXT: vldr s2, [r1] +; CHECK-NEXT: vldr s1, [r3] +; CHECK-NEXT: vldr s0, [r12] +; CHECK-NEXT: vldr s7, [r0] +; CHECK-NEXT: vldr s6, [lr] ; CHECK-NEXT: vldr s5, [r5] ; CHECK-NEXT: vldr s4, [r4] ; CHECK-NEXT: pop {r4, r5, r7, pc} @@ -168,25 +168,25 @@ define arm_aapcs_vfpcc <8 x i16> @ptr_i16(ptr %offptr) { ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: vmov r3, r12, d1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov r0, lr, d1 -; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r6, [r3] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrh r5, [r2] +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: ldrh r4, [r1] ; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh.w r3, [lr] -; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: ldrh.w r1, [lr] ; CHECK-NEXT: ldrh.w r12, [r12] -; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.16 q0[4], r4 +; CHECK-NEXT: vmov.16 q0[5], r5 ; CHECK-NEXT: vmov.16 q0[6], r6 ; CHECK-NEXT: vmov.16 q0[7], r12 ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -284,27 +284,27 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(ptr %offptr) { ; CHECK-NEXT: vmov r12, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov lr, r0, d1 +; CHECK-NEXT: ldrh r6, [r3] +; CHECK-NEXT: ldrh r5, [r2] +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ldrh r7, [r1] +; CHECK-NEXT: ldrh.w r4, [lr] +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: ldrh.w r1, [r12] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r4, [r0] -; CHECK-NEXT: vmov r0, r5, d0 -; CHECK-NEXT: ldrh.w r6, [lr] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r6 -; CHECK-NEXT: vmov.16 q0[3], r4 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov.16 q0[2], r4 +; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[5], r5 +; CHECK-NEXT: vmov.16 q0[6], r6 ; CHECK-NEXT: vmov.16 q0[7], r7 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrh.s32 q0, [r0] -; CHECK-NEXT: vldrh.s32 q1, [r0, #8] +; CHECK-NEXT: vstrw.32 q0, [r2] +; CHECK-NEXT: vldrh.s32 q0, [r2] +; CHECK-NEXT: vldrh.s32 q1, [r2, #8] ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: @@ -326,27 +326,27 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(ptr %offptr) { ; CHECK-NEXT: vmov r12, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov lr, r0, d1 +; CHECK-NEXT: ldrh r6, [r3] +; CHECK-NEXT: ldrh r5, [r2] +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ldrh r7, [r1] +; CHECK-NEXT: ldrh.w r4, [lr] +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: ldrh.w r1, [r12] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r4, [r0] -; CHECK-NEXT: vmov r0, r5, d0 -; CHECK-NEXT: ldrh.w r6, [lr] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r6 -; CHECK-NEXT: vmov.16 q0[3], r4 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov.16 q0[2], r4 +; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[5], r5 +; CHECK-NEXT: vmov.16 q0[6], r6 ; CHECK-NEXT: vmov.16 q0[7], r7 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vstrw.32 q0, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r2] +; CHECK-NEXT: vldrh.u32 q1, [r2, #8] ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: @@ -417,45 +417,45 @@ define arm_aapcs_vfpcc <16 x i8> @ptr_i8(ptr %offptr) { ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: vmov r6, r7, d4 -; CHECK-NEXT: vmov r4, r3, d1 +; CHECK-NEXT: vmov r3, r4, d1 ; CHECK-NEXT: ldrb r5, [r1] ; CHECK-NEXT: ldrb r1, [r2] ; CHECK-NEXT: ldrb r2, [r6] -; CHECK-NEXT: ldrb.w r12, [r3] +; CHECK-NEXT: ldrb.w lr, [r4] +; CHECK-NEXT: ldrb r4, [r7] ; CHECK-NEXT: vmov.8 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r3] ; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: ldrb.w lr, [r4] -; CHECK-NEXT: ldrb r4, [r2] -; CHECK-NEXT: ldrb r2, [r3] -; CHECK-NEXT: ldrb r3, [r7] -; CHECK-NEXT: vmov.8 q0[1], r3 -; CHECK-NEXT: vmov r3, r6, d5 +; CHECK-NEXT: vmov.8 q0[1], r4 +; CHECK-NEXT: vmov r4, r6, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: vmov.8 q0[2], r3 -; CHECK-NEXT: vmov r0, r3, d4 +; CHECK-NEXT: vmov.8 q0[2], r4 +; CHECK-NEXT: vmov r0, r4, d4 ; CHECK-NEXT: vmov.8 q0[3], r6 -; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.8 q0[5], r3 -; CHECK-NEXT: vmov r0, r3, d5 +; CHECK-NEXT: vmov.8 q0[5], r4 +; CHECK-NEXT: vmov r0, r4, d5 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.8 q0[7], r3 -; CHECK-NEXT: vmov r0, r3, d2 +; CHECK-NEXT: vmov.8 q0[7], r4 +; CHECK-NEXT: vmov r0, r4, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.8 q0[9], r3 -; CHECK-NEXT: vmov.8 q0[10], r4 -; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.8 q0[9], r4 +; CHECK-NEXT: vmov.8 q0[10], r2 +; CHECK-NEXT: vmov.8 q0[11], r3 ; CHECK-NEXT: vmov.8 q0[12], r5 ; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov.8 q0[14], lr -; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <16 x ptr>, ptr %offptr, align 4 @@ -472,23 +472,23 @@ define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_sext16(ptr %offptr) { ; CHECK-NEXT: vmov r3, r1, d1 ; CHECK-NEXT: vmov r12, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov lr, r0, d1 +; CHECK-NEXT: ldrb r6, [r3] +; CHECK-NEXT: ldrb r5, [r2] +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ldrb r7, [r1] +; CHECK-NEXT: ldrb.w r4, [lr] +; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb.w r1, [r12] ; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: ldrb.w r6, [lr] -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[2], r6 ; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov.16 q0[2], r4 ; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[5], r5 +; CHECK-NEXT: vmov.16 q0[6], r6 ; CHECK-NEXT: vmov.16 q0[7], r7 ; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -508,23 +508,23 @@ define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_zext16(ptr %offptr) { ; CHECK-NEXT: vmov r3, r1, d1 ; CHECK-NEXT: vmov r12, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov lr, r0, d1 +; CHECK-NEXT: ldrb r6, [r3] +; CHECK-NEXT: ldrb r5, [r2] +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ldrb r7, [r1] +; CHECK-NEXT: ldrb.w r4, [lr] +; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb.w r1, [r12] ; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: ldrb.w r6, [lr] -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[2], r6 ; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov.16 q0[2], r4 ; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[5], r5 +; CHECK-NEXT: vmov.16 q0[6], r6 ; CHECK-NEXT: vmov.16 q0[7], r7 ; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -541,25 +541,25 @@ define arm_aapcs_vfpcc <8 x i8> @ptr_v8i8(ptr %offptr) { ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: vmov r3, r12, d1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov r0, lr, d1 -; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r6, [r3] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrb r5, [r2] +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: ldrb r4, [r1] ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrb.w r3, [lr] -; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: ldrb.w r1, [lr] ; CHECK-NEXT: ldrb.w r12, [r12] -; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.16 q0[4], r4 +; CHECK-NEXT: vmov.16 q0[5], r5 ; CHECK-NEXT: vmov.16 q0[6], r6 ; CHECK-NEXT: vmov.16 q0[7], r12 ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -619,22 +619,22 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(ptr %offptr) { ; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: vmov r3, r12, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov r0, lr, d1 -; CHECK-NEXT: ldrb r7, [r2] -; CHECK-NEXT: vmov r2, r4, d0 -; CHECK-NEXT: ldrb r6, [r1] +; CHECK-NEXT: ldrb r7, [r1] ; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r7 ; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb.w r1, [r12] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r6 -; CHECK-NEXT: ldrb.w r5, [lr] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r7 +; CHECK-NEXT: ldrb.w r6, [lr] +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov q1[3], q1[1], r1, r2 ; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 ; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -655,20 +655,20 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_zext32(ptr %offptr) { ; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: vmov r12, r3, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vmov r0, lr, d1 ; CHECK-NEXT: ldrb r7, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb.w r2, [r12] -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: ldrb r6, [r1] +; CHECK-NEXT: ldrb.w r1, [r12] ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb.w r6, [lr] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov q2[2], q2[0], r1, r6 +; CHECK-NEXT: ldrb.w r5, [lr] ; CHECK-NEXT: vmov q2[3], q2[1], r3, r7 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q1, q2, q1 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll index 8013e1d639715f..c7b3d105cb6138 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll @@ -47,15 +47,15 @@ define arm_aapcs_vfpcc void @unscaled_i32_i8_scatter(ptr %base, ptr %offptr, <4 ; NOGATSCAT-NEXT: .save {r4, r5, r7, lr} ; NOGATSCAT-NEXT: push {r4, r5, r7, lr} ; NOGATSCAT-NEXT: vldrb.u32 q1, [r1] -; NOGATSCAT-NEXT: vmov r1, r3, d0 +; NOGATSCAT-NEXT: vmov r3, r1, d0 ; NOGATSCAT-NEXT: vmov r4, r5, d1 ; NOGATSCAT-NEXT: vadd.i32 q1, q1, r0 -; NOGATSCAT-NEXT: vmov r0, r12, d2 -; NOGATSCAT-NEXT: vmov r2, lr, d3 -; NOGATSCAT-NEXT: str r1, [r0] -; NOGATSCAT-NEXT: str.w r3, [r12] +; NOGATSCAT-NEXT: vmov r0, lr, d2 +; NOGATSCAT-NEXT: vmov r2, r12, d3 +; NOGATSCAT-NEXT: str r3, [r0] +; NOGATSCAT-NEXT: str.w r1, [lr] ; NOGATSCAT-NEXT: str r4, [r2] -; NOGATSCAT-NEXT: str.w r5, [lr] +; NOGATSCAT-NEXT: str.w r5, [r12] ; NOGATSCAT-NEXT: pop {r4, r5, r7, pc} ; ; NOMVE-LABEL: unscaled_i32_i8_scatter: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index a89d3522ca5f27..4affb19ced08d8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -390,22 +390,22 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %d ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr r4, .LCPI8_0 ; CHECK-NEXT: movs r5, #18 -; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: mov.w r12, #9 ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vdup.32 q1, r5 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vdup.32 q2, r5 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vadd.i32 q3, q2, r4 -; CHECK-NEXT: vmla.i32 q4, q2, lr +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vadd.i32 q3, q0, r4 +; CHECK-NEXT: vmla.i32 q4, q0, lr ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldrw.u32 q5, [q4, #24] -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmla.i32 q4, q2, r12 -; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmla.i32 q4, q0, r12 +; CHECK-NEXT: vmov q0, q3 ; CHECK-NEXT: vstrb.8 q5, [r1], #16 ; CHECK-NEXT: vstrw.32 q4, [r3] ; CHECK-NEXT: bne .LBB8_1 @@ -608,11 +608,11 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r0, r3 ; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne r0, [sp, #136] -; CHECK-NEXT: cmpne r0, #0 +; CHECK-NEXT: ldrne r7, [sp, #136] +; CHECK-NEXT: cmpne r7, #0 ; CHECK-NEXT: bne .LBB10_2 ; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #32 @@ -620,24 +620,24 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader -; CHECK-NEXT: ldr.w r12, [sp, #140] -; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: bic r2, r12, #3 -; CHECK-NEXT: subs r3, r2, #4 -; CHECK-NEXT: add.w r0, r7, r3, lsr #2 -; CHECK-NEXT: ldr r7, [sp, #136] -; CHECK-NEXT: adr r3, .LCPI10_0 -; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: lsl.w r0, r12, #1 +; CHECK-NEXT: ldr.w r9, [sp, #140] +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: vdup.32 q1, r7 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: lsls r6, r7, #1 +; CHECK-NEXT: bic r0, r9, #3 +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vshl.i32 q3, q1, #2 -; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: add.w r0, r3, r0, lsr #2 +; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: lsl.w r0, r9, #1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: adr r0, .LCPI10_0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: b .LBB10_5 ; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader @@ -648,15 +648,15 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: b .LBB10_15 ; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: add r12, r9 +; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add r2, r0 +; CHECK-NEXT: str r2, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r11, r12 -; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add r3, r0 -; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: adds r3, #1 -; CHECK-NEXT: cmp r3, r0 +; CHECK-NEXT: adds r2, #1 +; CHECK-NEXT: cmp r2, r0 ; CHECK-NEXT: beq .LBB10_1 ; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -664,9 +664,9 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_15 Depth 2 -; CHECK-NEXT: mul r5, r3, r7 -; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: mul r5, r2, r7 +; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: beq .LBB10_3 ; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 @@ -685,54 +685,55 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 -; CHECK-NEXT: cmp.w r12, #3 +; CHECK-NEXT: cmp.w r9, #3 ; CHECK-NEXT: bhi .LBB10_10 ; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: b .LBB10_13 ; CHECK-NEXT: .LBB10_10: @ %vector.ph ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov q5, q1 ; CHECK-NEXT: vmov.i32 q4, #0x0 ; CHECK-NEXT: vmlas.i32 q5, q2, r8 ; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: .LBB10_11: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 ; CHECK-NEXT: vadd.i32 q6, q5, q3 ; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] -; CHECK-NEXT: vldrh.s32 q5, [r3], #8 +; CHECK-NEXT: vldrh.s32 q5, [r0], #8 ; CHECK-NEXT: vmul.i32 q5, q7, q5 ; CHECK-NEXT: vadd.i32 q4, q5, q4 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vaddv.u32 r10, q4 -; CHECK-NEXT: cmp r2, r12 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: cmp r0, r9 ; CHECK-NEXT: beq .LBB10_7 ; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: mla r3, r7, r4, r8 -; CHECK-NEXT: add.w r0, r11, r4 +; CHECK-NEXT: mla r3, r7, r0, r8 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: add.w r4, r12, r0 ; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: sub.w lr, r12, r4 -; CHECK-NEXT: add.w r9, r7, r0, lsl #1 -; CHECK-NEXT: ldr r7, [sp, #136] +; CHECK-NEXT: sub.w lr, r9, r0 +; CHECK-NEXT: add.w r4, r7, r4, lsl #1 +; CHECK-NEXT: mov r7, r2 ; CHECK-NEXT: add.w r3, r1, r3, lsl #1 ; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: ldrsh.w r4, [r3] +; CHECK-NEXT: ldrsh.w r11, [r3] ; CHECK-NEXT: add r3, r6 -; CHECK-NEXT: ldrsh r0, [r9], #2 -; CHECK-NEXT: smlabb r10, r4, r0, r10 +; CHECK-NEXT: ldrsh r0, [r4], #2 +; CHECK-NEXT: smlabb r10, r11, r0, r10 ; CHECK-NEXT: le lr, .LBB10_14 ; CHECK-NEXT: b .LBB10_7 ; CHECK-NEXT: .LBB10_15: @ Parent Loop BB10_5 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll index 66c41bb32dee6a..4e6714d442a488 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -238,28 +238,28 @@ define void @justoffsets(i8* noalias nocapture readonly %r, i8* noalias nocaptur ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u32 q4, [r0, q0] -; CHECK-NEXT: vldrb.u32 q3, [r0, q1] -; CHECK-NEXT: vldrb.u32 q5, [r0, q2] +; CHECK-NEXT: vldrb.u32 q3, [r0, q0] +; CHECK-NEXT: vldrb.u32 q4, [r0, q1] +; CHECK-NEXT: vldrb.u32 q6, [r0, q2] ; CHECK-NEXT: adds r0, #12 -; CHECK-NEXT: vmul.i32 q6, q4, r11 -; CHECK-NEXT: vmla.i32 q6, q3, r8 -; CHECK-NEXT: vmla.i32 q6, q5, r12 -; CHECK-NEXT: vadd.i32 q6, q6, r3 -; CHECK-NEXT: vshr.u32 q6, q6, #16 -; CHECK-NEXT: vstrb.32 q6, [r1, q1] -; CHECK-NEXT: vmul.i32 q6, q4, r4 -; CHECK-NEXT: vmul.i32 q4, q4, r10 -; CHECK-NEXT: vmla.i32 q6, q3, r5 -; CHECK-NEXT: vmla.i32 q4, q3, r7 -; CHECK-NEXT: vmla.i32 q6, q5, r6 -; CHECK-NEXT: vmla.i32 q4, q5, r9 -; CHECK-NEXT: vadd.i32 q6, q6, r3 -; CHECK-NEXT: vadd.i32 q3, q4, r3 -; CHECK-NEXT: vshr.u32 q6, q6, #16 +; CHECK-NEXT: vmul.i32 q5, q3, r11 +; CHECK-NEXT: vmla.i32 q5, q4, r8 +; CHECK-NEXT: vmla.i32 q5, q6, r12 +; CHECK-NEXT: vadd.i32 q5, q5, r3 +; CHECK-NEXT: vshr.u32 q5, q5, #16 +; CHECK-NEXT: vstrb.32 q5, [r1, q1] +; CHECK-NEXT: vmul.i32 q5, q3, r4 +; CHECK-NEXT: vmul.i32 q3, q3, r10 +; CHECK-NEXT: vmla.i32 q5, q4, r5 +; CHECK-NEXT: vmla.i32 q3, q4, r7 +; CHECK-NEXT: vmla.i32 q5, q6, r6 +; CHECK-NEXT: vmla.i32 q3, q6, r9 +; CHECK-NEXT: vadd.i32 q5, q5, r3 +; CHECK-NEXT: vadd.i32 q3, q3, r3 +; CHECK-NEXT: vshr.u32 q5, q5, #16 ; CHECK-NEXT: vshr.u32 q3, q3, #16 ; CHECK-NEXT: vstrb.32 q3, [r1, q0] -; CHECK-NEXT: vstrb.32 q6, [r1, q2] +; CHECK-NEXT: vstrb.32 q5, [r1, q2] ; CHECK-NEXT: adds r1, #12 ; CHECK-NEXT: letp lr, .LBB3_2 ; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll index c95fe2296e0998..3a0d87b352982c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll @@ -4,8 +4,10 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(ptr %A, ptr %B, ptr %C) { ; CHECK-LABEL: loads_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q1, [r1] @@ -14,48 +16,49 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(ptr %A, ptr %B, ptr %C) { ; CHECK-NEXT: vmov.f32 s2, s7 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: vmov r3, r1, d1 +; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r12, lr, d1 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vand q3, q1, q2 ; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: vmov lr, r12, d7 +; CHECK-NEXT: vmov r4, r6, d7 +; CHECK-NEXT: vmov r5, r7, d6 ; CHECK-NEXT: vmov.f32 s16, s6 ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vand q2, q4, q2 ; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: adcs r5, r2 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: asrl r0, r5, r2 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: asrl r0, r1, r2 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: asrs r4, r2, #31 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adcs r1, r4 +; CHECK-NEXT: adds.w r2, r1, r12 +; CHECK-NEXT: asr.w r3, r1, #31 +; CHECK-NEXT: adc.w r1, r3, lr ; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: asrl r2, r1, r3 -; CHECK-NEXT: vmov r4, r5, d6 ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: adds.w r6, r1, lr +; CHECK-NEXT: adds r4, r4, r1 ; CHECK-NEXT: asr.w r3, r1, #31 -; CHECK-NEXT: adc.w r1, r3, r12 +; CHECK-NEXT: adc.w r1, r3, r6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: asrl r6, r1, r3 +; CHECK-NEXT: asrl r4, r1, r3 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: adds r4, r4, r1 +; CHECK-NEXT: adds r6, r1, r5 ; CHECK-NEXT: asr.w r3, r1, #31 -; CHECK-NEXT: adc.w r1, r3, r5 +; CHECK-NEXT: adc.w r1, r3, r7 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: asrl r4, r1, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 +; CHECK-NEXT: asrl r6, r1, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r2 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %a = load <4 x i32>, ptr %A, align 4 %b = load <4 x i32>, ptr %B, align 4 @@ -138,8 +141,8 @@ entry: define arm_aapcs_vfpcc void @load_store_i32(ptr %A, ptr %B, ptr %C, ptr %D) { ; CHECK-LABEL: load_store_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} @@ -152,48 +155,48 @@ define arm_aapcs_vfpcc void @load_store_i32(ptr %A, ptr %B, ptr %C, ptr %D) { ; CHECK-NEXT: vand q2, q0, q4 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vmov r5, r1, d3 +; CHECK-NEXT: vmov r1, r12, d3 ; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmov r0, r12, d2 +; CHECK-NEXT: vmov r9, r8, d2 ; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: vmov r4, lr, d5 +; CHECK-NEXT: vmov lr, r4, d5 ; CHECK-NEXT: vmov.f32 s20, s6 ; CHECK-NEXT: vmov.f32 s6, s1 ; CHECK-NEXT: vmov.f32 s22, s7 ; CHECK-NEXT: vand q4, q5, q4 -; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: adds r2, r6, r5 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: asr.w r7, r6, #31 -; CHECK-NEXT: adcs r1, r7 -; CHECK-NEXT: asrl r2, r1, r5 -; CHECK-NEXT: vmov r7, s2 +; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r12 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: asrl r0, r1, r2 +; CHECK-NEXT: vmov r5, r12, d4 ; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: adds r4, r4, r1 -; CHECK-NEXT: asr.w r5, r1, #31 -; CHECK-NEXT: adc.w r1, r5, lr -; CHECK-NEXT: asrl r4, r1, r7 -; CHECK-NEXT: vmov r6, r5, d4 +; CHECK-NEXT: adds.w r2, r1, lr +; CHECK-NEXT: asr.w r6, r1, #31 +; CHECK-NEXT: adc.w r1, r6, r4 +; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: asrl r2, r1, r6 ; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: asr.w r7, r1, #31 -; CHECK-NEXT: adc.w r1, r7, r12 -; CHECK-NEXT: vmov r7, s16 -; CHECK-NEXT: asrl r0, r1, r7 +; CHECK-NEXT: adds.w r4, r1, r9 +; CHECK-NEXT: asr.w r6, r1, #31 +; CHECK-NEXT: adc.w r1, r6, r8 +; CHECK-NEXT: vmov r6, s16 +; CHECK-NEXT: asrl r4, r1, r6 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: adds r6, r6, r1 +; CHECK-NEXT: adds r6, r1, r5 ; CHECK-NEXT: asr.w r7, r1, #31 -; CHECK-NEXT: adc.w r1, r7, r5 +; CHECK-NEXT: adc.w r1, r7, r12 ; CHECK-NEXT: vmov r7, s4 ; CHECK-NEXT: asrl r6, r1, r7 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 ; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %a = load <4 x i32>, ptr %A, align 4 %b = load <4 x i32>, ptr %B, align 4 @@ -276,8 +279,8 @@ entry: define arm_aapcs_vfpcc void @load_one_store_i32(ptr %A, ptr %D) { ; CHECK-LABEL: load_one_store_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s2, s3 @@ -287,25 +290,25 @@ define arm_aapcs_vfpcc void @load_one_store_i32(ptr %A, ptr %D) { ; CHECK-NEXT: asr.w r3, r2, #31 ; CHECK-NEXT: adc.w r3, r3, r2, asr #31 ; CHECK-NEXT: asrl r12, r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r2, r3, r3 -; CHECK-NEXT: asr.w r0, r3, #31 -; CHECK-NEXT: adc.w r5, r0, r3, asr #31 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: asrl r2, r5, r3 -; CHECK-NEXT: adds r4, r0, r0 -; CHECK-NEXT: asr.w r3, r0, #31 -; CHECK-NEXT: adc.w r3, r3, r0, asr #31 -; CHECK-NEXT: asrl r4, r3, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: adds r6, r0, r0 -; CHECK-NEXT: asr.w r3, r0, #31 -; CHECK-NEXT: adc.w r3, r3, r0, asr #31 -; CHECK-NEXT: asrl r6, r3, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adds r0, r2, r2 +; CHECK-NEXT: asr.w r3, r2, #31 +; CHECK-NEXT: adc.w r3, r3, r2, asr #31 +; CHECK-NEXT: asrl r0, r3, r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: adds r4, r2, r2 +; CHECK-NEXT: asr.w r3, r2, #31 +; CHECK-NEXT: adc.w r3, r3, r2, asr #31 +; CHECK-NEXT: asrl r4, r3, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r6, r2, r2 +; CHECK-NEXT: asr.w r3, r2, #31 +; CHECK-NEXT: adc.w r3, r3, r2, asr #31 +; CHECK-NEXT: asrl r6, r3, r2 ; CHECK-NEXT: vmov q0[2], q0[0], r6, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r6, r7, pc} entry: %a = load <4 x i32>, ptr %A, align 4 %sa = sext <4 x i32> %a to <4 x i64> @@ -368,34 +371,34 @@ entry: define arm_aapcs_vfpcc void @mul_i32(ptr %A, ptr %B, i64 %C, ptr %D) { ; CHECK-LABEL: mul_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: ldr.w lr, [sp, #20] -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: smull r12, r3, r1, r0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: ldr.w r12, [sp, #24] +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s14, s9 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov.f32 s0, s2 ; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: smull r8, r5, r1, r0 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmullb.s32 q2, q1, q0 -; CHECK-NEXT: asrl r12, r3, r2 -; CHECK-NEXT: vmov r6, r1, d4 -; CHECK-NEXT: vmov r4, r7, d5 -; CHECK-NEXT: asrl r6, r1, r2 -; CHECK-NEXT: asrl r4, r7, r2 -; CHECK-NEXT: smull r0, r5, r5, r0 -; CHECK-NEXT: asrl r0, r5, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r6 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r4 -; CHECK-NEXT: vstrw.32 q0, [lr] -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vmov r0, r7, d4 +; CHECK-NEXT: asrl r0, r7, r2 +; CHECK-NEXT: asrl r8, r5, r2 +; CHECK-NEXT: smull r4, r1, r1, r3 +; CHECK-NEXT: vmov r6, r3, d5 +; CHECK-NEXT: asrl r6, r3, r2 +; CHECK-NEXT: asrl r4, r1, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r8, r6 +; CHECK-NEXT: vstrw.32 q0, [r12] +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %a = load <4 x i32>, ptr %A, align 4 %b = load <4 x i32>, ptr %B, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll index cfdb20d15e938b..9634d403ba8644 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll @@ -126,36 +126,36 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef ; CHECK-NEXT: ldr r1, [sp, #48] ; CHECK-NEXT: add.w r12, r2, #3 ; CHECK-NEXT: ldr.w r11, [sp] @ 4-byte Reload -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov r10, r2 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: uxth r3, r1 +; CHECK-NEXT: uxth.w r8, r1 ; CHECK-NEXT: b .LBB4_4 ; CHECK-NEXT: .LBB4_2: @ in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: .LBB4_3: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: lsrs r2, r6, #16 ; CHECK-NEXT: sub.w r12, r12, #1 ; CHECK-NEXT: add.w r11, r11, #2 -; CHECK-NEXT: sub.w r8, r8, #1 -; CHECK-NEXT: strh.w r2, [r7, r10, lsl #1] -; CHECK-NEXT: add.w r10, r10, #1 +; CHECK-NEXT: sub.w r10, r10, #1 +; CHECK-NEXT: strh.w r2, [r3, r4, lsl #1] +; CHECK-NEXT: adds r4, #1 ; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: cmp r10, r2 +; CHECK-NEXT: cmp r4, r2 ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: beq .LBB4_12 ; CHECK-NEXT: .LBB4_4: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_8 Depth 2 ; CHECK-NEXT: @ Child Loop BB4_11 Depth 2 -; CHECK-NEXT: cmp r2, r10 +; CHECK-NEXT: cmp r2, r4 ; CHECK-NEXT: ble .LBB4_2 ; CHECK-NEXT: @ %bb.5: @ %vector.main.loop.iter.check ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: sub.w r4, r2, r10 -; CHECK-NEXT: cmp r4, #8 +; CHECK-NEXT: subs r2, r2, r4 +; CHECK-NEXT: cmp r2, #8 ; CHECK-NEXT: bhs .LBB4_7 ; CHECK-NEXT: @ %bb.6: @ in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: movs r6, #0 @@ -163,20 +163,20 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef ; CHECK-NEXT: b .LBB4_10 ; CHECK-NEXT: .LBB4_7: @ %vector.ph ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: bic r2, r8, #7 +; CHECK-NEXT: bic r3, r10, #7 ; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: bic r9, r4, #7 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: bic r9, r2, #7 ; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r5, r11 -; CHECK-NEXT: add.w lr, r7, r2, lsr #3 -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w lr, r7, r3, lsr #3 +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload ; CHECK-NEXT: .LBB4_8: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrh.u16 q0, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r3], #16 ; CHECK-NEXT: vldrh.u16 q1, [r5], #16 -; CHECK-NEXT: rsbs r7, r3, #0 +; CHECK-NEXT: rsb.w r7, r8, #0 ; CHECK-NEXT: vmullb.s16 q2, q1, q0 ; CHECK-NEXT: vmullt.s16 q0, q1, q0 ; CHECK-NEXT: vshl.s32 q2, r7 @@ -186,24 +186,24 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef ; CHECK-NEXT: le lr, .LBB4_8 ; CHECK-NEXT: @ %bb.9: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: cmp r4, r9 +; CHECK-NEXT: cmp r2, r9 ; CHECK-NEXT: beq .LBB4_3 ; CHECK-NEXT: .LBB4_10: @ %vec.epilog.ph ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r2, r9, r10 -; CHECK-NEXT: sub.w r5, r8, r9 +; CHECK-NEXT: add.w r3, r9, r4 +; CHECK-NEXT: sub.w r5, r10, r9 ; CHECK-NEXT: add.w r7, r1, r9, lsl #1 -; CHECK-NEXT: add.w r2, r1, r2, lsl #1 +; CHECK-NEXT: add.w r3, r1, r3, lsl #1 ; CHECK-NEXT: dlstp.32 lr, r5 ; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: rsbs r4, r3, #0 +; CHECK-NEXT: rsb.w r2, r8, #0 ; CHECK-NEXT: vldrh.s32 q0, [r7], #8 -; CHECK-NEXT: vldrh.s32 q1, [r2], #8 +; CHECK-NEXT: vldrh.s32 q1, [r3], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vshl.s32 q0, r4 +; CHECK-NEXT: vshl.s32 q0, r2 ; CHECK-NEXT: vaddva.u32 r6, q0 ; CHECK-NEXT: letp lr, .LBB4_11 ; CHECK-NEXT: b .LBB4_3 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll index af0920475dbf4e..2babf210e959ef 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -180,44 +180,44 @@ entry: define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: ext_add_ashr_trunc_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff ; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov r3, r7, d2 +; CHECK-NEXT: vmov r12, lr, d2 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov r0, r1, d6 ; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmov lr, r12, d7 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: asrs r5, r2, #31 -; CHECK-NEXT: adds r2, r2, r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: asr.w r5, r4, #31 ; CHECK-NEXT: adcs r1, r5 -; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: lsrl r0, r1, #1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: adds r2, r2, r1 +; CHECK-NEXT: asr.w r4, r1, #31 +; CHECK-NEXT: adc.w r1, r4, r3 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: lsrl r2, r1, #1 +; CHECK-NEXT: adds.w r4, r3, r12 +; CHECK-NEXT: asr.w r5, r3, #31 +; CHECK-NEXT: vmov r3, r12, d3 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: adc.w r5, r5, lr +; CHECK-NEXT: lsrl r4, r5, #1 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: asrs r4, r5, #31 -; CHECK-NEXT: adds r6, r5, r3 -; CHECK-NEXT: vmov r3, r5, d3 -; CHECK-NEXT: vmov.f32 s6, s1 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: adcs r7, r4 -; CHECK-NEXT: lsrl r6, r7, #1 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: adds r6, r1, r3 -; CHECK-NEXT: asr.w r2, r1, #31 -; CHECK-NEXT: adc.w r1, r2, r5 -; CHECK-NEXT: lsrl r6, r1, #1 -; CHECK-NEXT: vmov q0[3], q0[1], r6, r0 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %sa = sext <4 x i32> %a to <4 x i64> %sb = zext <4 x i32> %b to <4 x i64> @@ -331,86 +331,86 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov r10, s8 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov r7, s2 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov r6, s10 ; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: asr.w r0, r10, #31 -; CHECK-NEXT: asrs r5, r7, #31 -; CHECK-NEXT: adds.w r4, r10, r2 -; CHECK-NEXT: eor.w r6, r10, r2 -; CHECK-NEXT: adc r3, r0, #0 -; CHECK-NEXT: asrl r4, r3, r2 -; CHECK-NEXT: subs r0, r4, r2 +; CHECK-NEXT: asrs r3, r1, #31 +; CHECK-NEXT: adds r0, r1, r2 +; CHECK-NEXT: eor.w r7, r1, r2 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: asrl r0, r3, r2 +; CHECK-NEXT: subs r5, r0, r2 ; CHECK-NEXT: sbc lr, r3, #0 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: umull r0, r8, r0, r2 -; CHECK-NEXT: adds r4, r7, r3 -; CHECK-NEXT: eor.w r1, r7, r3 -; CHECK-NEXT: adc r5, r5, #0 -; CHECK-NEXT: asrl r4, r5, r3 -; CHECK-NEXT: subs r4, r4, r3 -; CHECK-NEXT: sbc r5, r5, #0 -; CHECK-NEXT: orrs.w r6, r6, r10, asr #31 -; CHECK-NEXT: umull r4, r12, r4, r3 +; CHECK-NEXT: asrs r3, r4, #31 +; CHECK-NEXT: adds r0, r4, r6 +; CHECK-NEXT: umull r12, r8, r5, r2 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: asrl r0, r3, r6 +; CHECK-NEXT: subs r0, r0, r6 +; CHECK-NEXT: sbc r3, r3, #0 +; CHECK-NEXT: orrs.w r7, r7, r1, asr #31 +; CHECK-NEXT: umull r0, r5, r0, r6 ; CHECK-NEXT: csetm r9, eq -; CHECK-NEXT: orrs.w r1, r1, r7, asr #31 -; CHECK-NEXT: mov.w r6, #0 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: bfi r6, r9, #0, #8 -; CHECK-NEXT: mla r5, r5, r3, r12 -; CHECK-NEXT: bfi r6, r1, #8, #8 -; CHECK-NEXT: rsbs r1, r7, #0 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: bfi r7, r9, #0, #8 +; CHECK-NEXT: mla r5, r3, r6, r5 +; CHECK-NEXT: eor.w r3, r4, r6 +; CHECK-NEXT: orrs.w r3, r3, r4, asr #31 +; CHECK-NEXT: csetm r3, eq +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r7, r3, #8, #8 +; CHECK-NEXT: rsbs r3, r4, #0 +; CHECK-NEXT: vmsr p0, r7 ; CHECK-NEXT: mla r7, lr, r2, r8 -; CHECK-NEXT: lsll r4, r5, r1 -; CHECK-NEXT: rsb.w r1, r10, #0 -; CHECK-NEXT: lsll r0, r7, r1 -; CHECK-NEXT: vmov lr, s2 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: lsll r0, r5, r3 +; CHECK-NEXT: lsll r12, r7, r1 ; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: lsll r0, r7, r2 -; CHECK-NEXT: lsll r4, r5, r3 -; CHECK-NEXT: vmsr p0, r6 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r4 -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: lsll r12, r7, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r7, s0 +; CHECK-NEXT: lsll r0, r5, r6 +; CHECK-NEXT: vmov q3[2], q3[0], r12, r0 ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: adds.w r2, lr, r1 -; CHECK-NEXT: asr.w r0, lr, #31 +; CHECK-NEXT: adds r6, r2, r1 +; CHECK-NEXT: asr.w r0, r2, #31 ; CHECK-NEXT: adc r3, r0, #0 -; CHECK-NEXT: asrl r2, r3, r1 -; CHECK-NEXT: subs r0, r2, r1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: sbc r7, r3, #0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: umull r0, r6, r0, r1 -; CHECK-NEXT: asrs r5, r2, #31 -; CHECK-NEXT: adds r4, r2, r3 -; CHECK-NEXT: adc r5, r5, #0 -; CHECK-NEXT: asrl r4, r5, r3 -; CHECK-NEXT: subs r4, r4, r3 -; CHECK-NEXT: sbc r8, r5, #0 -; CHECK-NEXT: mla r5, r7, r1, r6 -; CHECK-NEXT: eor.w r6, lr, r1 -; CHECK-NEXT: orrs.w r6, r6, lr, asr #31 -; CHECK-NEXT: eor.w r7, r2, r3 +; CHECK-NEXT: asrl r6, r3, r1 +; CHECK-NEXT: subs r0, r6, r1 +; CHECK-NEXT: sbc r12, r3, #0 +; CHECK-NEXT: asrs r3, r7, #31 +; CHECK-NEXT: adds r6, r7, r4 +; CHECK-NEXT: umull r0, r5, r0, r1 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: asrl r6, r3, r4 +; CHECK-NEXT: subs.w r8, r6, r4 +; CHECK-NEXT: eor.w r6, r7, r4 +; CHECK-NEXT: sbc lr, r3, #0 +; CHECK-NEXT: eor.w r3, r2, r1 +; CHECK-NEXT: orrs.w r3, r3, r2, asr #31 +; CHECK-NEXT: mla r5, r12, r1, r5 +; CHECK-NEXT: csetm r3, eq +; CHECK-NEXT: orrs.w r6, r6, r7, asr #31 ; CHECK-NEXT: csetm r6, eq -; CHECK-NEXT: orrs.w r7, r7, r2, asr #31 -; CHECK-NEXT: csetm r7, eq -; CHECK-NEXT: rsb.w lr, lr, #0 -; CHECK-NEXT: bfi r12, r7, #0, #8 -; CHECK-NEXT: lsll r0, r5, lr -; CHECK-NEXT: bfi r12, r6, #8, #8 -; CHECK-NEXT: umull r4, r6, r4, r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r10, r6, #0, #8 +; CHECK-NEXT: lsll r0, r5, r2 +; CHECK-NEXT: bfi r10, r3, #8, #8 +; CHECK-NEXT: umull r6, r3, r8, r4 ; CHECK-NEXT: lsll r0, r5, r1 -; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: vmsr p0, r12 -; CHECK-NEXT: mla r7, r8, r3, r6 -; CHECK-NEXT: lsll r4, r7, r1 -; CHECK-NEXT: lsll r4, r7, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: rsbs r1, r7, #0 +; CHECK-NEXT: vmsr p0, r10 +; CHECK-NEXT: mla r3, lr, r4, r3 +; CHECK-NEXT: lsll r6, r3, r1 +; CHECK-NEXT: lsll r6, r3, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r0 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s8 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll index b0a3a6354daa70..0d69fd374244f3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll @@ -556,35 +556,35 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(ptr %dest, <8 x i16> ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: and r2, r1, #1 -; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: bfi r2, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #2, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #3, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #4, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #5, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #2, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #3, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #4, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #5, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #12, #1 ; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #6, #1 ; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r2, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r1, r2 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: bfi r3, r1, #7, #1 +; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: uxtb r1, r3 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r2, [r0] ; CHECK-LE-NEXT: vmovne.16 q0[0], r2 @@ -624,36 +624,36 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(ptr %dest, <8 x i16> ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 ; CHECK-BE-NEXT: vmrs r1, p0 ; CHECK-BE-NEXT: ubfx r2, r1, #14, #1 -; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: bfi r2, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #2, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #3, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #4, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #5, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #2, #1 ; CHECK-BE-NEXT: and r1, r1, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #6, #1 ; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: bfi r2, r1, #7, #1 -; CHECK-BE-NEXT: uxtb r1, r2 -; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: bfi r3, r1, #7, #1 +; CHECK-BE-NEXT: lsls r2, r3, #24 +; CHECK-BE-NEXT: uxtb r1, r3 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: ldrhmi r2, [r0] ; CHECK-BE-NEXT: vmovmi.16 q1[0], r2 @@ -1420,35 +1420,35 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(ptr %dest, <8 x i16 ; CHECK-LE-NEXT: .pad #36 ; CHECK-LE-NEXT: sub sp, #36 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: and r2, r1, #1 -; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: bfi r2, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #2, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #3, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #4, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #5, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #2, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #3, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #4, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #5, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #12, #1 ; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #6, #1 ; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r2, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r1, r2 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: bfi r3, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r3 +; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: bne .LBB45_9 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -1538,36 +1538,36 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(ptr %dest, <8 x i16 ; CHECK-BE-NEXT: .pad #36 ; CHECK-BE-NEXT: sub sp, #36 ; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 ; CHECK-BE-NEXT: vmrs r1, p0 ; CHECK-BE-NEXT: ubfx r2, r1, #14, #1 -; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: bfi r2, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #2, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #3, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #4, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #5, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #2, #1 ; CHECK-BE-NEXT: and r1, r1, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #6, #1 ; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: bfi r2, r1, #7, #1 -; CHECK-BE-NEXT: uxtb r1, r2 -; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: bfi r3, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r3 +; CHECK-BE-NEXT: lsls r2, r3, #24 ; CHECK-BE-NEXT: bmi .LBB45_10 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #25 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll index 9012fada2bee24..f08d195c0d2a05 100644 --- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -184,34 +184,34 @@ define arm_aapcs_vfpcc void @masked_v8i16_align1(ptr %dest, <8 x i16> %a) { ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: vmrs r1, p0 ; CHECK-LE-NEXT: and r2, r1, #1 -; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: bfi r2, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #2, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #3, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #4, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #5, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #2, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #3, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #4, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #5, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #12, #1 ; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #6, #1 ; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r2, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r1, r2 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: bfi r3, r1, #7, #1 +; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: uxtb r1, r3 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne.u16 r2, q0[0] ; CHECK-LE-NEXT: strhne r2, [r0] @@ -251,35 +251,35 @@ define arm_aapcs_vfpcc void @masked_v8i16_align1(ptr %dest, <8 x i16> %a) { ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: vmrs r1, p0 ; CHECK-BE-NEXT: ubfx r2, r1, #14, #1 -; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: bfi r2, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #2, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #3, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #4, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #5, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #2, #1 ; CHECK-BE-NEXT: and r1, r1, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #6, #1 ; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: bfi r2, r1, #7, #1 -; CHECK-BE-NEXT: uxtb r1, r2 -; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: bfi r3, r1, #7, #1 +; CHECK-BE-NEXT: lsls r2, r3, #24 +; CHECK-BE-NEXT: uxtb r1, r3 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi.u16 r2, q1[0] ; CHECK-BE-NEXT: strhmi r2, [r0] @@ -744,36 +744,36 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(ptr %dest, <8 x half> %a, <8 x ; CHECK-BE-NEXT: .pad #36 ; CHECK-BE-NEXT: sub sp, #36 ; CHECK-BE-NEXT: vrev64.16 q2, q1 -; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr +; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vmrs r1, p0 ; CHECK-BE-NEXT: ubfx r2, r1, #14, #1 -; CHECK-BE-NEXT: rsbs r3, r2, #0 -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: bfi r2, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #2, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #3, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #4, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #5, #1 -; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #2, #1 ; CHECK-BE-NEXT: and r1, r1, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #6, #1 ; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: bfi r2, r1, #7, #1 -; CHECK-BE-NEXT: uxtb r1, r2 -; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: bfi r3, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r3 +; CHECK-BE-NEXT: lsls r2, r3, #24 ; CHECK-BE-NEXT: bmi .LBB16_9 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #25 diff --git a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll index da59cb259db616..71723a60faeee8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll @@ -554,13 +554,13 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: movw r0, :lower16:arr_21 ; CHECK-NEXT: movt r0, :upper16:arr_21 -; CHECK-NEXT: add.w r5, r0, #36 +; CHECK-NEXT: add.w r4, r0, #36 ; CHECK-NEXT: add.w r11, r6, #128 ; CHECK-NEXT: add.w r7, r6, #112 ; CHECK-NEXT: add.w r2, r6, #96 -; CHECK-NEXT: add.w r4, r6, #64 -; CHECK-NEXT: add.w r0, r6, #48 -; CHECK-NEXT: add.w r1, r6, #32 +; CHECK-NEXT: add.w r5, r6, #64 +; CHECK-NEXT: add.w r1, r6, #48 +; CHECK-NEXT: add.w r0, r6, #32 ; CHECK-NEXT: add.w r12, r6, #16 ; CHECK-NEXT: adr r6, .LCPI19_0 ; CHECK-NEXT: vldrw.u32 q0, [r6] @@ -572,21 +572,21 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: movt r6, :upper16:arr_20 ; CHECK-NEXT: .LBB19_3: @ %for.cond8.preheader ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r8, [r5, #-4] -; CHECK-NEXT: vstrh.16 q1, [r5, #-36] -; CHECK-NEXT: strh.w r9, [r5] -; CHECK-NEXT: vstrh.16 q1, [r5, #-20] +; CHECK-NEXT: str r8, [r4, #-4] +; CHECK-NEXT: vstrh.16 q1, [r4, #-36] +; CHECK-NEXT: strh.w r9, [r4] +; CHECK-NEXT: vstrh.16 q1, [r4, #-20] ; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r12], #152 ; CHECK-NEXT: vstrh.16 q0, [r6], #152 -; CHECK-NEXT: vstrh.16 q0, [r1], #152 ; CHECK-NEXT: vstrh.16 q0, [r0], #152 -; CHECK-NEXT: vstrh.16 q0, [r4], #152 +; CHECK-NEXT: vstrh.16 q0, [r1], #152 +; CHECK-NEXT: vstrh.16 q0, [r5], #152 ; CHECK-NEXT: vstrh.16 q0, [r2], #152 ; CHECK-NEXT: vstrh.16 q0, [r7], #152 ; CHECK-NEXT: vstrh.16 q0, [r11], #152 ; CHECK-NEXT: strd r9, r10, [r3, #64] -; CHECK-NEXT: adds r5, #38 +; CHECK-NEXT: adds r4, #38 ; CHECK-NEXT: adds r3, #152 ; CHECK-NEXT: le lr, .LBB19_3 ; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup6 @@ -601,48 +601,48 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: vstrb.8 q1, [r0], #16 ; CHECK-NEXT: letp lr, .LBB19_5 ; CHECK-NEXT: .LBB19_6: @ %for.cond.cleanup6 -; CHECK-NEXT: movw r6, :lower16:arr_20 +; CHECK-NEXT: movw r2, :lower16:arr_20 ; CHECK-NEXT: movw r0, #7376 -; CHECK-NEXT: movt r6, :upper16:arr_20 -; CHECK-NEXT: adds r3, r6, r0 +; CHECK-NEXT: movt r2, :upper16:arr_20 +; CHECK-NEXT: adds r4, r2, r0 ; CHECK-NEXT: movw r0, #7408 -; CHECK-NEXT: add.w r12, r6, r0 +; CHECK-NEXT: add.w r12, r2, r0 ; CHECK-NEXT: movw r0, #7344 -; CHECK-NEXT: add.w r9, r6, r0 +; CHECK-NEXT: add.w r8, r2, r0 ; CHECK-NEXT: movw r0, #7312 -; CHECK-NEXT: adds r2, r6, r0 +; CHECK-NEXT: add.w r9, r2, r0 ; CHECK-NEXT: movw r0, :lower16:arr_21 -; CHECK-NEXT: add.w r1, r6, #7424 -; CHECK-NEXT: add.w r7, r6, #7392 -; CHECK-NEXT: add.w r4, r6, #7360 -; CHECK-NEXT: add.w r5, r6, #7328 -; CHECK-NEXT: add.w r8, r6, #7296 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: add.w r1, r2, #7424 ; CHECK-NEXT: movt r0, :upper16:arr_21 -; CHECK-NEXT: addw r0, r0, #1860 +; CHECK-NEXT: addw r5, r0, #1860 +; CHECK-NEXT: add.w r0, r2, #7392 +; CHECK-NEXT: add.w r7, r2, #7360 +; CHECK-NEXT: add.w r3, r2, #7328 +; CHECK-NEXT: add.w r6, r2, #7296 +; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov.w r10, #5 -; CHECK-NEXT: dls lr, r6 -; CHECK-NEXT: mov.w r6, #327685 ; CHECK-NEXT: vmov.i16 q1, #0x5 ; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: dls lr, r2 +; CHECK-NEXT: mov.w r2, #327685 ; CHECK-NEXT: .LBB19_7: @ %for.cond8.preheader.1 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r6, [r0, #-4] -; CHECK-NEXT: vstrh.16 q1, [r0, #-36] -; CHECK-NEXT: strh.w r10, [r0] -; CHECK-NEXT: vstrh.16 q1, [r0, #-20] -; CHECK-NEXT: vstrw.32 q0, [r3] -; CHECK-NEXT: vstrh.16 q0, [r2], #152 -; CHECK-NEXT: vstrh.16 q0, [r8], #152 -; CHECK-NEXT: vstrh.16 q0, [r5], #152 +; CHECK-NEXT: str r2, [r5, #-4] +; CHECK-NEXT: vstrh.16 q1, [r5, #-36] +; CHECK-NEXT: strh.w r10, [r5] +; CHECK-NEXT: vstrh.16 q1, [r5, #-20] +; CHECK-NEXT: vstrw.32 q0, [r4] ; CHECK-NEXT: vstrh.16 q0, [r9], #152 -; CHECK-NEXT: vstrh.16 q0, [r4], #152 +; CHECK-NEXT: vstrh.16 q0, [r6], #152 +; CHECK-NEXT: vstrh.16 q0, [r3], #152 +; CHECK-NEXT: vstrh.16 q0, [r8], #152 ; CHECK-NEXT: vstrh.16 q0, [r7], #152 +; CHECK-NEXT: vstrh.16 q0, [r0], #152 ; CHECK-NEXT: vstrh.16 q0, [r12], #152 ; CHECK-NEXT: vstrh.16 q0, [r1], #152 -; CHECK-NEXT: strd r10, r11, [r3, #64] -; CHECK-NEXT: adds r0, #38 -; CHECK-NEXT: adds r3, #152 +; CHECK-NEXT: strd r10, r11, [r4, #64] +; CHECK-NEXT: adds r5, #38 +; CHECK-NEXT: adds r4, #152 ; CHECK-NEXT: le lr, .LBB19_7 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup6.1 ; CHECK-NEXT: movw r0, :lower16:arr_22 @@ -659,7 +659,7 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: movw r7, :lower16:arr_20 ; CHECK-NEXT: movw r0, #14672 ; CHECK-NEXT: movt r7, :upper16:arr_20 -; CHECK-NEXT: adds r3, r7, r0 +; CHECK-NEXT: adds r4, r7, r0 ; CHECK-NEXT: movw r0, #14704 ; CHECK-NEXT: add.w r12, r7, r0 ; CHECK-NEXT: movw r0, #14688 @@ -669,14 +669,14 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: movw r0, #14624 ; CHECK-NEXT: adds r2, r7, r0 ; CHECK-NEXT: movw r0, #14608 -; CHECK-NEXT: movw r1, :lower16:arr_21 -; CHECK-NEXT: add r0, r7 -; CHECK-NEXT: add.w r4, r7, #14720 -; CHECK-NEXT: add.w r5, r7, #14656 +; CHECK-NEXT: adds r1, r7, r0 +; CHECK-NEXT: movw r0, :lower16:arr_21 +; CHECK-NEXT: add.w r5, r7, #14720 +; CHECK-NEXT: add.w r3, r7, #14656 ; CHECK-NEXT: add.w r6, r7, #14592 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: movt r1, :upper16:arr_21 -; CHECK-NEXT: addw r1, r1, #3684 +; CHECK-NEXT: movt r0, :upper16:arr_21 +; CHECK-NEXT: addw r0, r0, #3684 ; CHECK-NEXT: mov.w r10, #5 ; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: mov.w r7, #327685 @@ -684,22 +684,22 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: .LBB19_11: @ %for.cond8.preheader.2 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r7, [r1, #-4] -; CHECK-NEXT: vstrh.16 q1, [r1, #-36] -; CHECK-NEXT: strh.w r10, [r1] -; CHECK-NEXT: vstrh.16 q1, [r1, #-20] -; CHECK-NEXT: vstrw.32 q0, [r3] -; CHECK-NEXT: vstrh.16 q0, [r0], #152 +; CHECK-NEXT: str r7, [r0, #-4] +; CHECK-NEXT: vstrh.16 q1, [r0, #-36] +; CHECK-NEXT: strh.w r10, [r0] +; CHECK-NEXT: vstrh.16 q1, [r0, #-20] +; CHECK-NEXT: vstrw.32 q0, [r4] +; CHECK-NEXT: vstrh.16 q0, [r1], #152 ; CHECK-NEXT: vstrh.16 q0, [r6], #152 ; CHECK-NEXT: vstrh.16 q0, [r2], #152 ; CHECK-NEXT: vstrh.16 q0, [r9], #152 -; CHECK-NEXT: vstrh.16 q0, [r5], #152 +; CHECK-NEXT: vstrh.16 q0, [r3], #152 ; CHECK-NEXT: vstrh.16 q0, [r8], #152 ; CHECK-NEXT: vstrh.16 q0, [r12], #152 -; CHECK-NEXT: vstrh.16 q0, [r4], #152 -; CHECK-NEXT: strd r10, r11, [r3, #64] -; CHECK-NEXT: adds r1, #38 -; CHECK-NEXT: adds r3, #152 +; CHECK-NEXT: vstrh.16 q0, [r5], #152 +; CHECK-NEXT: strd r10, r11, [r4, #64] +; CHECK-NEXT: adds r0, #38 +; CHECK-NEXT: adds r4, #152 ; CHECK-NEXT: le lr, .LBB19_11 ; CHECK-NEXT: @ %bb.12: @ %for.cond.cleanup6.2 ; CHECK-NEXT: movw r0, :lower16:arr_22 @@ -712,27 +712,27 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: vstrb.8 q1, [r0], #16 ; CHECK-NEXT: letp lr, .LBB19_13 ; CHECK-NEXT: .LBB19_14: @ %for.cond.cleanup6.2 -; CHECK-NEXT: movw r2, :lower16:arr_21 -; CHECK-NEXT: movw r1, #5508 -; CHECK-NEXT: movt r2, :upper16:arr_21 ; CHECK-NEXT: movw r7, :lower16:arr_20 -; CHECK-NEXT: add r2, r1 -; CHECK-NEXT: movw r1, #22000 -; CHECK-NEXT: movt r7, :upper16:arr_20 -; CHECK-NEXT: add.w r12, r7, r1 -; CHECK-NEXT: movw r1, #21984 -; CHECK-NEXT: add.w r8, r7, r1 -; CHECK-NEXT: movw r1, #21952 -; CHECK-NEXT: add.w r9, r7, r1 -; CHECK-NEXT: movw r1, #21936 +; CHECK-NEXT: movw r2, :lower16:arr_21 ; CHECK-NEXT: movw r0, #21968 -; CHECK-NEXT: adds r5, r7, r1 -; CHECK-NEXT: movw r1, #21920 -; CHECK-NEXT: movw r3, #21904 -; CHECK-NEXT: adds r4, r7, r3 -; CHECK-NEXT: add r0, r7 -; CHECK-NEXT: add r1, r7 +; CHECK-NEXT: movt r7, :upper16:arr_20 +; CHECK-NEXT: adds r1, r7, r0 +; CHECK-NEXT: movw r0, #5508 +; CHECK-NEXT: movt r2, :upper16:arr_21 ; CHECK-NEXT: add.w r3, r7, #22016 +; CHECK-NEXT: add r2, r0 +; CHECK-NEXT: movw r0, #22000 +; CHECK-NEXT: add.w r12, r7, r0 +; CHECK-NEXT: movw r0, #21984 +; CHECK-NEXT: add.w r8, r7, r0 +; CHECK-NEXT: movw r0, #21952 +; CHECK-NEXT: add.w r9, r7, r0 +; CHECK-NEXT: movw r0, #21936 +; CHECK-NEXT: adds r4, r7, r0 +; CHECK-NEXT: movw r0, #21920 +; CHECK-NEXT: adds r5, r7, r0 +; CHECK-NEXT: movw r0, #21904 +; CHECK-NEXT: add r0, r7 ; CHECK-NEXT: add.w r6, r7, #21888 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov.w r10, #5 @@ -746,18 +746,18 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: vstrh.16 q1, [r2, #-36] ; CHECK-NEXT: strh.w r10, [r2] ; CHECK-NEXT: vstrh.16 q1, [r2, #-20] -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vstrh.16 q0, [r4], #152 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0], #152 ; CHECK-NEXT: vstrh.16 q0, [r6], #152 -; CHECK-NEXT: vstrh.16 q0, [r1], #152 ; CHECK-NEXT: vstrh.16 q0, [r5], #152 +; CHECK-NEXT: vstrh.16 q0, [r4], #152 ; CHECK-NEXT: vstrh.16 q0, [r9], #152 ; CHECK-NEXT: vstrh.16 q0, [r8], #152 ; CHECK-NEXT: vstrh.16 q0, [r12], #152 ; CHECK-NEXT: vstrh.16 q0, [r3], #152 -; CHECK-NEXT: strd r10, r11, [r0, #64] +; CHECK-NEXT: strd r10, r11, [r1, #64] ; CHECK-NEXT: adds r2, #38 -; CHECK-NEXT: adds r0, #152 +; CHECK-NEXT: adds r1, #152 ; CHECK-NEXT: le lr, .LBB19_15 ; CHECK-NEXT: @ %bb.16: @ %for.cond.cleanup6.3 ; CHECK-NEXT: add sp, #12 diff --git a/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll index f2c8440b177d81..26bf7965b02624 100644 --- a/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll +++ b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll @@ -255,30 +255,30 @@ define arm_aapcs_vfpcc void @smax4i64(<4 x i64> %a, <4 x i64> %b, ptr %p) { ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: sbcs.w r1, r2, r12 ; CHECK-NEXT: vmov lr, r12, d3 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: bfi r3, r2, #0, #8 -; CHECK-NEXT: vmov r2, r4, d7 -; CHECK-NEXT: subs.w r2, r2, lr -; CHECK-NEXT: sbcs.w r2, r4, r12 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: bfi r3, r2, #8, #8 -; CHECK-NEXT: vmov r2, r12, d0 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: bfi r3, r1, #0, #8 +; CHECK-NEXT: vmov r1, r4, d7 +; CHECK-NEXT: subs.w r1, r1, lr +; CHECK-NEXT: sbcs.w r1, r4, r12 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: bfi r3, r1, #8, #8 +; CHECK-NEXT: vmov r1, r12, d0 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vmov r4, r3, d4 ; CHECK-NEXT: vpsel q1, q1, q3 ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: subs r1, r4, r1 +; CHECK-NEXT: sbcs.w r1, r3, r12 ; CHECK-NEXT: vmov r4, r3, d5 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: bfi r1, r2, #0, #8 -; CHECK-NEXT: vmov r2, r12, d1 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: sbcs.w r2, r3, r12 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: bfi r1, r2, #8, #8 -; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: vmov r1, r12, d1 +; CHECK-NEXT: subs r1, r4, r1 +; CHECK-NEXT: sbcs.w r1, r3, r12 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r2 ; CHECK-NEXT: vpsel q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: pop {r4, pc} @@ -534,30 +534,30 @@ define arm_aapcs_vfpcc void @umax4i64(<4 x i64> %a, <4 x i64> %b, ptr %p) { ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: sbcs.w r1, r2, r12 ; CHECK-NEXT: vmov lr, r12, d3 -; CHECK-NEXT: csetm r2, lo -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: bfi r3, r2, #0, #8 -; CHECK-NEXT: vmov r2, r4, d7 -; CHECK-NEXT: subs.w r2, r2, lr -; CHECK-NEXT: sbcs.w r2, r4, r12 -; CHECK-NEXT: csetm r2, lo -; CHECK-NEXT: bfi r3, r2, #8, #8 -; CHECK-NEXT: vmov r2, r12, d0 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: bfi r3, r1, #0, #8 +; CHECK-NEXT: vmov r1, r4, d7 +; CHECK-NEXT: subs.w r1, r1, lr +; CHECK-NEXT: sbcs.w r1, r4, r12 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: bfi r3, r1, #8, #8 +; CHECK-NEXT: vmov r1, r12, d0 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vmov r4, r3, d4 ; CHECK-NEXT: vpsel q1, q1, q3 ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: subs r1, r4, r1 +; CHECK-NEXT: sbcs.w r1, r3, r12 ; CHECK-NEXT: vmov r4, r3, d5 -; CHECK-NEXT: csetm r2, lo -; CHECK-NEXT: bfi r1, r2, #0, #8 -; CHECK-NEXT: vmov r2, r12, d1 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: sbcs.w r2, r3, r12 -; CHECK-NEXT: csetm r2, lo -; CHECK-NEXT: bfi r1, r2, #8, #8 -; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: vmov r1, r12, d1 +; CHECK-NEXT: subs r1, r4, r1 +; CHECK-NEXT: sbcs.w r1, r3, r12 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r2 ; CHECK-NEXT: vpsel q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: pop {r4, pc} @@ -820,30 +820,30 @@ define arm_aapcs_vfpcc void @smin4i64(<4 x i64> %a, <4 x i64> %b, ptr %p) { ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: sbcs.w r1, r2, r12 ; CHECK-NEXT: vmov lr, r12, d7 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: bfi r3, r2, #0, #8 -; CHECK-NEXT: vmov r2, r4, d3 -; CHECK-NEXT: subs.w r2, r2, lr -; CHECK-NEXT: sbcs.w r2, r4, r12 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: bfi r3, r2, #8, #8 -; CHECK-NEXT: vmov r2, r12, d4 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: bfi r3, r1, #0, #8 +; CHECK-NEXT: vmov r1, r4, d3 +; CHECK-NEXT: subs.w r1, r1, lr +; CHECK-NEXT: sbcs.w r1, r4, r12 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: bfi r3, r1, #8, #8 +; CHECK-NEXT: vmov r1, r12, d4 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vmov r4, r3, d0 ; CHECK-NEXT: vpsel q1, q1, q3 ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: subs r1, r4, r1 +; CHECK-NEXT: sbcs.w r1, r3, r12 ; CHECK-NEXT: vmov r4, r3, d1 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: bfi r1, r2, #0, #8 -; CHECK-NEXT: vmov r2, r12, d5 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: sbcs.w r2, r3, r12 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: bfi r1, r2, #8, #8 -; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: vmov r1, r12, d5 +; CHECK-NEXT: subs r1, r4, r1 +; CHECK-NEXT: sbcs.w r1, r3, r12 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r2 ; CHECK-NEXT: vpsel q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: pop {r4, pc} @@ -1099,30 +1099,30 @@ define arm_aapcs_vfpcc void @umin4i64(<4 x i64> %a, <4 x i64> %b, ptr %p) { ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: sbcs.w r1, r2, r12 ; CHECK-NEXT: vmov lr, r12, d7 -; CHECK-NEXT: csetm r2, lo -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: bfi r3, r2, #0, #8 -; CHECK-NEXT: vmov r2, r4, d3 -; CHECK-NEXT: subs.w r2, r2, lr -; CHECK-NEXT: sbcs.w r2, r4, r12 -; CHECK-NEXT: csetm r2, lo -; CHECK-NEXT: bfi r3, r2, #8, #8 -; CHECK-NEXT: vmov r2, r12, d4 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: bfi r3, r1, #0, #8 +; CHECK-NEXT: vmov r1, r4, d3 +; CHECK-NEXT: subs.w r1, r1, lr +; CHECK-NEXT: sbcs.w r1, r4, r12 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: bfi r3, r1, #8, #8 +; CHECK-NEXT: vmov r1, r12, d4 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vmov r4, r3, d0 ; CHECK-NEXT: vpsel q1, q1, q3 ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: sbcs.w r2, r3, r12 +; CHECK-NEXT: subs r1, r4, r1 +; CHECK-NEXT: sbcs.w r1, r3, r12 ; CHECK-NEXT: vmov r4, r3, d1 -; CHECK-NEXT: csetm r2, lo -; CHECK-NEXT: bfi r1, r2, #0, #8 -; CHECK-NEXT: vmov r2, r12, d5 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: sbcs.w r2, r3, r12 -; CHECK-NEXT: csetm r2, lo -; CHECK-NEXT: bfi r1, r2, #8, #8 -; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: bfi r2, r1, #0, #8 +; CHECK-NEXT: vmov r1, r12, d5 +; CHECK-NEXT: subs r1, r4, r1 +; CHECK-NEXT: sbcs.w r1, r3, r12 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r2 ; CHECK-NEXT: vpsel q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: pop {r4, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll index 72df912b25a9fc..741292c7549f6a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -152,52 +152,52 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: .pad #408 ; CHECK-NEXT: sub sp, #408 ; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals -; CHECK-NEXT: vldr s15, .LCPI1_1 +; CHECK-NEXT: vldr s7, .LCPI1_1 ; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals ; CHECK-NEXT: movw r2, :lower16:e -; CHECK-NEXT: mov r4, r7 ; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: ldr r6, [r4, #8]! -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r4, r7 ; CHECK-NEXT: ldr r0, [r3, #4]! +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: ldr r6, [r4, #8]! +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov s5, r3 +; CHECK-NEXT: vldr s4, .LCPI1_0 +; CHECK-NEXT: vdup.32 q7, r3 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: movt r2, :upper16:e -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r4 -; CHECK-NEXT: vmov s13, r3 -; CHECK-NEXT: vldr s12, .LCPI1_0 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r2 -; CHECK-NEXT: vdup.32 q7, r3 ; CHECK-NEXT: vmov q6[2], q6[0], r3, r5 -; CHECK-NEXT: vstrw.32 q0, [sp, #92] ; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vmov q2[2], q2[0], r4, r4 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vmov q4, q7 ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.32 q7[1], r2 ; CHECK-NEXT: vmov s21, r2 +; CHECK-NEXT: vmov q2[3], q2[1], r5, r2 +; CHECK-NEXT: vmov.f32 s20, s4 ; CHECK-NEXT: movs r1, #64 -; CHECK-NEXT: vmov.f32 s20, s12 +; CHECK-NEXT: vmov.f32 s22, s5 ; CHECK-NEXT: str r0, [sp, #40] -; CHECK-NEXT: vmov.f32 s22, s13 +; CHECK-NEXT: vmov.f32 s23, s7 ; CHECK-NEXT: str r6, [r0] -; CHECK-NEXT: vmov.f32 s23, s15 ; CHECK-NEXT: str r0, [r0] ; CHECK-NEXT: vstrw.32 q5, [r0] ; CHECK-NEXT: vstrw.32 q7, [r0] ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q6, [r0] ; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r3 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r3 +; CHECK-NEXT: vmov q3[2], q3[0], r3, r3 +; CHECK-NEXT: vstrw.32 q2, [sp, #92] +; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 ; CHECK-NEXT: mov.w r12, #4 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r4 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r4 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 ; CHECK-NEXT: vmov.32 q4[0], r8 ; CHECK-NEXT: @ implicit-def: $r2 ; CHECK-NEXT: str.w r8, [sp, #44] -; CHECK-NEXT: vstrw.32 q3, [sp, #60] +; CHECK-NEXT: vstrw.32 q1, [sp, #60] ; CHECK-NEXT: strh.w r12, [sp, #406] ; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2 ; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1 @@ -205,10 +205,10 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: .LBB1_2: @ %entry -; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vstrw.32 q2, [r0] ; CHECK-NEXT: str.w r8, [r7] ; CHECK-NEXT: vstrw.32 q4, [r0] -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vstrw.32 q3, [r0] ; CHECK-NEXT: str.w r12, [sp, #324] ; CHECK-NEXT: .LBB1_3: @ %for.cond ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll index 70957ca950d71f..01417bba0d1a8e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll @@ -18,50 +18,50 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: csel r7, r6, r5, hs ; CHECK-NEXT: add.w lr, r7, #1 ; CHECK-NEXT: mov r4, r5 -; CHECK-NEXT: vldrh.u16 q0, [r0], #32 +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r8, r5 -; CHECK-NEXT: vldrh.u16 q1, [r1], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 +; CHECK-NEXT: vldrh.u16 q2, [r1], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q2 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q2 ; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q3 +; CHECK-NEXT: vldrh.u16 q1, [r1], #32 ; CHECK-NEXT: sub.w lr, lr, #1 ; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 +; CHECK-NEXT: vldrh.u16 q2, [r0], #32 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q3 ; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q1 +; CHECK-NEXT: vldrh.u16 q2, [r0], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q3 +; CHECK-NEXT: vldrh.u16 q1, [r1], #32 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q3 ; CHECK-NEXT: movs r6, #14 ; CHECK-NEXT: and.w r2, r6, r2, lsl #1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r1, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q1 +; CHECK-NEXT: vldrh.u16 q1, [r1, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrht.u16 q1, [r0] +; CHECK-NEXT: vldrht.u16 q2, [r0] ; CHECK-NEXT: cmp r2, #9 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrht.u16 q0, [r1] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0 -; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0 +; CHECK-NEXT: vmlsldavat.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q2, q0 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.4: @ %do.body.1 ; CHECK-NEXT: subs r2, #8 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll index 45bb70ec44b737..b63ac9896d83af 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -602,7 +602,7 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r8, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 @@ -617,18 +617,18 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: add.w r10, r0, #2 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: add.w r11, r0, #1 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB4_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 @@ -636,18 +636,18 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: add.w r9, r3, r5 ; CHECK-NEXT: vldrw.u32 q5, [r4], #16 ; CHECK-NEXT: vldrw.u32 q6, [r3], #16 -; CHECK-NEXT: add.w r12, r9, r5 +; CHECK-NEXT: add.w r8, r9, r5 ; CHECK-NEXT: vfma.f32 q3, q6, q5 ; CHECK-NEXT: vldrw.u32 q6, [r9] -; CHECK-NEXT: add.w r6, r12, r5 +; CHECK-NEXT: add.w r6, r8, r5 ; CHECK-NEXT: vfma.f32 q4, q6, q5 -; CHECK-NEXT: vldrw.u32 q6, [r12] +; CHECK-NEXT: vldrw.u32 q6, [r8] ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vfma.f32 q2, q6, q5 ; CHECK-NEXT: vldrw.u32 q6, [r6] -; CHECK-NEXT: vfma.f32 q0, q6, q5 -; CHECK-NEXT: vldrw.u32 q6, [r7] ; CHECK-NEXT: vfma.f32 q1, q6, q5 +; CHECK-NEXT: vldrw.u32 q6, [r7] +; CHECK-NEXT: vfma.f32 q0, q6, q5 ; CHECK-NEXT: letp lr, .LBB4_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1 @@ -656,31 +656,31 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vadd.f32 s16, s16, s17 ; CHECK-NEXT: vadd.f32 s14, s14, s15 ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vadd.f32 s6, s6, s7 -; CHECK-NEXT: vadd.f32 s4, s4, s5 +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s1, s16, s18 -; CHECK-NEXT: vadd.f32 s2, s2, s3 +; CHECK-NEXT: vadd.f32 s6, s6, s7 +; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s12, s12, s14 -; CHECK-NEXT: vadd.f32 s4, s4, s6 -; CHECK-NEXT: vadd.f32 s6, s8, s10 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s2, s8, s10 ; CHECK-NEXT: vstr s1, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: adds r0, #5 +; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: vstr s12, [r1] ; CHECK-NEXT: add.w r1, r2, r10, lsl #2 -; CHECK-NEXT: vstr s6, [r1] +; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s0, [r1] +; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s4, [r1] +; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r8, r1 +; CHECK-NEXT: add r12, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB4_2 @@ -811,7 +811,7 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r8, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 @@ -830,39 +830,39 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: add.w r11, r0, #2 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #1 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB5_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r12, r3, r5 +; CHECK-NEXT: add.w r9, r3, r5 ; CHECK-NEXT: vldrw.u32 q6, [r1], #16 ; CHECK-NEXT: vldrw.u32 q7, [r3], #16 -; CHECK-NEXT: add.w r10, r12, r5 +; CHECK-NEXT: add.w r10, r9, r5 ; CHECK-NEXT: vfma.f32 q4, q7, q6 -; CHECK-NEXT: vldrw.u32 q7, [r12] +; CHECK-NEXT: vldrw.u32 q7, [r9] ; CHECK-NEXT: add.w r6, r10, r5 ; CHECK-NEXT: vfma.f32 q5, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r10] ; CHECK-NEXT: adds r7, r6, r5 -; CHECK-NEXT: vfma.f32 q2, q7, q6 +; CHECK-NEXT: vfma.f32 q3, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r6] ; CHECK-NEXT: adds r6, r7, r5 -; CHECK-NEXT: vfma.f32 q0, q7, q6 +; CHECK-NEXT: vfma.f32 q1, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r7] -; CHECK-NEXT: vfma.f32 q3, q7, q6 +; CHECK-NEXT: vfma.f32 q2, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r6] -; CHECK-NEXT: vfma.f32 q1, q7, q6 +; CHECK-NEXT: vfma.f32 q0, q7, q6 ; CHECK-NEXT: letp lr, .LBB5_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1 @@ -871,37 +871,37 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vadd.f32 s20, s20, s21 ; CHECK-NEXT: vadd.f32 s18, s18, s19 ; CHECK-NEXT: vadd.f32 s16, s16, s17 +; CHECK-NEXT: vadd.f32 s2, s2, s3 +; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s2, s2, s3 +; CHECK-NEXT: vadd.f32 s14, s14, s15 +; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vadd.f32 s1, s20, s22 ; CHECK-NEXT: vadd.f32 s6, s6, s7 -; CHECK-NEXT: vadd.f32 s3, s16, s18 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s8, s8, s10 -; CHECK-NEXT: vadd.f32 s14, s14, s15 -; CHECK-NEXT: vadd.f32 s12, s12, s13 +; CHECK-NEXT: vadd.f32 s3, s16, s18 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s2, s8, s10 +; CHECK-NEXT: vadd.f32 s8, s12, s14 ; CHECK-NEXT: vstr s1, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: adds r0, #6 +; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: vstr s3, [r1] ; CHECK-NEXT: add.w r1, r2, r11, lsl #2 -; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: vstr s8, [r1] ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vadd.f32 s6, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s0, [r1] +; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s6, [r1] +; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s4, [r1] +; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: add r8, r1 +; CHECK-NEXT: add r12, r1 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB5_2 @@ -1044,7 +1044,7 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r8, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 @@ -1065,113 +1065,104 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q7, #0x0 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #2 ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: add.w r8, r0, #1 -; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: mov r12, r7 -; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: add.w r12, r0, #1 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: vmov q2, q7 +; CHECK-NEXT: vmov q5, q7 +; CHECK-NEXT: vmov q3, q7 +; CHECK-NEXT: vmov q6, q7 +; CHECK-NEXT: vmov q4, q7 +; CHECK-NEXT: mov r9, r7 +; CHECK-NEXT: vstrw.32 q7, [sp, #40] @ 16-byte Spill ; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB6_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vctp.32 r9 ; CHECK-NEXT: add.w r10, r3, r5 +; CHECK-NEXT: vstrw.32 q7, [sp, #56] @ 16-byte Spill ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 ; CHECK-NEXT: add.w r11, r10, r5 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #40] @ 16-byte Reload ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q5, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r10] -; CHECK-NEXT: add.w r6, r11, r5 -; CHECK-NEXT: vpstt +; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: sub.w r9, r9, #4 +; CHECK-NEXT: vpstttt ; CHECK-NEXT: vfmat.f32 q6, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r11] -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q1, q0, q7 -; CHECK-NEXT: vmov q5, q4 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vpst +; CHECK-NEXT: vfmat.f32 q4, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r6] -; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q1, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r7] -; CHECK-NEXT: adds r6, r7, r5 -; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q3 -; CHECK-NEXT: vmov q3, q4 +; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vstrw.32 q1, [sp, #40] @ 16-byte Spill ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q3, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r6] -; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: adds r7, r6, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q4, q0, q7 -; CHECK-NEXT: vldrwt.u32 q0, [r7] -; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload -; CHECK-NEXT: vpst +; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload +; CHECK-NEXT: vpsttt ; CHECK-NEXT: vfmat.f32 q2, q0, q7 +; CHECK-NEXT: vldrwt.u32 q0, [r7] +; CHECK-NEXT: vfmat.f32 q1, q0, q7 +; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #56] @ 16-byte Reload ; CHECK-NEXT: le lr, .LBB6_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1 ; CHECK-NEXT: vadd.f32 s0, s26, s27 -; CHECK-NEXT: add.w r1, r2, r8, lsl #2 +; CHECK-NEXT: vldrw.u32 q1, [sp, #40] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s2, s24, s25 +; CHECK-NEXT: add.w r1, r2, r12, lsl #2 ; CHECK-NEXT: vadd.f32 s1, s22, s23 ; CHECK-NEXT: vadd.f32 s3, s20, s21 -; CHECK-NEXT: vadd.f32 s6, s6, s7 -; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NEXT: vadd.f32 s14, s14, s15 +; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vadd.f32 s9, s18, s19 ; CHECK-NEXT: vadd.f32 s11, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NEXT: vadd.f32 s6, s6, s7 +; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s2, s3, s1 -; CHECK-NEXT: vadd.f32 s5, s18, s19 -; CHECK-NEXT: vadd.f32 s7, s16, s17 -; CHECK-NEXT: vadd.f32 s4, s4, s6 +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s10, s12, s14 +; CHECK-NEXT: vadd.f32 s12, s11, s9 ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s14, s14, s15 +; CHECK-NEXT: vadd.f32 s20, s30, s31 ; CHECK-NEXT: adds r0, #7 -; CHECK-NEXT: vadd.f32 s12, s12, s13 +; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 -; CHECK-NEXT: vadd.f32 s8, s8, s10 -; CHECK-NEXT: vadd.f32 s6, s7, s5 -; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: vadd.f32 s10, s11, s9 +; CHECK-NEXT: vadd.f32 s22, s28, s29 +; CHECK-NEXT: vstr s12, [r1] ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: vadd.f32 s12, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s6, [r1] +; CHECK-NEXT: vadd.f32 s1, s22, s20 +; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s12, [r1] +; CHECK-NEXT: vstr s10, [r1] ; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s10, [r1] +; CHECK-NEXT: vstr s8, [r1] ; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s8, [r1] +; CHECK-NEXT: vstr s1, [r1] ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r9, r1 +; CHECK-NEXT: add r8, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB6_2 @@ -1326,7 +1317,7 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: add.w r9, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r6, r3, #2 @@ -1345,22 +1336,22 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: ldr.w r8, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #3 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: add.w r8, r0, #2 +; CHECK-NEXT: add.w r12, r0, #2 ; CHECK-NEXT: adds r1, r0, #1 -; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov q6, q3 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: vmov q7, q3 -; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmov q6, q0 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: mov r10, r7 -; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: dls lr, r5 ; CHECK-NEXT: .LBB7_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 @@ -1368,105 +1359,95 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vctp.32 r10 ; CHECK-NEXT: add.w r11, r3, r6 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 -; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 +; CHECK-NEXT: vldrwt.u32 q1, [r8], #16 +; CHECK-NEXT: vldrwt.u32 q2, [r3], #16 ; CHECK-NEXT: add.w r5, r11, r6 -; CHECK-NEXT: sub.w r10, r10, #4 +; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill ; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q6, q1, q0 -; CHECK-NEXT: vldrwt.u32 q1, [r11] -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q7, q1, q0 -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [r5] -; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload +; CHECK-NEXT: vfmat.f32 q6, q2, q1 +; CHECK-NEXT: vldrwt.u32 q2, [r11] +; CHECK-NEXT: vldrw.u32 q0, [sp, #56] @ 16-byte Reload ; CHECK-NEXT: adds r7, r5, r6 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: vldrwt.u32 q1, [r7] -; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload +; CHECK-NEXT: vfmat.f32 q7, q2, q1 +; CHECK-NEXT: vldrwt.u32 q2, [r5] ; CHECK-NEXT: adds r5, r7, r6 +; CHECK-NEXT: sub.w r10, r10, #4 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: vldrwt.u32 q1, [r5] +; CHECK-NEXT: vfmat.f32 q0, q2, q1 +; CHECK-NEXT: vldrwt.u32 q2, [r7] +; CHECK-NEXT: vstrw.32 q0, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [sp, #72] @ 16-byte Reload ; CHECK-NEXT: adds r7, r5, r6 -; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: vldrwt.u32 q1, [r7] +; CHECK-NEXT: vfmat.f32 q0, q2, q1 +; CHECK-NEXT: vldrwt.u32 q2, [r5] ; CHECK-NEXT: adds r5, r7, r6 -; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: vstrw.32 q0, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q4, q1, q0 -; CHECK-NEXT: vldrwt.u32 q1, [r5] -; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: add r5, r6 +; CHECK-NEXT: vfmat.f32 q3, q2, q1 +; CHECK-NEXT: vldrwt.u32 q2, [r7] +; CHECK-NEXT: vldrw.u32 q0, [sp, #40] @ 16-byte Reload ; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q5, q1, q0 -; CHECK-NEXT: vldrwt.u32 q1, [r5] -; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q3, q1, q0 +; CHECK-NEXT: vfmat.f32 q4, q2, q1 +; CHECK-NEXT: vldrwt.u32 q2, [r5] +; CHECK-NEXT: add r5, r6 +; CHECK-NEXT: vpsttt +; CHECK-NEXT: vfmat.f32 q5, q2, q1 +; CHECK-NEXT: vldrwt.u32 q2, [r5] +; CHECK-NEXT: vfmat.f32 q0, q2, q1 ; CHECK-NEXT: le lr, .LBB7_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1 -; CHECK-NEXT: vadd.f32 s0, s30, s31 +; CHECK-NEXT: vadd.f32 s4, s30, s31 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vadd.f32 s2, s28, s29 -; CHECK-NEXT: vadd.f32 s4, s26, s27 -; CHECK-NEXT: vadd.f32 s6, s24, s25 -; CHECK-NEXT: vadd.f32 s5, s18, s19 -; CHECK-NEXT: vadd.f32 s7, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 s10, s10, s11 -; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s9, s18, s19 -; CHECK-NEXT: vadd.f32 s11, s16, s17 +; CHECK-NEXT: vadd.f32 s6, s28, s29 +; CHECK-NEXT: vadd.f32 s2, s2, s3 +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vadd.f32 s8, s26, s27 +; CHECK-NEXT: vadd.f32 s10, s24, s25 +; CHECK-NEXT: vadd.f32 s1, s14, s15 +; CHECK-NEXT: vadd.f32 s3, s12, s13 +; CHECK-NEXT: vldrw.u32 q3, [sp, #56] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 s5, s22, s23 +; CHECK-NEXT: vadd.f32 s7, s20, s21 +; CHECK-NEXT: vadd.f32 s20, s18, s19 +; CHECK-NEXT: vadd.f32 s22, s16, s17 ; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 s14, s14, s15 -; CHECK-NEXT: vadd.f32 s12, s12, s13 +; CHECK-NEXT: vadd.f32 s9, s14, s15 +; CHECK-NEXT: vadd.f32 s11, s12, s13 ; CHECK-NEXT: vadd.f32 s13, s18, s19 ; CHECK-NEXT: vadd.f32 s15, s16, s17 -; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vadd.f32 s2, s6, s4 -; CHECK-NEXT: vadd.f32 s8, s8, s10 -; CHECK-NEXT: vadd.f32 s10, s11, s9 -; CHECK-NEXT: vadd.f32 s6, s12, s14 -; CHECK-NEXT: vadd.f32 s1, s22, s23 +; CHECK-NEXT: vadd.f32 s4, s6, s4 +; CHECK-NEXT: vadd.f32 s6, s10, s8 +; CHECK-NEXT: vadd.f32 s10, s3, s1 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s12, s11, s9 +; CHECK-NEXT: vadd.f32 s2, s22, s20 ; CHECK-NEXT: vadd.f32 s14, s15, s13 -; CHECK-NEXT: vstr s0, [r1] +; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s3, s20, s21 +; CHECK-NEXT: vadd.f32 s8, s7, s5 ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: add.w r1, r2, r8, lsl #2 -; CHECK-NEXT: vadd.f32 s12, s7, s5 -; CHECK-NEXT: vstr s10, [r1] +; CHECK-NEXT: vstr s6, [r1] +; CHECK-NEXT: add.w r1, r2, r12, lsl #2 +; CHECK-NEXT: vstr s12, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vstr s14, [r1] ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: vadd.f32 s4, s3, s1 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s8, [r1] +; CHECK-NEXT: vstr s10, [r1] ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s12, [r1] +; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s4, [r1] +; CHECK-NEXT: vstr s8, [r1] ; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s6, [r1] +; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r12, r1 +; CHECK-NEXT: add r9, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB7_2 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll index 39dededb5973a2..9ac2999bba4799 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll @@ -8,23 +8,24 @@ define i32 @vaddv(i32* nocapture readonly %data, i32 %N) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: mov lr, r1 ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB0_4 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #32 -; CHECK-NEXT: vaddva.s32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1, #-16] -; CHECK-NEXT: vaddva.s32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #32 +; CHECK-NEXT: vaddva.s32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #-16] +; CHECK-NEXT: vaddva.s32 r2, q0 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} entry: %cmp11 = icmp sgt i32 %N, 0 @@ -59,14 +60,14 @@ define void @arm_cmplx_dot_prod_q15(i16* nocapture readonly %pSrcA, i16* nocaptu ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: mvn r7, #7 -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: add.w r7, r7, r2, lsl #1 ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: lsr.w r9, r7, #3 -; CHECK-NEXT: mov r7, r12 -; CHECK-NEXT: mov r11, r12 +; CHECK-NEXT: mov r7, r4 +; CHECK-NEXT: mov r11, r4 ; CHECK-NEXT: wls lr, r9, .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: add.w r8, r0, r9, lsl #5 @@ -74,31 +75,31 @@ define void @arm_cmplx_dot_prod_q15(i16* nocapture readonly %pSrcA, i16* nocaptu ; CHECK-NEXT: adds r0, #32 ; CHECK-NEXT: add.w r6, r1, #32 ; CHECK-NEXT: lsl.w r9, r9, #4 -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r12, r11 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: mov r4, r11 ; CHECK-NEXT: .LBB1_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q2, [r6, #-16] ; CHECK-NEXT: vldrh.u16 q3, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r4, r11, q0, q1 -; CHECK-NEXT: vmlsldava.s16 r12, r7, q0, q1 +; CHECK-NEXT: vmlaldavax.s16 r12, r11, q0, q1 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #32 ; CHECK-NEXT: vldrh.u16 q1, [r6], #32 -; CHECK-NEXT: vmlaldavax.s16 r4, r11, q3, q2 -; CHECK-NEXT: vmlsldava.s16 r12, r7, q3, q2 +; CHECK-NEXT: vmlaldavax.s16 r12, r11, q3, q2 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q2 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %while.cond.while.end_crit_edge ; CHECK-NEXT: add.w r1, r1, r9, lsl #1 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: .LBB1_4: @ %while.end -; CHECK-NEXT: vmlaldavax.s16 r4, r11, q0, q1 -; CHECK-NEXT: vmlsldava.s16 r12, r7, q0, q1 -; CHECK-NEXT: mov r10, r4 +; CHECK-NEXT: vmlaldavax.s16 r12, r11, q0, q1 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 +; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: mov r5, r11 ; CHECK-NEXT: lsrl r10, r5, #6 ; CHECK-NEXT: ldr.w r8, [sp, #36] -; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r6, r4 ; CHECK-NEXT: mov r5, r7 ; CHECK-NEXT: and r2, r2, #3 ; CHECK-NEXT: lsrl r6, r5, #6 @@ -109,18 +110,18 @@ define void @arm_cmplx_dot_prod_q15(i16* nocapture readonly %pSrcA, i16* nocaptu ; CHECK-NEXT: ldrsh r6, [r1], #4 ; CHECK-NEXT: ldrsh r5, [r0, #-2] ; CHECK-NEXT: ldrsh r2, [r1, #-2] -; CHECK-NEXT: smlalbb r12, r7, r6, r9 -; CHECK-NEXT: smlalbb r4, r11, r6, r5 +; CHECK-NEXT: smlalbb r4, r7, r6, r9 +; CHECK-NEXT: smlalbb r12, r11, r6, r5 ; CHECK-NEXT: muls r5, r2, r5 -; CHECK-NEXT: smlalbb r4, r11, r2, r9 -; CHECK-NEXT: subs.w r12, r12, r5 +; CHECK-NEXT: smlalbb r12, r11, r2, r9 +; CHECK-NEXT: subs r4, r4, r5 ; CHECK-NEXT: sbc.w r7, r7, r5, asr #31 ; CHECK-NEXT: le lr, .LBB1_5 ; CHECK-NEXT: @ %bb.6: @ %while.end34.loopexit -; CHECK-NEXT: lsrl r12, r7, #6 -; CHECK-NEXT: lsrl r4, r11, #6 -; CHECK-NEXT: mov r6, r12 -; CHECK-NEXT: mov r10, r4 +; CHECK-NEXT: lsrl r4, r7, #6 +; CHECK-NEXT: lsrl r12, r11, #6 +; CHECK-NEXT: mov r6, r4 +; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: .LBB1_7: @ %while.end34 ; CHECK-NEXT: str r6, [r3] ; CHECK-NEXT: str.w r10, [r8] diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll index 2aa183c31bab59..a33b30022f90f8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -212,9 +212,9 @@ define i8* @test(i8* nocapture readonly %input_row, i8* nocapture readonly %inpu ; CHECK-NEXT: beq .LBB2_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph ; CHECK-NEXT: ldr r3, [sp, #64] -; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr.w r11, [sp, #56] +; CHECK-NEXT: ldr.w r12, [sp, #56] ; CHECK-NEXT: add.w r0, r1, r3, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r3 @@ -223,57 +223,57 @@ define i8* @test(i8* nocapture readonly %input_row, i8* nocapture readonly %inpu ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: adds r0, r3, #7 -; CHECK-NEXT: lsrs r0, r0, #3 +; CHECK-NEXT: lsr.w r9, r0, #3 ; CHECK-NEXT: b .LBB2_5 ; CHECK-NEXT: .LBB2_3: @ in Loop: Header=BB2_5 Depth=1 -; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: .LBB2_4: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #72] -; CHECK-NEXT: add.w r1, r8, r10 +; CHECK-NEXT: add.w r1, r10, r8 ; CHECK-NEXT: add r1, r6 -; CHECK-NEXT: add r1, r12 -; CHECK-NEXT: strb.w r1, [r3, r9] -; CHECK-NEXT: add.w r9, r9, #1 -; CHECK-NEXT: cmp r9, r2 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: ldr r1, [sp, #72] +; CHECK-NEXT: strb.w r0, [r1, r11] +; CHECK-NEXT: add.w r11, r11, #1 +; CHECK-NEXT: cmp r11, r2 ; CHECK-NEXT: beq .LBB2_8 ; CHECK-NEXT: .LBB2_5: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_7 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #68] -; CHECK-NEXT: ldr.w r12, [r1, r9, lsl #2] -; CHECK-NEXT: subs r1, r0, r0 +; CHECK-NEXT: ldr r0, [sp, #68] +; CHECK-NEXT: subs.w r1, r9, r9 +; CHECK-NEXT: ldr.w r0, [r0, r11, lsl #2] ; CHECK-NEXT: ble .LBB2_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: ldr r7, [sp, #64] -; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mla r7, r9, r7, r3 +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mla r7, r11, r7, r3 ; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: ldrd r4, r3, [sp] @ 8-byte Folded Reload -; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: .LBB2_7: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB2_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrb.s16 q0, [r4], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r11 -; CHECK-NEXT: vldrb.s16 q0, [r7], #8 -; CHECK-NEXT: vmlava.s16 r12, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 -; CHECK-NEXT: vmlava.s16 r6, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r3], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 -; CHECK-NEXT: vmlava.s16 r8, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 -; CHECK-NEXT: vmlava.s16 r10, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r7], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r12 +; CHECK-NEXT: vmlava.s16 r0, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r5], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r12 +; CHECK-NEXT: vmlava.s16 r6, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r3], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r12 +; CHECK-NEXT: vmlava.s16 r10, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r1], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r12 +; CHECK-NEXT: vmlava.s16 r8, q1, q0 ; CHECK-NEXT: le lr, .LBB2_7 ; CHECK-NEXT: b .LBB2_4 ; CHECK-NEXT: .LBB2_8: @ %if.end @@ -395,9 +395,9 @@ define i8* @test_optsize(i8* nocapture readonly %input_row, i8* nocapture readon ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph ; CHECK-NEXT: ldr r3, [sp, #64] -; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr.w r11, [sp, #56] +; CHECK-NEXT: ldr.w r12, [sp, #56] ; CHECK-NEXT: add.w r0, r1, r3, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r3 @@ -406,57 +406,57 @@ define i8* @test_optsize(i8* nocapture readonly %input_row, i8* nocapture readon ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: adds r0, r3, #7 -; CHECK-NEXT: lsrs r0, r0, #3 +; CHECK-NEXT: lsr.w r9, r0, #3 ; CHECK-NEXT: .LBB3_3: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_5 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #68] -; CHECK-NEXT: ldr.w r12, [r1, r9, lsl #2] -; CHECK-NEXT: subs r1, r0, r0 +; CHECK-NEXT: ldr r0, [sp, #68] +; CHECK-NEXT: subs.w r1, r9, r9 +; CHECK-NEXT: ldr.w r0, [r0, r11, lsl #2] ; CHECK-NEXT: ble .LBB3_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB3_3 Depth=1 ; CHECK-NEXT: ldr r7, [sp, #64] -; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mla r7, r9, r7, r3 +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mla r7, r11, r7, r3 ; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: ldrd r4, r3, [sp] @ 8-byte Folded Reload -; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: .LBB3_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB3_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrb.s16 q0, [r4], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r11 -; CHECK-NEXT: vldrb.s16 q0, [r7], #8 -; CHECK-NEXT: vmlava.s16 r12, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 -; CHECK-NEXT: vmlava.s16 r6, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r3], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 -; CHECK-NEXT: vmlava.s16 r8, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 -; CHECK-NEXT: vmlava.s16 r10, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r7], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r12 +; CHECK-NEXT: vmlava.s16 r0, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r5], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r12 +; CHECK-NEXT: vmlava.s16 r6, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r3], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r12 +; CHECK-NEXT: vmlava.s16 r10, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r1], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r12 +; CHECK-NEXT: vmlava.s16 r8, q1, q0 ; CHECK-NEXT: le lr, .LBB3_5 ; CHECK-NEXT: b .LBB3_7 ; CHECK-NEXT: .LBB3_6: @ in Loop: Header=BB3_3 Depth=1 -; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: .LBB3_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB3_3 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #72] -; CHECK-NEXT: add.w r1, r8, r10 +; CHECK-NEXT: add.w r1, r10, r8 ; CHECK-NEXT: add r1, r6 -; CHECK-NEXT: add r1, r12 -; CHECK-NEXT: strb.w r1, [r3, r9] -; CHECK-NEXT: add.w r9, r9, #1 -; CHECK-NEXT: cmp r9, r2 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: ldr r1, [sp, #72] +; CHECK-NEXT: strb.w r0, [r1, r11] +; CHECK-NEXT: add.w r11, r11, #1 +; CHECK-NEXT: cmp r11, r2 ; CHECK-NEXT: bne .LBB3_3 ; CHECK-NEXT: .LBB3_8: @ %if.end ; CHECK-NEXT: ldr r0, [sp, #72] @@ -573,9 +573,9 @@ define i32 @arm_nn_mat_mul_core_4x_s8(i32 %row_elements, i32 %offset, i8* %row_b ; CHECK-NEXT: blt .LBB4_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: add.w r5, r2, r1, lsl #1 -; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: adds r7, r2, r1 ; CHECK-NEXT: add.w r1, r1, r1, lsl #1 @@ -585,24 +585,24 @@ define i32 @arm_nn_mat_mul_core_4x_s8(i32 %row_elements, i32 %offset, i8* %row_b ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q0, [r3], #16 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16 -; CHECK-NEXT: vmlava.s8 r10, q1, q0 +; CHECK-NEXT: vmlava.s8 r8, q1, q0 ; CHECK-NEXT: vldrb.u8 q1, [r5], #16 -; CHECK-NEXT: vmlava.s8 r4, q1, q0 +; CHECK-NEXT: vmlava.s8 r10, q1, q0 ; CHECK-NEXT: vldrb.u8 q1, [r7], #16 -; CHECK-NEXT: vmlava.s8 r6, q1, q0 +; CHECK-NEXT: vmlava.s8 r4, q1, q0 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16 -; CHECK-NEXT: vmlava.s8 r8, q1, q0 +; CHECK-NEXT: vmlava.s8 r6, q1, q0 ; CHECK-NEXT: letp lr, .LBB4_2 ; CHECK-NEXT: b .LBB4_4 ; CHECK-NEXT: .LBB4_3: +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: .LBB4_4: @ %for.cond.cleanup ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: strd r8, r6, [r12] -; CHECK-NEXT: strd r4, r10, [r12, #8] +; CHECK-NEXT: strd r6, r4, [r12] +; CHECK-NEXT: strd r10, r8, [r12, #8] ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r10, pc} entry: %add = add nsw i32 %row_elements, 15 @@ -677,21 +677,21 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #28 -; CHECK-NEXT: sub sp, #28 -; CHECK-NEXT: add.w r12, sp, #12 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill +; CHECK-NEXT: str r2, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill ; CHECK-NEXT: bne .LBB5_8 ; CHECK-NEXT: @ %bb.1: @ %for.cond.preheader -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r2, [sp, #92] -; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: ldr r2, [sp, #96] +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #76] +; CHECK-NEXT: ldr.w r9, [sp, #80] ; CHECK-NEXT: add.w r0, r1, r2, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r2 @@ -703,61 +703,65 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16 ; CHECK-NEXT: lsrs r1, r0, #3 ; CHECK-NEXT: b .LBB5_5 ; CHECK-NEXT: .LBB5_3: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r12, r10 +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r6, r10 ; CHECK-NEXT: .LBB5_4: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: add.w r0, r8, r10 -; CHECK-NEXT: ldr r1, [sp, #100] -; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: strb.w r0, [r1, r11] -; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: cmp r11, r0 +; CHECK-NEXT: ldr r1, [sp, #104] +; CHECK-NEXT: add r0, r6 +; CHECK-NEXT: add r0, r10 +; CHECK-NEXT: strb.w r0, [r1, r8] +; CHECK-NEXT: add.w r8, r8, #1 +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: cmp r8, r0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: .LBB5_5: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_7 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #96] +; CHECK-NEXT: ldr r0, [sp, #100] ; CHECK-NEXT: cmp r1, r1 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] +; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: ldr.w r10, [r0, r8, lsl #2] ; CHECK-NEXT: ble .LBB5_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: ldr.w lr, [sp, #92] +; CHECK-NEXT: ldr.w r11, [sp, #96] +; CHECK-NEXT: mov r6, r10 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r12, r10 +; CHECK-NEXT: ldr.w lr, [sp, #96] +; CHECK-NEXT: mla r3, r8, r11, r0 +; CHECK-NEXT: subs r0, r1, r1 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mla r3, r11, lr, r0 -; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: ldm.w sp, {r0, r5, r7} @ 12-byte Folded Reload +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldm.w sp, {r4, r5, r7} @ 12-byte Folded Reload ; CHECK-NEXT: dlstp.16 lr, lr ; CHECK-NEXT: .LBB5_7: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB5_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrb.s16 q0, [r0], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r4 -; CHECK-NEXT: vldrb.s16 q0, [r3], #8 -; CHECK-NEXT: vmlava.s16 r12, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r7], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r6, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r8, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r10, q0, q1 +; CHECK-NEXT: vldrb.s16 q0, [r4], #8 +; CHECK-NEXT: vldrb.s16 q1, [r3], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r9 +; CHECK-NEXT: vmlava.s16 r10, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r7], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r9 +; CHECK-NEXT: vmlava.s16 r6, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r5], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r9 +; CHECK-NEXT: vmlava.s16 r0, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r1], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r9 +; CHECK-NEXT: vmlava.s16 r12, q1, q0 ; CHECK-NEXT: letp lr, .LBB5_7 ; CHECK-NEXT: b .LBB5_4 ; CHECK-NEXT: .LBB5_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #100] -; CHECK-NEXT: add sp, #28 +; CHECK-NEXT: ldr r0, [sp, #104] +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 @@ -866,21 +870,21 @@ define i8* @signext_optsize(i8* %input_row, i8* %input_col, i16 zeroext %output_ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #28 -; CHECK-NEXT: sub sp, #28 -; CHECK-NEXT: add.w r12, sp, #12 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill +; CHECK-NEXT: str r2, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill ; CHECK-NEXT: bne .LBB6_8 ; CHECK-NEXT: @ %bb.1: @ %for.cond.preheader -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r2, [sp, #92] -; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: ldr r2, [sp, #96] +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #76] +; CHECK-NEXT: ldr.w r9, [sp, #80] ; CHECK-NEXT: add.w r0, r1, r2, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r2 @@ -893,59 +897,63 @@ define i8* @signext_optsize(i8* %input_row, i8* %input_col, i16 zeroext %output_ ; CHECK-NEXT: .LBB6_3: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_5 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #96] +; CHECK-NEXT: ldr r0, [sp, #100] ; CHECK-NEXT: cmp r1, r1 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] +; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: ldr.w r10, [r0, r8, lsl #2] ; CHECK-NEXT: ble .LBB6_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: ldr.w lr, [sp, #92] +; CHECK-NEXT: ldr.w r11, [sp, #96] +; CHECK-NEXT: mov r6, r10 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r12, r10 +; CHECK-NEXT: ldr.w lr, [sp, #96] +; CHECK-NEXT: mla r3, r8, r11, r0 +; CHECK-NEXT: subs r0, r1, r1 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mla r3, r11, lr, r0 -; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: ldm.w sp, {r0, r5, r7} @ 12-byte Folded Reload +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldm.w sp, {r4, r5, r7} @ 12-byte Folded Reload ; CHECK-NEXT: dlstp.16 lr, lr ; CHECK-NEXT: .LBB6_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB6_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrb.s16 q0, [r0], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r4 -; CHECK-NEXT: vldrb.s16 q0, [r3], #8 -; CHECK-NEXT: vmlava.s16 r12, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r7], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r6, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r8, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r10, q0, q1 +; CHECK-NEXT: vldrb.s16 q0, [r4], #8 +; CHECK-NEXT: vldrb.s16 q1, [r3], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r9 +; CHECK-NEXT: vmlava.s16 r10, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r7], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r9 +; CHECK-NEXT: vmlava.s16 r6, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r5], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r9 +; CHECK-NEXT: vmlava.s16 r0, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r1], #8 +; CHECK-NEXT: vadd.i16 q0, q0, r9 +; CHECK-NEXT: vmlava.s16 r12, q1, q0 ; CHECK-NEXT: letp lr, .LBB6_5 ; CHECK-NEXT: b .LBB6_7 ; CHECK-NEXT: .LBB6_6: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r12, r10 +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r6, r10 ; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: add.w r0, r8, r10 -; CHECK-NEXT: ldr r1, [sp, #100] -; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: strb.w r0, [r1, r11] -; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: cmp r11, r0 +; CHECK-NEXT: ldr r1, [sp, #104] +; CHECK-NEXT: add r0, r6 +; CHECK-NEXT: add r0, r10 +; CHECK-NEXT: strb.w r0, [r1, r8] +; CHECK-NEXT: add.w r8, r8, #1 +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: cmp r8, r0 ; CHECK-NEXT: bne .LBB6_3 ; CHECK-NEXT: .LBB6_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #100] -; CHECK-NEXT: add sp, #28 +; CHECK-NEXT: ldr r0, [sp, #104] +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 @@ -1125,26 +1133,26 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf ; CHECK-NEXT: .LBB7_7: @ Parent Loop BB7_3 Depth=1 ; CHECK-NEXT: @ Parent Loop BB7_6 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vldrw.u32 q3, [r9] -; CHECK-NEXT: vldrw.u32 q4, [r2] -; CHECK-NEXT: vldrw.u32 q6, [r8] -; CHECK-NEXT: vldrw.u32 q7, [r1] -; CHECK-NEXT: vsub.f32 q5, q4, q3 -; CHECK-NEXT: vsub.f32 q0, q7, q6 -; CHECK-NEXT: vcadd.f32 q1, q0, q5, #270 -; CHECK-NEXT: vcadd.f32 q2, q0, q5, #90 -; CHECK-NEXT: vadd.f32 q0, q4, q3 -; CHECK-NEXT: vadd.f32 q3, q6, q7 -; CHECK-NEXT: vsub.f32 q4, q3, q0 -; CHECK-NEXT: vadd.f32 q0, q3, q0 -; CHECK-NEXT: vstrb.8 q0, [r1], #16 -; CHECK-NEXT: vldrw.u32 q0, [r7], #16 -; CHECK-NEXT: vcmul.f32 q3, q0, q4, #0 -; CHECK-NEXT: vcmla.f32 q3, q0, q4, #90 +; CHECK-NEXT: vldrw.u32 q2, [r9] +; CHECK-NEXT: vldrw.u32 q3, [r2] +; CHECK-NEXT: vldrw.u32 q5, [r8] +; CHECK-NEXT: vldrw.u32 q6, [r1] +; CHECK-NEXT: vsub.f32 q4, q3, q2 +; CHECK-NEXT: vadd.f32 q2, q3, q2 +; CHECK-NEXT: vsub.f32 q7, q6, q5 +; CHECK-NEXT: vadd.f32 q3, q5, q6 +; CHECK-NEXT: vcadd.f32 q1, q7, q4, #270 +; CHECK-NEXT: vcadd.f32 q0, q7, q4, #90 +; CHECK-NEXT: vsub.f32 q4, q3, q2 +; CHECK-NEXT: vadd.f32 q2, q3, q2 +; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r7], #16 +; CHECK-NEXT: vcmul.f32 q3, q2, q4, #0 +; CHECK-NEXT: vcmla.f32 q3, q2, q4, #90 ; CHECK-NEXT: vstrb.8 q3, [r2], #16 -; CHECK-NEXT: vldrw.u32 q0, [r3], #16 -; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0 -; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90 +; CHECK-NEXT: vldrw.u32 q2, [r3], #16 +; CHECK-NEXT: vcmul.f32 q3, q2, q0, #0 +; CHECK-NEXT: vcmla.f32 q3, q2, q0, #90 ; CHECK-NEXT: vstrb.8 q3, [r8], #16 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16 ; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll index 470007878ec842..8e1e59c4d632a6 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll @@ -62,25 +62,25 @@ define arm_aapcs_vfpcc <8 x i16> @bitcast_to_v8i1(i8 %b, <8 x i16> %a) { ; CHECK-LE-NEXT: vmov.i8 q1, #0x0 ; CHECK-LE-NEXT: vmov.i8 q2, #0xff ; CHECK-LE-NEXT: vmsr p0, r0 -; CHECK-LE-NEXT: vpsel q2, q2, q1 -; CHECK-LE-NEXT: vmov.u8 r0, q2[0] -; CHECK-LE-NEXT: vmov.16 q1[0], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[1] -; CHECK-LE-NEXT: vmov.16 q1[1], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[2] -; CHECK-LE-NEXT: vmov.16 q1[2], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[3] -; CHECK-LE-NEXT: vmov.16 q1[3], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[4] -; CHECK-LE-NEXT: vmov.16 q1[4], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[5] -; CHECK-LE-NEXT: vmov.16 q1[5], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[6] -; CHECK-LE-NEXT: vmov.16 q1[6], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[7] -; CHECK-LE-NEXT: vmov.16 q1[7], r0 -; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr +; CHECK-LE-NEXT: vpsel q1, q2, q1 +; CHECK-LE-NEXT: vmov.u8 r0, q1[0] +; CHECK-LE-NEXT: vmov.16 q2[0], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[1] +; CHECK-LE-NEXT: vmov.16 q2[1], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[2] +; CHECK-LE-NEXT: vmov.16 q2[2], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[3] +; CHECK-LE-NEXT: vmov.16 q2[3], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[4] +; CHECK-LE-NEXT: vmov.16 q2[4], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[5] +; CHECK-LE-NEXT: vmov.16 q2[5], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[6] +; CHECK-LE-NEXT: vmov.16 q2[6], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[7] +; CHECK-LE-NEXT: vmov.16 q2[7], r0 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: vcmp.i16 ne, q2, zr ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr @@ -95,26 +95,26 @@ define arm_aapcs_vfpcc <8 x i16> @bitcast_to_v8i1(i8 %b, <8 x i16> %a) { ; CHECK-BE-NEXT: vmov.i8 q2, #0xff ; CHECK-BE-NEXT: lsrs r0, r0, #24 ; CHECK-BE-NEXT: vmsr p0, r0 -; CHECK-BE-NEXT: vpsel q2, q2, q1 -; CHECK-BE-NEXT: vmov.u8 r0, q2[0] -; CHECK-BE-NEXT: vmov.16 q1[0], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[1] -; CHECK-BE-NEXT: vmov.16 q1[1], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[2] -; CHECK-BE-NEXT: vmov.16 q1[2], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[3] -; CHECK-BE-NEXT: vmov.16 q1[3], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[4] -; CHECK-BE-NEXT: vmov.16 q1[4], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[5] -; CHECK-BE-NEXT: vmov.16 q1[5], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[6] -; CHECK-BE-NEXT: vmov.16 q1[6], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[7] -; CHECK-BE-NEXT: vmov.16 q1[7], r0 -; CHECK-BE-NEXT: vcmp.i16 ne, q1, zr +; CHECK-BE-NEXT: vpsel q1, q2, q1 +; CHECK-BE-NEXT: vmov.u8 r0, q1[0] +; CHECK-BE-NEXT: vmov.16 q2[0], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[1] +; CHECK-BE-NEXT: vmov.16 q2[1], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[2] +; CHECK-BE-NEXT: vmov.16 q2[2], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[3] +; CHECK-BE-NEXT: vmov.16 q2[3], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[4] +; CHECK-BE-NEXT: vmov.16 q2[4], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[5] +; CHECK-BE-NEXT: vmov.16 q2[5], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[6] +; CHECK-BE-NEXT: vmov.16 q2[6], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[7] +; CHECK-BE-NEXT: vmov.16 q2[7], r0 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr ; CHECK-BE-NEXT: vrev32.16 q0, q0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.16 q0, q1 @@ -261,33 +261,33 @@ define arm_aapcs_vfpcc i8 @bitcast_from_v8i1(<8 x i16> %a) { ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr -; CHECK-LE-NEXT: vmrs r1, p0 -; CHECK-LE-NEXT: and r0, r1, #1 -; CHECK-LE-NEXT: rsbs r2, r0, #0 -; CHECK-LE-NEXT: movs r0, #0 -; CHECK-LE-NEXT: bfi r0, r2, #0, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #1, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #2, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #3, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #4, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #5, #1 -; CHECK-LE-NEXT: ubfx r2, r1, #12, #1 -; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r0, r2, #6, #1 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r0, p0 +; CHECK-LE-NEXT: and r1, r0, #1 ; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r0, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r0, r0 +; CHECK-LE-NEXT: bfi r2, r1, #0, #1 +; CHECK-LE-NEXT: ubfx r1, r0, #2, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #1, #1 +; CHECK-LE-NEXT: ubfx r1, r0, #4, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #2, #1 +; CHECK-LE-NEXT: ubfx r1, r0, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #3, #1 +; CHECK-LE-NEXT: ubfx r1, r0, #8, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #4, #1 +; CHECK-LE-NEXT: ubfx r1, r0, #10, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #5, #1 +; CHECK-LE-NEXT: ubfx r1, r0, #12, #1 +; CHECK-LE-NEXT: ubfx r0, r0, #14, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r0, r0, #0 +; CHECK-LE-NEXT: bfi r2, r0, #7, #1 +; CHECK-LE-NEXT: uxtb r0, r2 ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; @@ -296,34 +296,34 @@ define arm_aapcs_vfpcc i8 @bitcast_from_v8i1(<8 x i16> %a) { ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.i16 eq, q1, zr -; CHECK-BE-NEXT: vmrs r1, p0 -; CHECK-BE-NEXT: ubfx r0, r1, #14, #1 -; CHECK-BE-NEXT: rsbs r2, r0, #0 -; CHECK-BE-NEXT: movs r0, #0 -; CHECK-BE-NEXT: bfi r0, r2, #0, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #1, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #10, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #2, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #3, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #6, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #4, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #4, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #5, #1 -; CHECK-BE-NEXT: ubfx r2, r1, #2, #1 -; CHECK-BE-NEXT: and r1, r1, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r0, r2, #6, #1 +; CHECK-BE-NEXT: vmrs r0, p0 +; CHECK-BE-NEXT: ubfx r1, r0, #14, #1 ; CHECK-BE-NEXT: rsbs r1, r1, #0 -; CHECK-BE-NEXT: bfi r0, r1, #7, #1 -; CHECK-BE-NEXT: uxtb r0, r0 +; CHECK-BE-NEXT: bfi r2, r1, #0, #1 +; CHECK-BE-NEXT: ubfx r1, r0, #12, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #1, #1 +; CHECK-BE-NEXT: ubfx r1, r0, #10, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #2, #1 +; CHECK-BE-NEXT: ubfx r1, r0, #8, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #3, #1 +; CHECK-BE-NEXT: ubfx r1, r0, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #4, #1 +; CHECK-BE-NEXT: ubfx r1, r0, #4, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #5, #1 +; CHECK-BE-NEXT: ubfx r1, r0, #2, #1 +; CHECK-BE-NEXT: and r0, r0, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r0, r0, #0 +; CHECK-BE-NEXT: bfi r2, r0, #7, #1 +; CHECK-BE-NEXT: uxtb r0, r2 ; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll index a92adf6f1a067b..523301b51d0fe5 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll @@ -55,25 +55,25 @@ define arm_aapcs_vfpcc <8 x i16> @load_v8i1(ptr %src, <8 x i16> %a) { ; CHECK-LE-NEXT: vmov.i8 q1, #0x0 ; CHECK-LE-NEXT: vmov.i8 q2, #0xff ; CHECK-LE-NEXT: vmsr p0, r0 -; CHECK-LE-NEXT: vpsel q2, q2, q1 -; CHECK-LE-NEXT: vmov.u8 r0, q2[0] -; CHECK-LE-NEXT: vmov.16 q1[0], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[1] -; CHECK-LE-NEXT: vmov.16 q1[1], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[2] -; CHECK-LE-NEXT: vmov.16 q1[2], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[3] -; CHECK-LE-NEXT: vmov.16 q1[3], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[4] -; CHECK-LE-NEXT: vmov.16 q1[4], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[5] -; CHECK-LE-NEXT: vmov.16 q1[5], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[6] -; CHECK-LE-NEXT: vmov.16 q1[6], r0 -; CHECK-LE-NEXT: vmov.u8 r0, q2[7] -; CHECK-LE-NEXT: vmov.16 q1[7], r0 -; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr +; CHECK-LE-NEXT: vpsel q1, q2, q1 +; CHECK-LE-NEXT: vmov.u8 r0, q1[0] +; CHECK-LE-NEXT: vmov.16 q2[0], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[1] +; CHECK-LE-NEXT: vmov.16 q2[1], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[2] +; CHECK-LE-NEXT: vmov.16 q2[2], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[3] +; CHECK-LE-NEXT: vmov.16 q2[3], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[4] +; CHECK-LE-NEXT: vmov.16 q2[4], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[5] +; CHECK-LE-NEXT: vmov.16 q2[5], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[6] +; CHECK-LE-NEXT: vmov.16 q2[6], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[7] +; CHECK-LE-NEXT: vmov.16 q2[7], r0 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: vcmp.i16 ne, q2, zr ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; @@ -85,26 +85,26 @@ define arm_aapcs_vfpcc <8 x i16> @load_v8i1(ptr %src, <8 x i16> %a) { ; CHECK-BE-NEXT: rbit r0, r0 ; CHECK-BE-NEXT: lsrs r0, r0, #24 ; CHECK-BE-NEXT: vmsr p0, r0 -; CHECK-BE-NEXT: vpsel q2, q2, q1 -; CHECK-BE-NEXT: vmov.u8 r0, q2[0] -; CHECK-BE-NEXT: vmov.16 q1[0], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[1] -; CHECK-BE-NEXT: vmov.16 q1[1], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[2] -; CHECK-BE-NEXT: vmov.16 q1[2], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[3] -; CHECK-BE-NEXT: vmov.16 q1[3], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[4] -; CHECK-BE-NEXT: vmov.16 q1[4], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[5] -; CHECK-BE-NEXT: vmov.16 q1[5], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[6] -; CHECK-BE-NEXT: vmov.16 q1[6], r0 -; CHECK-BE-NEXT: vmov.u8 r0, q2[7] -; CHECK-BE-NEXT: vmov.16 q1[7], r0 -; CHECK-BE-NEXT: vcmp.i16 ne, q1, zr +; CHECK-BE-NEXT: vpsel q1, q2, q1 +; CHECK-BE-NEXT: vmov.u8 r0, q1[0] +; CHECK-BE-NEXT: vmov.16 q2[0], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[1] +; CHECK-BE-NEXT: vmov.16 q2[1], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[2] +; CHECK-BE-NEXT: vmov.16 q2[2], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[3] +; CHECK-BE-NEXT: vmov.16 q2[3], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[4] +; CHECK-BE-NEXT: vmov.16 q2[4], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[5] +; CHECK-BE-NEXT: vmov.16 q2[5], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[6] +; CHECK-BE-NEXT: vmov.16 q2[6], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[7] +; CHECK-BE-NEXT: vmov.16 q2[7], r0 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr ; CHECK-BE-NEXT: vrev32.16 q0, q0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.16 q0, q1 @@ -234,66 +234,66 @@ define arm_aapcs_vfpcc void @store_v8i1(ptr %dst, <8 x i16> %a) { ; CHECK-LE-LABEL: store_v8i1: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr -; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: bfi r1, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #3, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #4, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #5, #1 -; CHECK-LE-NEXT: ubfx r3, r2, #12, #1 -; CHECK-LE-NEXT: ubfx r2, r2, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r1, r3, #6, #1 +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r2, r1, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #2, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #3, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #4, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #10, #1 ; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #7, #1 -; CHECK-LE-NEXT: strb r1, [r0] +; CHECK-LE-NEXT: bfi r3, r2, #5, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r3, r1, #7, #1 +; CHECK-LE-NEXT: strb r3, [r0] ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: store_v8i1: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: vcmp.i16 eq, q1, zr -; CHECK-BE-NEXT: vmrs r2, p0 -; CHECK-BE-NEXT: ubfx r1, r2, #14, #1 -; CHECK-BE-NEXT: rsbs r3, r1, #0 -; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: bfi r1, r3, #0, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #1, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #10, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #2, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #3, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #6, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #4, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #5, #1 -; CHECK-BE-NEXT: ubfx r3, r2, #2, #1 -; CHECK-BE-NEXT: and r2, r2, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: bfi r1, r3, #6, #1 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r2, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #10, #1 ; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: bfi r1, r2, #7, #1 -; CHECK-BE-NEXT: strb r1, [r0] +; CHECK-BE-NEXT: bfi r3, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-BE-NEXT: and r1, r1, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r3, r1, #7, #1 +; CHECK-BE-NEXT: strb r3, [r0] ; CHECK-BE-NEXT: bx lr entry: %c = icmp eq <8 x i16> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll index cca15165e012eb..b86a705273d2d9 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll @@ -526,27 +526,27 @@ define <8 x i16> @shuffle5_b_v8i16(<16 x i8> %src, <8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: vmov d0, r0, r1 ; CHECK-NEXT: vcmp.i8 eq, q0, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 -; CHECK-NEXT: vpsel q1, q1, q0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.16 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.16 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.16 q1[7], r0 ; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vcmp.i16 ne, q1, zr ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov r0, r1, d0 @@ -567,27 +567,27 @@ define <8 x i16> @shuffle5_t_v8i16(<16 x i8> %src, <8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: vmov d0, r0, r1 ; CHECK-NEXT: vcmp.i8 eq, q0, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 -; CHECK-NEXT: vpsel q1, q1, q0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.16 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.16 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.16 q1[7], r0 ; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vcmp.i16 ne, q1, zr ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov r0, r1, d0 @@ -641,32 +641,32 @@ define <8 x i16> @shuffle6_v4i32(<4 x i32> %src1, <4 x i32> %src2, <8 x i16> %a, ; CHECK-LABEL: shuffle6_v4i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: vmov.i8 q1, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff ; CHECK-NEXT: vmov d0, r0, r1 -; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q3, q2, q1 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, r1, d7 -; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q2, q1, q0 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.16 q3[1], r1 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov.16 q3[2], r0 ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vcmp.i32 eq, q3, zr -; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.16 q3[3], r1 +; CHECK-NEXT: vcmp.i32 eq, q2, zr +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.16 q3[5], r1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov.16 q3[6], r0 ; CHECK-NEXT: add r0, sp, #32 -; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vmov.16 q3[7], r1 ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmp.i16 ne, q3, zr ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll index 747021e5c64eb3..f70af5661f4c90 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll @@ -383,27 +383,27 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1_i1(<2 x i64> %a, <2 x i64> %b, i64 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: orr.w r2, r0, r1 +; CHECK-NEXT: orr.w r3, r0, r1 ; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vmov r1, r2, d3 ; CHECK-NEXT: csetm r12, eq ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: orrs r1, r3 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: csetm r4, eq +; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: csetm lr, eq -; CHECK-NEXT: orrs r1, r3 -; CHECK-NEXT: vmov r1, r4, d1 -; CHECK-NEXT: csetm r3, eq -; CHECK-NEXT: orrs r1, r4 +; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: cbz r2, .LBB15_2 +; CHECK-NEXT: cbz r3, .LBB15_2 ; CHECK-NEXT: @ %bb.1: @ %select.false ; CHECK-NEXT: bfi r0, r12, #0, #8 -; CHECK-NEXT: bfi r0, lr, #8, #8 +; CHECK-NEXT: bfi r0, r4, #8, #8 ; CHECK-NEXT: b .LBB15_3 ; CHECK-NEXT: .LBB15_2: -; CHECK-NEXT: bfi r0, r3, #0, #8 +; CHECK-NEXT: bfi r0, lr, #0, #8 ; CHECK-NEXT: bfi r0, r1, #8, #8 ; CHECK-NEXT: .LBB15_3: @ %select.end ; CHECK-NEXT: vmsr p0, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll index b5d981ef340254..54087235ad3580 100644 --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -17,44 +17,45 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r8, r1 -; CHECK-NEXT: mov r10, r11 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: mov r9, r11 ; CHECK-NEXT: b .LBB0_6 ; CHECK-NEXT: .LBB0_3: @ %vector.ph ; CHECK-NEXT: bic r2, r3, #1 -; CHECK-NEXT: adr r4, .LCPI0_0 +; CHECK-NEXT: adr r5, .LCPI0_0 ; CHECK-NEXT: subs r7, r2, #2 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r10, r11, r2, lsl #2 +; CHECK-NEXT: add.w r9, r11, r2, lsl #2 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r8, r1, r2, lsl #2 +; CHECK-NEXT: add.w r6, r1, r2, lsl #2 ; CHECK-NEXT: add.w r12, r0, r2, lsl #2 -; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: vmvn.i32 q1, #0x80000000 ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r4, r2, [r0], #8 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: ldrd r7, r6, [r1], #8 -; CHECK-NEXT: smull r4, r7, r7, r4 -; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: rsbs.w r9, r4, #-2147483648 -; CHECK-NEXT: mov.w r9, #-1 -; CHECK-NEXT: sbcs.w r3, r9, r7 -; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: bfi r5, r3, #0, #8 -; CHECK-NEXT: smull r2, r3, r6, r2 -; CHECK-NEXT: asrl r2, r3, #31 -; CHECK-NEXT: rsbs.w r6, r2, #-2147483648 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 -; CHECK-NEXT: sbcs.w r6, r9, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r7, r3 -; CHECK-NEXT: csetm r6, lt -; CHECK-NEXT: bfi r5, r6, #8, #8 -; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: ldrd r7, r2, [r0], #8 +; CHECK-NEXT: mov.w r3, #-1 +; CHECK-NEXT: ldrd r5, r8, [r1], #8 +; CHECK-NEXT: smull r10, r7, r5, r7 +; CHECK-NEXT: asrl r10, r7, #31 +; CHECK-NEXT: rsbs.w r5, r10, #-2147483648 +; CHECK-NEXT: sbcs.w r5, r3, r7 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: csetm r5, lt +; CHECK-NEXT: bfi r3, r5, #0, #8 +; CHECK-NEXT: smull r4, r5, r8, r2 +; CHECK-NEXT: asrl r4, r5, #31 +; CHECK-NEXT: rsbs.w r2, r4, #-2147483648 +; CHECK-NEXT: vmov q2[2], q2[0], r10, r4 +; CHECK-NEXT: mov.w r2, #-1 +; CHECK-NEXT: vmov q2[3], q2[1], r7, r5 +; CHECK-NEXT: sbcs r2, r5 ; CHECK-NEXT: mvn r5, #-2147483648 +; CHECK-NEXT: csetm r2, lt +; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q2, q2, q0 ; CHECK-NEXT: vmov r2, r3, d4 ; CHECK-NEXT: subs r2, r2, r5 @@ -85,7 +86,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r2, [r12], #4 -; CHECK-NEXT: ldr r4, [r8], #4 +; CHECK-NEXT: ldr r4, [r6], #4 ; CHECK-NEXT: smull r2, r5, r4, r2 ; CHECK-NEXT: asrl r2, r5, #31 ; CHECK-NEXT: subs r4, r1, r2 @@ -97,7 +98,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: subs r5, r2, r3 ; CHECK-NEXT: sbcs r4, r4, #0 ; CHECK-NEXT: csel r2, r2, r3, lt -; CHECK-NEXT: str r2, [r10], #4 +; CHECK-NEXT: str r2, [r9], #4 ; CHECK-NEXT: le lr, .LBB0_7 ; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #8 @@ -191,112 +192,112 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB1_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB1_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r9, r5 -; CHECK-NEXT: mov r11, r2 +; CHECK-NEXT: mov r9, r12 +; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: mov r10, r2 ; CHECK-NEXT: b .LBB1_6 ; CHECK-NEXT: .LBB1_3: @ %vector.ph -; CHECK-NEXT: bic r1, r3, #3 -; CHECK-NEXT: adr r4, .LCPI1_0 -; CHECK-NEXT: subs r7, r1, #4 +; CHECK-NEXT: bic r0, r3, #3 +; CHECK-NEXT: adr r5, .LCPI1_0 +; CHECK-NEXT: subs r7, r0, #4 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: adr r4, .LCPI1_1 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r10, r2, r0, lsl #2 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r11, r2, r1, lsl #2 -; CHECK-NEXT: add.w r9, r5, r1, lsl #2 -; CHECK-NEXT: add.w r12, r0, r1, lsl #2 -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: adr r7, .LCPI1_1 +; CHECK-NEXT: add.w r11, r1, r0, lsl #2 +; CHECK-NEXT: add.w r9, r12, r0, lsl #2 +; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: vldrw.u32 q1, [r7] ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [r5], #16 -; CHECK-NEXT: vldrw.u32 q3, [r0], #16 -; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov.w r2, #-1 +; CHECK-NEXT: vldrw.u32 q3, [r12], #16 +; CHECK-NEXT: vldrw.u32 q4, [r1], #16 +; CHECK-NEXT: mov.w r6, #-1 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: vmov.f32 s20, s18 -; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: vmov.f32 s10, s15 ; CHECK-NEXT: vmov.f32 s22, s19 ; CHECK-NEXT: vmullb.s32 q6, q5, q2 -; CHECK-NEXT: vmov.f32 s18, s17 +; CHECK-NEXT: vmov.f32 s14, s13 ; CHECK-NEXT: vmov r4, r7, d12 ; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmov.f32 s18, s17 ; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 -; CHECK-NEXT: sbcs.w r5, r2, r7 +; CHECK-NEXT: sbcs.w r5, r6, r7 ; CHECK-NEXT: csetm r5, lt ; CHECK-NEXT: bfi r8, r5, #0, #8 -; CHECK-NEXT: vmov r10, r5, d13 -; CHECK-NEXT: asrl r10, r5, #31 -; CHECK-NEXT: vmov r6, s18 -; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r10 -; CHECK-NEXT: sbcs.w r3, r2, r5 +; CHECK-NEXT: vmov r12, r5, d13 +; CHECK-NEXT: asrl r12, r5, #31 +; CHECK-NEXT: rsbs.w r3, r12, #-2147483648 +; CHECK-NEXT: vmov q2[2], q2[0], r4, r12 +; CHECK-NEXT: sbcs.w r3, r6, r5 ; CHECK-NEXT: vmov q2[3], q2[1], r7, r5 ; CHECK-NEXT: csetm r3, lt +; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: bfi r8, r3, #8, #8 +; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmsr p0, r8 ; CHECK-NEXT: mvn r8, #-2147483648 ; CHECK-NEXT: vpsel q2, q2, q0 -; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: vmov r3, r7, d4 ; CHECK-NEXT: subs.w r3, r3, r8 -; CHECK-NEXT: sbcs r3, r4, #0 -; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: sbcs r3, r7, #0 +; CHECK-NEXT: mov.w r7, #0 ; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: bfi r4, r3, #0, #8 +; CHECK-NEXT: bfi r7, r3, #0, #8 ; CHECK-NEXT: vmov r3, r5, d5 ; CHECK-NEXT: subs.w r3, r3, r8 ; CHECK-NEXT: sbcs r3, r5, #0 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: bfi r4, r3, #8, #8 +; CHECK-NEXT: bfi r7, r3, #8, #8 ; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmsr p0, r4 -; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmsr p0, r7 +; CHECK-NEXT: vmov r7, s16 ; CHECK-NEXT: vpsel q2, q2, q1 -; CHECK-NEXT: smull r4, r7, r4, r3 +; CHECK-NEXT: smull r4, r7, r7, r3 ; CHECK-NEXT: asrl r4, r7, #31 ; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 -; CHECK-NEXT: sbcs.w r3, r2, r7 +; CHECK-NEXT: sbcs.w r3, r6, r7 ; CHECK-NEXT: csetm r3, lt ; CHECK-NEXT: bfi r5, r3, #0, #8 ; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: smull r6, r3, r6, r3 -; CHECK-NEXT: asrl r6, r3, #31 -; CHECK-NEXT: rsbs.w r1, r6, #-2147483648 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r6 -; CHECK-NEXT: sbcs.w r1, r2, r3 +; CHECK-NEXT: smull r0, r3, r0, r3 +; CHECK-NEXT: asrl r0, r3, #31 +; CHECK-NEXT: rsbs.w r6, r0, #-2147483648 +; CHECK-NEXT: vmov q3[2], q3[0], r4, r0 +; CHECK-NEXT: mov.w r6, #-1 ; CHECK-NEXT: vmov q3[3], q3[1], r7, r3 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r5, r1, #8, #8 +; CHECK-NEXT: sbcs r6, r3 +; CHECK-NEXT: csetm r6, lt +; CHECK-NEXT: bfi r5, r6, #8, #8 ; CHECK-NEXT: vmsr p0, r5 -; CHECK-NEXT: ldrd r5, r2, [sp, #8] @ 8-byte Folded Reload ; CHECK-NEXT: vpsel q3, q3, q0 -; CHECK-NEXT: vmov r1, r3, d6 -; CHECK-NEXT: subs.w r1, r1, r8 -; CHECK-NEXT: sbcs r1, r3, #0 +; CHECK-NEXT: vmov r0, r3, d6 +; CHECK-NEXT: subs.w r0, r0, r8 +; CHECK-NEXT: sbcs r0, r3, #0 ; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r3, r1, #0, #8 -; CHECK-NEXT: vmov r1, r4, d7 -; CHECK-NEXT: subs.w r1, r1, r8 -; CHECK-NEXT: sbcs r1, r4, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r3, r1, #8, #8 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r3, r0, #0, #8 +; CHECK-NEXT: vmov r0, r7, d7 +; CHECK-NEXT: subs.w r0, r0, r8 +; CHECK-NEXT: sbcs r0, r7, #0 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r3, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q3, q3, q1 ; CHECK-NEXT: vmov.f32 s13, s14 @@ -305,33 +306,33 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: vstrb.8 q3, [r2], #16 ; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: ldrd r1, r3, [sp] @ 8-byte Folded Reload -; CHECK-NEXT: cmp r1, r3 +; CHECK-NEXT: ldrd r0, r3, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: cmp r0, r3 ; CHECK-NEXT: beq .LBB1_8 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r1 -; CHECK-NEXT: mov.w r0, #-1 -; CHECK-NEXT: mov.w r3, #-2147483648 +; CHECK-NEXT: sub.w lr, r3, r0 +; CHECK-NEXT: mov.w r3, #-1 +; CHECK-NEXT: mov.w r0, #-2147483648 ; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: .LBB1_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [r12], #4 -; CHECK-NEXT: ldr r4, [r9], #4 -; CHECK-NEXT: smull r4, r1, r4, r1 -; CHECK-NEXT: asrl r4, r1, #31 -; CHECK-NEXT: subs r5, r3, r4 -; CHECK-NEXT: sbcs.w r5, r0, r1 -; CHECK-NEXT: cset r5, lt -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csel r4, r4, r3, ne -; CHECK-NEXT: csel r1, r1, r0, ne -; CHECK-NEXT: subs r5, r4, r2 -; CHECK-NEXT: sbcs r1, r1, #0 -; CHECK-NEXT: csel r1, r4, r2, lt -; CHECK-NEXT: str r1, [r11], #4 +; CHECK-NEXT: ldr r7, [r9], #4 +; CHECK-NEXT: ldr r6, [r11], #4 +; CHECK-NEXT: smull r6, r5, r6, r7 +; CHECK-NEXT: asrl r6, r5, #31 +; CHECK-NEXT: subs r7, r0, r6 +; CHECK-NEXT: sbcs.w r7, r3, r5 +; CHECK-NEXT: cset r7, lt +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: csel r6, r6, r0, ne +; CHECK-NEXT: csel r7, r5, r3, ne +; CHECK-NEXT: subs r5, r6, r2 +; CHECK-NEXT: sbcs r7, r7, #0 +; CHECK-NEXT: csel r7, r6, r2, lt +; CHECK-NEXT: str r7, [r10], #4 ; CHECK-NEXT: le lr, .LBB1_7 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -434,22 +435,22 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adds r6, r3, #3 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: bic r6, r6, #3 -; CHECK-NEXT: adr r4, .LCPI2_1 -; CHECK-NEXT: subs r6, #4 -; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: adds r5, r3, #3 +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: bic r5, r5, #3 ; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #-1 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: adr r5, .LCPI2_0 -; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: mvn r8, #-2147483648 +; CHECK-NEXT: add.w lr, r4, r5, lsr #2 +; CHECK-NEXT: subs r5, r3, #1 +; CHECK-NEXT: adr r4, .LCPI2_0 +; CHECK-NEXT: vdup.32 q1, r5 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: adr r4, .LCPI2_1 ; CHECK-NEXT: adr r5, .LCPI2_2 -; CHECK-NEXT: subs r6, r3, #1 +; CHECK-NEXT: vldrw.u32 q2, [r4] ; CHECK-NEXT: vldrw.u32 q3, [r5] -; CHECK-NEXT: vdup.32 q1, r6 -; CHECK-NEXT: mvn r8, #-2147483648 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -482,8 +483,9 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n ; CHECK-NEXT: sbcs.w r3, r12, r7 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r7 ; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: vmov r7, s22 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: bfi r4, r3, #8, #8 +; CHECK-NEXT: vmov r7, s22 ; CHECK-NEXT: vmsr p0, r4 ; CHECK-NEXT: vpsel q0, q0, q2 ; CHECK-NEXT: vmov r3, r4, d0 @@ -503,22 +505,21 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n ; CHECK-NEXT: vpsel q4, q0, q3 ; CHECK-NEXT: vmov.f32 s2, s21 ; CHECK-NEXT: smull r10, r5, r4, r3 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: asrl r10, r5, #31 ; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 ; CHECK-NEXT: sbcs.w r3, r12, r5 ; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: bfi r4, r3, #0, #8 +; CHECK-NEXT: bfi r6, r3, #0, #8 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: smull r6, r3, r7, r3 -; CHECK-NEXT: asrl r6, r3, #31 -; CHECK-NEXT: rsbs.w r7, r6, #-2147483648 -; CHECK-NEXT: vmov q0[2], q0[0], r10, r6 +; CHECK-NEXT: smull r4, r3, r7, r3 +; CHECK-NEXT: asrl r4, r3, #31 +; CHECK-NEXT: rsbs.w r7, r4, #-2147483648 +; CHECK-NEXT: vmov q0[2], q0[0], r10, r4 ; CHECK-NEXT: sbcs.w r7, r12, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 ; CHECK-NEXT: csetm r7, lt -; CHECK-NEXT: bfi r4, r7, #8, #8 -; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: bfi r6, r7, #8, #8 +; CHECK-NEXT: vmsr p0, r6 ; CHECK-NEXT: vpsel q0, q0, q2 ; CHECK-NEXT: vmov r3, r4, d0 ; CHECK-NEXT: subs.w r3, r3, r8 @@ -612,66 +613,66 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: @ %bb.1: @ %entry -; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: mov r9, r2 ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: bne .LBB3_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r11, r1 -; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: mov r11, r9 ; CHECK-NEXT: b .LBB3_6 ; CHECK-NEXT: .LBB3_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #1 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: subs r7, r5, #2 -; CHECK-NEXT: str r5, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r2, r8, r5, lsl #2 -; CHECK-NEXT: add.w r11, r1, r5, lsl #2 -; CHECK-NEXT: add.w lr, r6, r7, lsr #1 -; CHECK-NEXT: add.w r12, r0, r5, lsl #2 +; CHECK-NEXT: bic r2, r3, #1 +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: subs r6, r2, #2 +; CHECK-NEXT: str r2, [sp] @ 4-byte Spill +; CHECK-NEXT: add.w r11, r9, r2, lsl #2 +; CHECK-NEXT: add.w r12, r0, r2, lsl #2 +; CHECK-NEXT: add.w lr, r4, r6, lsr #1 +; CHECK-NEXT: add.w r6, r1, r2, lsl #2 ; CHECK-NEXT: vmov.i8 q0, #0xff ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r4, r9, [r0], #8 +; CHECK-NEXT: ldrd r7, r2, [r0], #8 ; CHECK-NEXT: ldrd r5, r10, [r1], #8 -; CHECK-NEXT: umull r4, r5, r5, r4 -; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: subs.w r6, r4, #-1 -; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: mov.w r6, #0 -; CHECK-NEXT: csetm r5, lo -; CHECK-NEXT: bfi r6, r5, #0, #8 -; CHECK-NEXT: umull r10, r5, r10, r9 -; CHECK-NEXT: lsrl r10, r5, #31 -; CHECK-NEXT: subs.w r7, r10, #-1 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r10 +; CHECK-NEXT: umull r8, r5, r5, r7 +; CHECK-NEXT: lsrl r8, r5, #31 +; CHECK-NEXT: subs.w r7, r8, #-1 ; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: mov.w r7, #0 ; CHECK-NEXT: csetm r5, lo -; CHECK-NEXT: bfi r6, r5, #8, #8 -; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: bfi r7, r5, #0, #8 +; CHECK-NEXT: umull r4, r5, r10, r2 +; CHECK-NEXT: lsrl r4, r5, #31 +; CHECK-NEXT: subs.w r2, r4, #-1 +; CHECK-NEXT: vmov q1[2], q1[0], r8, r4 +; CHECK-NEXT: sbcs r2, r5, #0 +; CHECK-NEXT: csetm r2, lo +; CHECK-NEXT: bfi r7, r2, #8, #8 +; CHECK-NEXT: vmsr p0, r7 ; CHECK-NEXT: vpsel q1, q1, q0 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: strd r5, r4, [r8], #8 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: strd r4, r2, [r9], #8 ; CHECK-NEXT: le lr, .LBB3_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload -; CHECK-NEXT: cmp r7, r3 +; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: cmp r2, r3 ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: .LBB3_6: @ %for.body.preheader -; CHECK-NEXT: sub.w lr, r3, r7 +; CHECK-NEXT: sub.w lr, r3, r2 ; CHECK-NEXT: .LBB3_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r0, [r12], #4 -; CHECK-NEXT: ldr r1, [r11], #4 +; CHECK-NEXT: ldr r1, [r6], #4 ; CHECK-NEXT: umull r0, r1, r1, r0 ; CHECK-NEXT: lsrl r0, r1, #31 -; CHECK-NEXT: subs.w r3, r0, #-1 +; CHECK-NEXT: subs.w r2, r0, #-1 ; CHECK-NEXT: sbcs r1, r1, #0 ; CHECK-NEXT: it hs ; CHECK-NEXT: movhs.w r0, #-1 -; CHECK-NEXT: str r0, [r2], #4 +; CHECK-NEXT: str r0, [r11], #4 ; CHECK-NEXT: le lr, .LBB3_7 ; CHECK-NEXT: .LBB3_8: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #4 @@ -767,12 +768,12 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: b .LBB4_6 ; CHECK-NEXT: .LBB4_3: @ %vector.ph ; CHECK-NEXT: bic r8, r3, #3 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: sub.w r7, r8, #4 +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: sub.w r6, r8, #4 ; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: add.w r11, r2, r8, lsl #2 ; CHECK-NEXT: add.w r9, r1, r8, lsl #2 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: add.w lr, r4, r6, lsr #2 ; CHECK-NEXT: add.w r12, r0, r8, lsl #2 ; CHECK-NEXT: .LBB4_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -1409,27 +1410,27 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q15(ptr nocapture readonly %pSrcA, ptr n ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vdup.32 q6, r3 +; CHECK-NEXT: vdup.32 q5, r3 ; CHECK-NEXT: adds r3, #8 -; CHECK-NEXT: vorr q5, q6, q0 -; CHECK-NEXT: vorr q6, q6, q4 -; CHECK-NEXT: vcmp.u32 cs, q1, q5 -; CHECK-NEXT: vpsel q7, q3, q2 +; CHECK-NEXT: vorr q6, q5, q0 +; CHECK-NEXT: vorr q5, q5, q4 ; CHECK-NEXT: vcmp.u32 cs, q1, q6 -; CHECK-NEXT: vmov r4, r12, d14 ; CHECK-NEXT: vpsel q6, q3, q2 -; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vmov.16 q5[1], r12 -; CHECK-NEXT: vmov r4, r12, d15 -; CHECK-NEXT: vmov.16 q5[2], r4 -; CHECK-NEXT: vmov.16 q5[3], r12 +; CHECK-NEXT: vcmp.u32 cs, q1, q5 ; CHECK-NEXT: vmov r4, r12, d12 -; CHECK-NEXT: vmov.16 q5[4], r4 -; CHECK-NEXT: vmov.16 q5[5], r12 +; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vmov.16 q7[0], r4 +; CHECK-NEXT: vmov.16 q7[1], r12 ; CHECK-NEXT: vmov r4, r12, d13 -; CHECK-NEXT: vmov.16 q5[6], r4 -; CHECK-NEXT: vmov.16 q5[7], r12 -; CHECK-NEXT: vptt.i16 ne, q5, zr +; CHECK-NEXT: vmov.16 q7[2], r4 +; CHECK-NEXT: vmov.16 q7[3], r12 +; CHECK-NEXT: vmov r4, r12, d10 +; CHECK-NEXT: vmov.16 q7[4], r4 +; CHECK-NEXT: vmov.16 q7[5], r12 +; CHECK-NEXT: vmov r4, r12, d11 +; CHECK-NEXT: vmov.16 q7[6], r4 +; CHECK-NEXT: vmov.16 q7[7], r12 +; CHECK-NEXT: vptt.i16 ne, q7, zr ; CHECK-NEXT: vldrht.u16 q5, [r0], #16 ; CHECK-NEXT: vldrht.u16 q6, [r1], #16 ; CHECK-NEXT: vmullt.s16 q7, q6, q5 @@ -1521,27 +1522,27 @@ define arm_aapcs_vfpcc void @ssatmul_8ti_q15(ptr nocapture readonly %pSrcA, ptr ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vdup.32 q6, r3 +; CHECK-NEXT: vdup.32 q5, r3 ; CHECK-NEXT: adds r3, #8 -; CHECK-NEXT: vorr q5, q6, q0 -; CHECK-NEXT: vorr q6, q6, q4 -; CHECK-NEXT: vcmp.u32 cs, q1, q5 -; CHECK-NEXT: vpsel q7, q3, q2 +; CHECK-NEXT: vorr q6, q5, q0 +; CHECK-NEXT: vorr q5, q5, q4 ; CHECK-NEXT: vcmp.u32 cs, q1, q6 -; CHECK-NEXT: vmov r4, r12, d14 ; CHECK-NEXT: vpsel q6, q3, q2 -; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vmov.16 q5[1], r12 -; CHECK-NEXT: vmov r4, r12, d15 -; CHECK-NEXT: vmov.16 q5[2], r4 -; CHECK-NEXT: vmov.16 q5[3], r12 +; CHECK-NEXT: vcmp.u32 cs, q1, q5 ; CHECK-NEXT: vmov r4, r12, d12 -; CHECK-NEXT: vmov.16 q5[4], r4 -; CHECK-NEXT: vmov.16 q5[5], r12 +; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vmov.16 q7[0], r4 +; CHECK-NEXT: vmov.16 q7[1], r12 ; CHECK-NEXT: vmov r4, r12, d13 -; CHECK-NEXT: vmov.16 q5[6], r4 -; CHECK-NEXT: vmov.16 q5[7], r12 -; CHECK-NEXT: vptt.i16 ne, q5, zr +; CHECK-NEXT: vmov.16 q7[2], r4 +; CHECK-NEXT: vmov.16 q7[3], r12 +; CHECK-NEXT: vmov r4, r12, d10 +; CHECK-NEXT: vmov.16 q7[4], r4 +; CHECK-NEXT: vmov.16 q7[5], r12 +; CHECK-NEXT: vmov r4, r12, d11 +; CHECK-NEXT: vmov.16 q7[6], r4 +; CHECK-NEXT: vmov.16 q7[7], r12 +; CHECK-NEXT: vptt.i16 ne, q7, zr ; CHECK-NEXT: vldrht.u16 q5, [r0], #16 ; CHECK-NEXT: vldrht.u16 q6, [r1], #16 ; CHECK-NEXT: vmullt.s16 q7, q6, q5 @@ -2409,27 +2410,27 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q7(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: .LBB17_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vdup.32 q6, r3 +; CHECK-NEXT: vdup.32 q5, r3 ; CHECK-NEXT: adds r3, #8 -; CHECK-NEXT: vorr q5, q6, q0 -; CHECK-NEXT: vorr q6, q6, q4 -; CHECK-NEXT: vcmp.u32 cs, q1, q5 -; CHECK-NEXT: vpsel q7, q3, q2 +; CHECK-NEXT: vorr q6, q5, q0 +; CHECK-NEXT: vorr q5, q5, q4 ; CHECK-NEXT: vcmp.u32 cs, q1, q6 -; CHECK-NEXT: vmov r4, r12, d14 ; CHECK-NEXT: vpsel q6, q3, q2 -; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vmov.16 q5[1], r12 -; CHECK-NEXT: vmov r4, r12, d15 -; CHECK-NEXT: vmov.16 q5[2], r4 -; CHECK-NEXT: vmov.16 q5[3], r12 +; CHECK-NEXT: vcmp.u32 cs, q1, q5 ; CHECK-NEXT: vmov r4, r12, d12 -; CHECK-NEXT: vmov.16 q5[4], r4 -; CHECK-NEXT: vmov.16 q5[5], r12 +; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vmov.16 q7[0], r4 +; CHECK-NEXT: vmov.16 q7[1], r12 ; CHECK-NEXT: vmov r4, r12, d13 -; CHECK-NEXT: vmov.16 q5[6], r4 -; CHECK-NEXT: vmov.16 q5[7], r12 -; CHECK-NEXT: vptt.i16 ne, q5, zr +; CHECK-NEXT: vmov.16 q7[2], r4 +; CHECK-NEXT: vmov.16 q7[3], r12 +; CHECK-NEXT: vmov r4, r12, d10 +; CHECK-NEXT: vmov.16 q7[4], r4 +; CHECK-NEXT: vmov.16 q7[5], r12 +; CHECK-NEXT: vmov r4, r12, d11 +; CHECK-NEXT: vmov.16 q7[6], r4 +; CHECK-NEXT: vmov.16 q7[7], r12 +; CHECK-NEXT: vptt.i16 ne, q7, zr ; CHECK-NEXT: vldrbt.s16 q5, [r0], #8 ; CHECK-NEXT: vldrbt.s16 q6, [r1], #8 ; CHECK-NEXT: vmul.i16 q5, q6, q5 @@ -2530,27 +2531,27 @@ define arm_aapcs_vfpcc void @ssatmul_16t_q7(ptr nocapture readonly %pSrcA, ptr n ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vdup.32 q0, r5 +; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: adds r5, #16 -; CHECK-NEXT: vorr q4, q0, q4 -; CHECK-NEXT: vcmp.u32 cs, q1, q4 -; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r4, r3, d8 -; CHECK-NEXT: vmov.16 q7[0], r4 -; CHECK-NEXT: vmov.16 q7[1], r3 -; CHECK-NEXT: vmov r3, r4, d9 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q7[2], r3 -; CHECK-NEXT: vmov.16 q7[3], r4 -; CHECK-NEXT: vorr q4, q0, q4 -; CHECK-NEXT: vcmp.u32 cs, q1, q4 -; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r3, r4, d8 -; CHECK-NEXT: vmov.16 q7[4], r3 -; CHECK-NEXT: vmov.16 q7[5], r4 -; CHECK-NEXT: vmov r3, r4, d9 -; CHECK-NEXT: vmov.16 q7[6], r3 -; CHECK-NEXT: vmov.16 q7[7], r4 -; CHECK-NEXT: vcmp.i16 ne, q7, zr +; CHECK-NEXT: vorr q7, q0, q4 +; CHECK-NEXT: vcmp.u32 cs, q1, q7 +; CHECK-NEXT: vpsel q7, q3, q2 +; CHECK-NEXT: vmov r4, r3, d14 +; CHECK-NEXT: vmov.16 q4[0], r4 +; CHECK-NEXT: vmov.16 q4[1], r3 +; CHECK-NEXT: vmov r3, r4, d15 +; CHECK-NEXT: vorr q7, q0, q5 +; CHECK-NEXT: vmov.16 q4[2], r3 +; CHECK-NEXT: vcmp.u32 cs, q1, q7 +; CHECK-NEXT: vmov.16 q4[3], r4 +; CHECK-NEXT: vpsel q7, q3, q2 +; CHECK-NEXT: vmov r3, r4, d14 +; CHECK-NEXT: vmov.16 q4[4], r3 +; CHECK-NEXT: vmov.16 q4[5], r4 +; CHECK-NEXT: vmov r3, r4, d15 +; CHECK-NEXT: vmov.16 q4[6], r3 +; CHECK-NEXT: vmov.16 q4[7], r4 +; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q4, q3, q2 ; CHECK-NEXT: vmov.u16 r3, q4[0] ; CHECK-NEXT: vmov.8 q7[0], r3 @@ -2572,22 +2573,22 @@ define arm_aapcs_vfpcc void @ssatmul_16t_q7(ptr nocapture readonly %pSrcA, ptr n ; CHECK-NEXT: vorr q4, q0, q4 ; CHECK-NEXT: vorr q0, q0, q6 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 -; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vpsel q4, q3, q2 ; CHECK-NEXT: vcmp.u32 cs, q1, q0 -; CHECK-NEXT: vmov r3, r4, d10 +; CHECK-NEXT: vmov r3, r4, d8 ; CHECK-NEXT: vpsel q0, q3, q2 -; CHECK-NEXT: vmov.16 q4[0], r3 -; CHECK-NEXT: vmov.16 q4[1], r4 -; CHECK-NEXT: vmov r3, r4, d11 -; CHECK-NEXT: vmov.16 q4[2], r3 -; CHECK-NEXT: vmov.16 q4[3], r4 +; CHECK-NEXT: vmov.16 q5[0], r3 +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov r3, r4, d9 +; CHECK-NEXT: vmov.16 q5[2], r3 +; CHECK-NEXT: vmov.16 q5[3], r4 ; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: vmov.16 q4[4], r3 -; CHECK-NEXT: vmov.16 q4[5], r4 +; CHECK-NEXT: vmov.16 q5[4], r3 +; CHECK-NEXT: vmov.16 q5[5], r4 ; CHECK-NEXT: vmov r3, r4, d1 -; CHECK-NEXT: vmov.16 q4[6], r3 -; CHECK-NEXT: vmov.16 q4[7], r4 -; CHECK-NEXT: vcmp.i16 ne, q4, zr +; CHECK-NEXT: vmov.16 q5[6], r3 +; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q0, q3, q2 ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov.8 q7[8], r3 @@ -2719,27 +2720,27 @@ define arm_aapcs_vfpcc void @ssatmul_16ti_q7(ptr nocapture readonly %pSrcA, ptr ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vdup.32 q0, r5 +; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: adds r5, #16 -; CHECK-NEXT: vorr q4, q0, q4 -; CHECK-NEXT: vcmp.u32 cs, q1, q4 -; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r4, r3, d8 -; CHECK-NEXT: vmov.16 q7[0], r4 -; CHECK-NEXT: vmov.16 q7[1], r3 -; CHECK-NEXT: vmov r3, r4, d9 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q7[2], r3 -; CHECK-NEXT: vmov.16 q7[3], r4 -; CHECK-NEXT: vorr q4, q0, q4 -; CHECK-NEXT: vcmp.u32 cs, q1, q4 -; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r3, r4, d8 -; CHECK-NEXT: vmov.16 q7[4], r3 -; CHECK-NEXT: vmov.16 q7[5], r4 -; CHECK-NEXT: vmov r3, r4, d9 -; CHECK-NEXT: vmov.16 q7[6], r3 -; CHECK-NEXT: vmov.16 q7[7], r4 -; CHECK-NEXT: vcmp.i16 ne, q7, zr +; CHECK-NEXT: vorr q7, q0, q4 +; CHECK-NEXT: vcmp.u32 cs, q1, q7 +; CHECK-NEXT: vpsel q7, q3, q2 +; CHECK-NEXT: vmov r4, r3, d14 +; CHECK-NEXT: vmov.16 q4[0], r4 +; CHECK-NEXT: vmov.16 q4[1], r3 +; CHECK-NEXT: vmov r3, r4, d15 +; CHECK-NEXT: vorr q7, q0, q5 +; CHECK-NEXT: vmov.16 q4[2], r3 +; CHECK-NEXT: vcmp.u32 cs, q1, q7 +; CHECK-NEXT: vmov.16 q4[3], r4 +; CHECK-NEXT: vpsel q7, q3, q2 +; CHECK-NEXT: vmov r3, r4, d14 +; CHECK-NEXT: vmov.16 q4[4], r3 +; CHECK-NEXT: vmov.16 q4[5], r4 +; CHECK-NEXT: vmov r3, r4, d15 +; CHECK-NEXT: vmov.16 q4[6], r3 +; CHECK-NEXT: vmov.16 q4[7], r4 +; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q4, q3, q2 ; CHECK-NEXT: vmov.u16 r3, q4[0] ; CHECK-NEXT: vmov.8 q7[0], r3 @@ -2761,22 +2762,22 @@ define arm_aapcs_vfpcc void @ssatmul_16ti_q7(ptr nocapture readonly %pSrcA, ptr ; CHECK-NEXT: vorr q4, q0, q4 ; CHECK-NEXT: vorr q0, q0, q6 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 -; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vpsel q4, q3, q2 ; CHECK-NEXT: vcmp.u32 cs, q1, q0 -; CHECK-NEXT: vmov r3, r4, d10 +; CHECK-NEXT: vmov r3, r4, d8 ; CHECK-NEXT: vpsel q0, q3, q2 -; CHECK-NEXT: vmov.16 q4[0], r3 -; CHECK-NEXT: vmov.16 q4[1], r4 -; CHECK-NEXT: vmov r3, r4, d11 -; CHECK-NEXT: vmov.16 q4[2], r3 -; CHECK-NEXT: vmov.16 q4[3], r4 +; CHECK-NEXT: vmov.16 q5[0], r3 +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov r3, r4, d9 +; CHECK-NEXT: vmov.16 q5[2], r3 +; CHECK-NEXT: vmov.16 q5[3], r4 ; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: vmov.16 q4[4], r3 -; CHECK-NEXT: vmov.16 q4[5], r4 +; CHECK-NEXT: vmov.16 q5[4], r3 +; CHECK-NEXT: vmov.16 q5[5], r4 ; CHECK-NEXT: vmov r3, r4, d1 -; CHECK-NEXT: vmov.16 q4[6], r3 -; CHECK-NEXT: vmov.16 q4[7], r4 -; CHECK-NEXT: vcmp.i16 ne, q4, zr +; CHECK-NEXT: vmov.16 q5[6], r3 +; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q0, q3, q2 ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov.8 q7[8], r3 diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll index bbc0ff9bd1be58..bad57457a97f3c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll +++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll @@ -34,15 +34,15 @@ entry: define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: sadd_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds.w r12, r2, r0 -; CHECK-NEXT: vmov r0, r4, d1 -; CHECK-NEXT: adc.w lr, r3, r1 +; CHECK-NEXT: adc.w r6, r3, r1 ; CHECK-NEXT: subs.w r2, r12, r2 -; CHECK-NEXT: sbcs.w r2, lr, r3 +; CHECK-NEXT: sbcs.w r2, r6, r3 +; CHECK-NEXT: vmov lr, r3, d3 ; CHECK-NEXT: cset r2, lt ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it mi @@ -50,21 +50,21 @@ define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) ; CHECK-NEXT: rsbs r1, r2, #0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: bfi r2, r1, #0, #8 -; CHECK-NEXT: vmov r1, r3, d3 -; CHECK-NEXT: adds r1, r1, r0 +; CHECK-NEXT: vmov r1, r4, d1 +; CHECK-NEXT: adds.w r0, r1, lr ; CHECK-NEXT: adc.w r5, r4, r3 -; CHECK-NEXT: subs r0, r1, r0 -; CHECK-NEXT: sbcs.w r0, r5, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r12, r1 -; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: subs r1, r0, r1 +; CHECK-NEXT: sbcs.w r1, r5, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r12, r0 +; CHECK-NEXT: cset r1, lt ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it mi -; CHECK-NEXT: eormi r0, r0, #1 -; CHECK-NEXT: asr.w r1, lr, #31 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q0[3], q0[1], lr, r5 -; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: eormi r1, r1, #1 ; CHECK-NEXT: asrs r0, r5, #31 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r5 +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: asrs r1, r6, #31 ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmsr p0, r2 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 @@ -72,7 +72,7 @@ define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: veor q1, q1, q2 ; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI3_0: @@ -118,28 +118,28 @@ entry: define arm_aapcs_vfpcc <2 x i64> @uadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: uadd_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov lr, r12, d2 ; CHECK-NEXT: adds r5, r2, r0 -; CHECK-NEXT: adc.w lr, r3, r1 +; CHECK-NEXT: adc.w r6, r3, r1 ; CHECK-NEXT: subs r2, r5, r2 -; CHECK-NEXT: sbcs.w r2, lr, r3 -; CHECK-NEXT: vmov r3, r12, d2 -; CHECK-NEXT: vmov r1, r4, d0 +; CHECK-NEXT: sbcs.w r2, r6, r3 +; CHECK-NEXT: vmov r3, r4, d0 ; CHECK-NEXT: csetm r2, lo -; CHECK-NEXT: adds r3, r3, r1 -; CHECK-NEXT: adc.w r0, r4, r12 -; CHECK-NEXT: subs r1, r3, r1 -; CHECK-NEXT: sbcs.w r1, r0, r4 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r5 -; CHECK-NEXT: csetm r1, lo -; CHECK-NEXT: vmov q1[3], q1[1], r0, lr -; CHECK-NEXT: vmov q0[2], q0[0], r1, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r2 +; CHECK-NEXT: adds.w r0, r3, lr +; CHECK-NEXT: adc.w r1, r4, r12 +; CHECK-NEXT: subs r3, r0, r3 +; CHECK-NEXT: sbcs.w r3, r1, r4 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r5 +; CHECK-NEXT: csetm r3, lo +; CHECK-NEXT: vmov q1[3], q1[1], r1, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %0 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %0 @@ -181,37 +181,37 @@ define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2) ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vmov r1, r0, d0 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: subs.w r12, r1, r2 -; CHECK-NEXT: sbc.w lr, r0, r3 -; CHECK-NEXT: subs.w r1, r12, r1 -; CHECK-NEXT: sbcs.w r0, lr, r0 -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: cset r0, lt -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: sbcs.w r2, r1, r3 -; CHECK-NEXT: it lt -; CHECK-NEXT: eorlt r0, r0, #1 -; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: subs.w r12, r2, r0 +; CHECK-NEXT: sbc.w lr, r3, r1 +; CHECK-NEXT: subs.w r2, r12, r2 +; CHECK-NEXT: sbcs.w r2, lr, r3 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: cset r2, lt ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: subs r6, r4, r2 -; CHECK-NEXT: sbc.w r7, r5, r3 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: it lt +; CHECK-NEXT: eorlt r2, r2, #1 +; CHECK-NEXT: rsbs r0, r2, #0 +; CHECK-NEXT: vmov r1, r2, d3 +; CHECK-NEXT: subs r6, r4, r1 +; CHECK-NEXT: sbc.w r7, r5, r2 ; CHECK-NEXT: subs r4, r6, r4 ; CHECK-NEXT: sbcs.w r4, r7, r5 ; CHECK-NEXT: vmov q0[2], q0[0], r12, r6 ; CHECK-NEXT: cset r4, lt -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: sbcs.w r2, r1, r3 -; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: sbcs.w r1, r3, r2 +; CHECK-NEXT: bfi r3, r0, #0, #8 ; CHECK-NEXT: it lt ; CHECK-NEXT: eorlt r4, r4, #1 ; CHECK-NEXT: rsbs r0, r4, #0 -; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: bfi r3, r0, #8, #8 ; CHECK-NEXT: asrs r0, r7, #31 -; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: asr.w r1, lr, #31 +; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], lr, r7 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 @@ -265,28 +265,28 @@ entry: define arm_aapcs_vfpcc <2 x i64> @usub_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: usub_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov lr, r12, d2 ; CHECK-NEXT: subs r5, r2, r0 -; CHECK-NEXT: sbc.w lr, r3, r1 +; CHECK-NEXT: sbc.w r6, r3, r1 ; CHECK-NEXT: subs r2, r2, r5 -; CHECK-NEXT: sbcs.w r2, r3, lr -; CHECK-NEXT: vmov r3, r12, d2 -; CHECK-NEXT: vmov r1, r4, d0 +; CHECK-NEXT: sbcs.w r2, r3, r6 +; CHECK-NEXT: vmov r3, r4, d0 ; CHECK-NEXT: csetm r2, lo -; CHECK-NEXT: subs r3, r1, r3 -; CHECK-NEXT: sbc.w r0, r4, r12 -; CHECK-NEXT: subs r1, r1, r3 -; CHECK-NEXT: sbcs.w r1, r4, r0 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r5 -; CHECK-NEXT: csetm r1, lo -; CHECK-NEXT: vmov q1[3], q1[1], r0, lr -; CHECK-NEXT: vmov q0[2], q0[0], r1, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r2 +; CHECK-NEXT: subs.w r0, r3, lr +; CHECK-NEXT: sbc.w r1, r4, r12 +; CHECK-NEXT: subs r3, r3, r0 +; CHECK-NEXT: sbcs.w r3, r4, r1 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r5 +; CHECK-NEXT: csetm r3, lo +; CHECK-NEXT: vmov q1[3], q1[1], r1, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vbic q0, q1, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %0 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %0 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll index e845070d579045..7fa70146616f0d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -60,14 +60,12 @@ define arm_aapcs_vfpcc void @scatter_inc_mini_8i16(<8 x i16> %data, ptr %dst, <8 define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, ptr %dst, <16 x i32> %offs) { ; CHECK-LABEL: scatter_inc_mini_16i8: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: movs r1, #16 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r1 -; CHECK-NEXT: add.w r12, sp, #32 +; CHECK-NEXT: add.w r12, sp, #24 ; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: vadd.i32 q3, q3, r0 ; CHECK-NEXT: vmov lr, r5, d3 @@ -77,32 +75,32 @@ define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, ptr %dst, <1 ; CHECK-NEXT: vmov r4, r12, d4 ; CHECK-NEXT: vmov.u8 r6, q0[0] ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, r8, d5 +; CHECK-NEXT: vmov r0, r7, d5 ; CHECK-NEXT: vadd.i32 q3, q3, r1 ; CHECK-NEXT: vadd.i32 q1, q1, r1 ; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov.u8 r7, q0[6] ; CHECK-NEXT: strb r6, [r2] ; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: strb r2, [r3] ; CHECK-NEXT: vmov.u8 r6, q0[2] -; CHECK-NEXT: vmov r2, r9, d6 +; CHECK-NEXT: vmov r2, r8, d6 ; CHECK-NEXT: strb.w r6, [lr] ; CHECK-NEXT: vmov.u8 r6, q0[3] -; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: strb r6, [r5] ; CHECK-NEXT: vmov r6, r5, d7 ; CHECK-NEXT: strb r1, [r4] ; CHECK-NEXT: vmov.u8 r1, q0[5] ; CHECK-NEXT: strb.w r1, [r12] ; CHECK-NEXT: vmov r1, r4, d2 -; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: strb r3, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: strb.w r0, [r8] -; CHECK-NEXT: vmov r0, r7, d3 -; CHECK-NEXT: strb r3, [r2] +; CHECK-NEXT: strb r0, [r7] +; CHECK-NEXT: vmov r0, r3, d3 +; CHECK-NEXT: vmov.u8 r7, q0[8] +; CHECK-NEXT: strb r7, [r2] ; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: strb.w r2, [r9] +; CHECK-NEXT: strb.w r2, [r8] ; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: strb r2, [r6] ; CHECK-NEXT: vmov.u8 r2, q0[11] @@ -114,9 +112,8 @@ define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, ptr %dst, <1 ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: strb r0, [r7] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: strb r0, [r3] +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} %1 = add <16 x i32> %offs, %2 = getelementptr inbounds i8, ptr %dst, <16 x i32> %1 call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %data, <16 x ptr> %2, i32 2, <16 x i1> ) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll index 5db402bb4d0f83..0c5590314617a6 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -93,15 +93,15 @@ define arm_aapcs_vfpcc void @scaled_v8f16_sext(ptr %base, ptr %offptr, <8 x half ; CHECK-LABEL: scaled_v8f16_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vshl.i32 q2, q1, #1 -; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r1, r2, d4 +; CHECK-NEXT: vldrh.s32 q2, [r1, #8] ; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, r2, d2 ; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vstr.16 s0, [r2] -; CHECK-NEXT: vmov r1, r2, d5 +; CHECK-NEXT: vmov r1, r2, d3 +; CHECK-NEXT: vshl.i32 q1, q2, #1 ; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vstr.16 s1, [r1] @@ -183,28 +183,28 @@ define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep(ptr %base, ptr %offptr, <8 x ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r12 ; CHECK-NEXT: vmov r3, r2, d2 -; CHECK-NEXT: vmov lr, r5, d3 +; CHECK-NEXT: vmov lr, r4, d3 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r12 ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov r4, r12, d3 +; CHECK-NEXT: vmov r12, r5, d3 ; CHECK-NEXT: strh r6, [r3] ; CHECK-NEXT: vmov.u16 r3, q0[1] ; CHECK-NEXT: strh r3, [r2] ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: strh.w r2, [lr] ; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: strh r2, [r5] +; CHECK-NEXT: strh r2, [r4] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: strh r2, [r0] ; CHECK-NEXT: vmov.u16 r0, q0[5] ; CHECK-NEXT: strh r0, [r1] ; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: strh r0, [r4] -; CHECK-NEXT: vmov.u16 r0, q0[7] ; CHECK-NEXT: strh.w r0, [r12] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: strh r0, [r5] ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i16>, ptr %offptr, align 2 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll index c7fdcd83479fb3..a3d618dea43cf6 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll @@ -335,19 +335,19 @@ define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(ptr %base, ptr %offptr ; CHECK-NEXT: vmov r12, lr, d5 ; CHECK-NEXT: vldrb.s32 q2, [r1, #4] ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, r6, d1 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: strh r4, [r2] -; CHECK-NEXT: vmov r2, r7, d4 +; CHECK-NEXT: vmov r2, r6, d4 ; CHECK-NEXT: strh r5, [r3] -; CHECK-NEXT: vmov r3, r5, d5 +; CHECK-NEXT: vmov r3, r7, d5 ; CHECK-NEXT: strh.w r0, [r12] -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: strh.w r6, [lr] -; CHECK-NEXT: vmov r6, r4, d3 +; CHECK-NEXT: vmov r0, r4, d2 +; CHECK-NEXT: strh.w r1, [lr] +; CHECK-NEXT: vmov r1, r5, d3 ; CHECK-NEXT: strh r0, [r2] -; CHECK-NEXT: strh r1, [r7] -; CHECK-NEXT: strh r6, [r3] -; CHECK-NEXT: strh r4, [r5] +; CHECK-NEXT: strh r4, [r6] +; CHECK-NEXT: strh r1, [r3] +; CHECK-NEXT: strh r5, [r7] ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i8>, ptr %offptr, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll index 87df13787c6c8a..9f7228863bc68d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -59,57 +59,57 @@ entry: define arm_aapcs_vfpcc void @unscaled_v16i8_sext(ptr %base, ptr %offptr, <16 x i8> %input) { ; CHECK-LABEL: unscaled_v16i8_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: vldrb.s32 q1, [r1] ; CHECK-NEXT: vldrb.s32 q3, [r1, #8] ; CHECK-NEXT: vmov.u8 r6, q0[0] -; CHECK-NEXT: vmov.u8 r5, q0[4] ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vadd.i32 q3, q3, r0 ; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vmov.u8 r7, q0[6] ; CHECK-NEXT: vmov r12, lr, d3 ; CHECK-NEXT: vldrb.s32 q1, [r1, #4] ; CHECK-NEXT: vadd.i32 q2, q1, r0 ; CHECK-NEXT: vldrb.s32 q1, [r1, #12] -; CHECK-NEXT: vmov r4, r8, d4 +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, r9, d5 +; CHECK-NEXT: vmov r0, r7, d5 ; CHECK-NEXT: strb r6, [r2] ; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: strb r2, [r3] ; CHECK-NEXT: vmov.u8 r6, q0[2] -; CHECK-NEXT: vmov r2, r10, d6 +; CHECK-NEXT: vmov r2, r8, d6 ; CHECK-NEXT: strb.w r6, [r12] ; CHECK-NEXT: vmov.u8 r6, q0[3] -; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov.u8 r3, q0[4] ; CHECK-NEXT: strb.w r6, [lr] ; CHECK-NEXT: vmov r6, r1, d7 -; CHECK-NEXT: strb r5, [r4] -; CHECK-NEXT: vmov.u8 r5, q0[5] -; CHECK-NEXT: strb.w r5, [r8] -; CHECK-NEXT: vmov r5, r4, d2 -; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: strb r3, [r4] +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: strb r3, [r5] +; CHECK-NEXT: vmov.u8 r4, q0[6] +; CHECK-NEXT: vmov r3, r5, d2 +; CHECK-NEXT: strb r4, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: strb.w r0, [r9] +; CHECK-NEXT: vmov.u8 r4, q0[8] +; CHECK-NEXT: strb r0, [r7] ; CHECK-NEXT: vmov r0, r7, d3 -; CHECK-NEXT: strb r3, [r2] +; CHECK-NEXT: strb r4, [r2] ; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: strb.w r2, [r10] +; CHECK-NEXT: strb.w r2, [r8] ; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: strb r2, [r6] ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: strb r2, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r5] +; CHECK-NEXT: strb r1, [r3] ; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r4] +; CHECK-NEXT: strb r1, [r5] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[15] ; CHECK-NEXT: strb r0, [r7] -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %offs = load <16 x i8>, ptr %offptr, align 1 %offs.sext = sext <16 x i8> %offs to <16 x i32> @@ -122,57 +122,57 @@ entry: define arm_aapcs_vfpcc void @unscaled_v16i8_i16(ptr %base, ptr %offptr, <16 x i8> %input) { ; CHECK-LABEL: unscaled_v16i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: vldrh.s32 q1, [r1] ; CHECK-NEXT: vldrh.s32 q3, [r1, #16] ; CHECK-NEXT: vmov.u8 r6, q0[0] -; CHECK-NEXT: vmov.u8 r5, q0[4] ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vadd.i32 q3, q3, r0 ; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vmov.u8 r7, q0[6] ; CHECK-NEXT: vmov r12, lr, d3 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vadd.i32 q2, q1, r0 ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] -; CHECK-NEXT: vmov r4, r8, d4 +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, r9, d5 +; CHECK-NEXT: vmov r0, r7, d5 ; CHECK-NEXT: strb r6, [r2] ; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: strb r2, [r3] ; CHECK-NEXT: vmov.u8 r6, q0[2] -; CHECK-NEXT: vmov r2, r10, d6 +; CHECK-NEXT: vmov r2, r8, d6 ; CHECK-NEXT: strb.w r6, [r12] ; CHECK-NEXT: vmov.u8 r6, q0[3] -; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov.u8 r3, q0[4] ; CHECK-NEXT: strb.w r6, [lr] ; CHECK-NEXT: vmov r6, r1, d7 -; CHECK-NEXT: strb r5, [r4] -; CHECK-NEXT: vmov.u8 r5, q0[5] -; CHECK-NEXT: strb.w r5, [r8] -; CHECK-NEXT: vmov r5, r4, d2 -; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: strb r3, [r4] +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: strb r3, [r5] +; CHECK-NEXT: vmov.u8 r4, q0[6] +; CHECK-NEXT: vmov r3, r5, d2 +; CHECK-NEXT: strb r4, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: strb.w r0, [r9] +; CHECK-NEXT: vmov.u8 r4, q0[8] +; CHECK-NEXT: strb r0, [r7] ; CHECK-NEXT: vmov r0, r7, d3 -; CHECK-NEXT: strb r3, [r2] +; CHECK-NEXT: strb r4, [r2] ; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: strb.w r2, [r10] +; CHECK-NEXT: strb.w r2, [r8] ; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: strb r2, [r6] ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: strb r2, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r5] +; CHECK-NEXT: strb r1, [r3] ; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r4] +; CHECK-NEXT: strb r1, [r5] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[15] ; CHECK-NEXT: strb r0, [r7] -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %offs = load <16 x i16>, ptr %offptr, align 2 %offs.sext = sext <16 x i16> %offs to <16 x i32> @@ -253,57 +253,57 @@ entry: define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(ptr %base, ptr %offptr, <16 x i8> %input) { ; CHECK-LABEL: unscaled_v16i8_i8_next: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vldrw.u32 q3, [r1, #32] ; CHECK-NEXT: vmov.u8 r6, q0[0] -; CHECK-NEXT: vmov.u8 r5, q0[4] ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vadd.i32 q3, q3, r0 ; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vmov.u8 r7, q0[6] ; CHECK-NEXT: vmov r12, lr, d3 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] ; CHECK-NEXT: vadd.i32 q2, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] -; CHECK-NEXT: vmov r4, r8, d4 +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, r9, d5 +; CHECK-NEXT: vmov r0, r7, d5 ; CHECK-NEXT: strb r6, [r2] ; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: strb r2, [r3] ; CHECK-NEXT: vmov.u8 r6, q0[2] -; CHECK-NEXT: vmov r2, r10, d6 +; CHECK-NEXT: vmov r2, r8, d6 ; CHECK-NEXT: strb.w r6, [r12] ; CHECK-NEXT: vmov.u8 r6, q0[3] -; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov.u8 r3, q0[4] ; CHECK-NEXT: strb.w r6, [lr] ; CHECK-NEXT: vmov r6, r1, d7 -; CHECK-NEXT: strb r5, [r4] -; CHECK-NEXT: vmov.u8 r5, q0[5] -; CHECK-NEXT: strb.w r5, [r8] -; CHECK-NEXT: vmov r5, r4, d2 -; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: strb r3, [r4] +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: strb r3, [r5] +; CHECK-NEXT: vmov.u8 r4, q0[6] +; CHECK-NEXT: vmov r3, r5, d2 +; CHECK-NEXT: strb r4, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: strb.w r0, [r9] +; CHECK-NEXT: vmov.u8 r4, q0[8] +; CHECK-NEXT: strb r0, [r7] ; CHECK-NEXT: vmov r0, r7, d3 -; CHECK-NEXT: strb r3, [r2] +; CHECK-NEXT: strb r4, [r2] ; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: strb.w r2, [r10] +; CHECK-NEXT: strb.w r2, [r8] ; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: strb r2, [r6] ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: strb r2, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r5] +; CHECK-NEXT: strb r1, [r3] ; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r4] +; CHECK-NEXT: strb r1, [r5] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[15] ; CHECK-NEXT: strb r0, [r7] -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %offs = load <16 x i32>, ptr %offptr, align 4 %ptrs = getelementptr inbounds i8, ptr %base, <16 x i32> %offs diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll index 8f969b8ad4c61a..9249924d4f884d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -45,17 +45,17 @@ define arm_aapcs_vfpcc void @ptr_v8i32(<8 x i32> %v, ptr %offptr) { ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vmov r0, r5, d1 ; CHECK-NEXT: str r3, [r1] -; CHECK-NEXT: vmov r1, r7, d4 +; CHECK-NEXT: vmov r1, r6, d4 ; CHECK-NEXT: str r4, [r2] -; CHECK-NEXT: vmov r2, r4, d5 +; CHECK-NEXT: vmov r2, r7, d5 ; CHECK-NEXT: str.w r0, [lr] ; CHECK-NEXT: vmov r0, r3, d2 ; CHECK-NEXT: str.w r5, [r12] -; CHECK-NEXT: vmov r5, r6, d3 +; CHECK-NEXT: vmov r5, r4, d3 ; CHECK-NEXT: str r0, [r1] -; CHECK-NEXT: str r3, [r7] +; CHECK-NEXT: str r3, [r6] ; CHECK-NEXT: str r5, [r2] -; CHECK-NEXT: str r6, [r4] +; CHECK-NEXT: str r4, [r7] ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x ptr>, ptr %offptr, align 4 @@ -154,17 +154,17 @@ define arm_aapcs_vfpcc void @ptr_v8f32(<8 x float> %v, ptr %offptr) { ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov r1, lr, d4 +; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: vmov r3, r12, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov r0, r2, d4 +; CHECK-NEXT: vmov r0, lr, d4 ; CHECK-NEXT: vmov r4, r5, d5 ; CHECK-NEXT: vstr s0, [r1] -; CHECK-NEXT: vstr s1, [lr] +; CHECK-NEXT: vstr s1, [r2] ; CHECK-NEXT: vstr s2, [r3] ; CHECK-NEXT: vstr s3, [r12] ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: vstr s5, [r2] +; CHECK-NEXT: vstr s5, [lr] ; CHECK-NEXT: vstr s6, [r4] ; CHECK-NEXT: vstr s7, [r5] ; CHECK-NEXT: pop {r4, r5, r7, pc} @@ -271,17 +271,17 @@ define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, ptr %offptr) { ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vmov r0, r5, d1 ; CHECK-NEXT: strh r3, [r1] -; CHECK-NEXT: vmov r1, r7, d4 +; CHECK-NEXT: vmov r1, r6, d4 ; CHECK-NEXT: strh r4, [r2] -; CHECK-NEXT: vmov r2, r4, d5 +; CHECK-NEXT: vmov r2, r7, d5 ; CHECK-NEXT: strh.w r0, [lr] ; CHECK-NEXT: vmov r0, r3, d2 ; CHECK-NEXT: strh.w r5, [r12] -; CHECK-NEXT: vmov r5, r6, d3 +; CHECK-NEXT: vmov r5, r4, d3 ; CHECK-NEXT: strh r0, [r1] -; CHECK-NEXT: strh r3, [r7] +; CHECK-NEXT: strh r3, [r6] ; CHECK-NEXT: strh r5, [r2] -; CHECK-NEXT: strh r6, [r4] +; CHECK-NEXT: strh r4, [r7] ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x ptr>, ptr %offptr, align 4 @@ -370,11 +370,11 @@ define arm_aapcs_vfpcc void @ptr_i8(<16 x i8> %v, ptr %offptr) { ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vmov.u8 r6, q0[0] ; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vmov.u8 r5, q0[4] +; CHECK-NEXT: vmov.u8 r5, q0[6] ; CHECK-NEXT: vmov r3, r12, d3 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vmov lr, r4, d4 -; CHECK-NEXT: vmov.u8 r7, q0[6] +; CHECK-NEXT: vmov.u8 r7, q0[8] ; CHECK-NEXT: vmov r0, r8, d5 ; CHECK-NEXT: strb r6, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[1] @@ -383,18 +383,18 @@ define arm_aapcs_vfpcc void @ptr_i8(<16 x i8> %v, ptr %offptr) { ; CHECK-NEXT: vmov r1, r9, d6 ; CHECK-NEXT: strb r6, [r3] ; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov.u8 r2, q0[4] ; CHECK-NEXT: strb.w r3, [r12] ; CHECK-NEXT: vmov r3, r6, d7 -; CHECK-NEXT: strb.w r5, [lr] -; CHECK-NEXT: vmov.u8 r5, q0[5] -; CHECK-NEXT: strb r5, [r4] -; CHECK-NEXT: vmov r5, r4, d2 -; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: strb.w r2, [lr] +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: strb r2, [r4] +; CHECK-NEXT: vmov r2, r4, d2 +; CHECK-NEXT: strb r5, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[7] ; CHECK-NEXT: strb.w r0, [r8] -; CHECK-NEXT: vmov r0, r7, d3 -; CHECK-NEXT: strb r2, [r1] +; CHECK-NEXT: vmov r0, r5, d3 +; CHECK-NEXT: strb r7, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[9] ; CHECK-NEXT: strb.w r1, [r9] ; CHECK-NEXT: vmov.u8 r1, q0[10] @@ -402,13 +402,13 @@ define arm_aapcs_vfpcc void @ptr_i8(<16 x i8> %v, ptr %offptr) { ; CHECK-NEXT: vmov.u8 r1, q0[11] ; CHECK-NEXT: strb r1, [r6] ; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r5] +; CHECK-NEXT: strb r1, [r2] ; CHECK-NEXT: vmov.u8 r1, q0[13] ; CHECK-NEXT: strb r1, [r4] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: strb r0, [r7] +; CHECK-NEXT: strb r0, [r5] ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %offs = load <16 x ptr>, ptr %offptr, align 4 @@ -479,17 +479,17 @@ define arm_aapcs_vfpcc void @ptr_v8i8_trunc32(<8 x i32> %v, ptr %offptr) { ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vmov r0, r5, d1 ; CHECK-NEXT: strb r3, [r1] -; CHECK-NEXT: vmov r1, r7, d4 +; CHECK-NEXT: vmov r1, r6, d4 ; CHECK-NEXT: strb r4, [r2] -; CHECK-NEXT: vmov r2, r4, d5 +; CHECK-NEXT: vmov r2, r7, d5 ; CHECK-NEXT: strb.w r0, [lr] ; CHECK-NEXT: vmov r0, r3, d2 ; CHECK-NEXT: strb.w r5, [r12] -; CHECK-NEXT: vmov r5, r6, d3 +; CHECK-NEXT: vmov r5, r4, d3 ; CHECK-NEXT: strb r0, [r1] -; CHECK-NEXT: strb r3, [r7] +; CHECK-NEXT: strb r3, [r6] ; CHECK-NEXT: strb r5, [r2] -; CHECK-NEXT: strb r6, [r4] +; CHECK-NEXT: strb r4, [r7] ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x ptr>, ptr %offptr, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index f4643f8c6c4a1f..3d210e015c36fb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV,CHECKFP -; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS -; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS,CHECKFP +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECKFP +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECKFP define arm_aapcs_vfpcc <4 x i32> @shuffle1_i32(<4 x i32> %src) { ; CHECK-LABEL: shuffle1_i32: @@ -225,15 +225,15 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { ; CHECK-LABEL: shuffle3_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vins.f16 s5, s4 -; CHECK-NEXT: vins.f16 s2, s0 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmovx.f16 s1, s7 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: vmovx.f16 s5, s3 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vins.f16 s6, s4 +; CHECK-NEXT: vins.f16 s5, s3 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -480,39 +480,39 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle3_i8(<16 x i8> %src) { ; CHECK-LABEL: shuffle3_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.8 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.8 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.8 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.8 q1[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.8 q1[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q1[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.8 q1[7], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.8 q1[10], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> @@ -1147,15 +1147,15 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { ; CHECK-LABEL: shuffle3_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vins.f16 s5, s4 -; CHECK-NEXT: vins.f16 s2, s0 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmovx.f16 s1, s7 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: vmovx.f16 s5, s3 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vins.f16 s6, s4 +; CHECK-NEXT: vins.f16 s5, s3 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> @@ -1467,47 +1467,27 @@ entry: ret <2 x double> %out } define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) { -; CHECK-LV-LABEL: shuffle9_f64: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LV-NEXT: vmov q5, q2 -; CHECK-LV-NEXT: vmov.f32 s16, s0 -; CHECK-LV-NEXT: vmov.f32 s18, s20 -; CHECK-LV-NEXT: vmov.f32 s20, s2 -; CHECK-LV-NEXT: vmov.f32 s10, s12 -; CHECK-LV-NEXT: vmov.f32 s19, s21 -; CHECK-LV-NEXT: vmov.f32 s8, s4 -; CHECK-LV-NEXT: vmov.f32 s17, s1 -; CHECK-LV-NEXT: vmov.f32 s21, s3 -; CHECK-LV-NEXT: vmov q0, q4 -; CHECK-LV-NEXT: vmov.f32 s12, s6 -; CHECK-LV-NEXT: vmov.f32 s11, s13 -; CHECK-LV-NEXT: vmov.f32 s9, s5 -; CHECK-LV-NEXT: vmov.f32 s13, s7 -; CHECK-LV-NEXT: vmov q1, q5 -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: shuffle9_f64: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vmov q5, q2 -; CHECK-LIS-NEXT: vmov q4, q0 -; CHECK-LIS-NEXT: vmov.f32 s2, s20 -; CHECK-LIS-NEXT: vmov.f32 s20, s18 -; CHECK-LIS-NEXT: vmov.f32 s10, s12 -; CHECK-LIS-NEXT: vmov.f32 s3, s21 -; CHECK-LIS-NEXT: vmov.f32 s8, s4 -; CHECK-LIS-NEXT: vmov.f32 s21, s19 -; CHECK-LIS-NEXT: vmov.f32 s12, s6 -; CHECK-LIS-NEXT: vmov.f32 s11, s13 -; CHECK-LIS-NEXT: vmov.f32 s9, s5 -; CHECK-LIS-NEXT: vmov.f32 s13, s7 -; CHECK-LIS-NEXT: vmov q1, q5 -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: shuffle9_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov q5, q2 +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vmov.f32 s20, s2 +; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vmov.f32 s19, s21 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s21, s3 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.f32 s11, s13 +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr entry: %out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> ret <8 x double> %out @@ -1580,47 +1560,27 @@ entry: ret <2 x i64> %out } define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) { -; CHECK-LV-LABEL: shuffle9_i64: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LV-NEXT: vmov q5, q2 -; CHECK-LV-NEXT: vmov.f32 s16, s0 -; CHECK-LV-NEXT: vmov.f32 s18, s20 -; CHECK-LV-NEXT: vmov.f32 s20, s2 -; CHECK-LV-NEXT: vmov.f32 s10, s12 -; CHECK-LV-NEXT: vmov.f32 s19, s21 -; CHECK-LV-NEXT: vmov.f32 s8, s4 -; CHECK-LV-NEXT: vmov.f32 s17, s1 -; CHECK-LV-NEXT: vmov.f32 s21, s3 -; CHECK-LV-NEXT: vmov q0, q4 -; CHECK-LV-NEXT: vmov.f32 s12, s6 -; CHECK-LV-NEXT: vmov.f32 s11, s13 -; CHECK-LV-NEXT: vmov.f32 s9, s5 -; CHECK-LV-NEXT: vmov.f32 s13, s7 -; CHECK-LV-NEXT: vmov q1, q5 -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: shuffle9_i64: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vmov q5, q2 -; CHECK-LIS-NEXT: vmov q4, q0 -; CHECK-LIS-NEXT: vmov.f32 s2, s20 -; CHECK-LIS-NEXT: vmov.f32 s20, s18 -; CHECK-LIS-NEXT: vmov.f32 s10, s12 -; CHECK-LIS-NEXT: vmov.f32 s3, s21 -; CHECK-LIS-NEXT: vmov.f32 s8, s4 -; CHECK-LIS-NEXT: vmov.f32 s21, s19 -; CHECK-LIS-NEXT: vmov.f32 s12, s6 -; CHECK-LIS-NEXT: vmov.f32 s11, s13 -; CHECK-LIS-NEXT: vmov.f32 s9, s5 -; CHECK-LIS-NEXT: vmov.f32 s13, s7 -; CHECK-LIS-NEXT: vmov q1, q5 -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: shuffle9_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov q5, q2 +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vmov.f32 s20, s2 +; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vmov.f32 s19, s21 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s21, s3 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.f32 s11, s13 +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr entry: %out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> ret <8 x i64> %out @@ -1859,4 +1819,3 @@ entry: %res = extractelement <2 x double> %a, i32 1 ret double %res } - diff --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll index 6ce75500142964..ce140919056370 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll @@ -134,39 +134,39 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_efcdab8967452301(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_efcdab8967452301: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.8 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q1[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.8 q1[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.8 q1[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.8 q1[7], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q1[10], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll index b3f7b7d961ad02..224d535f89f9f5 100644 --- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll +++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll @@ -37,16 +37,16 @@ define arm_aapcs_vfpcc <2 x i64> @add_int64_t(<2 x i64> %src1, <2 x i64> %src2) ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r0, r12, d3 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov r1, r0, d2 +; CHECK-NEXT: vmov r1, lr, d2 ; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r2, r3, r12 ; CHECK-NEXT: adds r1, r1, r4 -; CHECK-NEXT: adcs r0, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r3 +; CHECK-NEXT: adc.w r3, r5, lr +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = add nsw <2 x i64> %src1, %src2 @@ -112,13 +112,13 @@ define arm_aapcs_vfpcc <2 x double> @add_float64_t(<2 x double> %src1, <2 x doub ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r1, d11 -; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r2, r3, d11 ; CHECK-NEXT: bl __aeabi_dadd -; CHECK-NEXT: vmov lr, r12, d10 -; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: vmov lr, r12, d8 +; CHECK-NEXT: vmov r2, r3, d10 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: mov r0, lr ; CHECK-NEXT: mov r1, r12 @@ -168,16 +168,16 @@ define arm_aapcs_vfpcc <2 x i64> @sub_int64_t(<2 x i64> %src1, <2 x i64> %src2) ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r0, r12, d1 ; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: vmov r1, r0, d0 +; CHECK-NEXT: vmov r1, lr, d0 ; CHECK-NEXT: vmov r4, r5, d2 -; CHECK-NEXT: subs.w r2, r2, lr -; CHECK-NEXT: sbc.w r3, r3, r12 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbc.w r2, r3, r12 ; CHECK-NEXT: subs r1, r4, r1 -; CHECK-NEXT: sbc.w r0, r5, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r3 +; CHECK-NEXT: sbc.w r3, r5, lr +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = sub nsw <2 x i64> %src2, %src1 @@ -243,13 +243,13 @@ define arm_aapcs_vfpcc <2 x double> @sub_float64_t(<2 x double> %src1, <2 x doub ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r1, d11 -; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r2, r3, d11 ; CHECK-NEXT: bl __aeabi_dsub -; CHECK-NEXT: vmov lr, r12, d10 -; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: vmov lr, r12, d8 +; CHECK-NEXT: vmov r2, r3, d10 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: mov r0, lr ; CHECK-NEXT: mov r1, r12 @@ -376,13 +376,13 @@ define arm_aapcs_vfpcc <2 x double> @mul_float64_t(<2 x double> %src1, <2 x doub ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r1, d11 -; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r2, r3, d11 ; CHECK-NEXT: bl __aeabi_dmul -; CHECK-NEXT: vmov lr, r12, d10 -; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: vmov lr, r12, d8 +; CHECK-NEXT: vmov r2, r3, d10 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: mov r0, lr ; CHECK-NEXT: mov r1, r12 diff --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll index 4b76906034057f..03fa7f6bb8c053 100644 --- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll +++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll @@ -241,10 +241,10 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-BE-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: add r0, sp, #64 -; CHECK-BE-NEXT: vldrh.u16 q6, [r0] +; CHECK-BE-NEXT: vldrh.u16 q5, [r0] ; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.16 q4, q0 -; CHECK-BE-NEXT: vmov.u16 r0, q6[0] +; CHECK-BE-NEXT: vmov.u16 r0, q5[0] ; CHECK-BE-NEXT: vmov.u16 r4, q4[0] ; CHECK-BE-NEXT: bl __aeabi_h2f ; CHECK-BE-NEXT: mov r5, r0 @@ -253,8 +253,8 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h -; CHECK-BE-NEXT: vmov.16 q5[0], r0 -; CHECK-BE-NEXT: vmov.u16 r0, q6[1] +; CHECK-BE-NEXT: vmov.16 q6[0], r0 +; CHECK-BE-NEXT: vmov.u16 r0, q5[1] ; CHECK-BE-NEXT: vmov.u16 r4, q4[1] ; CHECK-BE-NEXT: bl __aeabi_h2f ; CHECK-BE-NEXT: mov r5, r0 @@ -263,8 +263,8 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h -; CHECK-BE-NEXT: vmov.16 q5[1], r0 -; CHECK-BE-NEXT: vmov.u16 r0, q6[2] +; CHECK-BE-NEXT: vmov.16 q6[1], r0 +; CHECK-BE-NEXT: vmov.u16 r0, q5[2] ; CHECK-BE-NEXT: vmov.u16 r4, q4[2] ; CHECK-BE-NEXT: bl __aeabi_h2f ; CHECK-BE-NEXT: mov r5, r0 @@ -273,8 +273,8 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h -; CHECK-BE-NEXT: vmov.16 q5[2], r0 -; CHECK-BE-NEXT: vmov.u16 r0, q6[3] +; CHECK-BE-NEXT: vmov.16 q6[2], r0 +; CHECK-BE-NEXT: vmov.u16 r0, q5[3] ; CHECK-BE-NEXT: vmov.u16 r4, q4[3] ; CHECK-BE-NEXT: bl __aeabi_h2f ; CHECK-BE-NEXT: mov r5, r0 @@ -283,8 +283,8 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h -; CHECK-BE-NEXT: vmov.16 q5[3], r0 -; CHECK-BE-NEXT: vmov.u16 r0, q6[4] +; CHECK-BE-NEXT: vmov.16 q6[3], r0 +; CHECK-BE-NEXT: vmov.u16 r0, q5[4] ; CHECK-BE-NEXT: vmov.u16 r4, q4[4] ; CHECK-BE-NEXT: bl __aeabi_h2f ; CHECK-BE-NEXT: mov r5, r0 @@ -293,8 +293,8 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h -; CHECK-BE-NEXT: vmov.16 q5[4], r0 -; CHECK-BE-NEXT: vmov.u16 r0, q6[5] +; CHECK-BE-NEXT: vmov.16 q6[4], r0 +; CHECK-BE-NEXT: vmov.u16 r0, q5[5] ; CHECK-BE-NEXT: vmov.u16 r4, q4[5] ; CHECK-BE-NEXT: bl __aeabi_h2f ; CHECK-BE-NEXT: mov r5, r0 @@ -303,8 +303,8 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h -; CHECK-BE-NEXT: vmov.16 q5[5], r0 -; CHECK-BE-NEXT: vmov.u16 r0, q6[6] +; CHECK-BE-NEXT: vmov.16 q6[5], r0 +; CHECK-BE-NEXT: vmov.u16 r0, q5[6] ; CHECK-BE-NEXT: vmov.u16 r4, q4[6] ; CHECK-BE-NEXT: bl __aeabi_h2f ; CHECK-BE-NEXT: mov r5, r0 @@ -313,8 +313,8 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h -; CHECK-BE-NEXT: vmov.16 q5[6], r0 -; CHECK-BE-NEXT: vmov.u16 r0, q6[7] +; CHECK-BE-NEXT: vmov.16 q6[6], r0 +; CHECK-BE-NEXT: vmov.u16 r0, q5[7] ; CHECK-BE-NEXT: vmov.u16 r4, q4[7] ; CHECK-BE-NEXT: bl __aeabi_h2f ; CHECK-BE-NEXT: mov r5, r0 @@ -323,8 +323,8 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h -; CHECK-BE-NEXT: vmov.16 q5[7], r0 -; CHECK-BE-NEXT: vrev64.16 q0, q5 +; CHECK-BE-NEXT: vmov.16 q6[7], r0 +; CHECK-BE-NEXT: vrev64.16 q0, q6 ; CHECK-BE-NEXT: vmov r1, r0, d0 ; CHECK-BE-NEXT: vmov r3, r2, d1 ; CHECK-BE-NEXT: vpop {d8, d9, d10, d11, d12, d13} @@ -354,25 +354,25 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) { ; CHECK-MVE-NEXT: sub sp, #4 ; CHECK-MVE-NEXT: .vsave {d8, d9} ; CHECK-MVE-NEXT: vpush {d8, d9} -; CHECK-MVE-NEXT: mov r4, r0 +; CHECK-MVE-NEXT: mov r6, r0 ; CHECK-MVE-NEXT: add r0, sp, #40 ; CHECK-MVE-NEXT: vldrw.u32 q4, [r0] -; CHECK-MVE-NEXT: mov r6, r1 +; CHECK-MVE-NEXT: mov r5, r1 ; CHECK-MVE-NEXT: mov r0, r3 -; CHECK-MVE-NEXT: mov r5, r2 +; CHECK-MVE-NEXT: mov r4, r2 ; CHECK-MVE-NEXT: vmov r7, r1, d9 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: vmov s19, r0 -; CHECK-MVE-NEXT: mov r0, r5 +; CHECK-MVE-NEXT: mov r0, r4 ; CHECK-MVE-NEXT: mov r1, r7 ; CHECK-MVE-NEXT: bl __aeabi_fadd -; CHECK-MVE-NEXT: vmov r5, r1, d8 +; CHECK-MVE-NEXT: vmov r4, r1, d8 ; CHECK-MVE-NEXT: vmov s18, r0 -; CHECK-MVE-NEXT: mov r0, r6 +; CHECK-MVE-NEXT: mov r0, r5 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: vmov s17, r0 -; CHECK-MVE-NEXT: mov r0, r4 -; CHECK-MVE-NEXT: mov r1, r5 +; CHECK-MVE-NEXT: mov r0, r6 +; CHECK-MVE-NEXT: mov r1, r4 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: vmov s16, r0 ; CHECK-MVE-NEXT: vmov r2, r3, d9 diff --git a/llvm/test/CodeGen/Thumb2/mve-vabd.ll b/llvm/test/CodeGen/Thumb2/mve-vabd.ll index f209a76d82e804..43ca6acbe4bbb6 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabd.ll @@ -14,26 +14,26 @@ define arm_aapcs_vfpcc void @vabd_v4f32(<4 x float> %x, <4 x float> %y, ptr %z) ; CHECK-MVE-NEXT: vmov q4, q1 ; CHECK-MVE-NEXT: vmov q5, q0 ; CHECK-MVE-NEXT: mov r8, r0 -; CHECK-MVE-NEXT: vmov r0, r6, d10 -; CHECK-MVE-NEXT: vmov r1, r7, d8 +; CHECK-MVE-NEXT: vmov r0, r5, d10 +; CHECK-MVE-NEXT: vmov r1, r6, d8 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: mov r9, r0 -; CHECK-MVE-NEXT: mov r0, r6 -; CHECK-MVE-NEXT: mov r1, r7 +; CHECK-MVE-NEXT: mov r0, r5 +; CHECK-MVE-NEXT: mov r1, r6 ; CHECK-MVE-NEXT: bl __aeabi_fsub -; CHECK-MVE-NEXT: mov r6, r0 -; CHECK-MVE-NEXT: vmov r0, r7, d11 +; CHECK-MVE-NEXT: mov r5, r0 +; CHECK-MVE-NEXT: vmov r0, r6, d11 ; CHECK-MVE-NEXT: vmov r1, r4, d9 ; CHECK-MVE-NEXT: bl __aeabi_fsub -; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: mov r0, r7 +; CHECK-MVE-NEXT: mov r7, r0 +; CHECK-MVE-NEXT: mov r0, r6 ; CHECK-MVE-NEXT: mov r1, r4 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 ; CHECK-MVE-NEXT: vmov s3, r0 -; CHECK-MVE-NEXT: bic r0, r5, #-2147483648 +; CHECK-MVE-NEXT: bic r0, r7, #-2147483648 ; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: bic r0, r6, #-2147483648 +; CHECK-MVE-NEXT: bic r0, r5, #-2147483648 ; CHECK-MVE-NEXT: vmov s1, r0 ; CHECK-MVE-NEXT: bic r0, r9, #-2147483648 ; CHECK-MVE-NEXT: vmov s0, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll index 654f7a3cb248a6..105d3a22e60ffb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -391,28 +391,28 @@ define void @vabd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ; CHECK-NEXT: vmov r4, s10 ; CHECK-NEXT: vmov.f32 s10, s9 ; CHECK-NEXT: vmov r6, s10 -; CHECK-NEXT: asrs r3, r4, #31 -; CHECK-NEXT: subs r4, r4, r6 -; CHECK-NEXT: sbc.w r9, r3, r6, asr #31 +; CHECK-NEXT: asrs r5, r4, #31 +; CHECK-NEXT: subs.w r9, r4, r6 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: sbc.w r5, r5, r6, asr #31 ; CHECK-NEXT: vmov r6, s8 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: subs r5, r7, r6 +; CHECK-NEXT: asrs r5, r5, #31 +; CHECK-NEXT: subs r3, r7, r6 ; CHECK-NEXT: asr.w r7, r7, #31 -; CHECK-NEXT: vmov q2[2], q2[0], r5, r8 -; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov q2[2], q2[0], r3, r8 +; CHECK-NEXT: vmov r3, s14 ; CHECK-NEXT: sbc.w r6, r7, r6, asr #31 ; CHECK-NEXT: asrs r6, r6, #31 -; CHECK-NEXT: subs r7, r3, r5 -; CHECK-NEXT: asr.w r3, r3, #31 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r7 +; CHECK-NEXT: subs r7, r4, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r9, r7 ; CHECK-NEXT: mov.w r7, #0 -; CHECK-NEXT: sbc.w r3, r3, r5, asr #31 ; CHECK-NEXT: bfi r7, r6, #0, #4 -; CHECK-NEXT: asr.w r4, r9, #31 ; CHECK-NEXT: asr.w r6, r12, #31 -; CHECK-NEXT: bfi r7, r4, #4, #4 -; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: bfi r7, r5, #4, #4 ; CHECK-NEXT: bfi r7, r6, #8, #4 +; CHECK-NEXT: asr.w r6, r4, #31 +; CHECK-NEXT: sbc.w r3, r6, r3, asr #31 +; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: bfi r7, r3, #12, #4 ; CHECK-NEXT: vmsr p0, r7 ; CHECK-NEXT: vpst @@ -532,8 +532,10 @@ for.cond.cleanup: ; preds = %vector.body define void @vabd_loop_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vabd_loop_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: mov.w lr, #256 @@ -558,26 +560,26 @@ define void @vabd_loop_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ; CHECK-NEXT: vmov r6, r7, d11 ; CHECK-NEXT: subs.w r8, r4, r3 ; CHECK-NEXT: sbc.w r12, r5, r12 -; CHECK-NEXT: vmov r5, r3, d9 -; CHECK-NEXT: subs.w r10, r6, r5 -; CHECK-NEXT: sbc.w r9, r7, r3 +; CHECK-NEXT: vmov r4, r5, d9 +; CHECK-NEXT: subs r4, r6, r4 +; CHECK-NEXT: sbc.w r9, r7, r5 ; CHECK-NEXT: vmov r6, r7, d8 -; CHECK-NEXT: vmov r4, r3, d10 -; CHECK-NEXT: subs r4, r4, r6 -; CHECK-NEXT: sbcs r3, r7 -; CHECK-NEXT: vmov q4[2], q4[0], r4, r8 -; CHECK-NEXT: vmov r4, r6, d5 -; CHECK-NEXT: vmov r7, r5, d7 +; CHECK-NEXT: vmov r3, r5, d10 +; CHECK-NEXT: subs r3, r3, r6 +; CHECK-NEXT: vmov q4[2], q4[0], r3, r8 +; CHECK-NEXT: sbc.w r3, r5, r7 +; CHECK-NEXT: vmov r5, r8, d5 +; CHECK-NEXT: vmov r7, r6, d7 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: subs r4, r7, r4 -; CHECK-NEXT: vmov q4[3], q4[1], r10, r4 +; CHECK-NEXT: subs r5, r7, r5 +; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 ; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: bfi r4, r3, #0, #4 ; CHECK-NEXT: asr.w r3, r9, #31 ; CHECK-NEXT: bfi r4, r3, #4, #4 ; CHECK-NEXT: asr.w r3, r12, #31 ; CHECK-NEXT: bfi r4, r3, #8, #4 -; CHECK-NEXT: sbc.w r3, r5, r6 +; CHECK-NEXT: sbc.w r3, r6, r8 ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: bfi r4, r3, #12, #4 ; CHECK-NEXT: vmsr p0, r4 @@ -587,7 +589,8 @@ define void @vabd_loop_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ; CHECK-NEXT: le lr, .LBB20_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: br label %vector.body diff --git a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll index 3e27955ab1f1b7..1aef8cc959f5cc 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll @@ -90,12 +90,12 @@ define arm_aapcs_vfpcc i64 @vaddva_v2i64_i64(<2 x i64> %s1, i64 %x) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %t = call i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64> %s1) diff --git a/llvm/test/CodeGen/Thumb2/mve-vcreate.ll b/llvm/test/CodeGen/Thumb2/mve-vcreate.ll index 7e68cea23e9497..e15fb67ecee81c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcreate.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcreate.ll @@ -307,55 +307,55 @@ define hidden <16 x i8> @create_i8(i8 zeroext %a1, i8 zeroext %b1, i8 zeroext %c ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: ldr r4, [sp, #40] +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: ldr r0, [sp, #40] +; CHECK-NEXT: ldr r4, [sp, #36] ; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: ldr r6, [sp, #36] ; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: lsll r4, r11, #16 -; CHECK-NEXT: mov r10, r1 -; CHECK-NEXT: lsll r6, r7, #24 +; CHECK-NEXT: lsll r0, r11, #16 +; CHECK-NEXT: lsll r4, r7, #24 ; CHECK-NEXT: mov r8, r3 -; CHECK-NEXT: orr.w r1, r6, r4 -; CHECK-NEXT: ldr r6, [sp, #44] +; CHECK-NEXT: orrs r0, r4 +; CHECK-NEXT: ldr r4, [sp, #44] ; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: lsll r4, r3, #8 +; CHECK-NEXT: ldr r1, [sp, #48] +; CHECK-NEXT: orrs r0, r4 ; CHECK-NEXT: ldr r4, [sp, #72] -; CHECK-NEXT: lsll r6, r3, #8 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: orrs r1, r6 -; CHECK-NEXT: ldr r6, [sp, #48] -; CHECK-NEXT: lsll r4, r5, #16 -; CHECK-NEXT: mov.w r9, #0 -; CHECK-NEXT: orr.w r12, r1, r6 ; CHECK-NEXT: ldr r6, [sp, #68] +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: lsll r4, r5, #16 ; CHECK-NEXT: lsll r6, r1, #24 ; CHECK-NEXT: orrs r6, r4 ; CHECK-NEXT: ldr r4, [sp, #76] +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: lsll r4, r9, #8 ; CHECK-NEXT: orrs r6, r4 ; CHECK-NEXT: ldr r4, [sp, #80] ; CHECK-NEXT: orr.w lr, r6, r4 ; CHECK-NEXT: lsl.w r4, r10, #16 -; CHECK-NEXT: orr.w r0, r4, r0, lsl #22 -; CHECK-NEXT: orr.w r0, r0, r2, lsl #8 -; CHECK-NEXT: add r0, r8 -; CHECK-NEXT: orrs r0, r7 -; CHECK-NEXT: orr.w r0, r0, r11 -; CHECK-NEXT: orr.w r2, r0, r3 -; CHECK-NEXT: ldr r0, [sp, #56] -; CHECK-NEXT: ldr r3, [sp, #52] -; CHECK-NEXT: lsls r0, r0, #16 -; CHECK-NEXT: orr.w r0, r0, r3, lsl #22 -; CHECK-NEXT: ldr r3, [sp, #60] -; CHECK-NEXT: orr.w r0, r0, r3, lsl #8 -; CHECK-NEXT: ldr r3, [sp, #64] -; CHECK-NEXT: add r0, r3 -; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: orr.w r6, r4, r12, lsl #22 +; CHECK-NEXT: orr.w r2, r6, r2, lsl #8 +; CHECK-NEXT: add r2, r8 +; CHECK-NEXT: orrs r2, r7 +; CHECK-NEXT: ldr r7, [sp, #52] +; CHECK-NEXT: orr.w r2, r2, r11 +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: ldr r3, [sp, #56] +; CHECK-NEXT: lsls r3, r3, #16 +; CHECK-NEXT: orr.w r3, r3, r7, lsl #22 +; CHECK-NEXT: ldr r7, [sp, #60] +; CHECK-NEXT: orr.w r3, r3, r7, lsl #8 +; CHECK-NEXT: ldr r7, [sp, #64] +; CHECK-NEXT: add r3, r7 +; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: orrs r1, r5 +; CHECK-NEXT: orr.w r3, r1, r9 ; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: orrs r0, r5 ; CHECK-NEXT: mov r2, lr -; CHECK-NEXT: orr.w r3, r0, r9 -; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %conv = zext i8 %a1 to i64 diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll index ad7a09fa50acba..161e32093ee50c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll @@ -89,35 +89,35 @@ entry: define arm_aapcs_vfpcc <8 x half> @foo_half_int16(<8 x i16> %src) { ; CHECK-MVE-LABEL: foo_half_int16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 ; CHECK-MVE-NEXT: vmov.s16 r0, q0[0] -; CHECK-MVE-NEXT: vmov s0, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[1] -; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: vcvt.f16.s32 s0, s0 -; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[3] -; CHECK-MVE-NEXT: vins.f16 s0, s2 -; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[2] -; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2 +; CHECK-MVE-NEXT: vmov s4, r0 +; CHECK-MVE-NEXT: vmov.s16 r0, q0[1] +; CHECK-MVE-NEXT: vmov s6, r0 +; CHECK-MVE-NEXT: vcvt.f16.s32 s4, s4 +; CHECK-MVE-NEXT: vcvt.f16.s32 s6, s6 +; CHECK-MVE-NEXT: vmov.s16 r0, q0[3] +; CHECK-MVE-NEXT: vins.f16 s4, s6 +; CHECK-MVE-NEXT: vmov s6, r0 +; CHECK-MVE-NEXT: vmov.s16 r0, q0[2] +; CHECK-MVE-NEXT: vcvt.f16.s32 s6, s6 ; CHECK-MVE-NEXT: vmov s8, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[4] -; CHECK-MVE-NEXT: vcvt.f16.s32 s1, s8 -; CHECK-MVE-NEXT: vins.f16 s1, s2 -; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[5] -; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2 +; CHECK-MVE-NEXT: vmov.s16 r0, q0[4] +; CHECK-MVE-NEXT: vcvt.f16.s32 s5, s8 +; CHECK-MVE-NEXT: vins.f16 s5, s6 +; CHECK-MVE-NEXT: vmov s6, r0 +; CHECK-MVE-NEXT: vmov.s16 r0, q0[5] +; CHECK-MVE-NEXT: vcvt.f16.s32 s6, s6 ; CHECK-MVE-NEXT: vmov s8, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[7] +; CHECK-MVE-NEXT: vmov.s16 r0, q0[7] ; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 -; CHECK-MVE-NEXT: vins.f16 s2, s8 +; CHECK-MVE-NEXT: vins.f16 s6, s8 ; CHECK-MVE-NEXT: vmov s8, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[6] +; CHECK-MVE-NEXT: vmov.s16 r0, q0[6] ; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 -; CHECK-MVE-NEXT: vmov s4, r0 -; CHECK-MVE-NEXT: vcvt.f16.s32 s3, s4 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vcvt.f16.s32 s7, s0 +; CHECK-MVE-NEXT: vins.f16 s7, s8 +; CHECK-MVE-NEXT: vmov q0, q1 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: foo_half_int16: @@ -132,35 +132,35 @@ entry: define arm_aapcs_vfpcc <8 x half> @foo_half_uint16(<8 x i16> %src) { ; CHECK-MVE-LABEL: foo_half_uint16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 ; CHECK-MVE-NEXT: vmov.u16 r0, q0[0] -; CHECK-MVE-NEXT: vmov s0, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[1] -; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: vcvt.f16.u32 s0, s0 -; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[3] -; CHECK-MVE-NEXT: vins.f16 s0, s2 -; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[2] -; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2 +; CHECK-MVE-NEXT: vmov s4, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[1] +; CHECK-MVE-NEXT: vmov s6, r0 +; CHECK-MVE-NEXT: vcvt.f16.u32 s4, s4 +; CHECK-MVE-NEXT: vcvt.f16.u32 s6, s6 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[3] +; CHECK-MVE-NEXT: vins.f16 s4, s6 +; CHECK-MVE-NEXT: vmov s6, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[2] +; CHECK-MVE-NEXT: vcvt.f16.u32 s6, s6 ; CHECK-MVE-NEXT: vmov s8, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[4] -; CHECK-MVE-NEXT: vcvt.f16.u32 s1, s8 -; CHECK-MVE-NEXT: vins.f16 s1, s2 -; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[5] -; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[4] +; CHECK-MVE-NEXT: vcvt.f16.u32 s5, s8 +; CHECK-MVE-NEXT: vins.f16 s5, s6 +; CHECK-MVE-NEXT: vmov s6, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[5] +; CHECK-MVE-NEXT: vcvt.f16.u32 s6, s6 ; CHECK-MVE-NEXT: vmov s8, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[7] +; CHECK-MVE-NEXT: vmov.u16 r0, q0[7] ; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 -; CHECK-MVE-NEXT: vins.f16 s2, s8 +; CHECK-MVE-NEXT: vins.f16 s6, s8 ; CHECK-MVE-NEXT: vmov s8, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[6] +; CHECK-MVE-NEXT: vmov.u16 r0, q0[6] ; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 -; CHECK-MVE-NEXT: vmov s4, r0 -; CHECK-MVE-NEXT: vcvt.f16.u32 s3, s4 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vcvt.f16.u32 s7, s0 +; CHECK-MVE-NEXT: vins.f16 s7, s8 +; CHECK-MVE-NEXT: vmov q0, q1 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: foo_half_uint16: @@ -377,15 +377,15 @@ entry: define arm_aapcs_vfpcc <8 x half> @vmovn32_trunc2(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: vmovn32_trunc2: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s4 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s5, s5 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s6, s6 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s7, s7 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s4, s0 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s5, s1 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s6, s2 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s7, s3 +; CHECK-MVE-NEXT: vmov q0, q1 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vmovn32_trunc2: diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll index 83d7275358ce32..7b73fc82f224ca 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -730,12 +730,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) { ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q1, #0xffffffff ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -827,8 +827,8 @@ entry: define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 @@ -850,23 +850,23 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) { ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add lr, r2 -; CHECK-NEXT: vmov r3, r2, d5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: add r2, lr +; CHECK-NEXT: vmov r3, lr, d5 +; CHECK-NEXT: adds r4, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: adc.w r12, r12, lr ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d1 ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <8 x i16> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -1123,8 +1123,8 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 @@ -1146,63 +1146,63 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add lr, r2 -; CHECK-NEXT: vmov r3, r2, d5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: add r2, lr +; CHECK-NEXT: vmov r3, lr, d5 +; CHECK-NEXT: adds r4, r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: adc.w r12, r12, lr ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d5 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[9] +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d5 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d5 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[13] +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d5 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d1 ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) @@ -1275,8 +1275,8 @@ entry: define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v8i8_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vmov.u16 r2, q0[1] @@ -1299,23 +1299,23 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) { ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add lr, r2 -; CHECK-NEXT: vmov r3, r2, d5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: add r2, lr +; CHECK-NEXT: vmov r3, lr, d5 +; CHECK-NEXT: adds r4, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: adc.w r12, r12, lr ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d1 ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <8 x i8> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -1440,12 +1440,12 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll index f9948db66b3b35..667dbc23af0aca 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -658,12 +658,11 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vcmp.i8 eq, q1, zr -; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q1, #0xff -; CHECK-NEXT: vpsel q5, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q2, #0x0 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vpsel q5, q1, q2 ; CHECK-NEXT: vmov.u8 r0, q5[0] ; CHECK-NEXT: vmov.16 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q5[1] @@ -681,7 +680,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vmov.u8 r0, q5[7] ; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vcmp.i16 ne, q3, zr -; CHECK-NEXT: vpsel q6, q1, q0 +; CHECK-NEXT: vpsel q6, q1, q2 ; CHECK-NEXT: vmov.u16 r0, q6[2] ; CHECK-NEXT: vmov.u16 r1, q6[0] ; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 @@ -689,161 +688,161 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vmov.u16 r1, q6[1] ; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vpsel q7, q1, q0 +; CHECK-NEXT: vpsel q7, q1, q2 ; CHECK-NEXT: vmov r0, r1, d14 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r1 ; CHECK-NEXT: vmov q3[3], q3[1], r0, r1 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.u8 r1, q2[0] +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.u8 r1, q0[0] ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 ; CHECK-NEXT: vmov.i64 q3, #0xff -; CHECK-NEXT: vand q0, q4, q3 +; CHECK-NEXT: vand q1, q4, q3 ; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vpsel q1, q1, q4 +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: vmov r2, r3, d15 ; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[3] -; CHECK-NEXT: vmov.u8 r3, q2[2] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: vmov.u8 r3, q0[2] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vpsel q1, q1, q4 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, r3, d3 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q6[6] ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q6[4] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q6[7] ; CHECK-NEXT: vmov.u16 r3, q6[5] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q1, q7 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q1, q7, q2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vand q6, q6, q3 +; CHECK-NEXT: vpsel q6, q6, q4 ; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[5] -; CHECK-NEXT: vmov.u8 r3, q2[4] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, r3, d13 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[7] -; CHECK-NEXT: vmov.u8 r3, q2[6] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vpsel q1, q1, q4 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, r3, d3 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q5[8] -; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmov.16 q1[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[9] -; CHECK-NEXT: vmov.16 q6[1], r2 +; CHECK-NEXT: vmov.16 q1[1], r2 ; CHECK-NEXT: vmov.u8 r2, q5[10] -; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: vmov.16 q1[2], r2 ; CHECK-NEXT: vmov.u8 r2, q5[11] -; CHECK-NEXT: vmov.16 q6[3], r2 +; CHECK-NEXT: vmov.16 q1[3], r2 ; CHECK-NEXT: vmov.u8 r2, q5[12] -; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmov.16 q1[4], r2 ; CHECK-NEXT: vmov.u8 r2, q5[13] -; CHECK-NEXT: vmov.16 q6[5], r2 +; CHECK-NEXT: vmov.16 q1[5], r2 ; CHECK-NEXT: vmov.u8 r2, q5[14] -; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov.u8 r2, q5[15] -; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: vmov.16 q1[7], r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vcmp.i16 ne, q6, zr -; CHECK-NEXT: vpsel q5, q1, q7 +; CHECK-NEXT: vcmp.i16 ne, q1, zr +; CHECK-NEXT: vpsel q5, q7, q2 ; CHECK-NEXT: vmov.u16 r2, q5[2] ; CHECK-NEXT: vmov.u16 r3, q5[0] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[3] ; CHECK-NEXT: vmov.u16 r3, q5[1] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q1, q7 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q1, q7, q2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vand q6, q6, q3 +; CHECK-NEXT: vpsel q6, q6, q4 ; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[9] -; CHECK-NEXT: vmov.u8 r3, q2[8] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, r3, d13 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[11] -; CHECK-NEXT: vmov.u8 r3, q2[10] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vpsel q1, q1, q4 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, r3, d3 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q5[6] ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q1, q1, q7 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q1, q7, q2 ; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[13] -; CHECK-NEXT: vmov.u8 r3, q2[12] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vpsel q2, q2, q4 +; CHECK-NEXT: vmov r2, r3, d4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, r3, d5 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[15] -; CHECK-NEXT: vmov.u8 r3, q2[14] -; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 @@ -1488,12 +1487,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %b, ; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -1527,12 +1526,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %b, ; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -1625,8 +1624,8 @@ entry: define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vmov.i8 q3, #0x0 @@ -1652,67 +1651,67 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b, ; CHECK-NEXT: vand q7, q2, q1 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vpsel q7, q7, q2 -; CHECK-NEXT: vmov r12, lr, d15 -; CHECK-NEXT: vmov r2, r3, d14 -; CHECK-NEXT: orr.w lr, lr, r3 -; CHECK-NEXT: add r12, r2 -; CHECK-NEXT: vmov r3, r2, d13 -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vmov r12, r3, d15 +; CHECK-NEXT: vmov lr, r2, d14 +; CHECK-NEXT: orr.w r5, r2, r3 +; CHECK-NEXT: vmov r2, r4, d13 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r4 +; CHECK-NEXT: add.w r3, lr, r12 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r4 ; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vmov.u16 r4, q0[2] ; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov q6[2], q6[0], r4, r2 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vpsel q6, q6, q2 -; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q5[7] -; CHECK-NEXT: vmov.u16 r3, q5[5] -; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vmov r2, r4, d12 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r5, r4 +; CHECK-NEXT: vmov r5, r4, d13 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: vmov.u16 r5, q5[6] +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov.u16 r4, q5[4] +; CHECK-NEXT: vmov q6[2], q6[0], r4, r5 +; CHECK-NEXT: vmov.u16 r5, q5[7] +; CHECK-NEXT: vmov.u16 r4, q5[5] +; CHECK-NEXT: vmov q6[3], q6[1], r4, r5 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vpsel q3, q4, q3 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov r5, r4, d6 +; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 +; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 +; CHECK-NEXT: vmov.u16 r5, q0[5] +; CHECK-NEXT: vmov.u16 r4, q0[4] ; CHECK-NEXT: vcmp.i32 ne, q4, zr -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vpsel q4, q4, q2 -; CHECK-NEXT: vmov r2, r3, d8 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d9 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: vmov r5, r4, d8 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r5, r4 +; CHECK-NEXT: vmov q3[3], q3[1], r5, r4 +; CHECK-NEXT: vmov.u16 r5, q0[7] +; CHECK-NEXT: vmov.u16 r4, q0[6] ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: vmov r5, r4, d0 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %c = icmp eq <8 x i16> %b, zeroinitializer %xx = zext <8 x i16> %x to <8 x i64> @@ -1725,8 +1724,8 @@ entry: define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vmov.i8 q2, #0x0 @@ -1753,12 +1752,12 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b, ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vpsel q6, q6, q1 -; CHECK-NEXT: vmov lr, r12, d13 -; CHECK-NEXT: vmov r3, r2, d12 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, r12, d13 +; CHECK-NEXT: vmov r3, lr, d12 +; CHECK-NEXT: adds r4, r3, r2 ; CHECK-NEXT: vmov r2, r3, d11 ; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 +; CHECK-NEXT: adc.w r12, r12, lr ; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 ; CHECK-NEXT: vmov.s16 r2, q0[3] ; CHECK-NEXT: vmov.s16 r3, q0[2] @@ -1769,14 +1768,14 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b, ; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 ; CHECK-NEXT: vpsel q5, q5, q1 ; CHECK-NEXT: vmov r2, r3, d10 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d11 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u16 r2, q4[6] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u16 r3, q4[4] -; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d11 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u16 r3, q4[6] +; CHECK-NEXT: vmov.u16 r2, q4[4] +; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q4[7] ; CHECK-NEXT: vmov.u16 r3, q4[5] ; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 @@ -1794,14 +1793,14 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b, ; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 ; CHECK-NEXT: vpsel q3, q3, q1 ; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d7 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r3, r2, d5 +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 ; CHECK-NEXT: vmov.s16 r2, q0[7] ; CHECK-NEXT: vmov.s16 r3, q0[6] ; CHECK-NEXT: vcmp.i32 ne, q2, zr @@ -1811,15 +1810,15 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b, ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d1 ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %c = icmp eq <8 x i16> %b, zeroinitializer %xx = sext <8 x i16> %x to <8 x i64> @@ -1849,12 +1848,12 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %b, ; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: orr.w r3, r3, lr -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r12, r3, d1 +; CHECK-NEXT: vmov lr, r2, d0 +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: add.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -1892,12 +1891,12 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %b, ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -2061,18 +2060,17 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vcmp.i8 eq, q1, zr -; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q1, #0xff -; CHECK-NEXT: vpsel q5, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q2, #0x0 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vpsel q5, q1, q2 ; CHECK-NEXT: vmov.u8 r2, q5[0] ; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[1] @@ -2090,7 +2088,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov.u8 r2, q5[7] ; CHECK-NEXT: vmov.16 q3[7], r2 ; CHECK-NEXT: vcmp.i16 ne, q3, zr -; CHECK-NEXT: vpsel q6, q1, q0 +; CHECK-NEXT: vpsel q6, q1, q2 ; CHECK-NEXT: vmov.u16 r2, q6[2] ; CHECK-NEXT: vmov.u16 r3, q6[0] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 @@ -2098,175 +2096,175 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov.u16 r3, q6[1] ; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vpsel q7, q1, q0 +; CHECK-NEXT: vpsel q7, q1, q2 ; CHECK-NEXT: vmov r2, r3, d14 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[1] -; CHECK-NEXT: vmov.u8 r3, q2[0] +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vmov.i64 q3, #0xff -; CHECK-NEXT: vand q0, q4, q3 +; CHECK-NEXT: vand q1, q4, q3 ; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: orr.w lr, lr, r3 -; CHECK-NEXT: add r12, r2 -; CHECK-NEXT: vmov r3, r2, d15 +; CHECK-NEXT: vpsel q1, q1, q4 +; CHECK-NEXT: vmov r12, r3, d3 +; CHECK-NEXT: vmov lr, r2, d2 +; CHECK-NEXT: orr.w r5, r2, r3 +; CHECK-NEXT: vmov r2, r4, d15 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 +; CHECK-NEXT: add.w r3, lr, r12 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r4 +; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: vmov.u8 r4, q0[2] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q1[2], q1[0], r4, r2 ; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vmov.u8 r2, q2[3] -; CHECK-NEXT: vmov.u8 r3, q2[2] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: vmov.u16 r2, q6[6] -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q6[4] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q6[7] -; CHECK-NEXT: vmov.u16 r3, q6[5] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q1, q7 -; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[5] -; CHECK-NEXT: vmov.u8 r3, q2[4] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[7] -; CHECK-NEXT: vmov.u8 r3, q2[6] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: vmov.u8 r2, q5[8] -; CHECK-NEXT: vmov.16 q6[0], r2 -; CHECK-NEXT: vmov.u8 r2, q5[9] -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov.u8 r2, q5[10] -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.u8 r2, q5[11] -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov.u8 r2, q5[12] -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov.u8 r2, q5[13] -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov.u8 r2, q5[14] -; CHECK-NEXT: vmov.16 q6[6], r2 -; CHECK-NEXT: vmov.u8 r2, q5[15] -; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vcmp.i16 ne, q6, zr -; CHECK-NEXT: vpsel q5, q1, q7 -; CHECK-NEXT: vmov.u16 r2, q5[2] -; CHECK-NEXT: vmov.u16 r3, q5[0] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q5[3] -; CHECK-NEXT: vmov.u16 r3, q5[1] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q1, q7 -; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[9] -; CHECK-NEXT: vmov.u8 r3, q2[8] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[11] -; CHECK-NEXT: vmov.u8 r3, q2[10] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q5[7] -; CHECK-NEXT: vmov.u16 r3, q5[5] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q1, q1, q7 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[13] -; CHECK-NEXT: vmov.u8 r3, q2[12] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[15] -; CHECK-NEXT: vmov.u8 r3, q2[14] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vpsel q1, q1, q4 +; CHECK-NEXT: vmov r2, r4, d2 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r5, r4 +; CHECK-NEXT: vmov r5, r4, d3 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: vmov.u16 r5, q6[6] +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov.u16 r4, q6[4] +; CHECK-NEXT: vmov q1[2], q1[0], r4, r5 +; CHECK-NEXT: vmov.u16 r5, q6[7] +; CHECK-NEXT: vmov.u16 r4, q6[5] +; CHECK-NEXT: vmov q1[3], q1[1], r4, r5 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q1, q7, q2 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: vmov q6[2], q6[0], r5, r4 +; CHECK-NEXT: vmov q6[3], q6[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q0[5] +; CHECK-NEXT: vmov.u8 r4, q0[4] +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r4, r5 +; CHECK-NEXT: vand q6, q6, q3 +; CHECK-NEXT: vpsel q6, q6, q4 +; CHECK-NEXT: vmov r5, r4, d12 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d13 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r5, r4 +; CHECK-NEXT: vmov q1[3], q1[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q0[7] +; CHECK-NEXT: vmov.u8 r4, q0[6] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q1[2], q1[0], r4, r5 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vpsel q1, q1, q4 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d3 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q5[8] +; CHECK-NEXT: vmov.16 q1[0], r5 +; CHECK-NEXT: vmov.u8 r5, q5[9] +; CHECK-NEXT: vmov.16 q1[1], r5 +; CHECK-NEXT: vmov.u8 r5, q5[10] +; CHECK-NEXT: vmov.16 q1[2], r5 +; CHECK-NEXT: vmov.u8 r5, q5[11] +; CHECK-NEXT: vmov.16 q1[3], r5 +; CHECK-NEXT: vmov.u8 r5, q5[12] +; CHECK-NEXT: vmov.16 q1[4], r5 +; CHECK-NEXT: vmov.u8 r5, q5[13] +; CHECK-NEXT: vmov.16 q1[5], r5 +; CHECK-NEXT: vmov.u8 r5, q5[14] +; CHECK-NEXT: vmov.16 q1[6], r5 +; CHECK-NEXT: vmov.u8 r5, q5[15] +; CHECK-NEXT: vmov.16 q1[7], r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vcmp.i16 ne, q1, zr +; CHECK-NEXT: vpsel q5, q7, q2 +; CHECK-NEXT: vmov.u16 r5, q5[2] +; CHECK-NEXT: vmov.u16 r4, q5[0] +; CHECK-NEXT: vmov q1[2], q1[0], r4, r5 +; CHECK-NEXT: vmov.u16 r5, q5[3] +; CHECK-NEXT: vmov.u16 r4, q5[1] +; CHECK-NEXT: vmov q1[3], q1[1], r4, r5 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q1, q7, q2 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: vmov q6[2], q6[0], r5, r4 +; CHECK-NEXT: vmov q6[3], q6[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q0[9] +; CHECK-NEXT: vmov.u8 r4, q0[8] +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: vmov q6[2], q6[0], r4, r5 +; CHECK-NEXT: vand q6, q6, q3 +; CHECK-NEXT: vpsel q6, q6, q4 +; CHECK-NEXT: vmov r5, r4, d12 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d13 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r5, r4 +; CHECK-NEXT: vmov q1[3], q1[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q0[11] +; CHECK-NEXT: vmov.u8 r4, q0[10] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q1[2], q1[0], r4, r5 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vpsel q1, q1, q4 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d3 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: vmov.u16 r5, q5[6] +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov.u16 r4, q5[4] +; CHECK-NEXT: vmov q1[2], q1[0], r4, r5 +; CHECK-NEXT: vmov.u16 r5, q5[7] +; CHECK-NEXT: vmov.u16 r4, q5[5] +; CHECK-NEXT: vmov q1[3], q1[1], r4, r5 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q1, q7, q2 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: vmov q2[2], q2[0], r5, r4 +; CHECK-NEXT: vmov q2[3], q2[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q0[13] +; CHECK-NEXT: vmov.u8 r4, q0[12] +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmov q2[2], q2[0], r4, r5 +; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vpsel q2, q2, q4 +; CHECK-NEXT: vmov r5, r4, d4 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r5, r4 +; CHECK-NEXT: vmov q1[3], q1[1], r5, r4 +; CHECK-NEXT: vmov.u8 r5, q0[15] +; CHECK-NEXT: vmov.u8 r4, q0[14] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: vmov r5, r4, d0 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> @@ -2279,8 +2277,8 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vcmp.i8 eq, q1, zr @@ -2325,12 +2323,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: vmov q7[3], q7[1], r3, r2 ; CHECK-NEXT: vpsel q7, q7, q3 -; CHECK-NEXT: vmov lr, r12, d15 -; CHECK-NEXT: vmov r3, r2, d14 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, r12, d15 +; CHECK-NEXT: vmov r3, lr, d14 +; CHECK-NEXT: adds r4, r3, r2 ; CHECK-NEXT: vmov r2, r3, d13 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: adc.w r12, r12, lr ; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 ; CHECK-NEXT: vmov.s8 r2, q0[3] ; CHECK-NEXT: vmov.s8 r3, q0[2] @@ -2341,14 +2339,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vpsel q6, q6, q3 ; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d13 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u16 r3, q5[6] +; CHECK-NEXT: vmov.u16 r2, q5[4] +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 @@ -2366,14 +2364,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vpsel q6, q6, q3 ; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d11 -; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d13 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r3, r2, d11 +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 ; CHECK-NEXT: vmov.s8 r2, q0[7] ; CHECK-NEXT: vmov.s8 r3, q0[6] ; CHECK-NEXT: vcmp.i32 ne, q5, zr @@ -2383,10 +2381,11 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 ; CHECK-NEXT: vpsel q5, q5, q3 ; CHECK-NEXT: vmov r2, r3, d10 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d11 -; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d11 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -2403,7 +2402,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[15] ; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: adc.w r12, r12, r3 ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q4, q2, q1 ; CHECK-NEXT: vmov.u16 r2, q4[2] @@ -2426,14 +2424,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vpsel q6, q6, q3 ; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d11 -; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d13 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r3, r2, d11 +; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 ; CHECK-NEXT: vmov.s8 r2, q0[11] ; CHECK-NEXT: vmov.s8 r3, q0[10] ; CHECK-NEXT: vcmp.i32 ne, q5, zr @@ -2443,14 +2441,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 ; CHECK-NEXT: vpsel q5, q5, q3 ; CHECK-NEXT: vmov r2, r3, d10 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d11 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u16 r2, q4[6] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u16 r3, q4[4] -; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d11 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u16 r3, q4[6] +; CHECK-NEXT: vmov.u16 r2, q4[4] +; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q4[7] ; CHECK-NEXT: vmov.u16 r3, q4[5] ; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 @@ -2468,14 +2466,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 ; CHECK-NEXT: vpsel q2, q2, q3 ; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d5 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r3, r2, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vmov.s8 r2, q0[15] ; CHECK-NEXT: vmov.s8 r3, q0[14] ; CHECK-NEXT: vcmp.i32 ne, q1, zr @@ -2485,15 +2483,15 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vpsel q0, q0, q3 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d1 ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i64> @@ -2523,12 +2521,12 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %b, i6 ; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: orr.w r3, r3, lr -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r12, r3, d1 +; CHECK-NEXT: vmov lr, r2, d0 +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: add.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -2566,12 +2564,12 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %b, i6 ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -2599,12 +2597,12 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %b, i64 ; CHECK-NEXT: bfi r12, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r12 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i64> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll index 6ab1a9344bb238..121c234591631c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -9,41 +9,41 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: cmp r1, #4 ; CHECK-NEXT: bhs .LBB0_4 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: b .LBB0_7 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB0_4: @ %vector.ph -; CHECK-NEXT: bic r3, r1, #3 -; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: subs r0, r3, #4 -; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: bic r12, r1, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: add.w lr, r3, r2, lsr #2 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: .LBB0_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r2], #16 -; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r3], #16 +; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: le lr, .LBB0_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: cmp r12, r1 +; CHECK-NEXT: beq .LBB0_9 ; CHECK-NEXT: .LBB0_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 -; CHECK-NEXT: add.w r2, r12, r3, lsl #2 +; CHECK-NEXT: sub.w lr, r1, r12 +; CHECK-NEXT: add.w r0, r0, r12, lsl #2 ; CHECK-NEXT: .LBB0_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [r2], #4 -; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: add r2, r1 ; CHECK-NEXT: le lr, .LBB0_8 -; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup +; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -119,11 +119,11 @@ define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: vmov lr, r3, d1 +; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: cmp r12, r1 -; CHECK-NEXT: vmov r2, r4, d0 -; CHECK-NEXT: mul r3, lr, r3 -; CHECK-NEXT: mul r2, r4, r2 +; CHECK-NEXT: vmov lr, r4, d0 +; CHECK-NEXT: mul r2, r3, r2 +; CHECK-NEXT: mul r3, lr, r4 ; CHECK-NEXT: mul r2, r3, r2 ; CHECK-NEXT: beq .LBB1_8 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1 @@ -216,12 +216,12 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: vand q0, q1, q0 ; CHECK-NEXT: le lr, .LBB2_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r2, r12, d1 ; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: vmov r2, r4, d0 -; CHECK-NEXT: and.w r12, r12, lr -; CHECK-NEXT: and.w r2, r2, r4 +; CHECK-NEXT: vmov lr, r4, d0 ; CHECK-NEXT: and.w r2, r2, r12 +; CHECK-NEXT: and.w r4, r4, lr +; CHECK-NEXT: and.w r2, r2, r4 ; CHECK-NEXT: beq .LBB2_9 ; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r3 @@ -313,12 +313,12 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: le lr, .LBB3_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r2, r12, d1 ; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: vmov r2, r4, d0 -; CHECK-NEXT: orr.w r12, r12, lr -; CHECK-NEXT: orr.w r2, r2, r4 +; CHECK-NEXT: vmov lr, r4, d0 ; CHECK-NEXT: orr.w r2, r2, r12 +; CHECK-NEXT: orr.w r4, r4, lr +; CHECK-NEXT: orr.w r2, r2, r4 ; CHECK-NEXT: beq .LBB3_9 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r3 @@ -410,12 +410,12 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: veor q0, q1, q0 ; CHECK-NEXT: le lr, .LBB4_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r2, r12, d1 ; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: vmov r2, r4, d0 -; CHECK-NEXT: eor.w r12, r12, lr -; CHECK-NEXT: eor.w r2, r2, r4 +; CHECK-NEXT: vmov lr, r4, d0 ; CHECK-NEXT: eor.w r2, r2, r12 +; CHECK-NEXT: eor.w r4, r4, lr +; CHECK-NEXT: eor.w r2, r2, r4 ; CHECK-NEXT: beq .LBB4_9 ; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r3 @@ -775,42 +775,42 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB8_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: cmp r1, #4 ; CHECK-NEXT: bhs .LBB8_4 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: mvn r0, #-2147483648 -; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: mvn r2, #-2147483648 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB8_7 ; CHECK-NEXT: .LBB8_3: -; CHECK-NEXT: mvn r0, #-2147483648 +; CHECK-NEXT: mvn r2, #-2147483648 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB8_4: @ %vector.ph -; CHECK-NEXT: bic r3, r1, #3 -; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: subs r0, r3, #4 -; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: mvn r0, #-2147483648 -; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: bic r12, r1, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: add.w lr, r3, r2, lsr #2 +; CHECK-NEXT: mvn r2, #-2147483648 +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: .LBB8_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r2], #16 -; CHECK-NEXT: vminv.s32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r3], #16 +; CHECK-NEXT: vminv.s32 r2, q0 ; CHECK-NEXT: le lr, .LBB8_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: cmp r12, r1 +; CHECK-NEXT: beq .LBB8_9 ; CHECK-NEXT: .LBB8_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 -; CHECK-NEXT: add.w r2, r12, r3, lsl #2 +; CHECK-NEXT: sub.w lr, r1, r12 +; CHECK-NEXT: add.w r0, r0, r12, lsl #2 ; CHECK-NEXT: .LBB8_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [r2], #4 -; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: csel r0, r0, r1, lt +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: cmp r2, r1 +; CHECK-NEXT: csel r2, r2, r1, lt ; CHECK-NEXT: le lr, .LBB8_8 -; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup +; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -968,42 +968,42 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB10_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: cmp r1, #4 ; CHECK-NEXT: bhs .LBB10_4 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: mov.w r0, #-2147483648 -; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: mov.w r2, #-2147483648 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB10_7 ; CHECK-NEXT: .LBB10_3: -; CHECK-NEXT: mov.w r0, #-2147483648 +; CHECK-NEXT: mov.w r2, #-2147483648 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB10_4: @ %vector.ph -; CHECK-NEXT: bic r3, r1, #3 -; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: subs r0, r3, #4 -; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: mov.w r0, #-2147483648 -; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: bic r12, r1, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: add.w lr, r3, r2, lsr #2 +; CHECK-NEXT: mov.w r2, #-2147483648 +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: .LBB10_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r2], #16 -; CHECK-NEXT: vmaxv.s32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r3], #16 +; CHECK-NEXT: vmaxv.s32 r2, q0 ; CHECK-NEXT: le lr, .LBB10_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: cmp r12, r1 +; CHECK-NEXT: beq .LBB10_9 ; CHECK-NEXT: .LBB10_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 -; CHECK-NEXT: add.w r2, r12, r3, lsl #2 +; CHECK-NEXT: sub.w lr, r1, r12 +; CHECK-NEXT: add.w r0, r0, r12, lsl #2 ; CHECK-NEXT: .LBB10_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [r2], #4 -; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: csel r0, r0, r1, gt +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: cmp r2, r1 +; CHECK-NEXT: csel r2, r2, r1, gt ; CHECK-NEXT: le lr, .LBB10_8 -; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup +; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -1161,42 +1161,42 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB12_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: cmp r1, #4 ; CHECK-NEXT: bhs .LBB12_4 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: mov.w r0, #-1 -; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: mov.w r2, #-1 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB12_7 ; CHECK-NEXT: .LBB12_3: -; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: mov.w r2, #-1 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB12_4: @ %vector.ph -; CHECK-NEXT: bic r3, r1, #3 -; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: subs r0, r3, #4 -; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: mov.w r0, #-1 -; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: bic r12, r1, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: add.w lr, r3, r2, lsr #2 +; CHECK-NEXT: mov.w r2, #-1 +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: .LBB12_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r2], #16 -; CHECK-NEXT: vminv.u32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r3], #16 +; CHECK-NEXT: vminv.u32 r2, q0 ; CHECK-NEXT: le lr, .LBB12_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: cmp r12, r1 +; CHECK-NEXT: beq .LBB12_9 ; CHECK-NEXT: .LBB12_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 -; CHECK-NEXT: add.w r2, r12, r3, lsl #2 +; CHECK-NEXT: sub.w lr, r1, r12 +; CHECK-NEXT: add.w r0, r0, r12, lsl #2 ; CHECK-NEXT: .LBB12_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [r2], #4 -; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: csel r0, r0, r1, hi +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: cmp r2, r1 +; CHECK-NEXT: csel r2, r2, r1, hi ; CHECK-NEXT: le lr, .LBB12_8 -; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup +; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -1354,42 +1354,42 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB14_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: cmp r1, #4 ; CHECK-NEXT: bhs .LBB14_4 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: b .LBB14_7 ; CHECK-NEXT: .LBB14_3: -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB14_4: @ %vector.ph -; CHECK-NEXT: bic r3, r1, #3 -; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: subs r0, r3, #4 -; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: bic r12, r1, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: add.w lr, r3, r2, lsr #2 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: .LBB14_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r2], #16 -; CHECK-NEXT: vmaxv.u32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r3], #16 +; CHECK-NEXT: vmaxv.u32 r2, q0 ; CHECK-NEXT: le lr, .LBB14_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: cmp r12, r1 +; CHECK-NEXT: beq .LBB14_9 ; CHECK-NEXT: .LBB14_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 -; CHECK-NEXT: add.w r2, r12, r3, lsl #2 +; CHECK-NEXT: sub.w lr, r1, r12 +; CHECK-NEXT: add.w r0, r0, r12, lsl #2 ; CHECK-NEXT: .LBB14_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [r2], #4 -; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: csel r0, r0, r1, hi +; CHECK-NEXT: ldr r1, [r0], #4 +; CHECK-NEXT: cmp r2, r1 +; CHECK-NEXT: csel r2, r2, r1, hi ; CHECK-NEXT: le lr, .LBB14_8 -; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup +; CHECK-NEXT: .LBB14_9: @ %for.cond.cleanup +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll index 2dec589448d2d9..d50d0fb55cb8c2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -919,12 +919,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmullb.u32 q2, q0, q1 -; CHECK-NEXT: vmov lr, r12, d5 -; CHECK-NEXT: vmov r3, r2, d4 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: vmov r3, lr, d4 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -941,12 +941,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmullb.s32 q2, q0, q1 -; CHECK-NEXT: vmov lr, r12, d5 -; CHECK-NEXT: vmov r3, r2, d4 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: vmov r3, lr, d4 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <2 x i32> %x to <2 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll index 63b1431ac0fa41..ba7f088f7c2049 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -819,49 +819,48 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vmov.i8 q2, #0xff -; CHECK-NEXT: vpsel q6, q2, q0 -; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.i8 q2, #0x0 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vpsel q6, q3, q2 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill ; CHECK-NEXT: vmov.u8 r0, q6[0] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q2[0], r0 ; CHECK-NEXT: vmov.u8 r0, q6[1] -; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.16 q2[1], r0 ; CHECK-NEXT: vmov.u8 r0, q6[2] -; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q2[2], r0 ; CHECK-NEXT: vmov.u8 r0, q6[3] -; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov.u8 r0, q6[4] -; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q2[4], r0 ; CHECK-NEXT: vmov.u8 r0, q6[5] -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov.u8 r0, q6[6] -; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov.u8 r0, q6[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vcmp.i16 ne, q0, zr -; CHECK-NEXT: vmov.u8 r2, q3[0] -; CHECK-NEXT: vpsel q7, q2, q4 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vcmp.i16 ne, q2, zr +; CHECK-NEXT: vpsel q7, q3, q4 ; CHECK-NEXT: vmov.u16 r0, q7[2] ; CHECK-NEXT: vmov.u16 r1, q7[0] -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q7[3] ; CHECK-NEXT: vmov.u16 r1, q7[1] -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q0, q2, q4 -; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vpsel q3, q3, q4 +; CHECK-NEXT: vmov r0, r1, d6 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 ; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 ; CHECK-NEXT: vmov.u8 r0, q1[1] ; CHECK-NEXT: vmov.u8 r1, q1[0] ; CHECK-NEXT: vcmp.i32 ne, q2, zr ; CHECK-NEXT: vmov q5[2], q5[0], r1, r0 -; CHECK-NEXT: vmov.u8 r1, q3[1] +; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vand q5, q5, q2 @@ -879,53 +878,53 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: adds.w r12, r2, r0 -; CHECK-NEXT: vmov.u8 r0, q3[2] +; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: adc.w lr, r3, r1 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[3] ; CHECK-NEXT: vmov.u8 r3, q1[2] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q3[3] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[3] ; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov r1, s16 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vpsel q0, q0, q5 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vpsel q3, q3, q5 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov r2, r3, d7 ; CHECK-NEXT: adds.w r0, r0, r12 ; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u16 r2, q7[6] ; CHECK-NEXT: vmov.u16 r3, q7[4] -; CHECK-NEXT: vmov.u8 r0, q3[4] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q7[7] ; CHECK-NEXT: vmov.u16 r3, q7[5] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vpsel q3, q3, q4 +; CHECK-NEXT: vmov r2, r3, d6 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[5] ; CHECK-NEXT: vmov.u8 r3, q1[4] ; CHECK-NEXT: vcmp.i32 ne, q4, zr ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q3[5] +; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: vmov q7[2], q7[0], r0, r3 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vand q7, q7, q2 @@ -945,69 +944,69 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov.u8 r0, q3[6] -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[7] ; CHECK-NEXT: vmov.u8 r3, q1[6] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q3[7] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[7] ; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov r1, s16 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vpsel q0, q0, q5 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vpsel q3, q3, q5 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov r2, r3, d7 ; CHECK-NEXT: adds.w r0, r0, r12 ; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q6[8] ; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q6[9] -; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov.16 q3[1], r2 ; CHECK-NEXT: vmov.u8 r2, q6[10] -; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q3[2], r2 ; CHECK-NEXT: vmov.u8 r2, q6[11] -; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.16 q3[3], r2 ; CHECK-NEXT: vmov.u8 r2, q6[12] -; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q3[4], r2 ; CHECK-NEXT: vmov.u8 r2, q6[13] -; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.16 q3[5], r2 ; CHECK-NEXT: vmov.u8 r2, q6[14] -; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q3[6], r2 ; CHECK-NEXT: vmov.u8 r2, q6[15] -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vmov.16 q3[7], r2 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vcmp.i16 ne, q3, zr ; CHECK-NEXT: vpsel q6, q7, q4 ; CHECK-NEXT: vmov.u16 r2, q6[2] ; CHECK-NEXT: vmov.u16 r3, q6[0] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q6[3] ; CHECK-NEXT: vmov.u16 r3, q6[1] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q0, q7, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vpsel q3, q7, q4 +; CHECK-NEXT: vmov r2, r3, d6 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[9] ; CHECK-NEXT: vmov.u8 r3, q1[8] ; CHECK-NEXT: vcmp.i32 ne, q4, zr ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q3[9] +; CHECK-NEXT: vmov.u8 r3, q0[9] ; CHECK-NEXT: vmov q7[2], q7[0], r0, r3 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vand q7, q7, q2 @@ -1026,52 +1025,52 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov.u8 r0, q3[10] -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[11] ; CHECK-NEXT: vmov.u8 r3, q1[10] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q3[11] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[11] ; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov r1, s16 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vpsel q0, q0, q5 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vpsel q3, q3, q5 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov r2, r3, d7 ; CHECK-NEXT: adds.w r0, r0, r12 ; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u16 r2, q6[6] ; CHECK-NEXT: vmov.u16 r3, q6[4] -; CHECK-NEXT: vmov.u8 r0, q3[12] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q6[7] ; CHECK-NEXT: vmov.u16 r3, q6[5] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vpsel q3, q3, q4 +; CHECK-NEXT: vmov r2, r3, d6 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[13] ; CHECK-NEXT: vmov.u8 r3, q1[12] ; CHECK-NEXT: vcmp.i32 ne, q4, zr ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q3[13] +; CHECK-NEXT: vmov.u8 r3, q0[13] ; CHECK-NEXT: vmov q6[2], q6[0], r0, r3 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vand q6, q6, q2 @@ -1090,22 +1089,22 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov.u8 r0, q3[14] -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[15] ; CHECK-NEXT: vmov.u8 r3, q1[14] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q3[15] -; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[15] +; CHECK-NEXT: vmov q0[2], q0[0], r0, r3 ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 @@ -1256,23 +1255,23 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q5[8] ; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[9] -; CHECK-NEXT: vmov.16 q6[1], r2 +; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov.u8 r2, q5[10] -; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: vmov.16 q0[2], r2 ; CHECK-NEXT: vmov.u8 r2, q5[11] -; CHECK-NEXT: vmov.16 q6[3], r2 +; CHECK-NEXT: vmov.16 q0[3], r2 ; CHECK-NEXT: vmov.u8 r2, q5[12] -; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmov.16 q0[4], r2 ; CHECK-NEXT: vmov.u8 r2, q5[13] -; CHECK-NEXT: vmov.16 q6[5], r2 +; CHECK-NEXT: vmov.16 q0[5], r2 ; CHECK-NEXT: vmov.u8 r2, q5[14] -; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmov.16 q0[6], r2 ; CHECK-NEXT: vmov.u8 r2, q5[15] -; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vmov.s8 r0, q1[8] -; CHECK-NEXT: vcmp.i16 ne, q6, zr +; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vmov.s8 r1, q3[8] ; CHECK-NEXT: vpsel q5, q2, q7 ; CHECK-NEXT: smull r0, r1, r1, r0 @@ -1635,16 +1634,16 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: vmov r0, r12, d3 ; CHECK-NEXT: vmov r2, lr, d1 -; CHECK-NEXT: vmov r4, r9, d2 +; CHECK-NEXT: vmov r4, r8, d2 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: umull r1, r8, r2, r0 -; CHECK-NEXT: umull r3, r5, r6, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 -; CHECK-NEXT: mla r1, r2, r12, r8 +; CHECK-NEXT: vmov r6, r9, d0 +; CHECK-NEXT: umull r1, r3, r2, r0 +; CHECK-NEXT: umull r5, r7, r6, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r1 +; CHECK-NEXT: mla r1, r2, r12, r3 ; CHECK-NEXT: mla r0, lr, r0, r1 -; CHECK-NEXT: mla r1, r6, r9, r5 -; CHECK-NEXT: mla r1, r7, r4, r1 +; CHECK-NEXT: mla r1, r6, r8, r7 +; CHECK-NEXT: mla r1, r9, r4, r1 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: orrs r0, r1 @@ -1737,12 +1736,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, ; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q0, q3, q0 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -1773,12 +1772,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, ; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q0, q3, q0 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -1997,12 +1996,12 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, ; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -2046,12 +2045,12 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -2292,56 +2291,55 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vmov.i8 q2, #0xff -; CHECK-NEXT: vpsel q6, q2, q0 -; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.i8 q2, #0x0 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vpsel q6, q3, q2 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill ; CHECK-NEXT: vmov.u8 r2, q6[0] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q6[1] -; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov.16 q2[1], r2 ; CHECK-NEXT: vmov.u8 r2, q6[2] -; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q2[2], r2 ; CHECK-NEXT: vmov.u8 r2, q6[3] -; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.16 q2[3], r2 ; CHECK-NEXT: vmov.u8 r2, q6[4] -; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q2[4], r2 ; CHECK-NEXT: vmov.u8 r2, q6[5] -; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.16 q2[5], r2 ; CHECK-NEXT: vmov.u8 r2, q6[6] -; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q2[6], r2 ; CHECK-NEXT: vmov.u8 r2, q6[7] -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vcmp.i16 ne, q0, zr -; CHECK-NEXT: vmov.u8 r4, q3[2] -; CHECK-NEXT: vpsel q7, q2, q4 +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmov.u8 r4, q0[2] +; CHECK-NEXT: vcmp.i16 ne, q2, zr +; CHECK-NEXT: vpsel q7, q3, q4 ; CHECK-NEXT: vmov.u16 r2, q7[2] ; CHECK-NEXT: vmov.u16 r3, q7[0] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q7[3] ; CHECK-NEXT: vmov.u16 r3, q7[1] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q0, q2, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vpsel q3, q3, q4 +; CHECK-NEXT: vmov r2, r3, d6 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[1] ; CHECK-NEXT: vmov.u8 r3, q1[0] ; CHECK-NEXT: vcmp.i32 ne, q2, zr ; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q3[1] -; CHECK-NEXT: vmov.u8 r2, q3[0] +; CHECK-NEXT: vmov.u8 r3, q0[1] +; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 ; CHECK-NEXT: vand q5, q5, q2 @@ -2356,251 +2354,251 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov q4[2], q4[0], r2, lr ; CHECK-NEXT: vmov q4[3], q4[1], r3, r12 ; CHECK-NEXT: vpsel q4, q4, q5 -; CHECK-NEXT: vmov lr, r12, d9 -; CHECK-NEXT: vmov r3, r2, d8 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 +; CHECK-NEXT: vmov r2, r12, d9 +; CHECK-NEXT: vmov r3, lr, d8 +; CHECK-NEXT: adds r6, r3, r2 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: adc.w r12, r12, lr +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[3] ; CHECK-NEXT: vmov.u8 r3, q1[2] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q3[3] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[3] ; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: vmov r5, s16 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r4, s12 ; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r4, r5, r5, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 -; CHECK-NEXT: vpsel q0, q0, q5 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: umull r5, r4, r5, r4 +; CHECK-NEXT: vmov q3[2], q3[0], r5, r2 +; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 +; CHECK-NEXT: vmov.u8 r4, q0[4] +; CHECK-NEXT: vpsel q3, q3, q5 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: vmov r6, r5, d7 ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.u16 r5, q7[6] -; CHECK-NEXT: vmov.u16 r4, q7[4] -; CHECK-NEXT: vmov.u8 r2, q3[4] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q7[7] -; CHECK-NEXT: vmov.u16 r4, q7[5] -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r5, r4, d0 -; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 -; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 -; CHECK-NEXT: vmov.u8 r5, q1[5] -; CHECK-NEXT: vmov.u8 r4, q1[4] +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: vmov.u16 r6, q7[6] +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov.u16 r5, q7[4] +; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 +; CHECK-NEXT: vmov.u16 r6, q7[7] +; CHECK-NEXT: vmov.u16 r5, q7[5] +; CHECK-NEXT: vmov q3[3], q3[1], r5, r6 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vpsel q3, q3, q4 +; CHECK-NEXT: vmov r6, r5, d6 +; CHECK-NEXT: vmov q4[2], q4[0], r6, r5 +; CHECK-NEXT: vmov q4[3], q4[1], r6, r5 +; CHECK-NEXT: vmov.u8 r6, q1[5] +; CHECK-NEXT: vmov.u8 r5, q1[4] ; CHECK-NEXT: vcmp.i32 ne, q4, zr -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q3[5] -; CHECK-NEXT: vmov q7[2], q7[0], r2, r4 +; CHECK-NEXT: vmov q4[2], q4[0], r5, r6 +; CHECK-NEXT: vmov.u8 r5, q0[5] +; CHECK-NEXT: vmov q7[2], q7[0], r4, r5 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vand q7, q7, q2 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r2, s30 -; CHECK-NEXT: vmov r3, s28 +; CHECK-NEXT: vmov r6, s18 +; CHECK-NEXT: vmov r5, s30 +; CHECK-NEXT: vmov r2, s28 ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: umull r2, r4, r2, r4 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r6 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q0[6] ; CHECK-NEXT: vpsel q4, q4, q5 -; CHECK-NEXT: vmov r2, r3, d8 -; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: vmov r2, r6, d8 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: vmov.u8 r2, q3[6] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 -; CHECK-NEXT: vmov.u8 r5, q1[7] -; CHECK-NEXT: vmov.u8 r4, q1[6] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q3[7] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d9 +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r6, r5, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r6, r5 +; CHECK-NEXT: vmov q3[3], q3[1], r6, r5 +; CHECK-NEXT: vmov.u8 r6, q1[7] +; CHECK-NEXT: vmov.u8 r5, q1[6] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 +; CHECK-NEXT: vmov.u8 r5, q0[7] +; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov r6, s14 +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vpsel q0, q0, q5 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: umull r2, r4, r2, r4 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 +; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q0[8] +; CHECK-NEXT: vpsel q3, q3, q5 +; CHECK-NEXT: vmov r2, r6, d6 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q6[8] -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: vmov.u8 r5, q6[9] -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.u8 r5, q6[10] -; CHECK-NEXT: vmov.16 q0[2], r5 -; CHECK-NEXT: vmov.u8 r5, q6[11] -; CHECK-NEXT: vmov.16 q0[3], r5 -; CHECK-NEXT: vmov.u8 r5, q6[12] -; CHECK-NEXT: vmov.16 q0[4], r5 -; CHECK-NEXT: vmov.u8 r5, q6[13] -; CHECK-NEXT: vmov.16 q0[5], r5 -; CHECK-NEXT: vmov.u8 r5, q6[14] -; CHECK-NEXT: vmov.16 q0[6], r5 -; CHECK-NEXT: vmov.u8 r5, q6[15] -; CHECK-NEXT: vmov.16 q0[7], r5 -; CHECK-NEXT: vmov.u8 r2, q3[8] -; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d7 +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: vmov.u8 r6, q6[8] +; CHECK-NEXT: vmov.16 q3[0], r6 +; CHECK-NEXT: vmov.u8 r6, q6[9] +; CHECK-NEXT: vmov.16 q3[1], r6 +; CHECK-NEXT: vmov.u8 r6, q6[10] +; CHECK-NEXT: vmov.16 q3[2], r6 +; CHECK-NEXT: vmov.u8 r6, q6[11] +; CHECK-NEXT: vmov.16 q3[3], r6 +; CHECK-NEXT: vmov.u8 r6, q6[12] +; CHECK-NEXT: vmov.16 q3[4], r6 +; CHECK-NEXT: vmov.u8 r6, q6[13] +; CHECK-NEXT: vmov.16 q3[5], r6 +; CHECK-NEXT: vmov.u8 r6, q6[14] +; CHECK-NEXT: vmov.16 q3[6], r6 +; CHECK-NEXT: vmov.u8 r6, q6[15] +; CHECK-NEXT: vmov.16 q3[7], r6 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vcmp.i16 ne, q3, zr ; CHECK-NEXT: vpsel q6, q7, q4 -; CHECK-NEXT: vmov.u16 r5, q6[2] -; CHECK-NEXT: vmov.u16 r4, q6[0] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q6[3] -; CHECK-NEXT: vmov.u16 r4, q6[1] -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q0, q7, q4 -; CHECK-NEXT: vmov r5, r4, d0 -; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 -; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 -; CHECK-NEXT: vmov.u8 r5, q1[9] -; CHECK-NEXT: vmov.u8 r4, q1[8] +; CHECK-NEXT: vmov.u16 r6, q6[2] +; CHECK-NEXT: vmov.u16 r5, q6[0] +; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 +; CHECK-NEXT: vmov.u16 r6, q6[3] +; CHECK-NEXT: vmov.u16 r5, q6[1] +; CHECK-NEXT: vmov q3[3], q3[1], r5, r6 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vpsel q3, q7, q4 +; CHECK-NEXT: vmov r6, r5, d6 +; CHECK-NEXT: vmov q4[2], q4[0], r6, r5 +; CHECK-NEXT: vmov q4[3], q4[1], r6, r5 +; CHECK-NEXT: vmov.u8 r6, q1[9] +; CHECK-NEXT: vmov.u8 r5, q1[8] ; CHECK-NEXT: vcmp.i32 ne, q4, zr -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q3[9] -; CHECK-NEXT: vmov q7[2], q7[0], r2, r4 +; CHECK-NEXT: vmov q4[2], q4[0], r5, r6 +; CHECK-NEXT: vmov.u8 r5, q0[9] +; CHECK-NEXT: vmov q7[2], q7[0], r4, r5 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vand q7, q7, q2 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vmov r6, s18 +; CHECK-NEXT: vmov r5, s30 ; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r3, s28 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov r2, s28 +; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: umull r2, r4, r2, r4 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r6 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q0[10] ; CHECK-NEXT: vpsel q4, q4, q5 -; CHECK-NEXT: vmov r2, r3, d8 -; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: vmov r2, r6, d8 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: vmov.u8 r2, q3[10] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 -; CHECK-NEXT: vmov.u8 r5, q1[11] -; CHECK-NEXT: vmov.u8 r4, q1[10] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q3[11] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d9 +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r6, r5, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r6, r5 +; CHECK-NEXT: vmov q3[3], q3[1], r6, r5 +; CHECK-NEXT: vmov.u8 r6, q1[11] +; CHECK-NEXT: vmov.u8 r5, q1[10] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 +; CHECK-NEXT: vmov.u8 r5, q0[11] +; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov r6, s14 +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vpsel q0, q0, q5 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: umull r2, r4, r2, r4 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 +; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q0[12] +; CHECK-NEXT: vpsel q3, q3, q5 +; CHECK-NEXT: vmov r2, r6, d6 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.u16 r5, q6[6] -; CHECK-NEXT: vmov.u16 r4, q6[4] -; CHECK-NEXT: vmov.u8 r2, q3[12] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q6[7] -; CHECK-NEXT: vmov.u16 r4, q6[5] -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r5, r4, d0 -; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 -; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 -; CHECK-NEXT: vmov.u8 r5, q1[13] -; CHECK-NEXT: vmov.u8 r4, q1[12] +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d7 +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: vmov.u16 r6, q6[6] +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov.u16 r5, q6[4] +; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 +; CHECK-NEXT: vmov.u16 r6, q6[7] +; CHECK-NEXT: vmov.u16 r5, q6[5] +; CHECK-NEXT: vmov q3[3], q3[1], r5, r6 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vpsel q3, q3, q4 +; CHECK-NEXT: vmov r6, r5, d6 +; CHECK-NEXT: vmov q4[2], q4[0], r6, r5 +; CHECK-NEXT: vmov q4[3], q4[1], r6, r5 +; CHECK-NEXT: vmov.u8 r6, q1[13] +; CHECK-NEXT: vmov.u8 r5, q1[12] ; CHECK-NEXT: vcmp.i32 ne, q4, zr -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q3[13] -; CHECK-NEXT: vmov q6[2], q6[0], r2, r4 +; CHECK-NEXT: vmov q4[2], q4[0], r5, r6 +; CHECK-NEXT: vmov.u8 r5, q0[13] +; CHECK-NEXT: vmov q6[2], q6[0], r4, r5 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vand q6, q6, q2 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov r6, s18 +; CHECK-NEXT: vmov r5, s26 ; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: umull r2, r4, r2, r4 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r6 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q0[14] ; CHECK-NEXT: vpsel q4, q4, q5 -; CHECK-NEXT: vmov r2, r3, d8 -; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: vmov r2, r6, d8 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: vmov.u8 r2, q3[14] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 -; CHECK-NEXT: vmov.u8 r5, q1[15] -; CHECK-NEXT: vmov.u8 r4, q1[14] -; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d9 +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r6, r5, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r6, r5 +; CHECK-NEXT: vmov q3[3], q3[1], r6, r5 +; CHECK-NEXT: vmov.u8 r6, q1[15] +; CHECK-NEXT: vmov.u8 r5, q1[14] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov q1[2], q1[0], r5, r6 +; CHECK-NEXT: vmov.u8 r5, q0[15] ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q3[15] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r6, s6 ; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: umull r2, r4, r2, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r6 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vpsel q0, q0, q5 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov r2, r6, d0 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: adcs r3, r5 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> @@ -2615,8 +2613,8 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 @@ -2646,7 +2644,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.16 q4[7], r2 ; CHECK-NEXT: vmov.s8 r5, q3[2] ; CHECK-NEXT: vcmp.i16 ne, q4, zr -; CHECK-NEXT: smull r4, r5, r5, r4 +; CHECK-NEXT: smull r5, r4, r5, r4 ; CHECK-NEXT: vpsel q6, q2, q0 ; CHECK-NEXT: vmov.u16 r2, q6[2] ; CHECK-NEXT: vmov.u16 r3, q6[0] @@ -2670,188 +2668,188 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r6, r3, r2 ; CHECK-NEXT: vmov r2, r3, d15 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: adc.w r12, r12, lr ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.s8 r2, q1[3] ; CHECK-NEXT: vmov.s8 r3, q3[3] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 +; CHECK-NEXT: vmov.s8 r4, q1[4] ; CHECK-NEXT: vpsel q0, q0, q4 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: vmov r6, r5, d1 ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.u16 r5, q6[6] -; CHECK-NEXT: vmov.u16 r4, q6[4] -; CHECK-NEXT: vmov.s8 r2, q1[4] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q6[7] -; CHECK-NEXT: vmov.u16 r4, q6[5] -; CHECK-NEXT: vmov.s8 r3, q3[4] -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: vmov.u16 r6, q6[6] +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov.u16 r5, q6[4] +; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 +; CHECK-NEXT: vmov.u16 r6, q6[7] +; CHECK-NEXT: vmov.u16 r5, q6[5] +; CHECK-NEXT: vmov.s8 r2, q3[4] +; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 +; CHECK-NEXT: smull r2, r4, r2, r4 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpsel q6, q2, q7 -; CHECK-NEXT: vmov r5, r4, d12 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 -; CHECK-NEXT: vmov.s8 r5, q1[5] -; CHECK-NEXT: vmov.s8 r4, q3[5] +; CHECK-NEXT: vmov r6, r5, d12 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r5 +; CHECK-NEXT: vmov.s8 r6, q1[5] +; CHECK-NEXT: vmov.s8 r5, q3[5] ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: smull r6, r5, r5, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r6 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vmov.s8 r4, q1[6] ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov r2, r6, d0 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov r5, r4, d13 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: vmov.s8 r2, q1[6] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 -; CHECK-NEXT: vmov.s8 r3, q3[6] -; CHECK-NEXT: vmov.s8 r5, q1[7] -; CHECK-NEXT: vmov.s8 r4, q3[7] -; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: vmov.s8 r2, q3[6] +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r6, r5, d13 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r5 +; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r5 +; CHECK-NEXT: vmov.s8 r6, q1[7] +; CHECK-NEXT: vmov.s8 r5, q3[7] ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: smull r6, r5, r5, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r6 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vmov.s8 r4, q1[8] ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov r2, r6, d0 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q5[8] -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.16 q6[0], r5 -; CHECK-NEXT: vmov.u8 r5, q5[9] -; CHECK-NEXT: vmov.16 q6[1], r5 -; CHECK-NEXT: vmov.u8 r5, q5[10] -; CHECK-NEXT: vmov.16 q6[2], r5 -; CHECK-NEXT: vmov.u8 r5, q5[11] -; CHECK-NEXT: vmov.16 q6[3], r5 -; CHECK-NEXT: vmov.u8 r5, q5[12] -; CHECK-NEXT: vmov.16 q6[4], r5 -; CHECK-NEXT: vmov.u8 r5, q5[13] -; CHECK-NEXT: vmov.16 q6[5], r5 -; CHECK-NEXT: vmov.u8 r5, q5[14] -; CHECK-NEXT: vmov.16 q6[6], r5 -; CHECK-NEXT: vmov.u8 r5, q5[15] -; CHECK-NEXT: vmov.16 q6[7], r5 -; CHECK-NEXT: vmov.s8 r2, q1[8] -; CHECK-NEXT: vcmp.i16 ne, q6, zr -; CHECK-NEXT: vmov.s8 r3, q3[8] +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: vmov.u8 r6, q5[8] +; CHECK-NEXT: vmov.16 q0[0], r6 +; CHECK-NEXT: vmov.u8 r6, q5[9] +; CHECK-NEXT: vmov.16 q0[1], r6 +; CHECK-NEXT: vmov.u8 r6, q5[10] +; CHECK-NEXT: vmov.16 q0[2], r6 +; CHECK-NEXT: vmov.u8 r6, q5[11] +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.u8 r6, q5[12] +; CHECK-NEXT: vmov.16 q0[4], r6 +; CHECK-NEXT: vmov.u8 r6, q5[13] +; CHECK-NEXT: vmov.16 q0[5], r6 +; CHECK-NEXT: vmov.u8 r6, q5[14] +; CHECK-NEXT: vmov.16 q0[6], r6 +; CHECK-NEXT: vmov.u8 r6, q5[15] +; CHECK-NEXT: vmov.16 q0[7], r6 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vmov.s8 r2, q3[8] ; CHECK-NEXT: vpsel q5, q2, q7 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.u16 r5, q5[2] -; CHECK-NEXT: vmov.u16 r4, q5[0] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q5[3] -; CHECK-NEXT: vmov.u16 r4, q5[1] -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: vmov.u16 r6, q5[2] +; CHECK-NEXT: vmov.u16 r5, q5[0] +; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 +; CHECK-NEXT: vmov.u16 r6, q5[3] +; CHECK-NEXT: vmov.u16 r5, q5[1] +; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpsel q6, q2, q7 -; CHECK-NEXT: vmov r5, r4, d12 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 -; CHECK-NEXT: vmov.s8 r5, q1[9] -; CHECK-NEXT: vmov.s8 r4, q3[9] +; CHECK-NEXT: vmov r6, r5, d12 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r5 +; CHECK-NEXT: vmov.s8 r6, q1[9] +; CHECK-NEXT: vmov.s8 r5, q3[9] ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: smull r6, r5, r5, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r6 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vmov.s8 r4, q1[10] ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov r2, r6, d0 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov r5, r4, d13 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: vmov.s8 r2, q1[10] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 -; CHECK-NEXT: vmov.s8 r3, q3[10] -; CHECK-NEXT: vmov.s8 r5, q1[11] -; CHECK-NEXT: vmov.s8 r4, q3[11] -; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: vmov.s8 r2, q3[10] +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r6, r5, d13 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r5 +; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r5 +; CHECK-NEXT: vmov.s8 r6, q1[11] +; CHECK-NEXT: vmov.s8 r5, q3[11] ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: smull r6, r5, r5, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r6 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vmov.s8 r4, q1[12] ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov r2, r6, d0 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.u16 r5, q5[6] -; CHECK-NEXT: vmov.u16 r4, q5[4] -; CHECK-NEXT: vmov.s8 r2, q1[12] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q5[7] -; CHECK-NEXT: vmov.u16 r4, q5[5] -; CHECK-NEXT: vmov.s8 r3, q3[12] -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: vmov.u16 r6, q5[6] +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov.u16 r5, q5[4] +; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 +; CHECK-NEXT: vmov.u16 r6, q5[7] +; CHECK-NEXT: vmov.u16 r5, q5[5] +; CHECK-NEXT: vmov.s8 r2, q3[12] +; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 +; CHECK-NEXT: smull r2, r4, r2, r4 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpsel q2, q2, q7 -; CHECK-NEXT: vmov r5, r4, d4 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 -; CHECK-NEXT: vmov.s8 r5, q1[13] -; CHECK-NEXT: vmov.s8 r4, q3[13] +; CHECK-NEXT: vmov r6, r5, d4 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r5 +; CHECK-NEXT: vmov.s8 r6, q1[13] +; CHECK-NEXT: vmov.s8 r5, q3[13] ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: smull r6, r5, r5, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r6 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vmov.s8 r4, q1[14] ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov r2, r6, d0 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: vmov.s8 r2, q1[14] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 -; CHECK-NEXT: vmov.s8 r3, q3[14] -; CHECK-NEXT: vmov.s8 r5, q1[15] -; CHECK-NEXT: vmov.s8 r4, q3[15] -; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: vmov.s8 r2, q3[14] +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r6, r5, d5 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r5 +; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r5 +; CHECK-NEXT: vmov.s8 r6, q1[15] +; CHECK-NEXT: vmov.s8 r5, q3[15] ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: smull r6, r5, r5, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r6 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov r2, r6, d0 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: adcs r3, r5 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i64> @@ -2892,12 +2890,12 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2 ; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -2941,12 +2939,12 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -2966,16 +2964,16 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: vmov r2, r12, d3 ; CHECK-NEXT: vmov r3, lr, d1 -; CHECK-NEXT: vmov r6, r9, d2 +; CHECK-NEXT: vmov r6, r8, d2 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: vmov r5, r11, d0 -; CHECK-NEXT: umull r10, r8, r3, r2 -; CHECK-NEXT: umull r4, r7, r5, r6 -; CHECK-NEXT: mla r3, r3, r12, r8 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r10 +; CHECK-NEXT: vmov r7, r9, d0 +; CHECK-NEXT: umull r4, r10, r3, r2 +; CHECK-NEXT: umull r5, r11, r7, r6 +; CHECK-NEXT: mla r3, r3, r12, r10 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 ; CHECK-NEXT: mla r2, lr, r2, r3 -; CHECK-NEXT: mla r3, r5, r9, r7 -; CHECK-NEXT: mla r3, r11, r6, r3 +; CHECK-NEXT: mla r3, r7, r8, r11 +; CHECK-NEXT: mla r3, r9, r6, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vmov r2, r3, d4 ; CHECK-NEXT: orrs r2, r3 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll index 97931d88d1a14c..6a7966f67b9c4e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll @@ -166,11 +166,11 @@ define arm_aapcs_vfpcc i64 @mul_v2i64(<2 x i64> %x) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r1, r12, d1 -; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: vmov r1, lr, d1 +; CHECK-NEXT: vmov r3, r12, d0 ; CHECK-NEXT: umull r0, r2, r3, r1 -; CHECK-NEXT: mla r2, r3, r12, r2 -; CHECK-NEXT: mla r1, lr, r1, r2 +; CHECK-NEXT: mla r2, r3, lr, r2 +; CHECK-NEXT: mla r1, r12, r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %z = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %x) @@ -182,14 +182,14 @@ define arm_aapcs_vfpcc i64 @mul_v4i64(<4 x i64> %x) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: vmov r1, r12, d1 -; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: vmov r1, lr, d1 +; CHECK-NEXT: vmov r3, r12, d0 ; CHECK-NEXT: vmov r5, r9, d2 ; CHECK-NEXT: vmov r6, r11, d3 ; CHECK-NEXT: umull r2, r8, r3, r1 -; CHECK-NEXT: mla r3, r3, r12, r8 +; CHECK-NEXT: mla r3, r3, lr, r8 ; CHECK-NEXT: umull r7, r10, r2, r5 -; CHECK-NEXT: mla r1, lr, r1, r3 +; CHECK-NEXT: mla r1, r12, r1, r3 ; CHECK-NEXT: mla r2, r2, r9, r10 ; CHECK-NEXT: umull r0, r4, r7, r6 ; CHECK-NEXT: mla r1, r1, r5, r2 @@ -412,33 +412,31 @@ define arm_aapcs_vfpcc i64 @mul_v4i64_acc(<4 x i64> %x, i64 %y) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #12 -; CHECK-NEXT: sub sp, #12 -; CHECK-NEXT: mov lr, r0 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: vmov r2, r0, d1 +; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: vmov r6, r9, d2 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: vmov r7, r11, d3 -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vmov r3, r0, d0 -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: umull r4, r8, r3, r2 -; CHECK-NEXT: mla r3, r3, r1, r8 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: umull r5, r10, r4, r6 -; CHECK-NEXT: mla r2, r1, r2, r3 ; CHECK-NEXT: mla r4, r4, r9, r10 ; CHECK-NEXT: umull r0, r12, r5, r7 +; CHECK-NEXT: mla r12, r5, r11, r12 +; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload +; CHECK-NEXT: mla r3, r3, r5, r8 +; CHECK-NEXT: mla r2, lr, r2, r3 ; CHECK-NEXT: mla r2, r2, r6, r4 -; CHECK-NEXT: mla r5, r5, r11, r12 -; CHECK-NEXT: mla r3, r2, r7, r5 -; CHECK-NEXT: umull r2, r7, lr, r0 -; CHECK-NEXT: mla r1, lr, r3, r7 -; CHECK-NEXT: ldr r3, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mla r3, r2, r7, r12 +; CHECK-NEXT: umull r2, r7, r1, r0 +; CHECK-NEXT: mla r1, r1, r3, r7 +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: mla r1, r3, r0, r1 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: add sp, #12 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %z = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %x) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll index d8c3e4ae3ffaf0..8ee022d7cf1dd1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll @@ -95,21 +95,21 @@ define i32 @addv32i32i32(ptr %x) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: vaddv.u32 r0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1, #32] -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1, #48] -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1, #64] -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1, #80] -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1, #96] -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1, #112] -; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #96] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #112] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i32>, ptr %x, align 4 @@ -855,21 +855,21 @@ define i32 @addv128i32i8(ptr %x) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: vaddv.u8 r0, q1 -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #32] -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #48] -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #64] -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #80] -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #96] -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #112] -; CHECK-NEXT: vaddva.u8 r0, q0 +; CHECK-NEXT: vaddv.u8 r2, q1 +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0, #32] +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0, #48] +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0, #64] +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0, #80] +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0, #96] +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0, #112] +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, ptr %x, align 1 @@ -1390,23 +1390,23 @@ define i32 @mlav24i32i32(ptr %x, ptr %y) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmlav.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #16] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #32] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #48] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #64] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #80] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i32>, ptr %x, align 4 @@ -1465,53 +1465,53 @@ define i32 @mlav64i32i32(ptr %x, ptr %y) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmlav.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #16] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #32] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #48] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #64] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #80] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #96] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #96] ; CHECK-NEXT: vldrw.u32 q1, [r1, #96] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #112] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #112] ; CHECK-NEXT: vldrw.u32 q1, [r1, #112] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #128] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #128] ; CHECK-NEXT: vldrw.u32 q1, [r1, #128] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #144] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #144] ; CHECK-NEXT: vldrw.u32 q1, [r1, #144] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #160] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #160] ; CHECK-NEXT: vldrw.u32 q1, [r1, #160] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #176] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #176] ; CHECK-NEXT: vldrw.u32 q1, [r1, #176] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #192] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #192] ; CHECK-NEXT: vldrw.u32 q1, [r1, #192] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #208] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #208] ; CHECK-NEXT: vldrw.u32 q1, [r1, #208] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #224] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #224] ; CHECK-NEXT: vldrw.u32 q1, [r1, #224] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #240] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #240] ; CHECK-NEXT: vldrw.u32 q1, [r1, #240] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <4 x i32>, ptr %x, align 4 @@ -1631,101 +1631,101 @@ define i32 @mlav128i32i32(ptr %x, ptr %y) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmlav.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #16] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #32] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #48] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #64] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #80] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #96] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #96] ; CHECK-NEXT: vldrw.u32 q1, [r1, #96] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #112] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #112] ; CHECK-NEXT: vldrw.u32 q1, [r1, #112] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #128] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #128] ; CHECK-NEXT: vldrw.u32 q1, [r1, #128] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #144] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #144] ; CHECK-NEXT: vldrw.u32 q1, [r1, #144] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #160] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #160] ; CHECK-NEXT: vldrw.u32 q1, [r1, #160] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #176] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #176] ; CHECK-NEXT: vldrw.u32 q1, [r1, #176] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #192] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #192] ; CHECK-NEXT: vldrw.u32 q1, [r1, #192] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #208] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #208] ; CHECK-NEXT: vldrw.u32 q1, [r1, #208] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #224] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #224] ; CHECK-NEXT: vldrw.u32 q1, [r1, #224] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #240] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #240] ; CHECK-NEXT: vldrw.u32 q1, [r1, #240] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #256] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #256] ; CHECK-NEXT: vldrw.u32 q1, [r1, #256] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #272] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #272] ; CHECK-NEXT: vldrw.u32 q1, [r1, #272] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #288] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #288] ; CHECK-NEXT: vldrw.u32 q1, [r1, #288] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #304] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #304] ; CHECK-NEXT: vldrw.u32 q1, [r1, #304] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #320] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #320] ; CHECK-NEXT: vldrw.u32 q1, [r1, #320] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #336] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #336] ; CHECK-NEXT: vldrw.u32 q1, [r1, #336] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #352] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #352] ; CHECK-NEXT: vldrw.u32 q1, [r1, #352] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #368] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #368] ; CHECK-NEXT: vldrw.u32 q1, [r1, #368] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #384] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #384] ; CHECK-NEXT: vldrw.u32 q1, [r1, #384] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #400] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #400] ; CHECK-NEXT: vldrw.u32 q1, [r1, #400] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #416] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #416] ; CHECK-NEXT: vldrw.u32 q1, [r1, #416] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #432] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #432] ; CHECK-NEXT: vldrw.u32 q1, [r1, #432] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #448] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #448] ; CHECK-NEXT: vldrw.u32 q1, [r1, #448] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #464] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #464] ; CHECK-NEXT: vldrw.u32 q1, [r1, #464] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #480] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #480] ; CHECK-NEXT: vldrw.u32 q1, [r1, #480] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #496] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #496] ; CHECK-NEXT: vldrw.u32 q1, [r1, #496] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <4 x i32>, ptr %x, align 4 @@ -2045,20 +2045,20 @@ define i32 @mlav24i32i16(ptr %x, ptr %y) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmlav.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #16] +; CHECK-NEXT: vmlav.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #16] ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #24] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #32] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #32] ; CHECK-NEXT: vldrh.s32 q1, [r1, #32] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #40] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #40] ; CHECK-NEXT: vldrh.s32 q1, [r1, #40] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i16>, ptr %x, align 2 @@ -2084,29 +2084,29 @@ define i32 @mlav32i32i16(ptr %x, ptr %y) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmlav.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #8] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #16] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #16] ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #24] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #32] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #32] ; CHECK-NEXT: vldrh.s32 q1, [r1, #32] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #40] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #40] ; CHECK-NEXT: vldrh.s32 q1, [r1, #40] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #48] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #48] ; CHECK-NEXT: vldrh.s32 q1, [r1, #48] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #56] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #56] ; CHECK-NEXT: vldrh.s32 q1, [r1, #56] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i16>, ptr %x, align 2 @@ -2225,53 +2225,53 @@ define i32 @mlav128i32i16(ptr %x, ptr %y) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmlav.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #16] +; CHECK-NEXT: vmlav.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #32] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vldrh.u16 q1, [r1, #32] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #48] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vldrh.u16 q1, [r1, #48] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #64] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #64] ; CHECK-NEXT: vldrh.u16 q1, [r1, #64] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #80] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #80] ; CHECK-NEXT: vldrh.u16 q1, [r1, #80] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #96] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #96] ; CHECK-NEXT: vldrh.u16 q1, [r1, #96] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #112] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #112] ; CHECK-NEXT: vldrh.u16 q1, [r1, #112] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #128] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #128] ; CHECK-NEXT: vldrh.u16 q1, [r1, #128] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #144] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #144] ; CHECK-NEXT: vldrh.u16 q1, [r1, #144] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #160] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #160] ; CHECK-NEXT: vldrh.u16 q1, [r1, #160] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #176] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #176] ; CHECK-NEXT: vldrh.u16 q1, [r1, #176] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #192] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #192] ; CHECK-NEXT: vldrh.u16 q1, [r1, #192] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #208] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #208] ; CHECK-NEXT: vldrh.u16 q1, [r1, #208] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #224] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #224] ; CHECK-NEXT: vldrh.u16 q1, [r1, #224] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2, #240] +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #240] ; CHECK-NEXT: vldrh.u16 q1, [r1, #240] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <8 x i16>, ptr %x, align 2 @@ -2531,29 +2531,29 @@ define i32 @mlav32i32i8(ptr %x, ptr %y) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmlav.u32 r0, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r2, #4] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] ; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r2, #8] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #8] ; CHECK-NEXT: vldrb.u32 q1, [r1, #8] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r2, #12] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #12] ; CHECK-NEXT: vldrb.u32 q1, [r1, #12] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r2, #16] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #16] ; CHECK-NEXT: vldrb.u32 q1, [r1, #16] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r2, #20] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #20] ; CHECK-NEXT: vldrb.u32 q1, [r1, #20] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r2, #24] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #24] ; CHECK-NEXT: vldrb.u32 q1, [r1, #24] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r2, #28] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #28] ; CHECK-NEXT: vldrb.u32 q1, [r1, #28] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i8>, ptr %x, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll index 43ef7994fcec93..939a2d29462c63 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll @@ -72,14 +72,14 @@ define ptr @vld2_v2i64(ptr %src, ptr %dst) { ; CHECK-NEXT: vmov r2, r12, d1 ; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0], #32 -; CHECK-NEXT: vmov r4, r7, d1 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: adds r7, r3, r2 ; CHECK-NEXT: vmov r3, r6, d0 -; CHECK-NEXT: adc.w r5, lr, r12 +; CHECK-NEXT: adc.w r2, lr, r12 ; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adcs r7, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r5 +; CHECK-NEXT: adcs r6, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r7 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll index 633aef46bbffcc..c66d8e6c5c4a1d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -315,17 +315,17 @@ define void @vld2_v2i64(ptr %src, ptr %dst) { ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov r0, r4, d1 ; CHECK-NEXT: vmov r5, r6, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r12 ; CHECK-NEXT: adds r0, r0, r5 ; CHECK-NEXT: adcs r6, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r3 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: @@ -346,36 +346,36 @@ define void @vld2_v4i64(ptr %src, ptr %dst) { ; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f32 s16, s14 +; CHECK-NEXT: vmov.f32 s17, s15 ; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vmov r5, r6, d6 +; CHECK-NEXT: vmov r3, r2, d1 ; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov r0, r4, d8 ; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f32 s16, s10 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov r5, r6, d4 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vmov r0, r7, d8 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, r4, d7 -; CHECK-NEXT: adds r0, r0, r5 -; CHECK-NEXT: adc.w r8, r6, r7 -; CHECK-NEXT: vmov r6, r5, d1 -; CHECK-NEXT: vmov r2, r7, d0 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: adc.w r6, r5, r4 -; CHECK-NEXT: vmov r5, r4, d2 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r8, r6 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w r8, r5, r0 +; CHECK-NEXT: vmov r7, r0, d0 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: vmov r5, r6, d1 ; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: adc.w r0, r7, r4 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r5, r6, d2 +; CHECK-NEXT: vmov q1[2], q1[0], r8, r2 +; CHECK-NEXT: vmov q1[3], q1[1], r4, r3 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: adds r5, r5, r7 +; CHECK-NEXT: vmov q0[2], q0[0], r5, lr +; CHECK-NEXT: adcs r0, r6 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8} @@ -536,16 +536,16 @@ define void @vld2_v4f16(ptr %src, ptr %dst) { ; CHECK-LABEL: vld2_v4f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vmovx.f16 s6, s3 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s9, s2 +; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vins.f16 s2, s3 ; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vins.f16 s9, s4 ; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index ccdc996d75970e..6d40017779e499 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,CHECK-LV -; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,CHECK-LIS +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s ; i32 @@ -83,28 +83,28 @@ define void @vld3_v8i32(ptr %src, ptr %dst) { ; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vadd.i32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s13, s4 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s12, s9 +; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s20, s8 ; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vadd.i32 q3, q5, q3 ; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vadd.i32 q1, q4, q1 +; CHECK-NEXT: vmov.f32 s6, s16 +; CHECK-NEXT: vmov.f32 s7, s19 +; CHECK-NEXT: vadd.i32 q1, q3, q1 ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -136,29 +136,29 @@ define void @vld3_v16i32(ptr %src, ptr %dst) { ; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vadd.i32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s13, s4 +; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s12, s9 +; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s20, s8 ; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vadd.i32 q3, q5, q3 ; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s7, s19 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vmov.f32 s6, s16 +; CHECK-NEXT: vadd.i32 q1, q3, q1 ; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.i32 q1, q4, q1 ; CHECK-NEXT: vmov.f32 s18, s10 ; CHECK-NEXT: vmov.f32 s21, s8 ; CHECK-NEXT: vmov.f32 s22, s11 @@ -167,30 +167,30 @@ define void @vld3_v16i32(ptr %src, ptr %dst) { ; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vmov.f32 s23, s26 ; CHECK-NEXT: vmov.f32 s19, s25 -; CHECK-NEXT: vadd.i32 q4, q4, q5 ; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vadd.i32 q4, q4, q5 ; CHECK-NEXT: vmov.f32 s10, s24 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vmov.f32 s11, s27 -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-NEXT: vldrw.u32 q6, [r0, #128] ; CHECK-NEXT: vadd.i32 q2, q4, q2 ; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vmov.f32 s25, s12 +; CHECK-NEXT: vmov.f32 s21, s12 ; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vmov.f32 s22, s15 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s30, s14 ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s20, s17 +; CHECK-NEXT: vmov.f32 s23, s26 ; CHECK-NEXT: vmov.f32 s28, s16 ; CHECK-NEXT: vmov.f32 s29, s19 -; CHECK-NEXT: vmov.f32 s31, s21 -; CHECK-NEXT: vadd.i32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s31, s25 +; CHECK-NEXT: vadd.i32 q5, q7, q5 ; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vadd.i32 q3, q6, q3 +; CHECK-NEXT: vmov.f32 s14, s24 +; CHECK-NEXT: vmov.f32 s15, s27 +; CHECK-NEXT: vadd.i32 q3, q5, q3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -256,21 +256,21 @@ define void @vld3_v4i16(ptr %src, ptr %dst) { ; CHECK-NEXT: vldrh.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.u16 r5, q0[6] ; CHECK-NEXT: vmov.u16 r6, q0[0] -; CHECK-NEXT: vmov r0, r3, d2 -; CHECK-NEXT: vmov.u16 lr, q0[2] -; CHECK-NEXT: vmov r2, r4, d3 +; CHECK-NEXT: vmov r0, r2, d2 +; CHECK-NEXT: vmov.u16 r12, q0[2] +; CHECK-NEXT: vmov r3, r4, d3 ; CHECK-NEXT: vmov q1[2], q1[0], r6, r5 ; CHECK-NEXT: vmov.u16 r5, q0[7] ; CHECK-NEXT: vmov.u16 r6, q0[1] ; CHECK-NEXT: vmov q2[2], q2[0], r6, r5 ; CHECK-NEXT: vmov.u16 r5, q0[3] ; CHECK-NEXT: vmov.u16 r6, q0[4] -; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r6, r2 -; CHECK-NEXT: vmov.u16 r12, q0[5] +; CHECK-NEXT: vmov q1[3], q1[1], r5, r2 +; CHECK-NEXT: vmov q2[3], q2[1], r6, r3 +; CHECK-NEXT: vmov.u16 lr, q0[5] ; CHECK-NEXT: vadd.i32 q0, q1, q2 -; CHECK-NEXT: vmov q1[2], q1[0], lr, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r4 +; CHECK-NEXT: vmov q1[2], q1[0], r12, r0 +; CHECK-NEXT: vmov q1[3], q1[1], lr, r4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -341,167 +341,86 @@ entry: } define void @vld3_v16i16(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v16i16: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-LV-NEXT: vmovx.f16 s6, s2 -; CHECK-LV-NEXT: vmov.f32 s4, s1 -; CHECK-LV-NEXT: vins.f16 s4, s6 -; CHECK-LV-NEXT: vmovx.f16 s6, s9 -; CHECK-LV-NEXT: vmov.f32 s5, s8 -; CHECK-LV-NEXT: vmovx.f16 s7, s12 -; CHECK-LV-NEXT: vins.f16 s5, s6 -; CHECK-LV-NEXT: vmov.f32 s6, s11 -; CHECK-LV-NEXT: vins.f16 s6, s7 -; CHECK-LV-NEXT: vmovx.f16 s16, s15 -; CHECK-LV-NEXT: vmov.f32 s7, s14 -; CHECK-LV-NEXT: vmovx.f16 s17, s3 -; CHECK-LV-NEXT: vins.f16 s7, s16 -; CHECK-LV-NEXT: vmovx.f16 s16, s0 -; CHECK-LV-NEXT: vins.f16 s16, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s1 -; CHECK-LV-NEXT: vins.f16 s0, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s8 -; CHECK-LV-NEXT: vins.f16 s3, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s11 -; CHECK-LV-NEXT: vmovx.f16 s8, s14 -; CHECK-LV-NEXT: vmovx.f16 s18, s10 -; CHECK-LV-NEXT: vmovx.f16 s19, s13 -; CHECK-LV-NEXT: vins.f16 s10, s2 -; CHECK-LV-NEXT: vins.f16 s13, s8 -; CHECK-LV-NEXT: vmov.f32 s1, s3 -; CHECK-LV-NEXT: vins.f16 s18, s12 -; CHECK-LV-NEXT: vins.f16 s19, s15 -; CHECK-LV-NEXT: vmov.f32 s3, s13 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-LV-NEXT: vins.f16 s17, s9 -; CHECK-LV-NEXT: vmov.f32 s2, s10 -; CHECK-LV-NEXT: vadd.i16 q0, q0, q4 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-LV-NEXT: vadd.i16 q0, q0, q1 -; CHECK-LV-NEXT: vmovx.f16 s6, s14 -; CHECK-LV-NEXT: vldrw.u32 q4, [r0] -; CHECK-LV-NEXT: vins.f16 s6, s8 -; CHECK-LV-NEXT: vmov.f32 s22, s15 -; CHECK-LV-NEXT: vmovx.f16 s8, s8 -; CHECK-LV-NEXT: vins.f16 s22, s8 -; CHECK-LV-NEXT: vmovx.f16 s8, s11 -; CHECK-LV-NEXT: vmov.f32 s23, s10 -; CHECK-LV-NEXT: vmovx.f16 s4, s16 -; CHECK-LV-NEXT: vins.f16 s23, s8 -; CHECK-LV-NEXT: vmovx.f16 s8, s17 -; CHECK-LV-NEXT: vins.f16 s16, s8 -; CHECK-LV-NEXT: vmovx.f16 s8, s12 -; CHECK-LV-NEXT: vmovx.f16 s5, s19 -; CHECK-LV-NEXT: vins.f16 s19, s8 -; CHECK-LV-NEXT: vmovx.f16 s8, s15 -; CHECK-LV-NEXT: vmovx.f16 s7, s9 -; CHECK-LV-NEXT: vins.f16 s14, s8 -; CHECK-LV-NEXT: vmovx.f16 s8, s10 -; CHECK-LV-NEXT: vins.f16 s4, s18 -; CHECK-LV-NEXT: vmov.f32 s20, s17 -; CHECK-LV-NEXT: vmovx.f16 s18, s18 -; CHECK-LV-NEXT: vins.f16 s9, s8 -; CHECK-LV-NEXT: vins.f16 s5, s13 -; CHECK-LV-NEXT: vins.f16 s20, s18 -; CHECK-LV-NEXT: vmov.f32 s17, s19 -; CHECK-LV-NEXT: vins.f16 s7, s11 -; CHECK-LV-NEXT: vmovx.f16 s13, s13 -; CHECK-LV-NEXT: vmov.f32 s21, s12 -; CHECK-LV-NEXT: vmov.f32 s18, s14 -; CHECK-LV-NEXT: vins.f16 s21, s13 -; CHECK-LV-NEXT: vmov.f32 s19, s9 -; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LV-NEXT: vadd.i16 q1, q4, q1 -; CHECK-LV-NEXT: vadd.i16 q1, q1, q5 -; CHECK-LV-NEXT: vstrw.32 q1, [r1] -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: vld3_v16i16: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-LIS-NEXT: vmovx.f16 s6, s2 -; CHECK-LIS-NEXT: vmov.f32 s4, s1 -; CHECK-LIS-NEXT: vins.f16 s4, s6 -; CHECK-LIS-NEXT: vmovx.f16 s6, s9 -; CHECK-LIS-NEXT: vmov.f32 s5, s8 -; CHECK-LIS-NEXT: vmovx.f16 s7, s12 -; CHECK-LIS-NEXT: vins.f16 s5, s6 -; CHECK-LIS-NEXT: vmov.f32 s6, s11 -; CHECK-LIS-NEXT: vins.f16 s6, s7 -; CHECK-LIS-NEXT: vmovx.f16 s16, s15 -; CHECK-LIS-NEXT: vmov.f32 s7, s14 -; CHECK-LIS-NEXT: vmovx.f16 s17, s3 -; CHECK-LIS-NEXT: vins.f16 s7, s16 -; CHECK-LIS-NEXT: vmovx.f16 s16, s0 -; CHECK-LIS-NEXT: vins.f16 s16, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s1 -; CHECK-LIS-NEXT: vins.f16 s0, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s8 -; CHECK-LIS-NEXT: vins.f16 s3, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s11 -; CHECK-LIS-NEXT: vmovx.f16 s8, s14 -; CHECK-LIS-NEXT: vmovx.f16 s18, s10 -; CHECK-LIS-NEXT: vmovx.f16 s19, s13 -; CHECK-LIS-NEXT: vins.f16 s10, s2 -; CHECK-LIS-NEXT: vins.f16 s13, s8 -; CHECK-LIS-NEXT: vmov.f32 s1, s3 -; CHECK-LIS-NEXT: vins.f16 s18, s12 -; CHECK-LIS-NEXT: vins.f16 s19, s15 -; CHECK-LIS-NEXT: vmov.f32 s3, s13 -; CHECK-LIS-NEXT: vins.f16 s17, s9 -; CHECK-LIS-NEXT: vmov.f32 s2, s10 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-LIS-NEXT: vadd.i16 q0, q0, q4 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-LIS-NEXT: vadd.i16 q0, q0, q1 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0] -; CHECK-LIS-NEXT: vmovx.f16 s6, s18 -; CHECK-LIS-NEXT: vmov.f32 s22, s19 -; CHECK-LIS-NEXT: vins.f16 s6, s8 -; CHECK-LIS-NEXT: vmovx.f16 s8, s8 -; CHECK-LIS-NEXT: vins.f16 s22, s8 -; CHECK-LIS-NEXT: vmovx.f16 s8, s11 -; CHECK-LIS-NEXT: vmov.f32 s23, s10 -; CHECK-LIS-NEXT: vmovx.f16 s4, s12 -; CHECK-LIS-NEXT: vins.f16 s23, s8 -; CHECK-LIS-NEXT: vmovx.f16 s8, s13 -; CHECK-LIS-NEXT: vins.f16 s12, s8 -; CHECK-LIS-NEXT: vmovx.f16 s8, s16 -; CHECK-LIS-NEXT: vmovx.f16 s5, s15 -; CHECK-LIS-NEXT: vins.f16 s15, s8 -; CHECK-LIS-NEXT: vmovx.f16 s8, s19 -; CHECK-LIS-NEXT: vins.f16 s4, s14 -; CHECK-LIS-NEXT: vmov.f32 s20, s13 -; CHECK-LIS-NEXT: vmovx.f16 s14, s14 -; CHECK-LIS-NEXT: vins.f16 s18, s8 -; CHECK-LIS-NEXT: vmovx.f16 s8, s10 -; CHECK-LIS-NEXT: vmovx.f16 s7, s9 -; CHECK-LIS-NEXT: vins.f16 s20, s14 -; CHECK-LIS-NEXT: vmovx.f16 s14, s17 -; CHECK-LIS-NEXT: vmov.f32 s21, s16 -; CHECK-LIS-NEXT: vins.f16 s9, s8 -; CHECK-LIS-NEXT: vins.f16 s21, s14 -; CHECK-LIS-NEXT: vmov.f32 s13, s15 -; CHECK-LIS-NEXT: vins.f16 s7, s11 -; CHECK-LIS-NEXT: vins.f16 s5, s17 -; CHECK-LIS-NEXT: vmov.f32 s14, s18 -; CHECK-LIS-NEXT: vmov.f32 s15, s9 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LIS-NEXT: vadd.i16 q1, q3, q1 -; CHECK-LIS-NEXT: vadd.i16 q1, q1, q5 -; CHECK-LIS-NEXT: vstrw.32 q1, [r1] -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: vld3_v16i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmovx.f16 s6, s2 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmovx.f16 s7, s12 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vmovx.f16 s16, s15 +; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vins.f16 s7, s16 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vins.f16 s3, s2 +; CHECK-NEXT: vmovx.f16 s2, s11 +; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vmovx.f16 s18, s10 +; CHECK-NEXT: vmovx.f16 s19, s13 +; CHECK-NEXT: vins.f16 s10, s2 +; CHECK-NEXT: vins.f16 s13, s8 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vins.f16 s18, s12 +; CHECK-NEXT: vins.f16 s19, s15 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vins.f16 s17, s9 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vadd.i16 q0, q0, q4 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmovx.f16 s10, s14 +; CHECK-NEXT: vmov.f32 s22, s15 +; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vins.f16 s22, s4 +; CHECK-NEXT: vmovx.f16 s4, s7 +; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: vmovx.f16 s8, s16 +; CHECK-NEXT: vins.f16 s23, s4 +; CHECK-NEXT: vmovx.f16 s4, s17 +; CHECK-NEXT: vins.f16 s16, s4 +; CHECK-NEXT: vmovx.f16 s4, s12 +; CHECK-NEXT: vmovx.f16 s9, s19 +; CHECK-NEXT: vins.f16 s19, s4 +; CHECK-NEXT: vmovx.f16 s4, s15 +; CHECK-NEXT: vmovx.f16 s11, s5 +; CHECK-NEXT: vins.f16 s14, s4 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vins.f16 s8, s18 +; CHECK-NEXT: vmov.f32 s20, s17 +; CHECK-NEXT: vmovx.f16 s18, s18 +; CHECK-NEXT: vins.f16 s5, s4 +; CHECK-NEXT: vins.f16 s9, s13 +; CHECK-NEXT: vins.f16 s20, s18 +; CHECK-NEXT: vmov.f32 s17, s19 +; CHECK-NEXT: vins.f16 s11, s7 +; CHECK-NEXT: vmovx.f16 s13, s13 +; CHECK-NEXT: vmov.f32 s21, s12 +; CHECK-NEXT: vmov.f32 s18, s14 +; CHECK-NEXT: vins.f16 s21, s13 +; CHECK-NEXT: vmov.f32 s19, s5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vadd.i16 q1, q4, q2 +; CHECK-NEXT: vadd.i16 q1, q1, q5 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr entry: %l1 = load <48 x i16>, ptr %src, align 4 %s1 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> @@ -796,23 +715,23 @@ define void @vld3_v2i64(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov r0, r3, d5 -; CHECK-NEXT: vmov r2, r4, d3 -; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: vmov r5, r8, d6 -; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov r0, r3, d7 +; CHECK-NEXT: vmov r2, r4, d1 +; CHECK-NEXT: vmov r6, r7, d2 +; CHECK-NEXT: vmov r5, r8, d4 +; CHECK-NEXT: vmov lr, r12, d3 ; CHECK-NEXT: adds.w r0, r0, lr ; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r2, r3, r4 -; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: vmov r3, r4, d6 ; CHECK-NEXT: adds r6, r6, r5 ; CHECK-NEXT: adc.w r7, r7, r8 ; CHECK-NEXT: adds r3, r3, r6 @@ -833,119 +752,65 @@ entry: } define void @vld3_v4i64(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v4i64: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-LV-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0] -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-LV-NEXT: vmov.f32 s4, s2 -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-LV-NEXT: vmov.f32 s5, s3 -; CHECK-LV-NEXT: vmov.f32 s2, s12 -; CHECK-LV-NEXT: vmov.f32 s3, s13 -; CHECK-LV-NEXT: vmov r5, r4, d5 -; CHECK-LV-NEXT: vmov r3, r8, d7 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-LV-NEXT: vmov.f32 s24, s22 -; CHECK-LV-NEXT: vmov.f32 s25, s23 -; CHECK-LV-NEXT: vmov lr, r12, d1 -; CHECK-LV-NEXT: vmov.f32 s2, s12 -; CHECK-LV-NEXT: vmov.f32 s3, s13 -; CHECK-LV-NEXT: vmov r6, r7, d12 -; CHECK-LV-NEXT: adds.w r0, r5, lr -; CHECK-LV-NEXT: adc.w r5, r4, r12 -; CHECK-LV-NEXT: adds.w lr, r0, r3 -; CHECK-LV-NEXT: vmov r4, r2, d10 -; CHECK-LV-NEXT: adc.w r12, r5, r8 -; CHECK-LV-NEXT: vmov r5, r0, d8 -; CHECK-LV-NEXT: adds r6, r6, r4 -; CHECK-LV-NEXT: adcs r2, r7 -; CHECK-LV-NEXT: adds r6, r6, r5 -; CHECK-LV-NEXT: adc.w r8, r2, r0 -; CHECK-LV-NEXT: vmov r7, r4, d1 -; CHECK-LV-NEXT: vmov r2, r5, d9 -; CHECK-LV-NEXT: vmov r3, r0, d0 -; CHECK-LV-NEXT: adds r2, r2, r7 -; CHECK-LV-NEXT: adc.w r7, r5, r4 -; CHECK-LV-NEXT: vmov r5, r4, d7 -; CHECK-LV-NEXT: adds r2, r2, r5 -; CHECK-LV-NEXT: adcs r7, r4 -; CHECK-LV-NEXT: vmov r5, r4, d2 -; CHECK-LV-NEXT: vmov q1[2], q1[0], r6, r2 -; CHECK-LV-NEXT: vmov q1[3], q1[1], r8, r7 -; CHECK-LV-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-LV-NEXT: adds r3, r3, r5 -; CHECK-LV-NEXT: adcs r0, r4 -; CHECK-LV-NEXT: vmov r4, r5, d4 -; CHECK-LV-NEXT: adds r3, r3, r4 -; CHECK-LV-NEXT: vmov q0[2], q0[0], r3, lr -; CHECK-LV-NEXT: adcs r0, r5 -; CHECK-LV-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-LV-NEXT: vstrw.32 q0, [r1] -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12} -; CHECK-LV-NEXT: pop.w {r4, r5, r6, r7, r8, pc} -; -; CHECK-LIS-LABEL: vld3_v4i64: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-LIS-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-LIS-NEXT: vmov.f32 s4, s2 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-LIS-NEXT: vmov.f32 s5, s3 -; CHECK-LIS-NEXT: vmov.f32 s2, s12 -; CHECK-LIS-NEXT: vmov.f32 s3, s13 -; CHECK-LIS-NEXT: vmov r5, r4, d5 -; CHECK-LIS-NEXT: vmov r3, r8, d7 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-LIS-NEXT: vmov.f32 s24, s22 -; CHECK-LIS-NEXT: vmov.f32 s25, s23 -; CHECK-LIS-NEXT: vmov lr, r12, d1 -; CHECK-LIS-NEXT: vmov.f32 s2, s12 -; CHECK-LIS-NEXT: vmov.f32 s3, s13 -; CHECK-LIS-NEXT: vmov r7, r6, d12 -; CHECK-LIS-NEXT: adds.w r0, r5, lr -; CHECK-LIS-NEXT: adc.w r5, r4, r12 -; CHECK-LIS-NEXT: adds.w lr, r0, r3 -; CHECK-LIS-NEXT: vmov r4, r2, d10 -; CHECK-LIS-NEXT: adc.w r12, r5, r8 -; CHECK-LIS-NEXT: vmov r5, r0, d8 -; CHECK-LIS-NEXT: adds r7, r7, r4 -; CHECK-LIS-NEXT: adcs r2, r6 -; CHECK-LIS-NEXT: adds r7, r7, r5 -; CHECK-LIS-NEXT: adc.w r8, r2, r0 -; CHECK-LIS-NEXT: vmov r6, r4, d1 -; CHECK-LIS-NEXT: vmov r2, r5, d9 -; CHECK-LIS-NEXT: vmov r3, r0, d0 -; CHECK-LIS-NEXT: adds r2, r2, r6 -; CHECK-LIS-NEXT: adc.w r6, r5, r4 -; CHECK-LIS-NEXT: vmov r5, r4, d7 -; CHECK-LIS-NEXT: adds r2, r2, r5 -; CHECK-LIS-NEXT: adcs r6, r4 -; CHECK-LIS-NEXT: vmov r5, r4, d2 -; CHECK-LIS-NEXT: vmov q1[2], q1[0], r7, r2 -; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r6 -; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-LIS-NEXT: adds r3, r3, r5 -; CHECK-LIS-NEXT: adcs r0, r4 -; CHECK-LIS-NEXT: vmov r4, r5, d4 -; CHECK-LIS-NEXT: adds r3, r3, r4 -; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, lr -; CHECK-LIS-NEXT: adcs r0, r5 -; CHECK-LIS-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1] -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12} -; CHECK-LIS-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-LABEL: vld3_v4i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .vsave {d12, d13} +; CHECK-NEXT: vpush {d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov r3, r2, d5 +; CHECK-NEXT: vmov r4, r8, d7 +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmov.f32 s20, s18 +; CHECK-NEXT: vmov.f32 s21, s19 +; CHECK-NEXT: vmov r5, r7, d8 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov r0, r6, d10 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds.w lr, r3, r4 +; CHECK-NEXT: adc.w r12, r2, r8 +; CHECK-NEXT: vmov r4, r2, d12 +; CHECK-NEXT: adds r0, r0, r5 +; CHECK-NEXT: adcs r7, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds.w r8, r0, r4 +; CHECK-NEXT: vmov r3, r0, d0 +; CHECK-NEXT: adcs r2, r7 +; CHECK-NEXT: vmov r7, r4, d13 +; CHECK-NEXT: adds r7, r7, r6 +; CHECK-NEXT: adc.w r6, r4, r5 +; CHECK-NEXT: vmov r5, r4, d7 +; CHECK-NEXT: adds r7, r7, r5 +; CHECK-NEXT: adcs r6, r4 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: vmov q1[2], q1[0], r8, r7 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r6 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r0, r4 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-NEXT: adcs r0, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vpop {d12, d13} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <12 x i64>, ptr %src, align 4 %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> @@ -1035,28 +900,28 @@ define void @vld3_v8f32(ptr %src, ptr %dst) { ; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.f32 q2, q2, q3 ; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vadd.f32 q2, q2, q3 ; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vadd.f32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s13, s4 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s12, s9 +; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s20, s8 ; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vadd.f32 q3, q5, q3 ; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vadd.f32 q1, q4, q1 +; CHECK-NEXT: vmov.f32 s6, s16 +; CHECK-NEXT: vmov.f32 s7, s19 +; CHECK-NEXT: vadd.f32 q1, q3, q1 ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -1088,29 +953,29 @@ define void @vld3_v16f32(ptr %src, ptr %dst) { ; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.f32 q2, q2, q3 ; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vadd.f32 q2, q2, q3 ; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vadd.f32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s13, s4 +; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s12, s9 +; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s20, s8 ; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vadd.f32 q3, q5, q3 ; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s7, s19 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vmov.f32 s6, s16 +; CHECK-NEXT: vadd.f32 q1, q3, q1 ; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.f32 q1, q4, q1 ; CHECK-NEXT: vmov.f32 s18, s10 ; CHECK-NEXT: vmov.f32 s21, s8 ; CHECK-NEXT: vmov.f32 s22, s11 @@ -1119,30 +984,30 @@ define void @vld3_v16f32(ptr %src, ptr %dst) { ; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vmov.f32 s23, s26 ; CHECK-NEXT: vmov.f32 s19, s25 -; CHECK-NEXT: vadd.f32 q4, q4, q5 ; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vadd.f32 q4, q4, q5 ; CHECK-NEXT: vmov.f32 s10, s24 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vmov.f32 s11, s27 -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-NEXT: vldrw.u32 q6, [r0, #128] ; CHECK-NEXT: vadd.f32 q2, q4, q2 ; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vmov.f32 s25, s12 +; CHECK-NEXT: vmov.f32 s21, s12 ; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vmov.f32 s22, s15 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s30, s14 ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s20, s17 +; CHECK-NEXT: vmov.f32 s23, s26 ; CHECK-NEXT: vmov.f32 s28, s16 ; CHECK-NEXT: vmov.f32 s29, s19 -; CHECK-NEXT: vmov.f32 s31, s21 -; CHECK-NEXT: vadd.f32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s31, s25 +; CHECK-NEXT: vadd.f32 q5, q7, q5 ; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vadd.f32 q3, q6, q3 +; CHECK-NEXT: vmov.f32 s14, s24 +; CHECK-NEXT: vmov.f32 s15, s27 +; CHECK-NEXT: vadd.f32 q3, q5, q3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -1193,26 +1058,26 @@ define void @vld3_v4f16(ptr %src, ptr %dst) { ; CHECK-LABEL: vld3_v4f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r2, r3, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vmovx.f16 s13, s7 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vins.f16 s1, s0 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vins.f16 s12, s6 -; CHECK-NEXT: vins.f16 s13, s9 -; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vadd.f16 q1, q1, q3 -; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmovx.f16 s13, s11 +; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s5, s2 +; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s2, s10 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vins.f16 s9, s2 +; CHECK-NEXT: vins.f16 s11, s0 +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vins.f16 s12, s10 +; CHECK-NEXT: vins.f16 s13, s1 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vadd.f16 q0, q2, q3 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] ; CHECK-NEXT: bx lr @@ -1228,93 +1093,49 @@ entry: } define void @vld3_v8f16(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v8f16: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9} -; CHECK-LV-NEXT: vpush {d8, d9} -; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-LV-NEXT: vldrw.u32 q0, [r0] -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LV-NEXT: vmov.f32 s5, s8 -; CHECK-LV-NEXT: vmovx.f16 s8, s8 -; CHECK-LV-NEXT: vmovx.f16 s17, s3 -; CHECK-LV-NEXT: vins.f16 s3, s8 -; CHECK-LV-NEXT: vmovx.f16 s8, s11 -; CHECK-LV-NEXT: vmovx.f16 s18, s10 -; CHECK-LV-NEXT: vmovx.f16 s16, s0 -; CHECK-LV-NEXT: vins.f16 s10, s8 -; CHECK-LV-NEXT: vmovx.f16 s6, s2 -; CHECK-LV-NEXT: vmov.f32 s4, s1 -; CHECK-LV-NEXT: vmovx.f16 s8, s14 -; CHECK-LV-NEXT: vmovx.f16 s19, s13 -; CHECK-LV-NEXT: vins.f16 s4, s6 -; CHECK-LV-NEXT: vmovx.f16 s6, s9 -; CHECK-LV-NEXT: vins.f16 s16, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s15 -; CHECK-LV-NEXT: vmovx.f16 s7, s12 -; CHECK-LV-NEXT: vins.f16 s18, s12 -; CHECK-LV-NEXT: vmovx.f16 s12, s1 -; CHECK-LV-NEXT: vins.f16 s13, s8 -; CHECK-LV-NEXT: vins.f16 s5, s6 -; CHECK-LV-NEXT: vmov.f32 s6, s11 -; CHECK-LV-NEXT: vins.f16 s14, s2 -; CHECK-LV-NEXT: vmov.f32 s1, s3 -; CHECK-LV-NEXT: vins.f16 s19, s15 -; CHECK-LV-NEXT: vins.f16 s17, s9 -; CHECK-LV-NEXT: vins.f16 s0, s12 -; CHECK-LV-NEXT: vmov.f32 s2, s10 -; CHECK-LV-NEXT: vmov.f32 s3, s13 -; CHECK-LV-NEXT: vins.f16 s6, s7 -; CHECK-LV-NEXT: vmov.f32 s7, s14 -; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 -; CHECK-LV-NEXT: vadd.f16 q0, q0, q1 -; CHECK-LV-NEXT: vstrw.32 q0, [r1] -; CHECK-LV-NEXT: vpop {d8, d9} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: vld3_v8f16: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9} -; CHECK-LIS-NEXT: vpush {d8, d9} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-LIS-NEXT: vmov.f32 s4, s1 -; CHECK-LIS-NEXT: vmovx.f16 s6, s2 -; CHECK-LIS-NEXT: vins.f16 s4, s6 -; CHECK-LIS-NEXT: vmov.f32 s5, s8 -; CHECK-LIS-NEXT: vmovx.f16 s6, s9 -; CHECK-LIS-NEXT: vmovx.f16 s8, s8 -; CHECK-LIS-NEXT: vmovx.f16 s13, s3 -; CHECK-LIS-NEXT: vins.f16 s5, s6 -; CHECK-LIS-NEXT: vins.f16 s3, s8 -; CHECK-LIS-NEXT: vmov.f32 s6, s11 -; CHECK-LIS-NEXT: vmovx.f16 s12, s16 -; CHECK-LIS-NEXT: vmovx.f16 s8, s11 -; CHECK-LIS-NEXT: vmovx.f16 s14, s10 -; CHECK-LIS-NEXT: vins.f16 s6, s12 -; CHECK-LIS-NEXT: vmovx.f16 s12, s0 -; CHECK-LIS-NEXT: vins.f16 s10, s8 -; CHECK-LIS-NEXT: vmovx.f16 s8, s18 -; CHECK-LIS-NEXT: vmovx.f16 s15, s17 -; CHECK-LIS-NEXT: vins.f16 s12, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s19 -; CHECK-LIS-NEXT: vmovx.f16 s1, s1 -; CHECK-LIS-NEXT: vins.f16 s17, s8 -; CHECK-LIS-NEXT: vins.f16 s18, s2 -; CHECK-LIS-NEXT: vins.f16 s0, s1 -; CHECK-LIS-NEXT: vmov.f32 s1, s3 -; CHECK-LIS-NEXT: vins.f16 s14, s16 -; CHECK-LIS-NEXT: vins.f16 s15, s19 -; CHECK-LIS-NEXT: vins.f16 s13, s9 -; CHECK-LIS-NEXT: vmov.f32 s2, s10 -; CHECK-LIS-NEXT: vmov.f32 s3, s17 -; CHECK-LIS-NEXT: vmov.f32 s7, s18 -; CHECK-LIS-NEXT: vadd.f16 q0, q0, q3 -; CHECK-LIS-NEXT: vadd.f16 q0, q0, q1 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1] -; CHECK-LIS-NEXT: vpop {d8, d9} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: vld3_v8f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmovx.f16 s8, s8 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vins.f16 s3, s8 +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vmovx.f16 s18, s10 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vins.f16 s10, s8 +; CHECK-NEXT: vmovx.f16 s6, s2 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vmovx.f16 s19, s13 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vmovx.f16 s2, s15 +; CHECK-NEXT: vmovx.f16 s7, s12 +; CHECK-NEXT: vins.f16 s18, s12 +; CHECK-NEXT: vmovx.f16 s12, s1 +; CHECK-NEXT: vins.f16 s13, s8 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vins.f16 s14, s2 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vins.f16 s19, s15 +; CHECK-NEXT: vins.f16 s17, s9 +; CHECK-NEXT: vins.f16 s0, s12 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr entry: %l1 = load <24 x half>, ptr %src, align 4 %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> @@ -1327,167 +1148,86 @@ entry: } define void @vld3_v16f16(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v16f16: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9} -; CHECK-LV-NEXT: vpush {d8, d9} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-LV-NEXT: vmovx.f16 s6, s2 -; CHECK-LV-NEXT: vmov.f32 s4, s1 -; CHECK-LV-NEXT: vins.f16 s4, s6 -; CHECK-LV-NEXT: vmovx.f16 s6, s9 -; CHECK-LV-NEXT: vmov.f32 s5, s8 -; CHECK-LV-NEXT: vmovx.f16 s7, s12 -; CHECK-LV-NEXT: vins.f16 s5, s6 -; CHECK-LV-NEXT: vmov.f32 s6, s11 -; CHECK-LV-NEXT: vins.f16 s6, s7 -; CHECK-LV-NEXT: vmovx.f16 s16, s15 -; CHECK-LV-NEXT: vmov.f32 s7, s14 -; CHECK-LV-NEXT: vmovx.f16 s17, s3 -; CHECK-LV-NEXT: vins.f16 s7, s16 -; CHECK-LV-NEXT: vmovx.f16 s16, s0 -; CHECK-LV-NEXT: vins.f16 s16, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s1 -; CHECK-LV-NEXT: vins.f16 s0, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s8 -; CHECK-LV-NEXT: vins.f16 s3, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s11 -; CHECK-LV-NEXT: vmovx.f16 s18, s10 -; CHECK-LV-NEXT: vins.f16 s10, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s14 -; CHECK-LV-NEXT: vmovx.f16 s19, s13 -; CHECK-LV-NEXT: vins.f16 s13, s2 -; CHECK-LV-NEXT: vmov.f32 s1, s3 -; CHECK-LV-NEXT: vins.f16 s18, s12 -; CHECK-LV-NEXT: vins.f16 s19, s15 -; CHECK-LV-NEXT: vmov.f32 s3, s13 -; CHECK-LV-NEXT: vins.f16 s17, s9 -; CHECK-LV-NEXT: vmov.f32 s2, s10 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 -; CHECK-LV-NEXT: vadd.f16 q2, q0, q1 -; CHECK-LV-NEXT: vldrw.u32 q0, [r0] -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-LV-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-LV-NEXT: vmovx.f16 s10, s2 -; CHECK-LV-NEXT: vmov.f32 s8, s1 -; CHECK-LV-NEXT: vins.f16 s8, s10 -; CHECK-LV-NEXT: vmovx.f16 s10, s13 -; CHECK-LV-NEXT: vmov.f32 s9, s12 -; CHECK-LV-NEXT: vmovx.f16 s11, s4 -; CHECK-LV-NEXT: vins.f16 s9, s10 -; CHECK-LV-NEXT: vmov.f32 s10, s15 -; CHECK-LV-NEXT: vins.f16 s10, s11 -; CHECK-LV-NEXT: vmovx.f16 s16, s7 -; CHECK-LV-NEXT: vmov.f32 s11, s6 -; CHECK-LV-NEXT: vmovx.f16 s17, s3 -; CHECK-LV-NEXT: vins.f16 s11, s16 -; CHECK-LV-NEXT: vmovx.f16 s16, s0 -; CHECK-LV-NEXT: vins.f16 s16, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s1 -; CHECK-LV-NEXT: vins.f16 s0, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s12 -; CHECK-LV-NEXT: vins.f16 s3, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s15 -; CHECK-LV-NEXT: vmovx.f16 s18, s14 -; CHECK-LV-NEXT: vins.f16 s14, s2 -; CHECK-LV-NEXT: vmovx.f16 s2, s6 -; CHECK-LV-NEXT: vmovx.f16 s19, s5 -; CHECK-LV-NEXT: vins.f16 s5, s2 -; CHECK-LV-NEXT: vmov.f32 s1, s3 -; CHECK-LV-NEXT: vins.f16 s18, s4 -; CHECK-LV-NEXT: vins.f16 s19, s7 -; CHECK-LV-NEXT: vins.f16 s17, s13 -; CHECK-LV-NEXT: vmov.f32 s2, s14 -; CHECK-LV-NEXT: vmov.f32 s3, s5 -; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 -; CHECK-LV-NEXT: vadd.f16 q0, q0, q2 -; CHECK-LV-NEXT: vstrw.32 q0, [r1] -; CHECK-LV-NEXT: vpop {d8, d9} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: vld3_v16f16: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9} -; CHECK-LIS-NEXT: vpush {d8, d9} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-LIS-NEXT: vmovx.f16 s6, s2 -; CHECK-LIS-NEXT: vmov.f32 s4, s1 -; CHECK-LIS-NEXT: vins.f16 s4, s6 -; CHECK-LIS-NEXT: vmovx.f16 s6, s9 -; CHECK-LIS-NEXT: vmov.f32 s5, s8 -; CHECK-LIS-NEXT: vmovx.f16 s7, s12 -; CHECK-LIS-NEXT: vins.f16 s5, s6 -; CHECK-LIS-NEXT: vmov.f32 s6, s11 -; CHECK-LIS-NEXT: vins.f16 s6, s7 -; CHECK-LIS-NEXT: vmovx.f16 s16, s15 -; CHECK-LIS-NEXT: vmov.f32 s7, s14 -; CHECK-LIS-NEXT: vmovx.f16 s17, s3 -; CHECK-LIS-NEXT: vins.f16 s7, s16 -; CHECK-LIS-NEXT: vmovx.f16 s16, s0 -; CHECK-LIS-NEXT: vins.f16 s16, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s1 -; CHECK-LIS-NEXT: vins.f16 s0, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s8 -; CHECK-LIS-NEXT: vins.f16 s3, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s11 -; CHECK-LIS-NEXT: vmovx.f16 s18, s10 -; CHECK-LIS-NEXT: vins.f16 s10, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s14 -; CHECK-LIS-NEXT: vmovx.f16 s19, s13 -; CHECK-LIS-NEXT: vins.f16 s13, s2 -; CHECK-LIS-NEXT: vmov.f32 s1, s3 -; CHECK-LIS-NEXT: vins.f16 s18, s12 -; CHECK-LIS-NEXT: vins.f16 s19, s15 -; CHECK-LIS-NEXT: vmov.f32 s3, s13 -; CHECK-LIS-NEXT: vins.f16 s17, s9 -; CHECK-LIS-NEXT: vmov.f32 s2, s10 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-LIS-NEXT: vadd.f16 q0, q0, q4 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-LIS-NEXT: vadd.f16 q1, q0, q1 -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] -; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-LIS-NEXT: vmov.f32 s5, s12 -; CHECK-LIS-NEXT: vmovx.f16 s6, s2 -; CHECK-LIS-NEXT: vmov.f32 s4, s1 -; CHECK-LIS-NEXT: vins.f16 s4, s6 -; CHECK-LIS-NEXT: vmovx.f16 s6, s13 -; CHECK-LIS-NEXT: vins.f16 s5, s6 -; CHECK-LIS-NEXT: vmov.f32 s6, s15 -; CHECK-LIS-NEXT: vmovx.f16 s7, s8 -; CHECK-LIS-NEXT: vmovx.f16 s16, s11 -; CHECK-LIS-NEXT: vins.f16 s6, s7 -; CHECK-LIS-NEXT: vmov.f32 s7, s10 -; CHECK-LIS-NEXT: vins.f16 s7, s16 -; CHECK-LIS-NEXT: vmovx.f16 s16, s0 -; CHECK-LIS-NEXT: vins.f16 s16, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s1 -; CHECK-LIS-NEXT: vins.f16 s0, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s12 -; CHECK-LIS-NEXT: vmovx.f16 s17, s3 -; CHECK-LIS-NEXT: vins.f16 s3, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s15 -; CHECK-LIS-NEXT: vmovx.f16 s18, s14 -; CHECK-LIS-NEXT: vins.f16 s14, s2 -; CHECK-LIS-NEXT: vmovx.f16 s2, s10 -; CHECK-LIS-NEXT: vmovx.f16 s19, s9 -; CHECK-LIS-NEXT: vins.f16 s9, s2 -; CHECK-LIS-NEXT: vmov.f32 s1, s3 -; CHECK-LIS-NEXT: vins.f16 s18, s8 -; CHECK-LIS-NEXT: vins.f16 s19, s11 -; CHECK-LIS-NEXT: vins.f16 s17, s13 -; CHECK-LIS-NEXT: vmov.f32 s2, s14 -; CHECK-LIS-NEXT: vmov.f32 s3, s9 -; CHECK-LIS-NEXT: vadd.f16 q0, q0, q4 -; CHECK-LIS-NEXT: vadd.f16 q0, q0, q1 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1] -; CHECK-LIS-NEXT: vpop {d8, d9} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: vld3_v16f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmovx.f16 s6, s2 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmovx.f16 s7, s12 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vmovx.f16 s16, s15 +; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vins.f16 s7, s16 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vins.f16 s3, s2 +; CHECK-NEXT: vmovx.f16 s2, s11 +; CHECK-NEXT: vmovx.f16 s18, s10 +; CHECK-NEXT: vins.f16 s10, s2 +; CHECK-NEXT: vmovx.f16 s2, s14 +; CHECK-NEXT: vmovx.f16 s19, s13 +; CHECK-NEXT: vins.f16 s13, s2 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vins.f16 s18, s12 +; CHECK-NEXT: vins.f16 s19, s15 +; CHECK-NEXT: vins.f16 s17, s9 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vadd.f16 q3, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmovx.f16 s14, s6 +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmovx.f16 s14, s9 +; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vmovx.f16 s18, s10 +; CHECK-NEXT: vins.f16 s13, s14 +; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmovx.f16 s15, s0 +; CHECK-NEXT: vins.f16 s18, s0 +; CHECK-NEXT: vins.f16 s14, s15 +; CHECK-NEXT: vmovx.f16 s16, s3 +; CHECK-NEXT: vmov.f32 s15, s2 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vins.f16 s15, s16 +; CHECK-NEXT: vmovx.f16 s16, s4 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmovx.f16 s17, s7 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmovx.f16 s19, s1 +; CHECK-NEXT: vins.f16 s10, s0 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vins.f16 s16, s6 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vins.f16 s19, s3 +; CHECK-NEXT: vins.f16 s17, s9 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vadd.f16 q0, q1, q4 +; CHECK-NEXT: vadd.f16 q0, q0, q3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr entry: %l1 = load <48 x half>, ptr %src, align 4 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> @@ -1529,18 +1269,18 @@ define void @vld3_v4f64(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vadd.f64 d1, d1, d2 +; CHECK-NEXT: vadd.f64 d0, d3, d0 ; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vadd.f64 d2, d4, d5 +; CHECK-NEXT: vadd.f64 d3, d4, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vadd.f64 d4, d7, d4 ; CHECK-NEXT: vadd.f64 d7, d8, d9 -; CHECK-NEXT: vadd.f64 d1, d1, d3 -; CHECK-NEXT: vadd.f64 d0, d2, d0 +; CHECK-NEXT: vadd.f64 d1, d0, d1 +; CHECK-NEXT: vadd.f64 d0, d3, d2 ; CHECK-NEXT: vadd.f64 d3, d4, d5 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vadd.f64 d2, d7, d6 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll index 1adc1269feab55..114553351ee10c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll @@ -106,16 +106,16 @@ define ptr @vld4_v2i64(ptr %src, ptr %dst) { ; CHECK-NEXT: vmov r4, r8, d9 ; CHECK-NEXT: vmov.f32 s12, s10 ; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov r2, r7, d1 +; CHECK-NEXT: vmov r3, r7, d1 ; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vmov r3, r6, d1 -; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: vmov r2, r6, d1 +; CHECK-NEXT: adds.w r3, r3, lr ; CHECK-NEXT: adc.w r7, r7, r12 -; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adds r2, r2, r4 ; CHECK-NEXT: vmov r4, r5, d2 ; CHECK-NEXT: adc.w r6, r6, r8 -; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: adds.w r12, r2, r3 ; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: adc.w lr, r6, r7 ; CHECK-NEXT: adds r3, r3, r4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index b49f19e55c895a..56a56c2ae32efb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -6,22 +6,22 @@ define void @vld4_v2i32(ptr %src, ptr %dst) { ; CHECK-LABEL: vld4_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.f32 s8, s3 -; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.f32 s8, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: add.w r12, r2, r0 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: add r0, r3 @@ -69,25 +69,25 @@ entry: define void @vld4_v8i32(ptr %src, ptr %dst) { ; CHECK-LABEL: vld4_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: vadd.i32 q4, q2, q3 -; CHECK-NEXT: vadd.i32 q5, q0, q1 -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vstrw.32 q4, [r1] ; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r0] ; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r0] ; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r0] +; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vadd.i32 q1, q5, q6 +; CHECK-NEXT: vadd.i32 q2, q3, q4 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i32>, ptr %src, align 4 @@ -110,31 +110,31 @@ define void @vld4_v16i32(ptr %src, ptr %dst) { ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: add.w r3, r0, #192 +; CHECK-NEXT: mov r3, r0 +; CHECK-NEXT: add.w r2, r0, #192 ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: adds r0, #128 -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r3]! ; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r2] ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r2] ; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r3] -; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r2] +; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r2] ; CHECK-NEXT: vadd.i32 q1, q5, q6 ; CHECK-NEXT: vadd.i32 q2, q3, q4 ; CHECK-NEXT: vadd.i32 q0, q2, q1 -; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r3] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r3] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r3] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r3] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vadd.i32 q2, q2, q3 @@ -321,25 +321,25 @@ entry: define void @vld4_v16i16(ptr %src, ptr %dst) { ; CHECK-LABEL: vld4_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: vadd.i16 q4, q2, q3 -; CHECK-NEXT: vadd.i16 q5, q0, q1 -; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vadd.i16 q4, q5, q4 -; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vstrw.32 q4, [r1] ; CHECK-NEXT: vadd.i16 q2, q2, q3 +; CHECK-NEXT: vld40.16 {q3, q4, q5, q6}, [r0] ; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vld41.16 {q3, q4, q5, q6}, [r0] ; CHECK-NEXT: vadd.i16 q0, q0, q2 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vld42.16 {q3, q4, q5, q6}, [r0] +; CHECK-NEXT: vld43.16 {q3, q4, q5, q6}, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vadd.i16 q1, q5, q6 +; CHECK-NEXT: vadd.i16 q2, q3, q4 +; CHECK-NEXT: vadd.i16 q1, q2, q1 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i16>, ptr %src, align 2 @@ -359,51 +359,51 @@ define void @vld4_v8i16_align1(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.u8 q1, [r0, #32] +; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vldrb.u8 q2, [r0, #48] -; CHECK-NEXT: vmovx.f16 s18, s5 -; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vins.f16 s18, s0 +; CHECK-NEXT: vmovx.f16 s18, s1 +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vins.f16 s18, s4 ; CHECK-NEXT: vmovx.f16 s19, s9 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vins.f16 s19, s0 -; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vmovx.f16 s4, s11 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vins.f16 s19, s4 +; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vins.f16 s9, s11 -; CHECK-NEXT: vmov.f32 s22, s5 -; CHECK-NEXT: vmovx.f16 s16, s1 -; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vmov.f32 s22, s1 +; CHECK-NEXT: vmovx.f16 s16, s5 +; CHECK-NEXT: vmovx.f16 s12, s7 ; CHECK-NEXT: vins.f16 s16, s12 ; CHECK-NEXT: vldrb.u8 q3, [r0, #16] -; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vins.f16 s5, s7 ; CHECK-NEXT: vmov.f32 s23, s9 ; CHECK-NEXT: vmovx.f16 s17, s13 ; CHECK-NEXT: vmovx.f16 s20, s15 ; CHECK-NEXT: vins.f16 s13, s15 ; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vmov.f32 s20, s5 +; CHECK-NEXT: vmovx.f16 s1, s2 ; CHECK-NEXT: vmov.f32 s21, s13 ; CHECK-NEXT: vadd.i16 q4, q5, q4 -; CHECK-NEXT: vmovx.f16 s22, s4 +; CHECK-NEXT: vmovx.f16 s22, s0 ; CHECK-NEXT: vins.f16 s22, s1 ; CHECK-NEXT: vmovx.f16 s23, s8 ; CHECK-NEXT: vmovx.f16 s1, s10 -; CHECK-NEXT: vmovx.f16 s20, s0 +; CHECK-NEXT: vmovx.f16 s20, s4 ; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s1, s2 +; CHECK-NEXT: vmovx.f16 s1, s6 ; CHECK-NEXT: vins.f16 s20, s1 ; CHECK-NEXT: vmovx.f16 s21, s12 ; CHECK-NEXT: vmovx.f16 s1, s14 ; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vins.f16 s21, s1 ; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmov.f32 s3, s8 -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vadd.i16 q0, q0, q5 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vins.f16 s21, s1 +; CHECK-NEXT: vmov.f32 s7, s8 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vadd.i16 q0, q1, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} @@ -585,47 +585,50 @@ entry: define void @vld4_v2i64(ptr %src, ptr %dst) { ; CHECK-LABEL: vld4_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s3, s9 ; CHECK-NEXT: vmov lr, r12, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov r0, r8, d9 -; CHECK-NEXT: vmov.f32 s12, s10 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vmov r0, r7, d7 +; CHECK-NEXT: vmov.f32 s16, s10 +; CHECK-NEXT: vmov.f32 s17, s11 +; CHECK-NEXT: vmov r3, r2, d1 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s13 ; CHECK-NEXT: vmov r5, r6, d1 -; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 ; CHECK-NEXT: vmov r4, r12, d2 ; CHECK-NEXT: adds r0, r0, r5 -; CHECK-NEXT: vmov r5, r7, d0 -; CHECK-NEXT: adc.w r6, r6, r8 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w lr, r6, r3 -; CHECK-NEXT: vmov r3, r6, d6 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, r2, d4 -; CHECK-NEXT: adc.w r7, r7, r12 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adcs r2, r6 -; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r7, r6 +; CHECK-NEXT: vmov r6, r5, d0 +; CHECK-NEXT: adds.w lr, r0, r3 ; CHECK-NEXT: adcs r2, r7 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r2, lr +; CHECK-NEXT: vmov r3, r7, d8 +; CHECK-NEXT: adds r6, r6, r4 +; CHECK-NEXT: vmov r4, r0, d4 +; CHECK-NEXT: adc.w r5, r5, r12 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r0, r7 +; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: adcs r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: vpop {d8} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %l1 = load <8 x i64>, ptr %src, align 8 %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> @@ -651,63 +654,63 @@ define void @vld4_v4i64(ptr %src, ptr %dst) { ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q5, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0, #80] +; CHECK-NEXT: vldrw.u32 q7, [r0, #80] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vmov.f32 s2, s20 -; CHECK-NEXT: vldrw.u32 q7, [r0, #112] +; CHECK-NEXT: vldrw.u32 q6, [r0, #112] ; CHECK-NEXT: vmov.f32 s3, s21 ; CHECK-NEXT: vmov r3, r2, d11 ; CHECK-NEXT: vldrw.u32 q5, [r0, #96] -; CHECK-NEXT: vmov.f32 s0, s26 -; CHECK-NEXT: vmov.f32 s1, s27 +; CHECK-NEXT: vmov.f32 s0, s30 +; CHECK-NEXT: vmov.f32 s1, s31 ; CHECK-NEXT: vmov lr, r12, d9 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s6, s24 +; CHECK-NEXT: vmov.f32 s7, s25 ; CHECK-NEXT: vmov r4, r5, d1 ; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vmov.f32 s3, s17 ; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmov.f32 s6, s28 -; CHECK-NEXT: vmov.f32 s7, s29 -; CHECK-NEXT: vmov.f32 s10, s20 -; CHECK-NEXT: vmov.f32 s11, s21 +; CHECK-NEXT: vmov.f32 s12, s10 +; CHECK-NEXT: vmov.f32 s13, s11 ; CHECK-NEXT: vmov r0, r6, d1 -; CHECK-NEXT: adds r7, r4, r3 -; CHECK-NEXT: vmov r4, r8, d0 -; CHECK-NEXT: adcs r5, r2 -; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vmov.f32 s3, s21 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adc.w r4, r5, r2 +; CHECK-NEXT: vmov r5, r8, d0 +; CHECK-NEXT: vmov r2, r7, d14 ; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vmov.f32 s1, s19 ; CHECK-NEXT: adds.w r0, r0, lr ; CHECK-NEXT: adc.w r6, r6, r12 -; CHECK-NEXT: adds.w lr, r0, r7 -; CHECK-NEXT: adc.w r12, r6, r5 -; CHECK-NEXT: vmov r6, r5, d0 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, r0, d8 -; CHECK-NEXT: adc.w r3, r3, r8 -; CHECK-NEXT: adds r6, r6, r4 -; CHECK-NEXT: adcs r0, r5 +; CHECK-NEXT: adds.w lr, r0, r3 +; CHECK-NEXT: adc.w r12, r6, r4 +; CHECK-NEXT: vmov r6, r4, d0 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: vmov r5, r0, d8 +; CHECK-NEXT: adc.w r7, r7, r8 +; CHECK-NEXT: adds r6, r6, r5 +; CHECK-NEXT: adcs r0, r4 ; CHECK-NEXT: adds.w r9, r6, r2 -; CHECK-NEXT: adc.w r8, r0, r3 -; CHECK-NEXT: vmov r5, r4, d15 -; CHECK-NEXT: vmov r3, r6, d3 -; CHECK-NEXT: vmov r7, r0, d5 -; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adc.w r8, r0, r7 +; CHECK-NEXT: vmov r5, r4, d13 +; CHECK-NEXT: vmov r7, r6, d3 +; CHECK-NEXT: vmov r3, r0, d1 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: adds r7, r7, r5 ; CHECK-NEXT: adcs r6, r4 ; CHECK-NEXT: vmov r5, r4, d11 -; CHECK-NEXT: adds r5, r5, r7 -; CHECK-NEXT: adcs r0, r4 ; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r0, r4 +; CHECK-NEXT: adds r3, r3, r7 ; CHECK-NEXT: adc.w r10, r0, r6 -; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vmov r4, r5, d2 ; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: vmov r2, r0, d2 ; CHECK-NEXT: vmov q1[2], q1[0], r9, r3 +; CHECK-NEXT: vmov r2, r0, d4 ; CHECK-NEXT: vmov q1[3], q1[1], r8, r10 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: adds r4, r4, r6 @@ -796,25 +799,25 @@ entry: define void @vld4_v8f32(ptr %src, ptr %dst) { ; CHECK-LABEL: vld4_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: vadd.f32 q4, q2, q3 -; CHECK-NEXT: vadd.f32 q5, q0, q1 -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vstrw.32 q4, [r1] ; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r0] ; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r0] ; CHECK-NEXT: vadd.f32 q0, q0, q2 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r0] +; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vadd.f32 q1, q5, q6 +; CHECK-NEXT: vadd.f32 q2, q3, q4 +; CHECK-NEXT: vadd.f32 q1, q2, q1 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x float>, ptr %src, align 4 @@ -837,31 +840,31 @@ define void @vld4_v16f32(ptr %src, ptr %dst) { ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: add.w r3, r0, #192 +; CHECK-NEXT: mov r3, r0 +; CHECK-NEXT: add.w r2, r0, #192 ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: adds r0, #128 -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r3]! ; CHECK-NEXT: vadd.f32 q2, q2, q3 -; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r2] ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r2] ; CHECK-NEXT: vadd.f32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r3] -; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r2] +; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r2] ; CHECK-NEXT: vadd.f32 q1, q5, q6 ; CHECK-NEXT: vadd.f32 q2, q3, q4 ; CHECK-NEXT: vadd.f32 q0, q2, q1 -; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r3] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r3] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r3] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r3] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vadd.f32 q2, q2, q3 @@ -1081,51 +1084,51 @@ define void @vld4_v8f16_align1(ptr %src, ptr %dst) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] -; CHECK-NEXT: vldrb.u8 q2, [r0, #48] -; CHECK-NEXT: vmovx.f16 s18, s1 +; CHECK-NEXT: vldrb.u8 q3, [r0] +; CHECK-NEXT: vldrb.u8 q4, [r0, #16] +; CHECK-NEXT: vmovx.f16 s10, s1 ; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vins.f16 s18, s4 -; CHECK-NEXT: vmovx.f16 s19, s9 -; CHECK-NEXT: vmovx.f16 s4, s11 +; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vldrb.u8 q1, [r0, #48] ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vins.f16 s19, s4 -; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vmovx.f16 s22, s0 +; CHECK-NEXT: vmovx.f16 s11, s5 +; CHECK-NEXT: vmovx.f16 s8, s7 +; CHECK-NEXT: vins.f16 s11, s8 +; CHECK-NEXT: vmovx.f16 s8, s13 +; CHECK-NEXT: vmovx.f16 s9, s15 ; CHECK-NEXT: vmovx.f16 s3, s2 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vins.f16 s16, s12 -; CHECK-NEXT: vldrb.u8 q3, [r0, #16] +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vmovx.f16 s9, s17 +; CHECK-NEXT: vmovx.f16 s20, s19 ; CHECK-NEXT: vins.f16 s22, s3 -; CHECK-NEXT: vmovx.f16 s23, s8 -; CHECK-NEXT: vmovx.f16 s17, s13 -; CHECK-NEXT: vmovx.f16 s20, s15 -; CHECK-NEXT: vmovx.f16 s3, s10 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vins.f16 s23, s3 -; CHECK-NEXT: vmovx.f16 s20, s4 +; CHECK-NEXT: vmovx.f16 s23, s4 ; CHECK-NEXT: vmovx.f16 s3, s6 -; CHECK-NEXT: vins.f16 s9, s11 +; CHECK-NEXT: vins.f16 s9, s20 +; CHECK-NEXT: vins.f16 s23, s3 +; CHECK-NEXT: vmovx.f16 s20, s12 +; CHECK-NEXT: vmovx.f16 s3, s14 ; CHECK-NEXT: vins.f16 s5, s7 ; CHECK-NEXT: vins.f16 s13, s15 +; CHECK-NEXT: vins.f16 s17, s19 ; CHECK-NEXT: vins.f16 s20, s3 -; CHECK-NEXT: vmovx.f16 s21, s12 -; CHECK-NEXT: vmovx.f16 s3, s14 -; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vmovx.f16 s21, s16 +; CHECK-NEXT: vmovx.f16 s3, s18 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s16, s18 ; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmov.f32 s24, s5 +; CHECK-NEXT: vmov.f32 s24, s13 ; CHECK-NEXT: vins.f16 s21, s3 ; CHECK-NEXT: vmov.f32 s26, s1 -; CHECK-NEXT: vmov.f32 s27, s9 -; CHECK-NEXT: vmov.f32 s25, s13 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vadd.f16 q4, q6, q4 -; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vadd.f16 q0, q1, q5 -; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vmov.f32 s27, s5 +; CHECK-NEXT: vmov.f32 s25, s17 +; CHECK-NEXT: vmov.f32 s14, s0 +; CHECK-NEXT: vadd.f16 q2, q6, q2 +; CHECK-NEXT: vmov.f32 s15, s4 +; CHECK-NEXT: vmov.f32 s13, s16 +; CHECK-NEXT: vadd.f16 q0, q3, q5 +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll index 219541cffb940f..881f3e184775a1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -6,14 +6,14 @@ define void @vldst4(ptr nocapture readonly %pIn, ptr nocapture %pOut, i32 %numRo ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: mul r12, r3, r2 ; CHECK-NEXT: lsrs.w r2, r12, #2 ; CHECK-NEXT: beq.w .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 -; CHECK-NEXT: ldr r2, [sp, #56] +; CHECK-NEXT: ldr r2, [sp, #72] ; CHECK-NEXT: and.w r3, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 @@ -21,104 +21,104 @@ define void @vldst4(ptr nocapture readonly %pIn, ptr nocapture %pOut, i32 %numRo ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0, #32] -; CHECK-NEXT: vldrh.u16 q4, [r0, #48] -; CHECK-NEXT: vldrh.u16 q3, [r0], #64 +; CHECK-NEXT: vldrh.u16 q3, [r0, #48] +; CHECK-NEXT: vldrh.u16 q4, [r0], #64 ; CHECK-NEXT: vmovx.f16 s26, s4 ; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmovx.f16 s6, s6 -; CHECK-NEXT: vldrh.u16 q5, [r0, #-48] -; CHECK-NEXT: vmovx.f16 s27, s16 -; CHECK-NEXT: vins.f16 s26, s6 -; CHECK-NEXT: vmovx.f16 s6, s18 ; CHECK-NEXT: vmovx.f16 s8, s7 ; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vmovx.f16 s24, s12 +; CHECK-NEXT: vldrh.u16 q5, [r0, #-48] ; CHECK-NEXT: vins.f16 s10, s8 -; CHECK-NEXT: vins.f16 s27, s6 +; CHECK-NEXT: vmovx.f16 s27, s12 +; CHECK-NEXT: vins.f16 s26, s6 ; CHECK-NEXT: vmovx.f16 s6, s14 -; CHECK-NEXT: vmovx.f16 s8, s19 -; CHECK-NEXT: vmovx.f16 s11, s17 -; CHECK-NEXT: vmov.f32 s0, s13 +; CHECK-NEXT: vmovx.f16 s8, s15 +; CHECK-NEXT: vmovx.f16 s11, s13 ; CHECK-NEXT: vins.f16 s11, s8 -; CHECK-NEXT: vmovx.f16 s25, s20 -; CHECK-NEXT: vins.f16 s24, s6 -; CHECK-NEXT: vmovx.f16 s6, s22 -; CHECK-NEXT: vmovx.f16 s1, s15 -; CHECK-NEXT: vmovx.f16 s8, s13 -; CHECK-NEXT: vins.f16 s20, s22 -; CHECK-NEXT: vins.f16 s16, s18 +; CHECK-NEXT: vmovx.f16 s24, s16 +; CHECK-NEXT: vins.f16 s27, s6 +; CHECK-NEXT: vmovx.f16 s6, s18 +; CHECK-NEXT: vmovx.f16 s1, s19 +; CHECK-NEXT: vmovx.f16 s8, s17 ; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vins.f16 s25, s6 -; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vins.f16 s0, s15 ; CHECK-NEXT: vmovx.f16 s9, s21 +; CHECK-NEXT: vmov.f32 s3, s13 ; CHECK-NEXT: vins.f16 s8, s1 +; CHECK-NEXT: vmov.f32 s0, s17 ; CHECK-NEXT: vmovx.f16 s1, s23 -; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmovx.f16 s25, s20 +; CHECK-NEXT: vins.f16 s20, s22 +; CHECK-NEXT: vins.f16 s24, s6 +; CHECK-NEXT: vmovx.f16 s6, s22 ; CHECK-NEXT: vins.f16 s21, s23 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmov.f32 s15, s16 +; CHECK-NEXT: vins.f16 s12, s14 ; CHECK-NEXT: vins.f16 s9, s1 -; CHECK-NEXT: vmov.f32 s13, s20 -; CHECK-NEXT: vmul.f16 q6, q6, r2 -; CHECK-NEXT: vmul.f16 q3, q3, r2 +; CHECK-NEXT: vins.f16 s25, s6 +; CHECK-NEXT: vins.f16 s0, s19 +; CHECK-NEXT: vins.f16 s16, s18 ; CHECK-NEXT: vins.f16 s2, s7 -; CHECK-NEXT: vins.f16 s3, s19 +; CHECK-NEXT: vins.f16 s3, s15 ; CHECK-NEXT: vmov.f32 s1, s21 +; CHECK-NEXT: vmul.f16 q6, q6, r2 +; CHECK-NEXT: vmov.f32 s18, s4 ; CHECK-NEXT: vmul.f16 q0, q0, r2 -; CHECK-NEXT: vmovx.f16 s4, s12 -; CHECK-NEXT: vmovx.f16 s6, s24 +; CHECK-NEXT: vmov.f32 s19, s12 ; CHECK-NEXT: vmul.f16 q2, q2, r2 +; CHECK-NEXT: vmov.f32 s17, s20 ; CHECK-NEXT: vmovx.f16 s7, s0 +; CHECK-NEXT: vmul.f16 q3, q4, r2 ; CHECK-NEXT: vins.f16 s0, s8 ; CHECK-NEXT: vmovx.f16 s8, s8 +; CHECK-NEXT: vmovx.f16 s4, s12 +; CHECK-NEXT: vmovx.f16 s6, s24 +; CHECK-NEXT: vmovx.f16 s19, s2 +; CHECK-NEXT: vins.f16 s2, s10 +; CHECK-NEXT: vmovx.f16 s10, s10 +; CHECK-NEXT: vins.f16 s7, s8 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s5, s1 +; CHECK-NEXT: vmovx.f16 s8, s1 ; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vins.f16 s7, s8 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vmovx.f16 s6, s13 -; CHECK-NEXT: vmovx.f16 s8, s25 -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmovx.f16 s19, s2 -; CHECK-NEXT: vmovx.f16 s8, s10 +; CHECK-NEXT: vins.f16 s19, s10 ; CHECK-NEXT: vmovx.f16 s18, s14 -; CHECK-NEXT: vins.f16 s19, s8 -; CHECK-NEXT: vmovx.f16 s8, s26 -; CHECK-NEXT: vins.f16 s18, s8 -; CHECK-NEXT: vmovx.f16 s23, s3 -; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vmovx.f16 s10, s26 +; CHECK-NEXT: vins.f16 s8, s6 +; CHECK-NEXT: vmovx.f16 s6, s13 ; CHECK-NEXT: vins.f16 s14, s26 -; CHECK-NEXT: vins.f16 s23, s8 +; CHECK-NEXT: vins.f16 s18, s10 +; CHECK-NEXT: vmovx.f16 s23, s3 +; CHECK-NEXT: vmovx.f16 s10, s11 ; CHECK-NEXT: vmovx.f16 s22, s15 ; CHECK-NEXT: vins.f16 s15, s27 -; CHECK-NEXT: vmovx.f16 s8, s27 ; CHECK-NEXT: vins.f16 s12, s24 ; CHECK-NEXT: vins.f16 s13, s25 +; CHECK-NEXT: vmovx.f16 s5, s25 ; CHECK-NEXT: vins.f16 s3, s11 ; CHECK-NEXT: vins.f16 s1, s9 -; CHECK-NEXT: vins.f16 s2, s10 -; CHECK-NEXT: vins.f16 s22, s8 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vmov.f32 s17, s0 -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov q6, q0 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vins.f16 s23, s10 +; CHECK-NEXT: vmovx.f16 s10, s27 +; CHECK-NEXT: vmov q6, q3 +; CHECK-NEXT: vins.f16 s6, s5 +; CHECK-NEXT: vmov.f32 s26, s4 +; CHECK-NEXT: vins.f16 s22, s10 +; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmov.f32 s25, s0 ; CHECK-NEXT: vmov.f32 s17, s2 ; CHECK-NEXT: vmov.f32 s16, s14 ; CHECK-NEXT: vmov.f32 s21, s3 ; CHECK-NEXT: vstrh.16 q4, [r1, #32] ; CHECK-NEXT: vmov.f32 s20, s15 -; CHECK-NEXT: vmov.f32 s7, s5 ; CHECK-NEXT: vstrh.16 q5, [r1, #48] -; CHECK-NEXT: vstrh.16 q2, [r1], #64 +; CHECK-NEXT: vstrh.16 q6, [r1], #64 +; CHECK-NEXT: vmov.f32 s7, s8 ; CHECK-NEXT: vmov.f32 s4, s13 -; CHECK-NEXT: vmov.f32 s5, s25 +; CHECK-NEXT: vmov.f32 s5, s29 ; CHECK-NEXT: vstrh.16 q1, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll index 898380760bd4d2..91bf9ac30406aa 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll @@ -495,20 +495,20 @@ define arm_aapcs_vfpcc i64 @uminv2i64(<2 x i64> %vec, i64 %min) { ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: subs.w r4, r2, r12 -; CHECK-NEXT: sbcs.w r4, r3, lr +; CHECK-NEXT: vmov r12, r3, d1 +; CHECK-NEXT: vmov lr, r2, d0 +; CHECK-NEXT: subs.w r4, lr, r12 +; CHECK-NEXT: sbcs.w r4, r2, r3 ; CHECK-NEXT: cset r4, lo ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r2, r2, r12, ne -; CHECK-NEXT: csel r3, r3, lr, ne -; CHECK-NEXT: subs r4, r2, r0 -; CHECK-NEXT: sbcs.w r4, r3, r1 +; CHECK-NEXT: csel r2, r2, r3, ne +; CHECK-NEXT: csel r3, lr, r12, ne +; CHECK-NEXT: subs r4, r3, r0 +; CHECK-NEXT: sbcs.w r4, r2, r1 ; CHECK-NEXT: cset r4, lo ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r0, r2, r0, ne -; CHECK-NEXT: csel r1, r3, r1, ne +; CHECK-NEXT: csel r0, r3, r0, ne +; CHECK-NEXT: csel r1, r2, r1, ne ; CHECK-NEXT: pop {r4, pc} %x = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %vec) %cmp = icmp ult i64 %x, %min @@ -521,20 +521,20 @@ define arm_aapcs_vfpcc i64 @sminv2i64(<2 x i64> %vec, i64 %min) { ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: subs.w r4, r2, r12 -; CHECK-NEXT: sbcs.w r4, r3, lr +; CHECK-NEXT: vmov r12, r3, d1 +; CHECK-NEXT: vmov lr, r2, d0 +; CHECK-NEXT: subs.w r4, lr, r12 +; CHECK-NEXT: sbcs.w r4, r2, r3 ; CHECK-NEXT: cset r4, lt ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r2, r2, r12, ne -; CHECK-NEXT: csel r3, r3, lr, ne -; CHECK-NEXT: subs r4, r2, r0 -; CHECK-NEXT: sbcs.w r4, r3, r1 +; CHECK-NEXT: csel r2, r2, r3, ne +; CHECK-NEXT: csel r3, lr, r12, ne +; CHECK-NEXT: subs r4, r3, r0 +; CHECK-NEXT: sbcs.w r4, r2, r1 ; CHECK-NEXT: cset r4, lt ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r0, r2, r0, ne -; CHECK-NEXT: csel r1, r3, r1, ne +; CHECK-NEXT: csel r0, r3, r0, ne +; CHECK-NEXT: csel r1, r2, r1, ne ; CHECK-NEXT: pop {r4, pc} %x = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %vec) %cmp = icmp slt i64 %x, %min @@ -547,20 +547,20 @@ define arm_aapcs_vfpcc i64 @umaxv2i64(<2 x i64> %vec, i64 %max) { ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r12, lr, d0 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: subs.w r4, r2, r12 -; CHECK-NEXT: sbcs.w r4, r3, lr +; CHECK-NEXT: vmov r12, r3, d0 +; CHECK-NEXT: vmov lr, r2, d1 +; CHECK-NEXT: subs.w r4, lr, r12 +; CHECK-NEXT: sbcs.w r4, r2, r3 ; CHECK-NEXT: cset r4, lo ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r2, r12, r2, ne -; CHECK-NEXT: csel r3, lr, r3, ne -; CHECK-NEXT: subs r4, r0, r2 -; CHECK-NEXT: sbcs.w r4, r1, r3 +; CHECK-NEXT: csel r2, r3, r2, ne +; CHECK-NEXT: csel r3, r12, lr, ne +; CHECK-NEXT: subs r4, r0, r3 +; CHECK-NEXT: sbcs.w r4, r1, r2 ; CHECK-NEXT: cset r4, lo ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r0, r2, r0, ne -; CHECK-NEXT: csel r1, r3, r1, ne +; CHECK-NEXT: csel r0, r3, r0, ne +; CHECK-NEXT: csel r1, r2, r1, ne ; CHECK-NEXT: pop {r4, pc} %x = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %vec) %cmp = icmp ugt i64 %x, %max @@ -573,20 +573,20 @@ define arm_aapcs_vfpcc i64 @smaxv2i64(<2 x i64> %vec, i64 %max) { ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r12, lr, d0 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: subs.w r4, r2, r12 -; CHECK-NEXT: sbcs.w r4, r3, lr +; CHECK-NEXT: vmov r12, r3, d0 +; CHECK-NEXT: vmov lr, r2, d1 +; CHECK-NEXT: subs.w r4, lr, r12 +; CHECK-NEXT: sbcs.w r4, r2, r3 ; CHECK-NEXT: cset r4, lt ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r2, r12, r2, ne -; CHECK-NEXT: csel r3, lr, r3, ne -; CHECK-NEXT: subs r4, r0, r2 -; CHECK-NEXT: sbcs.w r4, r1, r3 +; CHECK-NEXT: csel r2, r3, r2, ne +; CHECK-NEXT: csel r3, r12, lr, ne +; CHECK-NEXT: subs r4, r0, r3 +; CHECK-NEXT: sbcs.w r4, r1, r2 ; CHECK-NEXT: cset r4, lt ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r0, r2, r0, ne -; CHECK-NEXT: csel r1, r3, r1, ne +; CHECK-NEXT: csel r0, r3, r0, ne +; CHECK-NEXT: csel r1, r2, r1, ne ; CHECK-NEXT: pop {r4, pc} %x = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %vec) %cmp = icmp sgt i64 %x, %max diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll b/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll index ac4abdbf45643f..3effe87b879d78 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll @@ -156,19 +156,19 @@ define void @sunken_vmovl(i8* noalias %pTarget, i16 signext %iTargetStride, i8* ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: ldrsh.w r1, [sp, #8] ; CHECK-NEXT: vmov.i16 q0, #0x100 -; CHECK-NEXT: vldrb.u16 q1, [r2], #8 -; CHECK-NEXT: vldrb.u16 q2, [r0], #8 +; CHECK-NEXT: vldrb.u16 q2, [r2], #8 +; CHECK-NEXT: vldrb.u16 q1, [r0], #8 ; CHECK-NEXT: ldr r3, [sp, #12] ; CHECK-NEXT: dlstp.16 lr, r1 ; CHECK-NEXT: .LBB3_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vsub.i16 q3, q0, q1 ; CHECK-NEXT: vmovlb.u8 q2, q2 -; CHECK-NEXT: vmul.i16 q3, q2, q3 -; CHECK-NEXT: vldrb.u16 q2, [r0], #8 -; CHECK-NEXT: vmla.i16 q3, q1, r3 -; CHECK-NEXT: vldrb.u16 q1, [r2], #8 +; CHECK-NEXT: vsub.i16 q3, q0, q2 +; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmul.i16 q3, q1, q3 +; CHECK-NEXT: vldrb.u16 q1, [r0], #8 +; CHECK-NEXT: vmla.i16 q3, q2, r3 +; CHECK-NEXT: vldrb.u16 q2, [r2], #8 ; CHECK-NEXT: vshr.u16 q3, q3, #8 ; CHECK-NEXT: vstrb.16 q3, [r0, #-16] ; CHECK-NEXT: letp lr, .LBB3_1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll index b005cb92dc5169..e32d370b2f716c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll @@ -519,23 +519,23 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-MVE-LABEL: vmovn16_b2: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[1] -; CHECK-MVE-NEXT: vmov.16 q0[0], r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q2[0] -; CHECK-MVE-NEXT: vmov.16 q0[1], r0 +; CHECK-MVE-NEXT: vmov.16 q2[0], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[0] +; CHECK-MVE-NEXT: vmov.16 q2[1], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[3] -; CHECK-MVE-NEXT: vmov.16 q0[2], r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q2[2] -; CHECK-MVE-NEXT: vmov.16 q0[3], r0 +; CHECK-MVE-NEXT: vmov.16 q2[2], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[2] +; CHECK-MVE-NEXT: vmov.16 q2[3], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[5] -; CHECK-MVE-NEXT: vmov.16 q0[4], r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q2[4] -; CHECK-MVE-NEXT: vmov.16 q0[5], r0 +; CHECK-MVE-NEXT: vmov.16 q2[4], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[4] +; CHECK-MVE-NEXT: vmov.16 q2[5], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[7] -; CHECK-MVE-NEXT: vmov.16 q0[6], r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q2[6] -; CHECK-MVE-NEXT: vmov.16 q0[7], r0 +; CHECK-MVE-NEXT: vmov.16 q2[6], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[6] +; CHECK-MVE-NEXT: vmov.16 q2[7], r0 +; CHECK-MVE-NEXT: vmov q0, q2 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vmovn16_b2: @@ -574,22 +574,22 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-MVE-LABEL: vmovn16_b3: ; CHECK-MVE: @ %bb.0: @ %entry ; CHECK-MVE-NEXT: vmov.u16 r0, q0[1] -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vmov.16 q0[0], r0 +; CHECK-MVE-NEXT: vmov.16 q2[0], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[0] -; CHECK-MVE-NEXT: vmov.16 q0[1], r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q2[3] -; CHECK-MVE-NEXT: vmov.16 q0[2], r0 +; CHECK-MVE-NEXT: vmov.16 q2[1], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[3] +; CHECK-MVE-NEXT: vmov.16 q2[2], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[2] -; CHECK-MVE-NEXT: vmov.16 q0[3], r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q2[5] -; CHECK-MVE-NEXT: vmov.16 q0[4], r0 +; CHECK-MVE-NEXT: vmov.16 q2[3], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[5] +; CHECK-MVE-NEXT: vmov.16 q2[4], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[4] -; CHECK-MVE-NEXT: vmov.16 q0[5], r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q2[7] -; CHECK-MVE-NEXT: vmov.16 q0[6], r0 +; CHECK-MVE-NEXT: vmov.16 q2[5], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[7] +; CHECK-MVE-NEXT: vmov.16 q2[6], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[6] -; CHECK-MVE-NEXT: vmov.16 q0[7], r0 +; CHECK-MVE-NEXT: vmov.16 q2[7], r0 +; CHECK-MVE-NEXT: vmov q0, q2 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vmovn16_b3: @@ -718,39 +718,39 @@ entry: define arm_aapcs_vfpcc <16 x i8> @vmovn8_t2(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: vmovn8_t2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov.8 q2[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.8 q2[1], r0 ; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov.8 q2[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.8 q2[3], r0 ; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov.8 q2[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q2[5], r0 ; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov.8 q2[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q2[7], r0 ; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q2[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.8 q2[9], r0 ; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.8 q2[10], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q2[11], r0 ; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.8 q2[12], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q2[13], r0 ; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.8 q2[14], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.8 q2[15], r0 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn8_t2: diff --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll index 75f7350fcd5b15..a1960a1bd0f0a3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll @@ -169,32 +169,32 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_smaxmin(<2 x i64> %s0) { ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: subs.w r0, r0, r12 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: bfi r3, r1, #0, #8 -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: subs.w r1, r1, r12 -; CHECK-NEXT: sbcs r1, r2, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r3, r1, #8, #8 -; CHECK-NEXT: adr r1, .LCPI12_0 -; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r3, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: subs.w r0, r0, r12 +; CHECK-NEXT: sbcs r0, r2, #0 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: adr r0, .LCPI12_0 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: mov.w r3, #-1 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r3, r2 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r0, r1, #0, #8 -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r3, r2 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r0, r1, #8, #8 -; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov r0, r2, d0 +; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r3, r2 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r3, r2 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r1, r0, #8, #8 ; CHECK-NEXT: adr r0, .LCPI12_1 ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 @@ -225,32 +225,32 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_sminmax(<2 x i64> %s0) { ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 ; CHECK-NEXT: sbcs.w r0, r12, r1 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: bfi r3, r1, #0, #8 -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r12, r2 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r3, r1, #8, #8 -; CHECK-NEXT: adr r1, .LCPI13_0 -; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r3, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r12, r2 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: adr r0, .LCPI13_0 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: mvn r3, #-2147483648 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: subs r1, r1, r3 -; CHECK-NEXT: sbcs r1, r2, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r0, r1, #0, #8 -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: subs r1, r1, r3 -; CHECK-NEXT: sbcs r1, r2, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r0, r1, #8, #8 -; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov r0, r2, d0 +; CHECK-NEXT: subs r0, r0, r3 +; CHECK-NEXT: sbcs r0, r2, #0 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: subs r0, r0, r3 +; CHECK-NEXT: sbcs r0, r2, #0 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r1, r0, #8, #8 ; CHECK-NEXT: adr r0, .LCPI13_1 ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll index f78d36222c3121..178dc9bed32249 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll @@ -245,25 +245,25 @@ define arm_aapcs_vfpcc <2 x i64> @vqshrni64_sminmax(<2 x i64> %so) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov r2, r1, d0 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: mov.w r12, #-1 -; CHECK-NEXT: asrl r2, r1, #3 +; CHECK-NEXT: asrl r0, r1, #3 ; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: rsbs.w r0, r2, #-2147483648 +; CHECK-NEXT: rsbs.w r3, r0, #-2147483648 ; CHECK-NEXT: asrl r4, r5, #3 -; CHECK-NEXT: sbcs.w r0, r12, r1 -; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: sbcs.w r3, r12, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: csetm lr, lt -; CHECK-NEXT: rsbs.w r0, r4, #-2147483648 -; CHECK-NEXT: sbcs.w r0, r12, r5 -; CHECK-NEXT: bfi r3, lr, #0, #8 -; CHECK-NEXT: csetm r0, lt -; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 -; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: rsbs.w r2, r4, #-2147483648 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: sbcs.w r2, r12, r5 ; CHECK-NEXT: adr r0, .LCPI13_0 +; CHECK-NEXT: bfi r3, lr, #0, #8 +; CHECK-NEXT: csetm r2, lt ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 +; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: movs r6, #0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll index 57d08a7f3c4b25..bf1ff8670590b5 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -6,10 +6,10 @@ define void @vst2_v2i32(ptr %src, ptr %dst) { ; CHECK-LABEL: vst2_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldm.w r0, {r2, r3, r12} -; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r0 +; CHECK-NEXT: ldrd r12, r3, [r0] +; CHECK-NEXT: ldrd r2, r0, [r0, #8] +; CHECK-NEXT: vmov q0[2], q0[0], r12, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -123,12 +123,12 @@ entry: define void @vst2_v2i16(ptr %src, ptr %dst) { ; CHECK-LABEL: vst2_v2i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: ldrh r3, [r0] -; CHECK-NEXT: ldrh.w r12, [r0, #6] +; CHECK-NEXT: ldrh.w r12, [r0, #2] +; CHECK-NEXT: ldrh r2, [r0] +; CHECK-NEXT: ldrh r3, [r0, #6] ; CHECK-NEXT: ldrh r0, [r0, #4] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r3 ; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -199,28 +199,28 @@ define void @vst2_v8i16_align1(ptr %src, ptr %dst) { ; CHECK-LABEL: vst2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmovx.f16 s1, s10 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vins.f16 s10, s6 -; CHECK-NEXT: vmovx.f16 s3, s11 -; CHECK-NEXT: vmovx.f16 s6, s7 -; CHECK-NEXT: vins.f16 s11, s7 -; CHECK-NEXT: vins.f16 s3, s6 -; CHECK-NEXT: vmovx.f16 s6, s8 -; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmovx.f16 s5, s10 +; CHECK-NEXT: vins.f16 s10, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vmovx.f16 s7, s11 +; CHECK-NEXT: vins.f16 s5, s2 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vins.f16 s11, s3 +; CHECK-NEXT: vins.f16 s7, s2 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vins.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vins.f16 s6, s4 +; CHECK-NEXT: vins.f16 s2, s0 ; CHECK-NEXT: vmovx.f16 s15, s9 -; CHECK-NEXT: vins.f16 s9, s5 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vins.f16 s1, s0 -; CHECK-NEXT: vmov.f32 s0, s10 -; CHECK-NEXT: vins.f16 s15, s4 -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmov.f32 s13, s6 -; CHECK-NEXT: vstrb.8 q0, [r1, #16] +; CHECK-NEXT: vins.f16 s9, s1 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vins.f16 s15, s0 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmov.f32 s13, s2 +; CHECK-NEXT: vstrb.8 q1, [r1, #16] ; CHECK-NEXT: vmov.f32 s14, s9 ; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr @@ -238,12 +238,12 @@ entry: define void @vst2_v2i8(ptr %src, ptr %dst) { ; CHECK-LABEL: vst2_v2i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: ldrb.w r12, [r0] ; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: ldrb.w r12, [r0, #2] +; CHECK-NEXT: ldrb r2, [r0, #2] ; CHECK-NEXT: ldrb r0, [r0, #3] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r12, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 ; CHECK-NEXT: vstrb.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -334,19 +334,19 @@ define void @vst2_v4i64(ptr %src, ptr %dst) { ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f64 d8, d4 +; CHECK-NEXT: vmov.f64 d8, d2 ; CHECK-NEXT: vmov.f64 d9, d0 -; CHECK-NEXT: vmov.f64 d0, d5 +; CHECK-NEXT: vmov.f64 d0, d3 ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vmov.f64 d3, d4 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f64 d4, d6 -; CHECK-NEXT: vmov.f64 d2, d7 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vmov.f64 d2, d6 +; CHECK-NEXT: vmov.f64 d4, d7 +; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -503,24 +503,24 @@ entry: define void @vst2_v4f16(ptr %src, ptr %dst) { ; CHECK-LABEL: vst2_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r2, r12, [r0] -; CHECK-NEXT: ldrd r3, r0, [r0, #8] +; CHECK-NEXT: ldrd r12, r3, [r0] +; CHECK-NEXT: ldrd r2, r0, [r0, #8] +; CHECK-NEXT: vmov.32 q1[0], r12 ; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.32 q0[1], r12 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmovx.f16 s2, s0 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmovx.f16 s4, s4 -; CHECK-NEXT: vins.f16 s2, s4 -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vins.f16 s1, s5 -; CHECK-NEXT: vmovx.f16 s6, s5 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vins.f16 s5, s1 +; CHECK-NEXT: vins.f16 s0, s6 +; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s11, s4 +; CHECK-NEXT: vmov.f32 s10, s5 +; CHECK-NEXT: vmov.f32 s11, s0 ; CHECK-NEXT: vstrh.16 q2, [r1] ; CHECK-NEXT: bx lr entry: @@ -637,19 +637,19 @@ define void @vst2_v4f64(ptr %src, ptr %dst) { ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f64 d8, d4 +; CHECK-NEXT: vmov.f64 d8, d2 ; CHECK-NEXT: vmov.f64 d9, d0 -; CHECK-NEXT: vmov.f64 d0, d5 +; CHECK-NEXT: vmov.f64 d0, d3 ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vmov.f64 d3, d4 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f64 d4, d6 -; CHECK-NEXT: vmov.f64 d2, d7 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vmov.f64 d2, d6 +; CHECK-NEXT: vmov.f64 d4, d7 +; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll index d80dd5a673e20f..f680513d85af48 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -8,18 +8,19 @@ define void @vst3_v2i32(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: str r2, [r1, #16] +; CHECK-NEXT: add.w r4, r0, #8 +; CHECK-NEXT: ldrd r12, lr, [r0] +; CHECK-NEXT: ldr r0, [r0, #20] +; CHECK-NEXT: ldm r4, {r2, r3, r4} ; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 ; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.f32 s10, s0 +; CHECK-NEXT: str r3, [r1, #16] +; CHECK-NEXT: vmov q1[2], q1[0], r12, r2 +; CHECK-NEXT: str r0, [r1, #20] +; CHECK-NEXT: vmov q1[3], q1[1], lr, r3 ; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: str r0, [r1, #20] -; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s5 ; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: pop {r4, pc} @@ -41,23 +42,23 @@ define void @vst3_v4i32(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.f32 s4, s8 +; CHECK-NEXT: vmov.f32 s12, s4 ; CHECK-NEXT: vmov r0, r2, d0 -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vmov.f32 s7, s9 -; CHECK-NEXT: vmov.f32 s16, s13 -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vmov.f32 s19, s10 ; CHECK-NEXT: vmov.f32 s0, s2 ; CHECK-NEXT: vmov.32 q4[1], r2 -; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vmov.f32 s1, s7 ; CHECK-NEXT: vstrw.32 q4, [r1, #16] -; CHECK-NEXT: vmov.f32 s2, s15 +; CHECK-NEXT: vmov.f32 s2, s11 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -141,101 +142,101 @@ define void @vst3_v16i32(ptr %src, ptr %dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #160 ; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q3, [r0, #160] +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vstrw.32 q3, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vmov r12, r3, d10 +; CHECK-NEXT: vldrw.u32 q1, [r0, #128] +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: vstrw.32 q2, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #144] ; CHECK-NEXT: vldrw.u32 q7, [r0, #176] -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #96] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vldrw.u32 q4, [r0, #112] -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vmov.f32 s20, s22 -; CHECK-NEXT: vmov.f32 s22, s3 +; CHECK-NEXT: vldrw.u32 q6, [r0, #112] +; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #96] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vmov r12, r3, d2 +; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmov.f32 s4, s6 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s5, s23 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: vmov.f32 s15, s2 +; CHECK-NEXT: vmov.f32 s5, s0 ; CHECK-NEXT: vmov.f32 s0, s30 -; CHECK-NEXT: vmov.f32 s1, s15 -; CHECK-NEXT: vmov.f32 s2, s19 +; CHECK-NEXT: vmov.f32 s1, s19 +; CHECK-NEXT: vmov.f32 s2, s27 ; CHECK-NEXT: vmov.f32 s3, s31 ; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s11, s5 -; CHECK-NEXT: vmov.f32 s0, s17 -; CHECK-NEXT: vstrw.32 q2, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s3, s18 -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s4, s20 +; CHECK-NEXT: vmov.f32 s7, s21 +; CHECK-NEXT: vmov.f32 s0, s25 +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s3, s26 +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vmov.f32 s23, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vmov.f64 d14, d4 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s5, s27 -; CHECK-NEXT: vmov.f32 s8, s24 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s24, s1 -; CHECK-NEXT: vmov.f32 s27, s2 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s13, s23 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s13, s24 +; CHECK-NEXT: vmov.f32 s24, s20 +; CHECK-NEXT: vmov.f32 s27, s21 +; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s12, s16 +; CHECK-NEXT: vmov.f64 d14, d10 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s19, s6 +; CHECK-NEXT: vmov.f32 s25, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vmov.f32 s3, s23 +; CHECK-NEXT: vmov.f32 s0, s22 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vstrw.32 q0, [r1, #80] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov r0, r3, d14 ; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s20, s8 ; CHECK-NEXT: vstrw.32 q0, [r1, #128] -; CHECK-NEXT: vmov.f32 s11, s25 +; CHECK-NEXT: vmov.f32 s21, s4 ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s20, s12 -; CHECK-NEXT: vmov.32 q6[1], r3 -; CHECK-NEXT: vmov.f32 s12, s4 -; CHECK-NEXT: vstrw.32 q6, [r1, #64] -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vmov.32 q5[2], r0 ; CHECK-NEXT: vmov r0, lr, d14 -; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload ; CHECK-NEXT: vmov.32 q0[1], lr -; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1, #160] ; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov r2, r4, d14 -; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s15, s17 +; CHECK-NEXT: vstrw.32 q5, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1, #176] ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov r2, r4, d14 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: vmov.f32 s11, s6 ; CHECK-NEXT: vmov.32 q4[1], r4 +; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vstrw.32 q1, [r1, #80] -; CHECK-NEXT: vstrw.32 q3, [r1, #96] +; CHECK-NEXT: vstrw.32 q2, [r1, #64] +; CHECK-NEXT: vstrw.32 q6, [r1, #96] ; CHECK-NEXT: vstrw.32 q4, [r1, #112] -; CHECK-NEXT: vstrw.32 q5, [r1, #144] +; CHECK-NEXT: vstrw.32 q3, [r1, #144] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #160 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -296,28 +297,28 @@ define void @vst3_v4i16(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q1, [r0] ; CHECK-NEXT: vldrh.u32 q0, [r0, #8] -; CHECK-NEXT: vldrh.u32 q2, [r0, #16] -; CHECK-NEXT: vmov r0, r5, d2 -; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vldrh.u32 q1, [r0] ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov lr, r4, d1 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vldrh.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r0, r5, d2 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov r12, s6 ; CHECK-NEXT: vmov.32 q1[2], r4 -; CHECK-NEXT: vmov r0, r4, d4 +; CHECK-NEXT: vmov r0, r4, d0 ; CHECK-NEXT: vstrh.32 q1, [r1, #16] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.16 q0[3], r5 -; CHECK-NEXT: vmov.16 q0[4], r3 -; CHECK-NEXT: vmov.16 q0[5], r4 -; CHECK-NEXT: vmov.16 q0[6], r12 -; CHECK-NEXT: vmov.16 q0[7], lr -; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov.16 q2[3], r5 +; CHECK-NEXT: vmov.16 q2[4], r3 +; CHECK-NEXT: vmov.16 q2[5], r4 +; CHECK-NEXT: vmov.16 q2[6], r12 +; CHECK-NEXT: vmov.16 q2[7], lr +; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %l1 = load <4 x i16>, ptr %src, align 4 @@ -403,113 +404,109 @@ define void @vst3_v16i16(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q7, [r0, #80] -; CHECK-NEXT: vmov.f32 s0, s11 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vins.f16 s0, s7 -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.f64 d12, d4 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s26, s10 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s12, s0 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.f32 s14, s11 -; CHECK-NEXT: vins.f16 s14, s0 -; CHECK-NEXT: vmov.f32 s20, s7 -; CHECK-NEXT: vmov q0, q3 -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vins.f16 s20, s15 +; CHECK-NEXT: vldrw.u32 q7, [r0, #80] +; CHECK-NEXT: vmov.f32 s4, s15 +; CHECK-NEXT: vmov.u16 r2, q0[5] ; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.f32 s17, s20 -; CHECK-NEXT: vmovx.f16 s20, s31 +; CHECK-NEXT: vins.f16 s4, s3 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmovx.f16 s7, s30 -; CHECK-NEXT: vins.f16 s16, s7 -; CHECK-NEXT: vmovx.f16 s7, s18 -; CHECK-NEXT: vins.f16 s31, s7 -; CHECK-NEXT: vmovx.f16 s7, s11 -; CHECK-NEXT: vins.f16 s3, s7 -; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vins.f16 s16, s4 +; CHECK-NEXT: vmov.f32 s18, s11 +; CHECK-NEXT: vmovx.f16 s4, s15 +; CHECK-NEXT: vmov.f64 d12, d6 +; CHECK-NEXT: vins.f16 s18, s4 +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s11, s11 +; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vins.f16 s19, s11 +; CHECK-NEXT: vmov.f32 s26, s14 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.f32 s20, s15 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vins.f16 s20, s7 +; CHECK-NEXT: vmov.f32 s11, s25 +; CHECK-NEXT: vmov.f32 s1, s20 +; CHECK-NEXT: vmovx.f16 s20, s31 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.f32 s2, s15 +; CHECK-NEXT: vmovx.f16 s15, s30 +; CHECK-NEXT: vins.f16 s0, s15 +; CHECK-NEXT: vmovx.f16 s15, s2 +; CHECK-NEXT: vins.f16 s31, s15 +; CHECK-NEXT: vins.f16 s3, s20 +; CHECK-NEXT: vmov.f32 s2, s31 +; CHECK-NEXT: vmovx.f16 s15, s8 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s20, s24 -; CHECK-NEXT: vmovx.f16 s11, s8 -; CHECK-NEXT: vmov.f32 s7, s25 -; CHECK-NEXT: vins.f16 s20, s0 ; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vins.f16 s7, s1 +; CHECK-NEXT: vins.f16 s20, s0 ; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.f32 s23, s7 -; CHECK-NEXT: vmovx.f16 s7, s24 -; CHECK-NEXT: vmov.f32 s24, s4 -; CHECK-NEXT: vins.f16 s8, s7 -; CHECK-NEXT: vins.f16 s24, s12 +; CHECK-NEXT: vins.f16 s11, s1 +; CHECK-NEXT: vmov.f32 s23, s11 +; CHECK-NEXT: vmovx.f16 s11, s24 +; CHECK-NEXT: vmov.f32 s24, s12 +; CHECK-NEXT: vins.f16 s8, s11 ; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vins.f16 s24, s4 +; CHECK-NEXT: vmov.f32 s8, s13 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.f32 s1, s25 +; CHECK-NEXT: vins.f16 s8, s5 +; CHECK-NEXT: vmov.f32 s2, s26 ; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vins.f16 s8, s13 -; CHECK-NEXT: vmovx.f16 s4, s4 ; CHECK-NEXT: vmov.f32 s27, s8 -; CHECK-NEXT: vmovx.f16 s8, s28 -; CHECK-NEXT: vins.f16 s28, s4 -; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vins.f16 s4, s14 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vins.f16 s26, s8 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmovx.f16 s4, s29 -; CHECK-NEXT: vins.f16 s1, s4 -; CHECK-NEXT: vmovx.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vins.f16 s30, s4 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s29, s0 -; CHECK-NEXT: vmov.f32 s0, s29 -; CHECK-NEXT: vins.f16 s22, s11 -; CHECK-NEXT: vmov.f32 s3, s30 -; CHECK-NEXT: vstrw.32 q5, [r1] -; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vstrw.32 q0, [r1, #64] -; CHECK-NEXT: vmov.f32 s30, s6 -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s31 +; CHECK-NEXT: vmovx.f16 s8, s12 +; CHECK-NEXT: vmovx.f16 s12, s28 +; CHECK-NEXT: vins.f16 s28, s8 +; CHECK-NEXT: vmov.f32 s8, s14 ; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vins.f16 s8, s6 ; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.f32 s25, s28 ; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmovx.f16 s8, s9 -; CHECK-NEXT: vmovx.f16 s4, s29 +; CHECK-NEXT: vmovx.f16 s8, s29 +; CHECK-NEXT: vins.f16 s22, s15 +; CHECK-NEXT: vmovx.f16 s4, s13 ; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vmovx.f16 s8, s30 -; CHECK-NEXT: vins.f16 s9, s4 +; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vins.f16 s26, s12 +; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s30, s8 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vins.f16 s8, s14 +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vins.f16 s29, s4 +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s9, s8 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: vins.f16 s10, s8 -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmov.f32 s25, s28 +; CHECK-NEXT: vins.f16 s13, s12 +; CHECK-NEXT: vmov.f32 s4, s29 +; CHECK-NEXT: vmov.f32 s7, s30 ; CHECK-NEXT: vstrw.32 q6, [r1, #48] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q4, [r1, #80] -; CHECK-NEXT: vstrw.32 q1, [r1, #32] -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vmov.f32 s12, s9 +; CHECK-NEXT: vstrw.32 q1, [r1, #64] +; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vstrw.32 q0, [r1, #80] +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -905,28 +902,28 @@ define void @vst3_v4i64(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q7, [r0, #48] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f64 d6, d15 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmov.f64 d15, d13 -; CHECK-NEXT: vmov.f64 d7, d1 -; CHECK-NEXT: vmov.f64 d10, d2 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f64 d11, d12 -; CHECK-NEXT: vmov.f64 d2, d8 -; CHECK-NEXT: vstrw.32 q5, [r1] -; CHECK-NEXT: vmov.f64 d1, d5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmov.f64 d8, d15 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmov.f64 d12, d5 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vmov.f64 d5, d9 +; CHECK-NEXT: vmov.f64 d13, d1 +; CHECK-NEXT: vmov.f64 d14, d6 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f64 d15, d8 +; CHECK-NEXT: vmov.f64 d6, d10 +; CHECK-NEXT: vstrw.32 q7, [r1] +; CHECK-NEXT: vmov.f64 d1, d3 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f64 d10, d5 ; CHECK-NEXT: vstrw.32 q0, [r1, #64] -; CHECK-NEXT: vmov.f64 d12, d4 -; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.f64 d13, d14 -; CHECK-NEXT: vstrw.32 q6, [r1, #48] +; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vmov.f64 d9, d4 +; CHECK-NEXT: vstrw.32 q4, [r1, #48] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -975,22 +972,22 @@ define void @vst3_v4f32(ptr %src, ptr %dst) { ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s13, s5 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vmov.f32 s14, s10 +; CHECK-NEXT: vmov.f32 s13, s9 +; CHECK-NEXT: vmov.f32 s18, s8 +; CHECK-NEXT: vmov.f32 s8, s10 +; CHECK-NEXT: vmov.f32 s14, s6 ; CHECK-NEXT: vmov.f32 s15, s2 -; CHECK-NEXT: vmov.f32 s16, s8 +; CHECK-NEXT: vmov.f32 s16, s4 ; CHECK-NEXT: vstrw.32 q3, [r1, #16] ; CHECK-NEXT: vmov.f32 s17, s0 -; CHECK-NEXT: vmov.f32 s19, s9 -; CHECK-NEXT: vmov.f32 s5, s11 +; CHECK-NEXT: vmov.f32 s19, s5 +; CHECK-NEXT: vmov.f32 s9, s7 ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -1011,46 +1008,50 @@ define void @vst3_v8f32(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s1, s15 -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f32 s0, s12 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vmov.f32 s2, s24 -; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s20, s4 -; CHECK-NEXT: vmov.f32 s23, s5 -; CHECK-NEXT: vstrw.32 q0, [r1, #80] -; CHECK-NEXT: vmov.f32 s12, s9 -; CHECK-NEXT: vmov.f32 s15, s10 -; CHECK-NEXT: vmov.f32 s13, s25 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vstrw.32 q3, [r1, #64] -; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s22, s28 -; CHECK-NEXT: vmov.f32 s8, s30 -; CHECK-NEXT: vstrw.32 q5, [r1] -; CHECK-NEXT: vmov.f32 s10, s19 -; CHECK-NEXT: vmov.f32 s11, s31 -; CHECK-NEXT: vmov.f32 s5, s29 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vmov.f32 s4, s17 -; CHECK-NEXT: vmov.f32 s7, s18 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s13, s4 +; CHECK-NEXT: vmov.f64 d15, d3 +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s29, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s3 +; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vmov.f32 s4, s0 +; CHECK-NEXT: vmov.f32 s26, s19 +; CHECK-NEXT: vmov.f32 s5, s16 +; CHECK-NEXT: vmov.f32 s0, s17 +; CHECK-NEXT: vmov.f32 s3, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s1, s21 +; CHECK-NEXT: vmov.f32 s21, s17 +; CHECK-NEXT: vstrw.32 q0, [r1, #64] +; CHECK-NEXT: vmov.f32 s24, s22 +; CHECK-NEXT: vmov.f32 s27, s23 +; CHECK-NEXT: vmov.f64 d11, d9 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f32 s12, s8 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov.f32 s6, s20 +; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vmov.f32 s16, s18 +; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vmov.f32 s17, s11 +; CHECK-NEXT: vmov.f32 s18, s31 +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.f32 s9, s21 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vmov.f32 s8, s29 +; CHECK-NEXT: vmov.f32 s11, s30 +; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1073,95 +1074,95 @@ define void @vst3_v16f32(ptr %src, ptr %dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #128 ; CHECK-NEXT: sub sp, #128 -; CHECK-NEXT: vldrw.u32 q3, [r0, #176] -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #176] +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vldrw.u32 q7, [r0, #64] ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] ; CHECK-NEXT: vldrw.u32 q0, [r0, #128] -; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #160] -; CHECK-NEXT: vmov.f32 s24, s9 -; CHECK-NEXT: vldrw.u32 q5, [r0, #144] -; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #96] -; CHECK-NEXT: vmov.f32 s26, s6 -; CHECK-NEXT: vldrw.u32 q7, [r0, #112] -; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.f32 s27, s10 -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s25, s1 -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #112] ; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vstrw.32 q6, [r1, #16] -; CHECK-NEXT: vmov.f32 s24, s2 +; CHECK-NEXT: vmov.f32 s16, s29 +; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #96] +; CHECK-NEXT: vmov.f32 s18, s6 ; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s27, s3 +; CHECK-NEXT: vmov.f32 s19, s30 +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vldrw.u32 q6, [r0, #144] +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f32 s16, s2 +; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.f32 s14, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s12, s4 ; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s13, s28 ; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s25, s7 -; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s14, s0 +; CHECK-NEXT: vmov.f32 s12, s20 +; CHECK-NEXT: vmov.f32 s13, s4 +; CHECK-NEXT: vmov.f32 s15, s21 +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s13, s1 ; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vmov.f32 s4, s16 -; CHECK-NEXT: vmov.f32 s5, s28 -; CHECK-NEXT: vmov.f32 s7, s17 -; CHECK-NEXT: vmov.f32 s1, s19 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s2, s31 +; CHECK-NEXT: vmov.f32 s1, s23 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmov.f32 s15, s6 ; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s11 -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s15, s30 -; CHECK-NEXT: vstrw.32 q6, [r1, #32] -; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vldrw.u32 q6, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s31 +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vmov.f32 s21, s1 +; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s30, s0 +; CHECK-NEXT: vstrw.32 q3, [r1, #160] ; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vmov.f32 s1, s11 -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s20, s5 +; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vmov.f32 s23, s6 ; CHECK-NEXT: vmov.f32 s28, s8 +; CHECK-NEXT: vstrw.32 q5, [r1, #112] +; CHECK-NEXT: vmov.f32 s29, s4 ; CHECK-NEXT: vmov.f32 s31, s9 +; CHECK-NEXT: vmov.f32 s1, s11 ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s12, s29 -; CHECK-NEXT: vmov.f32 s29, s4 -; CHECK-NEXT: vstrw.32 q3, [r1, #160] -; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s0, s8 +; CHECK-NEXT: vmov.f32 s3, s9 ; CHECK-NEXT: vstrw.32 q7, [r1, #96] -; CHECK-NEXT: vmov.f32 s19, s6 -; CHECK-NEXT: vmov.f32 s4, s8 -; CHECK-NEXT: vstrw.32 q4, [r1, #112] -; CHECK-NEXT: vmov.f32 s6, s20 -; CHECK-NEXT: vmov.f32 s20, s22 -; CHECK-NEXT: vmov.f32 s5, s0 -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov.f32 s22, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vmov.f32 s2, s24 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vstrw.32 q0, [r1, #48] +; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s9, s25 ; CHECK-NEXT: vstrw.32 q0, [r1, #128] ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s9, s21 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vmov.f32 s24, s26 ; CHECK-NEXT: vstrw.32 q0, [r1, #144] -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s21, s27 -; CHECK-NEXT: vstrw.32 q2, [r1, #64] +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s8, s5 ; CHECK-NEXT: vstrw.32 q0, [r1, #176] ; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [r1, #80] +; CHECK-NEXT: vmov.f32 s11, s6 +; CHECK-NEXT: vmov.f32 s25, s19 +; CHECK-NEXT: vstrw.32 q2, [r1, #64] +; CHECK-NEXT: vmov.f32 s26, s7 ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q6, [r1, #80] ; CHECK-NEXT: add sp, #128 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -1217,22 +1218,23 @@ define void @vst3_v4f16(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r3 +; CHECK-NEXT: add.w r4, r0, #8 +; CHECK-NEXT: ldrd r12, lr, [r0] +; CHECK-NEXT: ldr r0, [r0, #20] +; CHECK-NEXT: ldm r4, {r2, r3, r4} ; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r2 ; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmovx.f16 s9, s3 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vmov q0[2], q0[0], r12, r2 +; CHECK-NEXT: vmov q0[3], q0[1], lr, r3 ; CHECK-NEXT: vmovx.f16 s6, s0 ; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s8, s4 ; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmovx.f16 s6, s1 ; CHECK-NEXT: vins.f16 s2, s8 ; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vmovx.f16 s9, s3 ; CHECK-NEXT: vins.f16 s5, s6 ; CHECK-NEXT: vins.f16 s9, s8 ; CHECK-NEXT: vmov.f32 s8, s5 @@ -1263,55 +1265,56 @@ define void @vst3_v8f16(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vmov.f32 s4, s15 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmov.f32 s8, s7 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vins.f16 s4, s19 +; CHECK-NEXT: vins.f16 s8, s15 ; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmovx.f16 s10, s16 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmovx.f16 s4, s19 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmovx.f16 s8, s15 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmovx.f16 s2, s15 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vins.f16 s3, s8 -; CHECK-NEXT: vmov.f32 s8, s12 -; CHECK-NEXT: vins.f16 s8, s16 -; CHECK-NEXT: vins.f16 s7, s2 -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vins.f16 s2, s17 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmovx.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vins.f16 s4, s2 -; CHECK-NEXT: vins.f16 s10, s12 -; CHECK-NEXT: vmovx.f16 s12, s17 -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vins.f16 s2, s18 -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmovx.f16 s2, s13 -; CHECK-NEXT: vins.f16 s5, s2 -; CHECK-NEXT: vmovx.f16 s2, s14 -; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vins.f16 s16, s12 ; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmovx.f16 s7, s10 +; CHECK-NEXT: vmovx.f16 s12, s12 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vmovx.f16 s7, s11 +; CHECK-NEXT: vins.f16 s11, s2 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vins.f16 s2, s13 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vins.f16 s3, s7 +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s8 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vins.f16 s18, s4 +; CHECK-NEXT: vmovx.f16 s4, s13 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vins.f16 s2, s14 +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmovx.f16 s4, s9 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s9, s2 +; CHECK-NEXT: vmovx.f16 s2, s6 +; CHECK-NEXT: vins.f16 s10, s2 +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vmov.f32 s17, s8 +; CHECK-NEXT: vins.f16 s13, s4 +; CHECK-NEXT: vmov.f32 s12, s9 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s19, s6 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vstrw.32 q3, [r1, #16] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -1334,125 +1337,119 @@ define void @vst3_v16f16(ptr %src, ptr %dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #48 ; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q7, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] ; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vmov.f32 s8, s12 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmovx.f16 s2, s28 +; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vins.f16 s8, s28 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vins.f16 s0, s5 +; CHECK-NEXT: vins.f16 s0, s29 ; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmovx.f16 s2, s16 ; CHECK-NEXT: vmov.f32 s11, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmov.f32 s12, s8 -; CHECK-NEXT: vmov.f64 d11, d9 -; CHECK-NEXT: vmov.f32 s21, s17 -; CHECK-NEXT: vmov.f64 d7, d5 -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmovx.f16 s2, s8 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vins.f16 s14, s2 +; CHECK-NEXT: vins.f16 s10, s2 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmovx.f16 s0, s4 ; CHECK-NEXT: vmovx.f16 s2, s24 -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmov.f32 s4, s8 +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vins.f16 s16, s0 +; CHECK-NEXT: vmov.f32 s0, s9 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.f32 s16, s12 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vins.f16 s16, s24 -; CHECK-NEXT: vmov.16 q4[4], r2 +; CHECK-NEXT: vins.f16 s4, s24 +; CHECK-NEXT: vmov.f64 d7, d3 ; CHECK-NEXT: vins.f16 s0, s25 -; CHECK-NEXT: vmov.f32 s19, s0 -; CHECK-NEXT: vmovx.f16 s0, s12 -; CHECK-NEXT: vmov.f64 d15, d13 -; CHECK-NEXT: vmov.f32 s17, s13 -; CHECK-NEXT: vmov.f32 s24, s16 -; CHECK-NEXT: vmov.f64 d13, d9 -; CHECK-NEXT: vmov.f64 d9, d7 -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmovx.f16 s2, s12 -; CHECK-NEXT: vins.f16 s12, s0 -; CHECK-NEXT: vins.f16 s26, s2 -; CHECK-NEXT: vmovx.f16 s2, s30 -; CHECK-NEXT: vmov.f32 s0, s19 -; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s2, s20 +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmovx.f16 s2, s26 +; CHECK-NEXT: vins.f16 s20, s0 +; CHECK-NEXT: vmov.f32 s0, s11 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vins.f16 s0, s31 -; CHECK-NEXT: vmov.f32 s29, s25 -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.f32 s25, s0 -; CHECK-NEXT: vmovx.f16 s0, s31 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d3, d5 +; CHECK-NEXT: vins.f16 s0, s27 +; CHECK-NEXT: vmovx.f16 s2, s23 +; CHECK-NEXT: vmov.f32 s5, s9 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmovx.f16 s0, s27 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmovx.f16 s2, s15 -; CHECK-NEXT: vins.f16 s24, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vins.f16 s15, s0 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vmov.f32 s4, s23 -; CHECK-NEXT: vins.f16 s27, s2 +; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vins.f16 s23, s0 +; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vins.f16 s11, s2 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vins.f16 s4, s7 +; CHECK-NEXT: vins.f16 s10, s31 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vmov.f64 d3, d7 +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmovx.f16 s10, s31 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmovx.f16 s10, s18 ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmovx.f16 s4, s11 -; CHECK-NEXT: vmovx.f16 s2, s23 -; CHECK-NEXT: vins.f16 s3, s4 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vins.f16 s11, s2 -; CHECK-NEXT: vmov.f32 s2, s22 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.f32 s29, s12 -; CHECK-NEXT: vmovx.f16 s4, s21 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vins.f16 s9, s4 -; CHECK-NEXT: vmovx.f16 s4, s22 -; CHECK-NEXT: vins.f16 s10, s4 -; CHECK-NEXT: vmov.f32 s21, s17 -; CHECK-NEXT: vmov.f32 s22, s18 -; CHECK-NEXT: vins.f16 s5, s12 -; CHECK-NEXT: vmov.f32 s4, s18 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmovx.f16 s12, s17 -; CHECK-NEXT: vins.f16 s4, s18 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s13 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vmovx.f16 s2, s15 +; CHECK-NEXT: vins.f16 s0, s10 +; CHECK-NEXT: vmovx.f16 s10, s19 +; CHECK-NEXT: vins.f16 s19, s2 +; CHECK-NEXT: vmov.f32 s2, s14 +; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s13, s20 +; CHECK-NEXT: vmovx.f16 s7, s6 +; CHECK-NEXT: vins.f16 s3, s10 +; CHECK-NEXT: vmovx.f16 s10, s29 +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s13, s5 +; CHECK-NEXT: vins.f16 s18, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmovx.f16 s20, s17 +; CHECK-NEXT: vins.f16 s17, s13 +; CHECK-NEXT: vmovx.f16 s13, s25 +; CHECK-NEXT: vmov.f32 s7, s6 +; CHECK-NEXT: vins.f16 s2, s30 ; CHECK-NEXT: vmov.16 q7[2], r0 -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmov.f32 s30, s4 -; CHECK-NEXT: vmovx.f16 s4, s21 -; CHECK-NEXT: vins.f16 s13, s4 -; CHECK-NEXT: vmovx.f16 s4, s22 -; CHECK-NEXT: vins.f16 s14, s4 -; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vins.f16 s7, s26 +; CHECK-NEXT: vmov.16 q6[2], r0 +; CHECK-NEXT: vmov.f32 s26, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s30, s2 +; CHECK-NEXT: vins.f16 s29, s20 +; CHECK-NEXT: vmov.f32 s2, s19 +; CHECK-NEXT: vmovx.f16 s7, s6 +; CHECK-NEXT: vmovx.f16 s13, s5 +; CHECK-NEXT: vmovx.f16 s20, s21 +; CHECK-NEXT: vins.f16 s22, s7 +; CHECK-NEXT: vins.f16 s21, s13 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1, #80] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vins.f16 s29, s12 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vstrw.32 q6, [r1, #32] -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vstrw.32 q5, [r1, #48] -; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmov.f32 s10, s23 +; CHECK-NEXT: vmov.f32 s5, s16 +; CHECK-NEXT: vins.f16 s25, s20 +; CHECK-NEXT: vmov.f32 s28, s17 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] +; CHECK-NEXT: vmov.f32 s31, s18 +; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vmov.f32 s24, s21 +; CHECK-NEXT: vstrw.32 q7, [r1, #64] +; CHECK-NEXT: vmov.f32 s27, s22 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov.f32 s28, s13 -; CHECK-NEXT: vstrw.32 q1, [r1, #64] -; CHECK-NEXT: vmov.f32 s31, s14 -; CHECK-NEXT: vstrw.32 q7, [r1, #16] +; CHECK-NEXT: vstrw.32 q6, [r1, #16] ; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -1503,28 +1500,28 @@ define void @vst3_v4f64(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q7, [r0, #48] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f64 d6, d15 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmov.f64 d15, d13 -; CHECK-NEXT: vmov.f64 d7, d1 -; CHECK-NEXT: vmov.f64 d10, d2 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f64 d11, d12 -; CHECK-NEXT: vmov.f64 d2, d8 -; CHECK-NEXT: vstrw.32 q5, [r1] -; CHECK-NEXT: vmov.f64 d1, d5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmov.f64 d8, d15 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmov.f64 d12, d5 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vmov.f64 d5, d9 +; CHECK-NEXT: vmov.f64 d13, d1 +; CHECK-NEXT: vmov.f64 d14, d6 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f64 d15, d8 +; CHECK-NEXT: vmov.f64 d6, d10 +; CHECK-NEXT: vstrw.32 q7, [r1] +; CHECK-NEXT: vmov.f64 d1, d3 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f64 d10, d5 ; CHECK-NEXT: vstrw.32 q0, [r1, #64] -; CHECK-NEXT: vmov.f64 d12, d4 -; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.f64 d13, d14 -; CHECK-NEXT: vstrw.32 q6, [r1, #48] +; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vmov.f64 d9, d4 +; CHECK-NEXT: vstrw.32 q4, [r1, #48] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll index 869c9cb7afce8a..fab8c311794d3c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll @@ -102,19 +102,19 @@ define ptr @vst4_v2i64(ptr %src, ptr %dst) { ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d2, d6 -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f64 d0, d7 -; CHECK-NEXT: vmov.f64 d7, d4 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vmov.f64 d9, d0 +; CHECK-NEXT: vmov.f64 d0, d3 +; CHECK-NEXT: vmov.f64 d3, d4 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov.f64 d4, d9 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f64 d2, d6 +; CHECK-NEXT: vmov.f64 d4, d7 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vstrw.32 q1, [r1], #64 +; CHECK-NEXT: vstrw.32 q4, [r1], #64 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -204,19 +204,19 @@ define ptr @vst4_v2f64(ptr %src, ptr %dst) { ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d2, d6 -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f64 d0, d7 -; CHECK-NEXT: vmov.f64 d7, d4 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vmov.f64 d9, d0 +; CHECK-NEXT: vmov.f64 d0, d3 +; CHECK-NEXT: vmov.f64 d3, d4 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov.f64 d4, d9 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f64 d2, d6 +; CHECK-NEXT: vmov.f64 d4, d7 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vstrw.32 q1, [r1], #64 +; CHECK-NEXT: vstrw.32 q4, [r1], #64 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index f3a65c40031af3..6e7bee7be20eb8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -8,23 +8,22 @@ define void @vst4_v2i32(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: add.w r6, r0, #16 -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldm r6, {r4, r5, r6} -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 +; CHECK-NEXT: add.w r6, r0, #8 +; CHECK-NEXT: ldrd r12, lr, [r0] ; CHECK-NEXT: ldr r0, [r0, #28] -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 +; CHECK-NEXT: ldm r6, {r2, r3, r4, r5, r6} ; CHECK-NEXT: vmov q0[2], q0[0], r4, r6 -; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r0 +; CHECK-NEXT: vmov.f32 s10, s0 +; CHECK-NEXT: vmov q1[2], q1[0], r12, r2 +; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vmov q1[3], q1[1], lr, r3 +; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vmov.f32 s9, s6 ; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s2 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -115,34 +114,37 @@ define void @vst4_v16i32(ptr %src, ptr %dst) { ; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #192 -; CHECK-NEXT: sub sp, #192 +; CHECK-NEXT: .pad #256 +; CHECK-NEXT: sub sp, #256 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q4, [r0, #176] ; CHECK-NEXT: vldrw.u32 q3, [r0, #208] ; CHECK-NEXT: vldrw.u32 q2, [r0, #144] ; CHECK-NEXT: vldrw.u32 q1, [r0, #80] ; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [r0, #176] +; CHECK-NEXT: add r2, sp, #192 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #128] -; CHECK-NEXT: vldrw.u32 q5, [r0, #240] -; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vldrw.u32 q3, [r0, #192] +; CHECK-NEXT: vldrw.u32 q2, [r0, #128] ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: vldrw.u32 q5, [r0, #240] +; CHECK-NEXT: vldrw.u32 q7, [r0, #224] +; CHECK-NEXT: vldrw.u32 q6, [r0, #160] +; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vmov q7, q5 -; CHECK-NEXT: vldrw.u32 q3, [r0, #224] -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] +; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill -; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: add r2, sp, #192 ; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #64 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: mov r0, r1 @@ -167,7 +169,7 @@ define void @vst4_v16i32(ptr %src, ptr %dst) { ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: add sp, #192 +; CHECK-NEXT: add sp, #256 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr @@ -193,25 +195,25 @@ define void @vst4_v4i32_align1(ptr %src, ptr %dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vmov.f32 s10, s1 ; CHECK-NEXT: vmov.f32 s22, s0 ; CHECK-NEXT: vmov.f32 s26, s3 -; CHECK-NEXT: vmov.f32 s12, s17 -; CHECK-NEXT: vmov.f32 s13, s9 -; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov.f32 s8, s17 +; CHECK-NEXT: vmov.f32 s9, s13 +; CHECK-NEXT: vmov.f32 s11, s5 ; CHECK-NEXT: vmov.f32 s20, s16 -; CHECK-NEXT: vstrb.8 q3, [r1, #16] -; CHECK-NEXT: vmov.f32 s21, s8 +; CHECK-NEXT: vstrb.8 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s21, s12 ; CHECK-NEXT: vmov.f32 s23, s4 ; CHECK-NEXT: vmov.f32 s24, s19 ; CHECK-NEXT: vstrb.8 q5, [r1] -; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f32 s25, s15 ; CHECK-NEXT: vmov.f32 s27, s7 ; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vstrb.8 q6, [r1, #48] -; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s1, s14 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vstrb.8 q0, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} @@ -239,15 +241,15 @@ define void @vst4_v2i16(ptr %src, ptr %dst) { ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: ldrh r3, [r0, #2] -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: ldrh.w r12, [r0, #10] +; CHECK-NEXT: ldrh.w r12, [r0] +; CHECK-NEXT: ldrh r2, [r0, #10] ; CHECK-NEXT: ldrh.w lr, [r0, #4] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov q1[2], q1[0], r12, r3 ; CHECK-NEXT: ldrh r4, [r0, #12] ; CHECK-NEXT: ldrh r5, [r0, #6] ; CHECK-NEXT: ldrh r6, [r0, #14] ; CHECK-NEXT: ldrh r0, [r0, #8] -; CHECK-NEXT: vmov q0[2], q0[0], r0, r12 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov.16 q1[0], r0 ; CHECK-NEXT: vmov r0, s0 @@ -256,7 +258,7 @@ define void @vst4_v2i16(ptr %src, ptr %dst) { ; CHECK-NEXT: vmov.16 q1[3], r4 ; CHECK-NEXT: vmov.16 q1[4], r3 ; CHECK-NEXT: vmov.16 q1[5], r5 -; CHECK-NEXT: vmov.16 q1[6], r12 +; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov.16 q1[7], r6 ; CHECK-NEXT: vstrh.16 q1, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -278,42 +280,45 @@ entry: define void @vst4_v4i16(ptr %src, ptr %dst) { ; CHECK-LABEL: vst4_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrh.u32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q3, [r0, #8] +; CHECK-NEXT: vldrh.u32 q2, [r0, #8] ; CHECK-NEXT: vldrh.u32 q1, [r0, #16] -; CHECK-NEXT: vldrh.u32 q4, [r0, #24] -; CHECK-NEXT: vmov r3, r4, d1 -; CHECK-NEXT: vmov r5, r12, d0 -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: vmov r0, r5, d7 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r2, lr, d3 -; CHECK-NEXT: vmov r0, r3, d9 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, r6, d8 -; CHECK-NEXT: vmov.16 q2[4], r4 -; CHECK-NEXT: vmov.16 q2[5], r5 -; CHECK-NEXT: vmov r4, r5, d6 -; CHECK-NEXT: vmov.16 q2[6], lr +; CHECK-NEXT: vldrh.u32 q3, [r0, #24] +; CHECK-NEXT: vmov r3, lr, d1 +; CHECK-NEXT: vmov r4, r7, d0 +; CHECK-NEXT: vmov.16 q4[0], r3 +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: vmov r0, r4, d5 +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov r2, r12, d3 +; CHECK-NEXT: vmov r0, r3, d7 +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov r0, r2, d6 +; CHECK-NEXT: vmov.16 q4[4], lr +; CHECK-NEXT: vmov.16 q4[5], r4 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vmov.16 q4[6], r12 ; CHECK-NEXT: vmov.16 q0[1], r4 -; CHECK-NEXT: vmov.16 q2[7], r3 -; CHECK-NEXT: vmov r3, r2, d2 +; CHECK-NEXT: vmov.16 q4[7], r3 +; CHECK-NEXT: vmov r3, r6, d2 ; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: vstrh.16 q2, [r1, #16] +; CHECK-NEXT: vstrh.16 q4, [r1, #16] ; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.16 q0[4], r12 +; CHECK-NEXT: vmov.16 q0[4], r7 ; CHECK-NEXT: vmov.16 q0[5], r5 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r6 +; CHECK-NEXT: vmov.16 q0[6], r6 +; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %l1 = load <4 x i16>, ptr %src, align 4 %s2 = getelementptr <4 x i16>, ptr %src, i32 1 @@ -521,31 +526,31 @@ define void @vst4_v4i8(ptr %src, ptr %dst) { ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrb.u32 q2, [r0] -; CHECK-NEXT: vldrb.u32 q3, [r0, #4] -; CHECK-NEXT: vldrb.u32 q1, [r0, #8] -; CHECK-NEXT: vldrb.u32 q4, [r0, #12] -; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vldrb.u32 q3, [r0] +; CHECK-NEXT: vldrb.u32 q2, [r0, #4] +; CHECK-NEXT: vldrb.u32 q4, [r0, #8] +; CHECK-NEXT: vldrb.u32 q1, [r0, #12] +; CHECK-NEXT: vmov r4, r5, d6 ; CHECK-NEXT: vmov.8 q0[0], r4 -; CHECK-NEXT: vmov r2, lr, d6 -; CHECK-NEXT: vmov.8 q0[1], r2 -; CHECK-NEXT: vmov r0, r4, d2 -; CHECK-NEXT: vmov r3, r12, d8 +; CHECK-NEXT: vmov r3, lr, d4 +; CHECK-NEXT: vmov r0, r4, d8 +; CHECK-NEXT: vmov.8 q0[1], r3 ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.8 q0[3], r3 -; CHECK-NEXT: vmov r2, r7, d9 +; CHECK-NEXT: vmov r2, r12, d2 +; CHECK-NEXT: vmov.8 q0[3], r2 +; CHECK-NEXT: vmov r0, r7, d3 ; CHECK-NEXT: vmov.8 q0[4], r5 -; CHECK-NEXT: vmov r3, r5, d7 +; CHECK-NEXT: vmov r2, r5, d5 ; CHECK-NEXT: vmov.8 q0[5], lr ; CHECK-NEXT: vmov.8 q0[6], r4 -; CHECK-NEXT: vmov r4, r0, d5 +; CHECK-NEXT: vmov r4, r3, d7 ; CHECK-NEXT: vmov.8 q0[7], r12 ; CHECK-NEXT: vmov.8 q0[8], r4 -; CHECK-NEXT: vmov r4, r6, d3 -; CHECK-NEXT: vmov.8 q0[9], r3 +; CHECK-NEXT: vmov r4, r6, d9 +; CHECK-NEXT: vmov.8 q0[9], r2 ; CHECK-NEXT: vmov.8 q0[10], r4 -; CHECK-NEXT: vmov.8 q0[11], r2 -; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r5 ; CHECK-NEXT: vmov.8 q0[14], r6 ; CHECK-NEXT: vmov.8 q0[15], r7 @@ -695,19 +700,19 @@ define void @vst4_v2i64(ptr %src, ptr %dst) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vmov.f64 d9, d0 -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vmov.f64 d11, d2 +; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vmov.f64 d11, d4 ; CHECK-NEXT: vstrw.32 q4, [r1, #16] ; CHECK-NEXT: vmov.f64 d10, d6 -; CHECK-NEXT: vmov.f64 d0, d5 +; CHECK-NEXT: vmov.f64 d0, d3 ; CHECK-NEXT: vstrw.32 q5, [r1] -; CHECK-NEXT: vmov.f64 d2, d7 +; CHECK-NEXT: vmov.f64 d4, d7 ; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: vstrw.32 q2, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: @@ -730,45 +735,43 @@ define void @vst4_v4i64(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: vldrw.u32 q7, [r0, #80] -; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d15, d10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #112] +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #96] +; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d9, d0 +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vldrw.u32 q6, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #112] +; CHECK-NEXT: vmov.f64 d0, d15 +; CHECK-NEXT: vmov.f64 d8, d14 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d14, d12 -; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d14, d4 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d8, d10 +; CHECK-NEXT: vmov.f64 d9, d4 +; CHECK-NEXT: vmov.f64 d4, d11 +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f64 d11, d12 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f64 d10, d6 ; CHECK-NEXT: vmov.f64 d15, d2 -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d4, d0 +; CHECK-NEXT: vstrw.32 q5, [r1, #64] +; CHECK-NEXT: vmov.f64 d14, d0 +; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [r1, #80] +; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d10, d13 -; CHECK-NEXT: vmov.f64 d2, d5 -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vmov.f64 d5, d6 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vmov.f64 d13, d8 -; CHECK-NEXT: vstrw.32 q2, [r1, #64] -; CHECK-NEXT: vmov.f64 d12, d0 -; CHECK-NEXT: vmov.f64 d8, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f64 d12, d7 +; CHECK-NEXT: vstrw.32 q1, [r1, #112] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d6, d15 -; CHECK-NEXT: vstrw.32 q4, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vstrw.32 q3, [r1, #96] -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: vstrw.32 q6, [r1, #96] +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -889,34 +892,37 @@ define void @vst4_v16f32(ptr %src, ptr %dst) { ; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #192 -; CHECK-NEXT: sub sp, #192 +; CHECK-NEXT: .pad #256 +; CHECK-NEXT: sub sp, #256 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q4, [r0, #176] ; CHECK-NEXT: vldrw.u32 q3, [r0, #208] ; CHECK-NEXT: vldrw.u32 q2, [r0, #144] ; CHECK-NEXT: vldrw.u32 q1, [r0, #80] ; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [r0, #176] +; CHECK-NEXT: add r2, sp, #192 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #128] -; CHECK-NEXT: vldrw.u32 q5, [r0, #240] -; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vldrw.u32 q3, [r0, #192] +; CHECK-NEXT: vldrw.u32 q2, [r0, #128] ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: vldrw.u32 q5, [r0, #240] +; CHECK-NEXT: vldrw.u32 q7, [r0, #224] +; CHECK-NEXT: vldrw.u32 q6, [r0, #160] +; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vmov q7, q5 -; CHECK-NEXT: vldrw.u32 q3, [r0, #224] -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] +; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill -; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: add r2, sp, #192 ; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #64 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: mov r0, r1 @@ -941,7 +947,7 @@ define void @vst4_v16f32(ptr %src, ptr %dst) { ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: add sp, #192 +; CHECK-NEXT: add sp, #256 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr @@ -967,25 +973,25 @@ define void @vst4_v4f32_align1(ptr %src, ptr %dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vmov.f32 s10, s1 ; CHECK-NEXT: vmov.f32 s22, s0 ; CHECK-NEXT: vmov.f32 s26, s3 -; CHECK-NEXT: vmov.f32 s12, s17 -; CHECK-NEXT: vmov.f32 s13, s9 -; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov.f32 s8, s17 +; CHECK-NEXT: vmov.f32 s9, s13 +; CHECK-NEXT: vmov.f32 s11, s5 ; CHECK-NEXT: vmov.f32 s20, s16 -; CHECK-NEXT: vstrb.8 q3, [r1, #16] -; CHECK-NEXT: vmov.f32 s21, s8 +; CHECK-NEXT: vstrb.8 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s21, s12 ; CHECK-NEXT: vmov.f32 s23, s4 ; CHECK-NEXT: vmov.f32 s24, s19 ; CHECK-NEXT: vstrb.8 q5, [r1] -; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f32 s25, s15 ; CHECK-NEXT: vmov.f32 s27, s7 ; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vstrb.8 q6, [r1, #48] -; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s1, s14 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vstrb.8 q0, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} @@ -1047,38 +1053,37 @@ define void @vst4_v4f16(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: add.w r6, r0, #16 -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldm r6, {r4, r5, r6} -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 +; CHECK-NEXT: add.w r6, r0, #8 +; CHECK-NEXT: ldrd r12, lr, [r0] ; CHECK-NEXT: ldr r0, [r0, #28] -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 +; CHECK-NEXT: ldm r6, {r2, r3, r4, r5, r6} ; CHECK-NEXT: vmov q0[2], q0[0], r4, r6 -; CHECK-NEXT: vmovx.f16 s10, s5 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r0 -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s4, s0 ; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmov q3[2], q3[0], r12, r2 ; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vmov q3[3], q3[1], lr, r3 +; CHECK-NEXT: vins.f16 s4, s2 ; CHECK-NEXT: vmovx.f16 s11, s1 -; CHECK-NEXT: vins.f16 s12, s2 ; CHECK-NEXT: vmovx.f16 s2, s3 ; CHECK-NEXT: vins.f16 s11, s2 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vmovx.f16 s2, s12 +; CHECK-NEXT: vmovx.f16 s6, s14 ; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s6, s7 -; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmovx.f16 s10, s13 +; CHECK-NEXT: vins.f16 s13, s15 +; CHECK-NEXT: vmovx.f16 s6, s15 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmov.f32 s8, s13 ; CHECK-NEXT: vins.f16 s10, s6 ; CHECK-NEXT: vmov.f32 s9, s1 -; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vstrh.16 q2, [r1, #16] -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s12 -; CHECK-NEXT: vstrh.16 q1, [r1] +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s15, s4 +; CHECK-NEXT: vstrh.16 q3, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %l1 = load <4 x half>, ptr %src, align 4 @@ -1163,61 +1168,61 @@ entry: define void @vst4_v8f16_align1(ptr %src, ptr %dst) { ; CHECK-LABEL: vst4_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: vldrw.u32 q6, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmovx.f16 s2, s21 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmovx.f16 s6, s17 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s9 ; CHECK-NEXT: vmovx.f16 s12, s25 -; CHECK-NEXT: vmovx.f16 s19, s4 -; CHECK-NEXT: vins.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s12, s20 -; CHECK-NEXT: vins.f16 s19, s12 +; CHECK-NEXT: vmovx.f16 s15, s0 +; CHECK-NEXT: vins.f16 s6, s12 +; CHECK-NEXT: vmovx.f16 s12, s16 +; CHECK-NEXT: vins.f16 s15, s12 ; CHECK-NEXT: vmovx.f16 s12, s8 ; CHECK-NEXT: vmovx.f16 s14, s24 -; CHECK-NEXT: vmovx.f16 s15, s7 +; CHECK-NEXT: vmovx.f16 s23, s3 ; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vmovx.f16 s14, s23 -; CHECK-NEXT: vins.f16 s15, s14 -; CHECK-NEXT: vmovx.f16 s14, s11 -; CHECK-NEXT: vmovx.f16 s1, s27 -; CHECK-NEXT: vins.f16 s7, s23 -; CHECK-NEXT: vins.f16 s14, s1 -; CHECK-NEXT: vmovx.f16 s23, s6 -; CHECK-NEXT: vmovx.f16 s1, s22 -; CHECK-NEXT: vins.f16 s6, s22 -; CHECK-NEXT: vins.f16 s5, s21 -; CHECK-NEXT: vins.f16 s4, s20 -; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s22, s10 +; CHECK-NEXT: vmovx.f16 s14, s19 +; CHECK-NEXT: vins.f16 s23, s14 +; CHECK-NEXT: vmovx.f16 s22, s11 +; CHECK-NEXT: vmovx.f16 s14, s27 +; CHECK-NEXT: vins.f16 s3, s19 +; CHECK-NEXT: vins.f16 s22, s14 +; CHECK-NEXT: vmovx.f16 s19, s2 +; CHECK-NEXT: vmovx.f16 s14, s18 +; CHECK-NEXT: vins.f16 s2, s18 +; CHECK-NEXT: vins.f16 s1, s17 +; CHECK-NEXT: vins.f16 s0, s16 +; CHECK-NEXT: vins.f16 s11, s27 +; CHECK-NEXT: vins.f16 s19, s14 +; CHECK-NEXT: vmovx.f16 s18, s10 ; CHECK-NEXT: vins.f16 s10, s26 -; CHECK-NEXT: vmovx.f16 s1, s26 +; CHECK-NEXT: vmovx.f16 s14, s26 ; CHECK-NEXT: vins.f16 s9, s25 ; CHECK-NEXT: vins.f16 s8, s24 -; CHECK-NEXT: vins.f16 s11, s27 -; CHECK-NEXT: vmov q6, q1 -; CHECK-NEXT: vins.f16 s22, s1 -; CHECK-NEXT: vmov.f32 s1, s25 +; CHECK-NEXT: vmov q6, q0 +; CHECK-NEXT: vmov.f32 s5, s25 ; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov.f32 s3, s0 -; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vmov.f32 s7, s4 +; CHECK-NEXT: vins.f16 s18, s14 +; CHECK-NEXT: vmov.f32 s4, s9 ; CHECK-NEXT: vmov.f32 s26, s12 -; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s25, s4 -; CHECK-NEXT: vmov.f32 s27, s19 -; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vstrb.8 q1, [r1, #16] +; CHECK-NEXT: vmov.f32 s25, s0 +; CHECK-NEXT: vmov.f32 s27, s15 +; CHECK-NEXT: vmov.f32 s21, s3 ; CHECK-NEXT: vstrb.8 q6, [r1] -; CHECK-NEXT: vmov.f32 s12, s11 -; CHECK-NEXT: vmov.f32 s21, s6 -; CHECK-NEXT: vstrb.8 q3, [r1, #48] -; CHECK-NEXT: vmov.f32 s20, s10 -; CHECK-NEXT: vstrb.8 q5, [r1, #32] -; CHECK-NEXT: vpop {d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov.f32 s20, s11 +; CHECK-NEXT: vmov.f32 s17, s2 +; CHECK-NEXT: vstrb.8 q5, [r1, #48] +; CHECK-NEXT: vmov.f32 s16, s10 +; CHECK-NEXT: vstrb.8 q4, [r1, #32] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <8 x half>, ptr %src, align 4 @@ -1242,19 +1247,19 @@ define void @vst4_v2f64(ptr %src, ptr %dst) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vmov.f64 d9, d0 -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vmov.f64 d11, d2 +; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vmov.f64 d11, d4 ; CHECK-NEXT: vstrw.32 q4, [r1, #16] ; CHECK-NEXT: vmov.f64 d10, d6 -; CHECK-NEXT: vmov.f64 d0, d5 +; CHECK-NEXT: vmov.f64 d0, d3 ; CHECK-NEXT: vstrw.32 q5, [r1] -; CHECK-NEXT: vmov.f64 d2, d7 +; CHECK-NEXT: vmov.f64 d4, d7 ; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: vstrw.32 q2, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: @@ -1277,45 +1282,43 @@ define void @vst4_v4f64(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: vldrw.u32 q7, [r0, #80] -; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d15, d10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #112] +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #96] +; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d9, d0 +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vldrw.u32 q6, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #112] +; CHECK-NEXT: vmov.f64 d0, d15 +; CHECK-NEXT: vmov.f64 d8, d14 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d14, d12 -; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d14, d4 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d8, d10 +; CHECK-NEXT: vmov.f64 d9, d4 +; CHECK-NEXT: vmov.f64 d4, d11 +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f64 d11, d12 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f64 d10, d6 ; CHECK-NEXT: vmov.f64 d15, d2 -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d4, d0 +; CHECK-NEXT: vstrw.32 q5, [r1, #64] +; CHECK-NEXT: vmov.f64 d14, d0 +; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [r1, #80] +; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d10, d13 -; CHECK-NEXT: vmov.f64 d2, d5 -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vmov.f64 d5, d6 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vmov.f64 d13, d8 -; CHECK-NEXT: vstrw.32 q2, [r1, #64] -; CHECK-NEXT: vmov.f64 d12, d0 -; CHECK-NEXT: vmov.f64 d8, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f64 d12, d7 +; CHECK-NEXT: vstrw.32 q1, [r1, #112] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d6, d15 -; CHECK-NEXT: vstrw.32 q4, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vstrw.32 q3, [r1, #96] -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: vstrw.32 q6, [r1, #96] +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/pr52817.ll b/llvm/test/CodeGen/Thumb2/pr52817.ll index 87615f0a1f7ef4..7ac0746a6e0508 100644 --- a/llvm/test/CodeGen/Thumb2/pr52817.ll +++ b/llvm/test/CodeGen/Thumb2/pr52817.ll @@ -39,6 +39,7 @@ define i32 @test(ptr %arg, ptr %arg1, ptr %arg2) #0 !dbg !6 { ; CHECK-NEXT: .loc 1 28 24 prologue_end @ test.cpp:28:24 ; CHECK-NEXT: strne.w r6, [r8] ; CHECK-NEXT: moveq r6, #1 +; CHECK-NEXT: @DEBUG_VALUE: test:this <- [DW_OP_LLVM_arg 0, DW_OP_plus_uconst 135168, DW_OP_LLVM_arg 1, DW_OP_constu 4, DW_OP_mul, DW_OP_plus, DW_OP_plus_uconst 4, DW_OP_stack_value] $r0, $r5 ; CHECK-NEXT: ldr r4, [r4, #4] ; CHECK-NEXT: orrs r4, r6 ; CHECK-NEXT: str.w r4, [r8] diff --git a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll index 58bafebd5b702f..e59493c15baffb 100644 --- a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll @@ -67,39 +67,39 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: and r0, r3, #1 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: rsbs r1, r0, #0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: movs r2, #9 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: and r0, r5, #1 -; CHECK-NEXT: mov r7, r2 +; CHECK-NEXT: and r0, r4, #1 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: rsbs r1, r0, #0 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r7, r3 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: movs r2, #9 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: ldr r1, [sp, #44] ; CHECK-NEXT: vmov.32 d8[0], r2 ; CHECK-NEXT: ldr r0, [sp, #40] -; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: mov r4, r3 ; CHECK-NEXT: and r1, r1, #1 ; CHECK-NEXT: mvn r2, #8 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: vmov.32 d9[0], r7 +; CHECK-NEXT: vmov.32 d9[0], r6 ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: vmov.32 d16[0], r2 ; CHECK-NEXT: adr r0, .LCPI3_0 -; CHECK-NEXT: vmov.32 d9[1], r4 +; CHECK-NEXT: vmov.32 d9[1], r7 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128] ; CHECK-NEXT: adr r0, .LCPI3_1 ; CHECK-NEXT: vmov.32 d16[1], r3 -; CHECK-NEXT: vmov.32 d8[1], r5 +; CHECK-NEXT: vmov.32 d8[1], r4 ; CHECK-NEXT: vand q8, q8, q9 ; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128] ; CHECK-NEXT: adr r0, .LCPI3_2 diff --git a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll index cff16c300e7036..d5ab73dfbeada1 100644 --- a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll @@ -8,118 +8,115 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; THUMBV7-NEXT: .pad #44 ; THUMBV7-NEXT: sub sp, #44 -; THUMBV7-NEXT: ldr.w lr, [sp, #88] -; THUMBV7-NEXT: mov r11, r0 ; THUMBV7-NEXT: ldr r4, [sp, #96] +; THUMBV7-NEXT: mov r11, r0 ; THUMBV7-NEXT: ldr.w r12, [sp, #80] -; THUMBV7-NEXT: umull r1, r5, r2, lr -; THUMBV7-NEXT: umull r7, r6, r3, r4 -; THUMBV7-NEXT: str r1, [sp, #40] @ 4-byte Spill -; THUMBV7-NEXT: ldr r1, [sp, #100] +; THUMBV7-NEXT: ldr r6, [sp, #92] +; THUMBV7-NEXT: umull r7, lr, r3, r4 +; THUMBV7-NEXT: ldr.w r9, [sp, #88] ; THUMBV7-NEXT: umull r4, r0, r4, r2 ; THUMBV7-NEXT: str r7, [sp, #32] @ 4-byte Spill -; THUMBV7-NEXT: umull r7, r1, r1, r2 +; THUMBV7-NEXT: umull r1, r8, r2, r9 ; THUMBV7-NEXT: str r4, [sp, #24] @ 4-byte Spill -; THUMBV7-NEXT: str r0, [sp, #12] @ 4-byte Spill -; THUMBV7-NEXT: ldr r0, [sp, #84] -; THUMBV7-NEXT: str r7, [sp, #20] @ 4-byte Spill -; THUMBV7-NEXT: ldr r7, [sp, #92] -; THUMBV7-NEXT: umull r10, r8, r0, lr -; THUMBV7-NEXT: umull r4, r9, r7, r12 -; THUMBV7-NEXT: str r4, [sp, #8] @ 4-byte Spill -; THUMBV7-NEXT: umull r4, r0, r12, lr +; THUMBV7-NEXT: str r0, [sp, #4] @ 4-byte Spill +; THUMBV7-NEXT: umull r4, r5, r6, r12 +; THUMBV7-NEXT: str r1, [sp, #40] @ 4-byte Spill +; THUMBV7-NEXT: ldr r1, [sp, #100] +; THUMBV7-NEXT: str r4, [sp, #12] @ 4-byte Spill +; THUMBV7-NEXT: umull r4, r0, r12, r9 ; THUMBV7-NEXT: mov.w r12, #0 -; THUMBV7-NEXT: umlal r5, r12, r3, lr +; THUMBV7-NEXT: umull r7, r1, r1, r2 ; THUMBV7-NEXT: str r4, [sp, #16] @ 4-byte Spill -; THUMBV7-NEXT: str r0, [sp, #4] @ 4-byte Spill -; THUMBV7-NEXT: umull r4, r2, r2, r7 +; THUMBV7-NEXT: str r0, [sp, #8] @ 4-byte Spill +; THUMBV7-NEXT: umull r4, r2, r2, r6 ; THUMBV7-NEXT: ldr r0, [sp, #40] @ 4-byte Reload +; THUMBV7-NEXT: str r7, [sp, #20] @ 4-byte Spill +; THUMBV7-NEXT: ldr r7, [sp, #84] +; THUMBV7-NEXT: umlal r8, r12, r3, r9 ; THUMBV7-NEXT: str r4, [sp, #28] @ 4-byte Spill ; THUMBV7-NEXT: str r2, [sp, #36] @ 4-byte Spill ; THUMBV7-NEXT: str.w r0, [r11] +; THUMBV7-NEXT: umull r10, r7, r7, r9 ; THUMBV7-NEXT: ldr r0, [sp, #32] @ 4-byte Reload ; THUMBV7-NEXT: ldr r2, [sp, #20] @ 4-byte Reload ; THUMBV7-NEXT: add r2, r0 -; THUMBV7-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; THUMBV7-NEXT: adds.w lr, r0, r2 +; THUMBV7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; THUMBV7-NEXT: adds.w r9, r0, r2 ; THUMBV7-NEXT: mov.w r2, #0 ; THUMBV7-NEXT: adc r0, r2, #0 ; THUMBV7-NEXT: str r0, [sp, #32] @ 4-byte Spill -; THUMBV7-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; THUMBV7-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; THUMBV7-NEXT: add.w r4, r10, r0 -; THUMBV7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; THUMBV7-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; THUMBV7-NEXT: adds r4, r4, r0 ; THUMBV7-NEXT: adc r0, r2, #0 ; THUMBV7-NEXT: str r0, [sp, #40] @ 4-byte Spill ; THUMBV7-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; THUMBV7-NEXT: ldr r2, [sp, #16] @ 4-byte Reload ; THUMBV7-NEXT: adds.w r10, r2, r0 -; THUMBV7-NEXT: mov r2, r3 -; THUMBV7-NEXT: adc.w r0, r4, lr -; THUMBV7-NEXT: ldr.w lr, [sp, #100] +; THUMBV7-NEXT: ldr r0, [sp, #100] +; THUMBV7-NEXT: adc.w r9, r9, r4 ; THUMBV7-NEXT: cmp r1, #0 -; THUMBV7-NEXT: str r0, [sp, #24] @ 4-byte Spill ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r1, #1 ; THUMBV7-NEXT: cmp r3, #0 -; THUMBV7-NEXT: mov r0, lr +; THUMBV7-NEXT: mov r4, r3 ; THUMBV7-NEXT: it ne -; THUMBV7-NEXT: movne r2, #1 -; THUMBV7-NEXT: cmp.w lr, #0 +; THUMBV7-NEXT: movne r4, #1 +; THUMBV7-NEXT: cmp r0, #0 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r0, #1 -; THUMBV7-NEXT: ldr r4, [sp, #28] @ 4-byte Reload -; THUMBV7-NEXT: ands r0, r2 -; THUMBV7-NEXT: orrs r1, r0 -; THUMBV7-NEXT: adds r5, r5, r4 -; THUMBV7-NEXT: str.w r5, [r11, #4] -; THUMBV7-NEXT: ldr r0, [sp, #36] @ 4-byte Reload -; THUMBV7-NEXT: mov.w r5, #0 -; THUMBV7-NEXT: adcs.w r0, r0, r12 -; THUMBV7-NEXT: adc r2, r5, #0 +; THUMBV7-NEXT: ldr r2, [sp, #28] @ 4-byte Reload +; THUMBV7-NEXT: ands r0, r4 +; THUMBV7-NEXT: ldr r4, [sp, #84] +; THUMBV7-NEXT: adds.w r2, r2, r8 +; THUMBV7-NEXT: str.w r2, [r11, #4] +; THUMBV7-NEXT: orr.w r0, r0, r1 +; THUMBV7-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; THUMBV7-NEXT: adcs.w r1, r1, r12 +; THUMBV7-NEXT: mov.w r12, #0 +; THUMBV7-NEXT: adc r2, r12, #0 +; THUMBV7-NEXT: cmp.w lr, #0 +; THUMBV7-NEXT: it ne +; THUMBV7-NEXT: movne.w lr, #1 +; THUMBV7-NEXT: umlal r1, r2, r3, r6 +; THUMBV7-NEXT: ldr r3, [sp, #32] @ 4-byte Reload +; THUMBV7-NEXT: orr.w r0, r0, lr ; THUMBV7-NEXT: cmp r6, #0 +; THUMBV7-NEXT: orr.w r0, r0, r3 +; THUMBV7-NEXT: mov r3, r4 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r6, #1 -; THUMBV7-NEXT: orrs r1, r6 -; THUMBV7-NEXT: ldr r6, [sp, #84] -; THUMBV7-NEXT: umlal r0, r2, r3, r7 -; THUMBV7-NEXT: ldr r3, [sp, #32] @ 4-byte Reload +; THUMBV7-NEXT: cmp r4, #0 +; THUMBV7-NEXT: it ne +; THUMBV7-NEXT: movne r3, #1 ; THUMBV7-NEXT: cmp r7, #0 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r7, #1 -; THUMBV7-NEXT: orrs r1, r3 -; THUMBV7-NEXT: mov r3, r6 -; THUMBV7-NEXT: cmp r6, #0 +; THUMBV7-NEXT: ands r3, r6 +; THUMBV7-NEXT: cmp r5, #0 +; THUMBV7-NEXT: orr.w r3, r3, r7 ; THUMBV7-NEXT: it ne -; THUMBV7-NEXT: movne r3, #1 -; THUMBV7-NEXT: cmp.w r8, #0 -; THUMBV7-NEXT: and.w r3, r3, r7 +; THUMBV7-NEXT: movne r5, #1 ; THUMBV7-NEXT: ldr r7, [sp, #80] -; THUMBV7-NEXT: it ne -; THUMBV7-NEXT: movne.w r8, #1 -; THUMBV7-NEXT: cmp.w r9, #0 -; THUMBV7-NEXT: it ne -; THUMBV7-NEXT: movne.w r9, #1 -; THUMBV7-NEXT: orrs r7, r6 -; THUMBV7-NEXT: ldr r6, [sp, #96] +; THUMBV7-NEXT: orrs r3, r5 +; THUMBV7-NEXT: ldrd r6, r5, [sp, #96] +; THUMBV7-NEXT: orrs r7, r4 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r7, #1 -; THUMBV7-NEXT: orr.w r3, r3, r8 -; THUMBV7-NEXT: orrs.w r6, r6, lr -; THUMBV7-NEXT: orr.w r3, r3, r9 +; THUMBV7-NEXT: orrs r6, r5 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r6, #1 -; THUMBV7-NEXT: adds.w r0, r0, r10 -; THUMBV7-NEXT: str.w r0, [r11, #8] -; THUMBV7-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; THUMBV7-NEXT: adcs r0, r2 -; THUMBV7-NEXT: str.w r0, [r11, #12] -; THUMBV7-NEXT: ldr r0, [sp, #40] @ 4-byte Reload +; THUMBV7-NEXT: adds.w r1, r1, r10 +; THUMBV7-NEXT: str.w r1, [r11, #8] +; THUMBV7-NEXT: adcs.w r1, r2, r9 +; THUMBV7-NEXT: str.w r1, [r11, #12] ; THUMBV7-NEXT: and.w r2, r7, r6 -; THUMBV7-NEXT: orr.w r0, r0, r3 -; THUMBV7-NEXT: orr.w r0, r0, r2 +; THUMBV7-NEXT: ldr r1, [sp, #40] @ 4-byte Reload +; THUMBV7-NEXT: orr.w r1, r1, r3 +; THUMBV7-NEXT: orr.w r1, r1, r2 ; THUMBV7-NEXT: orr.w r0, r0, r1 -; THUMBV7-NEXT: adc r1, r5, #0 +; THUMBV7-NEXT: adc r1, r12, #0 ; THUMBV7-NEXT: orrs r0, r1 ; THUMBV7-NEXT: and r0, r0, #1 ; THUMBV7-NEXT: strb.w r0, [r11, #16] diff --git a/llvm/test/CodeGen/VE/Scalar/atomic.ll b/llvm/test/CodeGen/VE/Scalar/atomic.ll index 2fa6b0d7bcc1da..dd68b8e8c764dc 100644 --- a/llvm/test/CodeGen/VE/Scalar/atomic.ll +++ b/llvm/test/CodeGen/VE/Scalar/atomic.ll @@ -124,16 +124,16 @@ define signext i8 @test_atomic_fetch_xor_1() { ; CHECK-NEXT: lea %s0, c@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s0, c@hi(, %s0) -; CHECK-NEXT: and %s1, -4, %s0 -; CHECK-NEXT: ldl.sx %s0, (, %s1) +; CHECK-NEXT: and %s0, -4, %s0 +; CHECK-NEXT: ldl.sx %s1, (, %s0) ; CHECK-NEXT: .LBB4_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or %s2, 0, %s0 -; CHECK-NEXT: xor %s0, 1, %s2 -; CHECK-NEXT: cas.w %s0, (%s1), %s2 -; CHECK-NEXT: brne.w %s0, %s2, .LBB4_1 +; CHECK-NEXT: or %s2, 0, %s1 +; CHECK-NEXT: xor %s1, 1, %s2 +; CHECK-NEXT: cas.w %s1, (%s0), %s2 +; CHECK-NEXT: brne.w %s1, %s2, .LBB4_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sll %s0, %s1, 56 ; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: b.l.t (, %s10) @@ -151,22 +151,22 @@ define signext i16 @test_atomic_fetch_nand_2() { ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s0, s@hi(, %s0) ; CHECK-NEXT: and %s0, -4, %s0 -; CHECK-NEXT: ldl.sx %s2, (, %s0) +; CHECK-NEXT: ldl.sx %s3, (, %s0) ; CHECK-NEXT: lea %s1, 65534 -; CHECK-NEXT: lea %s3, -65536 -; CHECK-NEXT: and %s3, %s3, (32)0 +; CHECK-NEXT: lea %s2, -65536 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: .LBB5_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or %s4, 0, %s2 -; CHECK-NEXT: xor %s2, -1, %s4 -; CHECK-NEXT: or %s2, %s2, %s1 -; CHECK-NEXT: and %s2, %s2, (48)0 -; CHECK-NEXT: and %s5, %s4, %s3 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: cas.w %s2, (%s0), %s4 -; CHECK-NEXT: brne.w %s2, %s4, .LBB5_1 +; CHECK-NEXT: or %s4, 0, %s3 +; CHECK-NEXT: xor %s3, -1, %s4 +; CHECK-NEXT: or %s3, %s3, %s1 +; CHECK-NEXT: and %s3, %s3, (48)0 +; CHECK-NEXT: and %s5, %s4, %s2 +; CHECK-NEXT: or %s3, %s5, %s3 +; CHECK-NEXT: cas.w %s3, (%s0), %s4 +; CHECK-NEXT: brne.w %s3, %s4, .LBB5_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: sll %s0, %s2, 48 +; CHECK-NEXT: sll %s0, %s3, 48 ; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: b.l.t (, %s10) @@ -182,16 +182,16 @@ define signext i32 @test_atomic_fetch_max_4() { ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: lea %s0, i@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s1, i@hi(, %s0) -; CHECK-NEXT: ldl.sx %s0, (, %s1) +; CHECK-NEXT: lea.sl %s0, i@hi(, %s0) +; CHECK-NEXT: ldl.sx %s1, (, %s0) ; CHECK-NEXT: .LBB6_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or %s2, 0, %s0 -; CHECK-NEXT: maxs.w.sx %s0, 1, %s0 -; CHECK-NEXT: cas.w %s0, (%s1), %s2 -; CHECK-NEXT: brne.w %s0, %s2, .LBB6_1 +; CHECK-NEXT: or %s2, 0, %s1 +; CHECK-NEXT: maxs.w.sx %s1, 1, %s1 +; CHECK-NEXT: cas.w %s1, (%s0), %s2 +; CHECK-NEXT: brne.w %s1, %s2, .LBB6_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: b.l.t (, %s10) entry: @@ -206,16 +206,16 @@ define signext i32 @test_atomic_fetch_min_4() { ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: lea %s0, i@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s1, i@hi(, %s0) -; CHECK-NEXT: ldl.sx %s0, (, %s1) +; CHECK-NEXT: lea.sl %s0, i@hi(, %s0) +; CHECK-NEXT: ldl.sx %s1, (, %s0) ; CHECK-NEXT: .LBB7_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or %s2, 0, %s0 -; CHECK-NEXT: mins.w.sx %s0, 1, %s0 -; CHECK-NEXT: cas.w %s0, (%s1), %s2 -; CHECK-NEXT: brne.w %s0, %s2, .LBB7_1 +; CHECK-NEXT: or %s2, 0, %s1 +; CHECK-NEXT: mins.w.sx %s1, 1, %s1 +; CHECK-NEXT: cas.w %s1, (%s0), %s2 +; CHECK-NEXT: brne.w %s1, %s2, .LBB7_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: b.l.t (, %s10) entry: diff --git a/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll b/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll index b70f0ea602d0b1..c0ea8a39aac840 100644 --- a/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll +++ b/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll @@ -85,25 +85,25 @@ define zeroext i1 @_Z26atomic_cmp_swap_relaxed_i1RNSt3__16atomicIbEERbb(ptr noca ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: ldl.sx %s5, (, %s3) -; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s7, (56)0, %s0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s6, %s5 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 -; CHECK-NEXT: brne.w 0, %s3, .LBB0_2 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s7, %s6 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 +; CHECK-NEXT: brne.w 0, %s2, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %bb7 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st1b %s0, (, %s1) ; CHECK-NEXT: .LBB0_2: # %bb9 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i = zext i1 %arg2 to i8 @@ -129,26 +129,26 @@ define signext i8 @_Z26atomic_cmp_swap_relaxed_i8RNSt3__16atomicIcEERcc(ptr noca ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: sla.w.sx %s5, (56)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 +; CHECK-NEXT: ldl.sx %s7, (, %s3) ; CHECK-NEXT: and %s2, %s2, (56)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s5, %s6 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 -; CHECK-NEXT: brne.w 0, %s3, .LBB1_2 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s6, %s7 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 +; CHECK-NEXT: brne.w 0, %s2, .LBB1_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st1b %s0, (, %s1) ; CHECK-NEXT: .LBB1_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i8, ptr %arg1, align 1 @@ -174,25 +174,25 @@ define zeroext i8 @_Z26atomic_cmp_swap_relaxed_u8RNSt3__16atomicIhEERhh(ptr noca ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: ldl.sx %s5, (, %s3) -; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s7, (56)0, %s0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s6, %s5 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 -; CHECK-NEXT: brne.w 0, %s3, .LBB2_2 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s7, %s6 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 +; CHECK-NEXT: brne.w 0, %s2, .LBB2_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st1b %s0, (, %s1) ; CHECK-NEXT: .LBB2_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i8, ptr %arg1, align 1 @@ -218,26 +218,26 @@ define signext i16 @_Z27atomic_cmp_swap_relaxed_i16RNSt3__16atomicIsEERss(ptr no ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: sla.w.sx %s5, (48)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s6, (48)0, %s0 +; CHECK-NEXT: ldl.sx %s7, (, %s3) ; CHECK-NEXT: and %s2, %s2, (48)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s5, %s6 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 -; CHECK-NEXT: brne.w 0, %s3, .LBB3_2 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s6, %s7 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 +; CHECK-NEXT: brne.w 0, %s2, .LBB3_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st2b %s0, (, %s1) ; CHECK-NEXT: .LBB3_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i16, ptr %arg1, align 2 @@ -263,25 +263,25 @@ define zeroext i16 @_Z27atomic_cmp_swap_relaxed_u16RNSt3__16atomicItEERtt(ptr no ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: ldl.sx %s5, (, %s3) -; CHECK-NEXT: sla.w.sx %s6, (48)0, %s0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s7, (48)0, %s0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s6, %s5 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 -; CHECK-NEXT: brne.w 0, %s3, .LBB4_2 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s7, %s6 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 +; CHECK-NEXT: brne.w 0, %s2, .LBB4_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st2b %s0, (, %s1) ; CHECK-NEXT: .LBB4_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i16, ptr %arg1, align 2 @@ -525,26 +525,26 @@ define zeroext i1 @_Z26atomic_cmp_swap_acquire_i1RNSt3__16atomicIbEERbb(ptr noca ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: ldl.sx %s5, (, %s3) -; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s7, (56)0, %s0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s6, %s5 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s7, %s6 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: fencem 2 -; CHECK-NEXT: brne.w 0, %s3, .LBB11_2 +; CHECK-NEXT: brne.w 0, %s2, .LBB11_2 ; CHECK-NEXT: # %bb.1: # %bb7 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st1b %s0, (, %s1) ; CHECK-NEXT: .LBB11_2: # %bb9 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i = zext i1 %arg2 to i8 @@ -570,27 +570,27 @@ define signext i8 @_Z26atomic_cmp_swap_acquire_i8RNSt3__16atomicIcEERcc(ptr noca ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: sla.w.sx %s5, (56)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 +; CHECK-NEXT: ldl.sx %s7, (, %s3) ; CHECK-NEXT: and %s2, %s2, (56)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s5, %s6 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s6, %s7 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: fencem 2 -; CHECK-NEXT: brne.w 0, %s3, .LBB12_2 +; CHECK-NEXT: brne.w 0, %s2, .LBB12_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st1b %s0, (, %s1) ; CHECK-NEXT: .LBB12_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i8, ptr %arg1, align 1 @@ -616,26 +616,26 @@ define zeroext i8 @_Z26atomic_cmp_swap_acquire_u8RNSt3__16atomicIhEERhh(ptr noca ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: ldl.sx %s5, (, %s3) -; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s7, (56)0, %s0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s6, %s5 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s7, %s6 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: fencem 2 -; CHECK-NEXT: brne.w 0, %s3, .LBB13_2 +; CHECK-NEXT: brne.w 0, %s2, .LBB13_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st1b %s0, (, %s1) ; CHECK-NEXT: .LBB13_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i8, ptr %arg1, align 1 @@ -661,27 +661,27 @@ define signext i16 @_Z27atomic_cmp_swap_acquire_i16RNSt3__16atomicIsEERss(ptr no ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: sla.w.sx %s5, (48)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s6, (48)0, %s0 +; CHECK-NEXT: ldl.sx %s7, (, %s3) ; CHECK-NEXT: and %s2, %s2, (48)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s5, %s6 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s6, %s7 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: fencem 2 -; CHECK-NEXT: brne.w 0, %s3, .LBB14_2 +; CHECK-NEXT: brne.w 0, %s2, .LBB14_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st2b %s0, (, %s1) ; CHECK-NEXT: .LBB14_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i16, ptr %arg1, align 2 @@ -707,26 +707,26 @@ define zeroext i16 @_Z27atomic_cmp_swap_acquire_u16RNSt3__16atomicItEERtt(ptr no ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: ldl.sx %s5, (, %s3) -; CHECK-NEXT: sla.w.sx %s6, (48)0, %s0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s7, (48)0, %s0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s6, %s5 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s7, %s6 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: fencem 2 -; CHECK-NEXT: brne.w 0, %s3, .LBB15_2 +; CHECK-NEXT: brne.w 0, %s2, .LBB15_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st2b %s0, (, %s1) ; CHECK-NEXT: .LBB15_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i16, ptr %arg1, align 2 @@ -975,26 +975,26 @@ define zeroext i1 @_Z26atomic_cmp_swap_seq_cst_i1RNSt3__16atomicIbEERbb(ptr noca ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: ldl.sx %s5, (, %s3) -; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s7, (56)0, %s0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s6, %s5 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s7, %s6 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: fencem 3 -; CHECK-NEXT: brne.w 0, %s3, .LBB22_2 +; CHECK-NEXT: brne.w 0, %s2, .LBB22_2 ; CHECK-NEXT: # %bb.1: # %bb7 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st1b %s0, (, %s1) ; CHECK-NEXT: .LBB22_2: # %bb9 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i = zext i1 %arg2 to i8 @@ -1021,27 +1021,27 @@ define signext i8 @_Z26atomic_cmp_swap_seq_cst_i8RNSt3__16atomicIcEERcc(ptr noca ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: sla.w.sx %s5, (56)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 +; CHECK-NEXT: ldl.sx %s7, (, %s3) ; CHECK-NEXT: and %s2, %s2, (56)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s5, %s6 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s6, %s7 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: fencem 3 -; CHECK-NEXT: brne.w 0, %s3, .LBB23_2 +; CHECK-NEXT: brne.w 0, %s2, .LBB23_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st1b %s0, (, %s1) ; CHECK-NEXT: .LBB23_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i8, ptr %arg1, align 1 @@ -1068,26 +1068,26 @@ define zeroext i8 @_Z26atomic_cmp_swap_seq_cst_u8RNSt3__16atomicIhEERhh(ptr noca ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: ldl.sx %s5, (, %s3) -; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s7, (56)0, %s0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s6, %s5 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s7, %s6 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: fencem 3 -; CHECK-NEXT: brne.w 0, %s3, .LBB24_2 +; CHECK-NEXT: brne.w 0, %s2, .LBB24_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st1b %s0, (, %s1) ; CHECK-NEXT: .LBB24_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i8, ptr %arg1, align 1 @@ -1114,27 +1114,27 @@ define signext i16 @_Z27atomic_cmp_swap_seq_cst_i16RNSt3__16atomicIsEERss(ptr no ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: sla.w.sx %s5, (48)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s6, (48)0, %s0 +; CHECK-NEXT: ldl.sx %s7, (, %s3) ; CHECK-NEXT: and %s2, %s2, (48)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s5, %s6 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s6, %s7 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: fencem 3 -; CHECK-NEXT: brne.w 0, %s3, .LBB25_2 +; CHECK-NEXT: brne.w 0, %s2, .LBB25_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st2b %s0, (, %s1) ; CHECK-NEXT: .LBB25_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i16, ptr %arg1, align 2 @@ -1161,26 +1161,26 @@ define zeroext i16 @_Z27atomic_cmp_swap_seq_cst_u16RNSt3__16atomicItEERtt(ptr no ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 -; CHECK-NEXT: ldl.sx %s5, (, %s3) -; CHECK-NEXT: sla.w.sx %s6, (48)0, %s0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: sla.w.sx %s7, (48)0, %s0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 -; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 -; CHECK-NEXT: nnd %s5, %s6, %s5 -; CHECK-NEXT: and %s5, %s5, (32)0 -; CHECK-NEXT: or %s2, %s5, %s2 -; CHECK-NEXT: or %s4, %s5, %s4 -; CHECK-NEXT: cas.w %s2, (%s3), %s4 -; CHECK-NEXT: cmps.w.sx %s4, %s2, %s4 -; CHECK-NEXT: or %s3, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s4 +; CHECK-NEXT: sla.w.sx %s5, %s4, %s0 +; CHECK-NEXT: nnd %s4, %s7, %s6 +; CHECK-NEXT: and %s6, %s4, (32)0 +; CHECK-NEXT: or %s4, %s6, %s2 +; CHECK-NEXT: or %s2, %s6, %s5 +; CHECK-NEXT: cas.w %s4, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s4, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: fencem 3 -; CHECK-NEXT: brne.w 0, %s3, .LBB26_2 +; CHECK-NEXT: brne.w 0, %s2, .LBB26_2 ; CHECK-NEXT: # %bb.1: # %bb6 -; CHECK-NEXT: and %s2, %s2, (32)0 -; CHECK-NEXT: srl %s0, %s2, %s0 +; CHECK-NEXT: and %s3, %s4, (32)0 +; CHECK-NEXT: srl %s0, %s3, %s0 ; CHECK-NEXT: st2b %s0, (, %s1) ; CHECK-NEXT: .LBB26_2: # %bb8 -; CHECK-NEXT: adds.w.zx %s0, %s3, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i3 = load i16, ptr %arg1, align 2 @@ -1441,17 +1441,17 @@ define zeroext i1 @_Z30atomic_cmp_swap_relaxed_stk_i1Rbb(ptr nocapture nonnull a ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB33_4: # %bb ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: ld1b.zx %s3, (, %s0) +; CHECK-NEXT: ld1b.zx %s2, (, %s0) ; CHECK-NEXT: ldl.zx %s4, 8(, %s11) -; CHECK-NEXT: lea %s2, 8(, %s11) +; CHECK-NEXT: lea %s3, 8(, %s11) ; CHECK-NEXT: lea %s5, -256 ; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: and %s4, %s4, %s5 ; CHECK-NEXT: and %s4, %s4, (32)0 ; CHECK-NEXT: or %s1, %s4, %s1 -; CHECK-NEXT: or %s3, %s4, %s3 -; CHECK-NEXT: cas.w %s1, (%s2), %s3 -; CHECK-NEXT: cmps.w.sx %s3, %s1, %s3 +; CHECK-NEXT: or %s2, %s4, %s2 +; CHECK-NEXT: cas.w %s1, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s1, %s2 ; CHECK-NEXT: or %s2, 0, (0)1 ; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: brne.w 0, %s2, .LBB33_2 @@ -1560,17 +1560,17 @@ define zeroext i8 @_Z30atomic_cmp_swap_relaxed_stk_u8Rhh(ptr nocapture nonnull a ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB35_4: # %bb ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: ld1b.zx %s3, (, %s0) +; CHECK-NEXT: ld1b.zx %s2, (, %s0) ; CHECK-NEXT: ldl.zx %s4, 8(, %s11) -; CHECK-NEXT: lea %s2, 8(, %s11) +; CHECK-NEXT: lea %s3, 8(, %s11) ; CHECK-NEXT: lea %s5, -256 ; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: and %s4, %s4, %s5 ; CHECK-NEXT: and %s4, %s4, (32)0 ; CHECK-NEXT: or %s1, %s4, %s1 -; CHECK-NEXT: or %s3, %s4, %s3 -; CHECK-NEXT: cas.w %s1, (%s2), %s3 -; CHECK-NEXT: cmps.w.sx %s3, %s1, %s3 +; CHECK-NEXT: or %s2, %s4, %s2 +; CHECK-NEXT: cas.w %s1, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s1, %s2 ; CHECK-NEXT: or %s2, 0, (0)1 ; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: brne.w 0, %s2, .LBB35_2 @@ -1673,17 +1673,17 @@ define zeroext i16 @_Z31atomic_cmp_swap_relaxed_stk_u16Rtt(ptr nocapture nonnull ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB37_4: # %bb ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: ld2b.zx %s3, (, %s0) +; CHECK-NEXT: ld2b.zx %s2, (, %s0) ; CHECK-NEXT: ldl.zx %s4, 8(, %s11) -; CHECK-NEXT: lea %s2, 8(, %s11) +; CHECK-NEXT: lea %s3, 8(, %s11) ; CHECK-NEXT: lea %s5, -65536 ; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: and %s4, %s4, %s5 ; CHECK-NEXT: and %s4, %s4, (32)0 ; CHECK-NEXT: or %s1, %s4, %s1 -; CHECK-NEXT: or %s3, %s4, %s3 -; CHECK-NEXT: cas.w %s1, (%s2), %s3 -; CHECK-NEXT: cmps.w.sx %s3, %s1, %s3 +; CHECK-NEXT: or %s2, %s4, %s2 +; CHECK-NEXT: cas.w %s1, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s1, %s2 ; CHECK-NEXT: or %s2, 0, (0)1 ; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 ; CHECK-NEXT: brne.w 0, %s2, .LBB37_2 @@ -2004,28 +2004,28 @@ bb: define zeroext i1 @_Z29atomic_cmp_swap_relaxed_gv_i1Rbb(ptr nocapture nonnull align 1 dereferenceable(1) %arg, i1 zeroext %arg1) { ; CHECK-LABEL: _Z29atomic_cmp_swap_relaxed_gv_i1Rbb: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: and %s2, %s1, (32)0 -; CHECK-NEXT: lea %s1, gv_i1@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s1, gv_i1@hi(, %s1) -; CHECK-NEXT: and %s1, -4, %s1 -; CHECK-NEXT: ldl.zx %s4, (, %s1) +; CHECK-NEXT: lea %s2, gv_i1@lo +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lea.sl %s2, gv_i1@hi(, %s2) +; CHECK-NEXT: and %s2, -4, %s2 +; CHECK-NEXT: ldl.zx %s4, (, %s2) ; CHECK-NEXT: ld1b.zx %s3, (, %s0) ; CHECK-NEXT: lea %s5, -256 ; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: and %s4, %s4, %s5 ; CHECK-NEXT: and %s4, %s4, (32)0 -; CHECK-NEXT: or %s2, %s4, %s2 +; CHECK-NEXT: or %s1, %s4, %s1 ; CHECK-NEXT: or %s3, %s4, %s3 -; CHECK-NEXT: cas.w %s2, (%s1), %s3 -; CHECK-NEXT: cmps.w.sx %s3, %s2, %s3 -; CHECK-NEXT: or %s1, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s1, (63)0, %s3 -; CHECK-NEXT: brne.w 0, %s1, .LBB44_2 +; CHECK-NEXT: cas.w %s1, (%s2), %s3 +; CHECK-NEXT: cmps.w.sx %s3, %s1, %s3 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 +; CHECK-NEXT: brne.w 0, %s2, .LBB44_2 ; CHECK-NEXT: # %bb.1: # %bb5 -; CHECK-NEXT: st1b %s2, (, %s0) +; CHECK-NEXT: st1b %s1, (, %s0) ; CHECK-NEXT: .LBB44_2: # %bb7 -; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i = zext i1 %arg1 to i8 @@ -2091,28 +2091,28 @@ bb6: ; preds = %bb4, %bb define zeroext i8 @_Z29atomic_cmp_swap_relaxed_gv_u8Rhh(ptr nocapture nonnull align 1 dereferenceable(1) %arg, i8 zeroext %arg1) { ; CHECK-LABEL: _Z29atomic_cmp_swap_relaxed_gv_u8Rhh: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: and %s2, %s1, (32)0 -; CHECK-NEXT: lea %s1, gv_u8@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s1, gv_u8@hi(, %s1) -; CHECK-NEXT: and %s1, -4, %s1 -; CHECK-NEXT: ldl.zx %s4, (, %s1) +; CHECK-NEXT: lea %s2, gv_u8@lo +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: lea.sl %s2, gv_u8@hi(, %s2) +; CHECK-NEXT: and %s2, -4, %s2 +; CHECK-NEXT: ldl.zx %s4, (, %s2) ; CHECK-NEXT: ld1b.zx %s3, (, %s0) ; CHECK-NEXT: lea %s5, -256 ; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: and %s4, %s4, %s5 ; CHECK-NEXT: and %s4, %s4, (32)0 -; CHECK-NEXT: or %s2, %s4, %s2 +; CHECK-NEXT: or %s1, %s4, %s1 ; CHECK-NEXT: or %s3, %s4, %s3 -; CHECK-NEXT: cas.w %s2, (%s1), %s3 -; CHECK-NEXT: cmps.w.sx %s3, %s2, %s3 -; CHECK-NEXT: or %s1, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s1, (63)0, %s3 -; CHECK-NEXT: brne.w 0, %s1, .LBB46_2 +; CHECK-NEXT: cas.w %s1, (%s2), %s3 +; CHECK-NEXT: cmps.w.sx %s3, %s1, %s3 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 +; CHECK-NEXT: brne.w 0, %s2, .LBB46_2 ; CHECK-NEXT: # %bb.1: # %bb4 -; CHECK-NEXT: st1b %s2, (, %s0) +; CHECK-NEXT: st1b %s1, (, %s0) ; CHECK-NEXT: .LBB46_2: # %bb6 -; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i = load i8, ptr %arg, align 1 diff --git a/llvm/test/CodeGen/VE/Scalar/br_jt.ll b/llvm/test/CodeGen/VE/Scalar/br_jt.ll index bc7b26abe7e046..f195564d76fb07 100644 --- a/llvm/test/CodeGen/VE/Scalar/br_jt.ll +++ b/llvm/test/CodeGen/VE/Scalar/br_jt.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc < %s -mtriple=ve | FileCheck %s ; RUN: llc < %s -mtriple=ve -relocation-model=pic \ ; RUN: | FileCheck %s -check-prefix=PIC @@ -11,22 +12,22 @@ define signext i32 @br_jt3(i32 signext %0) { ; CHECK-LABEL: br_jt3: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: breq.w 1, %s0, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: breq.w 1, %s0, .LBB0_1 ; CHECK-NEXT: # %bb.2: -; CHECK-NEXT: breq.w 4, %s0, .LBB{{[0-9]+}}_5 +; CHECK-NEXT: breq.w 4, %s0, .LBB0_5 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: brne.w 2, %s0, .LBB{{[0-9]+}}_6 +; CHECK-NEXT: brne.w 2, %s0, .LBB0_6 ; CHECK-NEXT: # %bb.4: ; CHECK-NEXT: or %s0, 0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: .LBB0_1: ; CHECK-NEXT: or %s0, 3, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_5: +; CHECK-NEXT: .LBB0_5: ; CHECK-NEXT: or %s0, 7, (0)1 -; CHECK-NEXT: .LBB{{[0-9]+}}_6: +; CHECK-NEXT: .LBB0_6: ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) ; @@ -78,7 +79,7 @@ define signext i32 @br_jt4(i32 signext %0) { ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: adds.w.sx %s1, -1, %s0 ; CHECK-NEXT: cmpu.w %s2, 3, %s1 -; CHECK-NEXT: brgt.w 0, %s2, .LBB{{[0-9]+}}_2 +; CHECK-NEXT: brgt.w 0, %s2, .LBB1_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: sll %s0, %s0, 2 @@ -87,7 +88,7 @@ define signext i32 @br_jt4(i32 signext %0) { ; CHECK-NEXT: lea.sl %s1, .Lswitch.table.br_jt4@hi(, %s1) ; CHECK-NEXT: ldl.sx %s0, (%s0, %s1) ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) ; @@ -138,18 +139,18 @@ define signext i32 @br_jt7(i32 signext %0) { ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: adds.w.sx %s1, -1, %s0 ; CHECK-NEXT: cmpu.w %s2, 8, %s1 -; CHECK-NEXT: brgt.w 0, %s2, .LBB{{[0-9]+}}_3 +; CHECK-NEXT: brgt.w 0, %s2, .LBB2_3 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: and %s2, %s1, (48)0 ; CHECK-NEXT: lea %s3, 463 ; CHECK-NEXT: and %s3, %s3, (32)0 ; CHECK-NEXT: srl %s2, %s3, %s2 ; CHECK-NEXT: and %s2, 1, %s2 -; CHECK-NEXT: brne.w 0, %s2, .LBB{{[0-9]+}}_2 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: brne.w 0, %s2, .LBB2_2 +; CHECK-NEXT: .LBB2_3: ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: sll %s0, %s0, 2 ; CHECK-NEXT: lea %s1, .Lswitch.table.br_jt7@lo @@ -219,18 +220,18 @@ define signext i32 @br_jt8(i32 signext %0) { ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: adds.w.sx %s1, -1, %s0 ; CHECK-NEXT: cmpu.w %s2, 8, %s1 -; CHECK-NEXT: brgt.w 0, %s2, .LBB{{[0-9]+}}_3 +; CHECK-NEXT: brgt.w 0, %s2, .LBB3_3 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: and %s2, %s1, (48)0 ; CHECK-NEXT: lea %s3, 495 ; CHECK-NEXT: and %s3, %s3, (32)0 ; CHECK-NEXT: srl %s2, %s3, %s2 ; CHECK-NEXT: and %s2, 1, %s2 -; CHECK-NEXT: brne.w 0, %s2, .LBB{{[0-9]+}}_2 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: brne.w 0, %s2, .LBB3_2 +; CHECK-NEXT: .LBB3_3: ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: sll %s0, %s0, 2 ; CHECK-NEXT: lea %s1, .Lswitch.table.br_jt8@lo @@ -298,23 +299,23 @@ define signext i32 @br_jt3_m(i32 signext %0, i32 signext %1) { ; CHECK-LABEL: br_jt3_m: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: breq.w 1, %s0, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: breq.w 1, %s0, .LBB4_1 ; CHECK-NEXT: # %bb.2: -; CHECK-NEXT: breq.w 4, %s0, .LBB{{[0-9]+}}_5 +; CHECK-NEXT: breq.w 4, %s0, .LBB4_5 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: brne.w 2, %s0, .LBB{{[0-9]+}}_6 +; CHECK-NEXT: brne.w 2, %s0, .LBB4_6 ; CHECK-NEXT: # %bb.4: ; CHECK-NEXT: or %s0, 0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: .LBB4_1: ; CHECK-NEXT: or %s0, 3, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_5: +; CHECK-NEXT: .LBB4_5: ; CHECK-NEXT: and %s0, %s1, (32)0 ; CHECK-NEXT: adds.w.sx %s0, 3, %s0 -; CHECK-NEXT: .LBB{{[0-9]+}}_6: +; CHECK-NEXT: .LBB4_6: ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) ; @@ -368,7 +369,7 @@ define signext i32 @br_jt4_m(i32 signext %0, i32 signext %1) { ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: adds.w.sx %s2, -1, %s0 ; CHECK-NEXT: cmpu.w %s3, 3, %s2 -; CHECK-NEXT: brgt.w 0, %s3, .LBB{{[0-9]+}}_5 +; CHECK-NEXT: brgt.w 0, %s3, .LBB5_5 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: sll %s0, %s0, 3 @@ -378,18 +379,18 @@ define signext i32 @br_jt4_m(i32 signext %0, i32 signext %1) { ; CHECK-NEXT: ld %s2, (%s2, %s0) ; CHECK-NEXT: or %s0, 3, (0)1 ; CHECK-NEXT: b.l.t (, %s2) -; CHECK-NEXT: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: .LBB5_2: ; CHECK-NEXT: or %s0, 0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: .LBB5_3: ; CHECK-NEXT: or %s0, 4, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: .LBB5_4: ; CHECK-NEXT: and %s0, %s1, (32)0 ; CHECK-NEXT: adds.w.sx %s0, 3, %s0 -; CHECK-NEXT: .LBB{{[0-9]+}}_5: +; CHECK-NEXT: .LBB5_5: ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) ; @@ -452,48 +453,48 @@ define signext i32 @br_jt4_m(i32 signext %0, i32 signext %1) { define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) { ; CHECK-LABEL: br_jt7_m: ; CHECK: # %bb.0: -; CHECK-NEXT: and %s2, %s0, (32)0 -; CHECK-NEXT: adds.w.sx %s0, -1, %s2 -; CHECK-NEXT: cmpu.w %s3, 8, %s0 -; CHECK-NEXT: brgt.w 0, %s3, .LBB{{[0-9]+}}_8 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: adds.w.sx %s2, -1, %s0 +; CHECK-NEXT: cmpu.w %s3, 8, %s2 +; CHECK-NEXT: brgt.w 0, %s3, .LBB6_8 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 -; CHECK-NEXT: sll %s0, %s0, 3 +; CHECK-NEXT: adds.w.zx %s2, %s2, (0)1 +; CHECK-NEXT: sll %s2, %s2, 3 ; CHECK-NEXT: lea %s3, .LJTI6_0@lo ; CHECK-NEXT: and %s3, %s3, (32)0 ; CHECK-NEXT: lea.sl %s3, .LJTI6_0@hi(, %s3) -; CHECK-NEXT: ld %s3, (%s3, %s0) -; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: or %s0, 3, (0)1 +; CHECK-NEXT: ld %s3, (%s3, %s2) +; CHECK-NEXT: and %s2, %s1, (32)0 +; CHECK-NEXT: or %s1, 3, (0)1 ; CHECK-NEXT: b.l.t (, %s3) -; CHECK-NEXT: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: or %s0, 4, (0)1 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: .LBB6_3: +; CHECK-NEXT: or %s1, 4, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_4: -; CHECK-NEXT: adds.w.sx %s0, 3, %s1 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: .LBB6_4: +; CHECK-NEXT: adds.w.sx %s1, 3, %s2 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_8: -; CHECK-NEXT: or %s0, 0, %s2 -; CHECK-NEXT: .LBB{{[0-9]+}}_9: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: .LBB6_8: +; CHECK-NEXT: or %s1, 0, %s0 +; CHECK-NEXT: .LBB6_9: +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_7: -; CHECK-NEXT: or %s0, 11, (0)1 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: .LBB6_7: +; CHECK-NEXT: or %s1, 11, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_6: -; CHECK-NEXT: or %s0, 10, (0)1 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: .LBB6_6: +; CHECK-NEXT: or %s1, 10, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_5: -; CHECK-NEXT: adds.w.sx %s0, -2, %s1 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: .LBB6_5: +; CHECK-NEXT: adds.w.sx %s1, -2, %s2 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 ; CHECK-NEXT: b.l.t (, %s10) ; ; PIC-LABEL: br_jt7_m: @@ -591,7 +592,7 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) { ; CHECK-NEXT: and %s2, %s0, (32)0 ; CHECK-NEXT: adds.w.sx %s0, -1, %s2 ; CHECK-NEXT: cmpu.w %s3, 8, %s0 -; CHECK-NEXT: brgt.w 0, %s3, .LBB{{[0-9]+}}_9 +; CHECK-NEXT: brgt.w 0, %s3, .LBB7_9 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: sll %s0, %s0, 3 @@ -602,36 +603,36 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) { ; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: or %s0, 3, (0)1 ; CHECK-NEXT: b.l.t (, %s3) -; CHECK-NEXT: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: .LBB7_2: ; CHECK-NEXT: or %s0, 0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: .LBB7_3: ; CHECK-NEXT: or %s0, 4, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: adds.w.sx %s0, 3, %s1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_9: +; CHECK-NEXT: .LBB7_9: ; CHECK-NEXT: or %s0, 0, %s2 -; CHECK-NEXT: .LBB{{[0-9]+}}_10: +; CHECK-NEXT: .LBB7_10: ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_5: +; CHECK-NEXT: .LBB7_5: ; CHECK-NEXT: adds.w.sx %s0, -5, %s1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_6: +; CHECK-NEXT: .LBB7_6: ; CHECK-NEXT: adds.w.sx %s0, -2, %s1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_8: +; CHECK-NEXT: .LBB7_8: ; CHECK-NEXT: or %s0, 11, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) -; CHECK-NEXT: .LBB{{[0-9]+}}_7: +; CHECK-NEXT: .LBB7_7: ; CHECK-NEXT: or %s0, 10, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: b.l.t (, %s10) @@ -640,53 +641,53 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) { ; PIC: # %bb.0: ; PIC-NEXT: st %s15, 24(, %s11) ; PIC-NEXT: st %s16, 32(, %s11) -; PIC-NEXT: and %s2, %s0, (32)0 -; PIC-NEXT: adds.w.sx %s0, -1, %s2 -; PIC-NEXT: cmpu.w %s3, 8, %s0 +; PIC-NEXT: and %s0, %s0, (32)0 +; PIC-NEXT: adds.w.sx %s3, -1, %s0 +; PIC-NEXT: cmpu.w %s2, 8, %s3 ; PIC-NEXT: lea %s15, _GLOBAL_OFFSET_TABLE_@pc_lo(-24) ; PIC-NEXT: and %s15, %s15, (32)0 ; PIC-NEXT: sic %s16 ; PIC-NEXT: lea.sl %s15, _GLOBAL_OFFSET_TABLE_@pc_hi(%s16, %s15) -; PIC-NEXT: brgt.w 0, %s3, .LBB7_9 +; PIC-NEXT: brgt.w 0, %s2, .LBB7_9 ; PIC-NEXT: # %bb.1: -; PIC-NEXT: and %s1, %s1, (32)0 -; PIC-NEXT: adds.w.zx %s0, %s0, (0)1 -; PIC-NEXT: sll %s0, %s0, 2 +; PIC-NEXT: and %s2, %s1, (32)0 +; PIC-NEXT: adds.w.zx %s1, %s3, (0)1 +; PIC-NEXT: sll %s1, %s1, 2 ; PIC-NEXT: lea %s3, .LJTI7_0@gotoff_lo ; PIC-NEXT: and %s3, %s3, (32)0 ; PIC-NEXT: lea.sl %s3, .LJTI7_0@gotoff_hi(%s3, %s15) -; PIC-NEXT: ldl.sx %s0, (%s3, %s0) +; PIC-NEXT: ldl.sx %s1, (%s3, %s1) ; PIC-NEXT: lea %s3, br_jt8_m@gotoff_lo ; PIC-NEXT: and %s3, %s3, (32)0 ; PIC-NEXT: lea.sl %s3, br_jt8_m@gotoff_hi(%s3, %s15) -; PIC-NEXT: adds.l %s3, %s0, %s3 -; PIC-NEXT: or %s0, 3, (0)1 +; PIC-NEXT: adds.l %s3, %s1, %s3 +; PIC-NEXT: or %s1, 3, (0)1 ; PIC-NEXT: b.l.t (, %s3) ; PIC-NEXT: .LBB7_2: -; PIC-NEXT: or %s0, 0, (0)1 +; PIC-NEXT: or %s1, 0, (0)1 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_3: -; PIC-NEXT: or %s0, 4, (0)1 +; PIC-NEXT: or %s1, 4, (0)1 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_4: -; PIC-NEXT: adds.w.sx %s0, 3, %s1 +; PIC-NEXT: adds.w.sx %s1, 3, %s2 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_9: -; PIC-NEXT: or %s0, 0, %s2 +; PIC-NEXT: or %s1, 0, %s0 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_5: -; PIC-NEXT: adds.w.sx %s0, -5, %s1 +; PIC-NEXT: adds.w.sx %s1, -5, %s2 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_6: -; PIC-NEXT: adds.w.sx %s0, -2, %s1 +; PIC-NEXT: adds.w.sx %s1, -2, %s2 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_8: -; PIC-NEXT: or %s0, 11, (0)1 +; PIC-NEXT: or %s1, 11, (0)1 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_7: -; PIC-NEXT: or %s0, 10, (0)1 +; PIC-NEXT: or %s1, 10, (0)1 ; PIC-NEXT: .LBB7_10: -; PIC-NEXT: adds.w.sx %s0, %s0, (0)1 +; PIC-NEXT: adds.w.sx %s0, %s1, (0)1 ; PIC-NEXT: ld %s16, 32(, %s11) ; PIC-NEXT: ld %s15, 24(, %s11) ; PIC-NEXT: b.l.t (, %s10) diff --git a/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll b/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll index d77d4352f8336c..6107ed2b70fd24 100644 --- a/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll +++ b/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll @@ -4,16 +4,16 @@ define fastcc void @fht(ptr %fz, i16 signext %n) { ; CHECK-LABEL: fht: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: subss %xmm3, %xmm1 -; CHECK-NEXT: movaps %xmm3, %xmm4 +; CHECK-NEXT: subss %xmm2, %xmm1 +; CHECK-NEXT: movaps %xmm2, %xmm4 ; CHECK-NEXT: mulss %xmm0, %xmm4 -; CHECK-NEXT: addss %xmm3, %xmm4 -; CHECK-NEXT: movaps %xmm3, %xmm2 -; CHECK-NEXT: subss %xmm4, %xmm2 -; CHECK-NEXT: addss %xmm3, %xmm4 +; CHECK-NEXT: addss %xmm2, %xmm4 +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: subss %xmm4, %xmm3 +; CHECK-NEXT: addss %xmm2, %xmm4 ; CHECK-NEXT: xorps %xmm5, %xmm5 ; CHECK-NEXT: subss %xmm1, %xmm5 ; CHECK-NEXT: addss %xmm0, %xmm1 @@ -22,14 +22,14 @@ define fastcc void @fht(ptr %fz, i16 signext %n) { ; CHECK-NEXT: addss %xmm4, %xmm5 ; CHECK-NEXT: addss %xmm0, %xmm5 ; CHECK-NEXT: movss %xmm5, 0 -; CHECK-NEXT: movss %xmm3, (%ecx) -; CHECK-NEXT: addss %xmm0, %xmm3 -; CHECK-NEXT: movss %xmm3, 0 -; CHECK-NEXT: mulss %xmm0, %xmm1 -; CHECK-NEXT: mulss %xmm0, %xmm2 -; CHECK-NEXT: addss %xmm1, %xmm2 -; CHECK-NEXT: addss %xmm0, %xmm2 ; CHECK-NEXT: movss %xmm2, (%ecx) +; CHECK-NEXT: addss %xmm0, %xmm2 +; CHECK-NEXT: movss %xmm2, 0 +; CHECK-NEXT: mulss %xmm0, %xmm1 +; CHECK-NEXT: mulss %xmm0, %xmm3 +; CHECK-NEXT: addss %xmm1, %xmm3 +; CHECK-NEXT: addss %xmm0, %xmm3 +; CHECK-NEXT: movss %xmm3, (%ecx) ; CHECK-NEXT: retl entry: br i1 true, label %bb171.preheader, label %bb431 diff --git a/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll b/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll index 5c78092e9f2c46..753e82496c37c8 100644 --- a/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll +++ b/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll @@ -18,13 +18,13 @@ define fastcc i64 @foo() nounwind { ; CHECK-NEXT: movq X(%rip), %rdi ; CHECK-NEXT: movq X(%rip), %rsi ; CHECK-NEXT: movq X(%rip), %rdx -; CHECK-NEXT: movq X(%rip), %rbx +; CHECK-NEXT: movq X(%rip), %r11 ; CHECK-NEXT: movq X(%rip), %rax ; CHECK-NEXT: addq %rsi, %rax ; CHECK-NEXT: movq X(%rip), %r10 -; CHECK-NEXT: movq X(%rip), %r11 +; CHECK-NEXT: movq X(%rip), %rbx ; CHECK-NEXT: bswapq %r10 -; CHECK-NEXT: leaq (%rbx,%rdx), %r14 +; CHECK-NEXT: leaq (%r11,%rdx), %r14 ; CHECK-NEXT: addq %rsi, %r14 ; CHECK-NEXT: addq %rax, %r14 ; CHECK-NEXT: addq %r10, %r14 @@ -32,18 +32,18 @@ define fastcc i64 @foo() nounwind { ; CHECK-NEXT: leaq (%rax,%r8), %r10 ; CHECK-NEXT: addq %r14, %rdi ; CHECK-NEXT: addq %r10, %r10 -; CHECK-NEXT: bswapq %r11 +; CHECK-NEXT: bswapq %rbx ; CHECK-NEXT: addq %r14, %r10 -; CHECK-NEXT: addq %rbx, %r11 -; CHECK-NEXT: leaq (%rsi,%rdx), %rbx -; CHECK-NEXT: addq %rdi, %rbx +; CHECK-NEXT: addq %r11, %rbx +; CHECK-NEXT: leaq (%rsi,%rdx), %r11 ; CHECK-NEXT: addq %rdi, %r11 -; CHECK-NEXT: addq %rbx, %r11 +; CHECK-NEXT: addq %rdi, %rbx +; CHECK-NEXT: addq %r11, %rbx ; CHECK-NEXT: addq %rax, %rax ; CHECK-NEXT: addq %r10, %rax -; CHECK-NEXT: addq %r11, %r8 +; CHECK-NEXT: addq %rbx, %r8 ; CHECK-NEXT: addq %r10, %rax -; CHECK-NEXT: addq %r11, %rax +; CHECK-NEXT: addq %rbx, %rax ; CHECK-NEXT: movq X(%rip), %r11 ; CHECK-NEXT: bswapq %r11 ; CHECK-NEXT: addq %rdx, %r11 @@ -187,32 +187,32 @@ define fastcc i64 @foo() nounwind { ; CHECK-NEXT: addq %rbx, %r10 ; CHECK-NEXT: addq %rax, %rsi ; CHECK-NEXT: addq %rbx, %rsi -; CHECK-NEXT: movq X(%rip), %rbx -; CHECK-NEXT: bswapq %rbx -; CHECK-NEXT: addq %r8, %rbx -; CHECK-NEXT: leaq (%rcx,%r9), %rdi +; CHECK-NEXT: movq X(%rip), %rdi +; CHECK-NEXT: bswapq %rdi +; CHECK-NEXT: addq %r8, %rdi +; CHECK-NEXT: leaq (%rcx,%r9), %r8 +; CHECK-NEXT: addq %r10, %r8 ; CHECK-NEXT: addq %r10, %rdi -; CHECK-NEXT: addq %r10, %rbx -; CHECK-NEXT: addq %rdi, %rbx -; CHECK-NEXT: leaq (%rax,%rdx), %rdi -; CHECK-NEXT: addq %rdi, %rdi -; CHECK-NEXT: addq %rsi, %rdi -; CHECK-NEXT: addq %rbx, %r11 -; CHECK-NEXT: addq %rsi, %rdi -; CHECK-NEXT: addq %rbx, %rdi -; CHECK-NEXT: movq X(%rip), %r8 -; CHECK-NEXT: bswapq %r8 -; CHECK-NEXT: addq %r9, %r8 +; CHECK-NEXT: addq %r8, %rdi +; CHECK-NEXT: leaq (%rax,%rdx), %r8 +; CHECK-NEXT: addq %r8, %r8 +; CHECK-NEXT: addq %rsi, %r8 +; CHECK-NEXT: addq %rdi, %r11 +; CHECK-NEXT: addq %rsi, %r8 +; CHECK-NEXT: addq %rdi, %r8 +; CHECK-NEXT: movq X(%rip), %rdi +; CHECK-NEXT: bswapq %rdi +; CHECK-NEXT: addq %r9, %rdi ; CHECK-NEXT: leaq (%r10,%rcx), %r9 ; CHECK-NEXT: addq %r11, %r9 -; CHECK-NEXT: addq %r11, %r8 -; CHECK-NEXT: addq %r9, %r8 +; CHECK-NEXT: addq %r11, %rdi +; CHECK-NEXT: addq %r9, %rdi ; CHECK-NEXT: addq %rax, %rsi ; CHECK-NEXT: addq %rsi, %rsi -; CHECK-NEXT: addq %rdi, %rsi -; CHECK-NEXT: addq %rdi, %rsi -; CHECK-NEXT: addq %r8, %rdx ; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: addq %rdi, %rdx +; CHECK-NEXT: addq %rdi, %rsi ; CHECK-NEXT: movq X(%rip), %rax ; CHECK-NEXT: bswapq %rax ; CHECK-NEXT: addq %r10, %r11 diff --git a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll index 705a1cc861e021..a7e518fe7e71b6 100644 --- a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll +++ b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll @@ -307,16 +307,16 @@ define i64 @t5_cse(i64 %val, i64 %shamt, ptr%dst) nounwind { ; X32-NOBMI2-NEXT: pushl %esi ; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NOBMI2-NEXT: movl %eax, %ebx -; X32-NOBMI2-NEXT: addl $32, %ebx -; X32-NOBMI2-NEXT: adcl $0, %edi -; X32-NOBMI2-NEXT: movl %ebx, (%ecx) -; X32-NOBMI2-NEXT: movl %edi, 4(%ecx) +; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOBMI2-NEXT: movl %ebx, %edi +; X32-NOBMI2-NEXT: addl $32, %edi +; X32-NOBMI2-NEXT: adcl $0, %ecx +; X32-NOBMI2-NEXT: movl %edi, (%eax) +; X32-NOBMI2-NEXT: movl %ecx, 4(%eax) ; X32-NOBMI2-NEXT: movb $32, %cl -; X32-NOBMI2-NEXT: subb %al, %cl +; X32-NOBMI2-NEXT: subb %bl, %cl ; X32-NOBMI2-NEXT: movl %esi, %eax ; X32-NOBMI2-NEXT: shll %cl, %eax ; X32-NOBMI2-NEXT: shldl %cl, %esi, %edx diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll index 0c349c3aa8ec16..8cba9603cf4b69 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll @@ -61,16 +61,16 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16) ; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_1]] :: (store (s64) into %stack.5) ; CHECK-NEXT: [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]] - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[MOV32rm3]] - ; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[COPY6]] + ; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm3]] ; CHECK-NEXT: [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]] ; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8) ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]] - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY7]], [[MOVSX64rr32_2]], implicit-def dead $eflags - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY7]], [[MOVSX64rm32_]], implicit-def dead $eflags - ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY7]], 0, $noreg + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY6]], [[MOVSX64rr32_2]], implicit-def dead $eflags + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY6]], [[MOVSX64rm32_]], implicit-def dead $eflags + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY6]], 0, $noreg ; CHECK-NEXT: MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr32 = COPY [[MOV32rm3]] ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7) ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = IMUL64rr [[COPY8]], [[MOVSX64rr32_2]], implicit-def dead $eflags @@ -87,22 +87,21 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8) ; CHECK-NEXT: CMP32rm [[MOV32rm4]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16) - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) - ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) ; CHECK-NEXT: JCC_1 %bb.5, 13, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.for.body17.lr.ph: ; CHECK-NEXT: successors: %bb.6(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) - ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm2]], [[MOVSX64rr32_]], implicit-def dead $eflags - ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm2]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1) - ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[MOV64rm2]] :: (store (s64) into %stack.13) + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm1]], [[MOVSX64rr32_]], implicit-def dead $eflags + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm1]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1) + ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[MOV64rm1]] :: (store (s64) into %stack.13) ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12) ; CHECK-NEXT: undef [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm5]] - ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) - ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) + ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) + ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) ; CHECK-NEXT: JMP_1 %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.for.cond.cleanup: @@ -111,52 +110,51 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: bb.5.for.cond.cleanup16: ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) - ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm5]], %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.7) - ; CHECK-NEXT: [[MOV64rm6:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) - ; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm6]], implicit-def dead $eflags :: (store (s64) into %stack.9) - ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOV64rm5]] :: (store (s64) into %stack.6) - ; CHECK-NEXT: CMP64rm [[MOV64rm5]], %stack.8, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %stack.8) + ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) + ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm4]], %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.7) + ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) + ; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm5]], implicit-def dead $eflags :: (store (s64) into %stack.9) + ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOV64rm4]] :: (store (s64) into %stack.6) + ; CHECK-NEXT: CMP64rm [[MOV64rm4]], %stack.8, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %stack.8) ; CHECK-NEXT: JCC_1 %bb.2, 12, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6.for.body17: ; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit - ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13) - ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm7]], 1, [[MOVSX64rr32_]], 0, $noreg + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY7]].sub_16bit, [[MOV32rm2]].sub_16bit + ; CHECK-NEXT: [[MOV64rm6:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13) + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY7]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm6]], 1, [[MOVSX64rr32_]], 0, $noreg ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOV64rm1]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]] - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64 = COPY [[MOV32rm2]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64 = COPY [[COPY1]] - ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY18]], 1, [[COPY9]], 0, $noreg - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOV32rm2]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[COPY1]] + ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY15]], 1, [[COPY9]], 0, $noreg + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64 = COPY [[MOV64rm]] + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr32 = COPY [[COPY4]] ; CHECK-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2) - ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[COPY19]], 0, $noreg - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = COPY [[COPY19]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY18]] - ; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr64_nosp = COPY [[COPY17]] - ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY16]] - ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY15]] - ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY14]] - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[COPY13]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr32 = COPY [[COPY12]] - ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = COPY [[COPY11]] - ; CHECK-NEXT: [[MOV64rm8:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) - ; CHECK-NEXT: [[LEA64_32r1:%[0-9]+]]:gr32 = COPY [[COPY10]] - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY6]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]] - ; CHECK-NEXT: PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]] - ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm4]], [[MOVSX64rr32_3]], implicit-def dead $eflags - ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm8]], implicit-def dead $eflags + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]] + ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64_nosp = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY18]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm7]], 0, $noreg + ; CHECK-NEXT: [[LEA64_32r1:%[0-9]+]]:gr32 = COPY [[COPY18]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr32 = COPY [[COPY17]] + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = COPY [[COPY16]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY15]] + ; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr64_nosp = COPY [[COPY14]] + ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY13]] + ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY12]] + ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY11]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr32 = COPY [[COPY10]] + ; CHECK-NEXT: [[MOV64rm8:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY7]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]] + ; CHECK-NEXT: PTILESTOREDV [[COPY7]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm2]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]] + ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOVSX64rr32_3]], implicit-def dead $eflags + ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm2]], [[MOV64rm]], implicit-def dead $eflags ; CHECK-NEXT: [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY9]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags - ; CHECK-NEXT: CMP64rr [[MOV64rm4]], [[MOV64rm1]], implicit-def $eflags + ; CHECK-NEXT: CMP64rr [[MOV64rm3]], [[MOV64rm8]], implicit-def $eflags ; CHECK-NEXT: JCC_1 %bb.6, 12, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.5 entry: diff --git a/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll b/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll index 46b5f62456cdcd..3693b0454fc930 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll @@ -24,14 +24,14 @@ define dso_local void @test_chain(ptr %A_mem, ptr %B_mem, ptr %C_mem) { ; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm0 ; CHECK-NEXT: addq $1024, %rdi # imm = 0x400 ; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm1 -; CHECK-NEXT: tileloadd (%rdx,%rax), %tmm3 +; CHECK-NEXT: tileloadd (%rdx,%rax), %tmm2 ; CHECK-NEXT: leaq 1024(%rdx), %rdi -; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm2 +; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm3 ; CHECK-NEXT: tileloadd (%rsi,%rax), %tmm4 -; CHECK-NEXT: tdpbssd %tmm4, %tmm0, %tmm3 -; CHECK-NEXT: tilestored %tmm3, (%rdx,%rax) -; CHECK-NEXT: tdpbssd %tmm4, %tmm1, %tmm2 -; CHECK-NEXT: tilestored %tmm2, (%rdi,%rax) +; CHECK-NEXT: tdpbssd %tmm4, %tmm0, %tmm2 +; CHECK-NEXT: tilestored %tmm2, (%rdx,%rax) +; CHECK-NEXT: tdpbssd %tmm4, %tmm1, %tmm3 +; CHECK-NEXT: tilestored %tmm3, (%rdi,%rax) ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill.ll b/llvm/test/CodeGen/X86/AMX/amx-spill.ll index a04715bd613227..fcdfbd305cf516 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-spill.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill.ll @@ -20,25 +20,25 @@ define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) nounwind ; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %dl, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dl, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %dl, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf, %ecx ; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1 -; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1 +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0 +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0 ; CHECK-NEXT: movabsq $64, %r8 -; CHECK-NEXT: tilestored %tmm1, -64(%rsp,%r8) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm0, -64(%rsp,%r8) # 1024-byte Folded Spill +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1 ; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm3 ; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm4 -; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm2 ; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm5 -; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0 +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm2 ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: @@ -46,16 +46,16 @@ define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) nounwind ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm6 ; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm7 -; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1 -; CHECK-NEXT: tdpbssd %tmm7, %tmm6, %tmm1 +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0 +; CHECK-NEXT: tdpbssd %tmm7, %tmm6, %tmm0 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd -64(%rsp,%rax), %tmm7 # 1024-byte Folded Reload -; CHECK-NEXT: tdpbssd %tmm7, %tmm1, %tmm3 -; CHECK-NEXT: tdpbssd %tmm4, %tmm3, %tmm2 -; CHECK-NEXT: tdpbssd %tmm5, %tmm2, %tmm0 +; CHECK-NEXT: tdpbssd %tmm7, %tmm0, %tmm1 +; CHECK-NEXT: tdpbssd %tmm3, %tmm1, %tmm4 +; CHECK-NEXT: tdpbssd %tmm5, %tmm4, %tmm2 ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %ecx -; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx) +; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx) ; CHECK-NEXT: addq $968, %rsp # imm = 0x3C8 ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/GlobalISel/callingconv.ll b/llvm/test/CodeGen/X86/GlobalISel/callingconv.ll index 33d4de16c97727..b0d215aae8f0da 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/callingconv.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/callingconv.ll @@ -259,13 +259,13 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) { ; X32: # %bb.0: ; X32-NEXT: subl $44, %esp ; X32-NEXT: .cfi_def_cfa_offset 48 -; X32-NEXT: movaps %xmm0, (%esp) # 16-byte Spill -; X32-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: movaps %xmm1, (%esp) # 16-byte Spill ; X32-NEXT: movdqa %xmm2, %xmm0 ; X32-NEXT: movdqa {{[0-9]+}}(%esp), %xmm1 ; X32-NEXT: calll split_return_callee -; X32-NEXT: paddd (%esp), %xmm0 # 16-byte Folded Reload -; X32-NEXT: paddd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X32-NEXT: paddd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X32-NEXT: paddd (%esp), %xmm1 # 16-byte Folded Reload ; X32-NEXT: addl $44, %esp ; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl @@ -274,13 +274,13 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) { ; X64: # %bb.0: ; X64-NEXT: subq $40, %rsp ; X64-NEXT: .cfi_def_cfa_offset 48 -; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; X64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; X64-NEXT: movdqa %xmm2, %xmm0 ; X64-NEXT: movdqa %xmm3, %xmm1 ; X64-NEXT: callq split_return_callee -; X64-NEXT: paddd (%rsp), %xmm0 # 16-byte Folded Reload -; X64-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: paddd (%rsp), %xmm1 # 16-byte Folded Reload ; X64-NEXT: addq $40, %rsp ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll index daed1125e9deeb..7b690583c5245a 100644 --- a/llvm/test/CodeGen/X86/abds.ll +++ b/llvm/test/CodeGen/X86/abds.ll @@ -191,23 +191,23 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: sarl $31, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: sbbl %edi, %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: subl %ecx, %eax ; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: xorl %edi, %edx +; X86-NEXT: xorl %edi, %eax +; X86-NEXT: subl %edi, %eax +; X86-NEXT: sbbl %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl @@ -233,23 +233,23 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: sarl $31, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: sbbl %edi, %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: subl %ecx, %eax ; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: xorl %edi, %edx +; X86-NEXT: xorl %edi, %eax +; X86-NEXT: subl %edi, %eax +; X86-NEXT: sbbl %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll index 5969aae43f82e8..6d6abee1c64043 100644 --- a/llvm/test/CodeGen/X86/abs.ll +++ b/llvm/test/CodeGen/X86/abs.ll @@ -486,6 +486,7 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind { ; X86-LABEL: test_v16i8: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %esi ; X86-NEXT: subl $12, %esp ; X86-NEXT: movb {{[0-9]+}}(%esp), %bh ; X86-NEXT: movb {{[0-9]+}}(%esp), %bl @@ -541,12 +542,6 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind { ; X86-NEXT: xorb %al, %cl ; X86-NEXT: subb %al, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sarb $7, %al -; X86-NEXT: xorb %al, %cl -; X86-NEXT: subb %al, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movb {{[0-9]+}}(%esp), %bh ; X86-NEXT: movb %bh, %al ; X86-NEXT: sarb $7, %al @@ -577,34 +572,40 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind { ; X86-NEXT: sarb $7, %al ; X86-NEXT: xorb %al, %cl ; X86-NEXT: subb %al, %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movb %cl, 15(%eax) -; X86-NEXT: movb %dl, 14(%eax) -; X86-NEXT: movb %ch, 13(%eax) -; X86-NEXT: movb %dh, 12(%eax) -; X86-NEXT: movb %bl, 11(%eax) -; X86-NEXT: movb %bh, 10(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, 9(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, 8(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, 7(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, 6(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, 5(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, 4(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, 3(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, 2(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, 1(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, (%eax) +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb %al, %ah +; X86-NEXT: sarb $7, %ah +; X86-NEXT: xorb %ah, %al +; X86-NEXT: subb %ah, %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movb %al, 15(%esi) +; X86-NEXT: movb %cl, 14(%esi) +; X86-NEXT: movb %dl, 13(%esi) +; X86-NEXT: movb %ch, 12(%esi) +; X86-NEXT: movb %dh, 11(%esi) +; X86-NEXT: movb %bl, 10(%esi) +; X86-NEXT: movb %bh, 9(%esi) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movb %al, 8(%esi) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movb %al, 7(%esi) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movb %al, 6(%esi) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movb %al, 5(%esi) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movb %al, 4(%esi) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movb %al, 3(%esi) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movb %al, 2(%esi) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movb %al, 1(%esi) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movb %al, (%esi) +; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $12, %esp +; X86-NEXT: popl %esi ; X86-NEXT: popl %ebx ; X86-NEXT: retl $4 %r = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 false) diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 6d5f8a78cb1d70..64e2afc1753ccf 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -5095,16 +5095,16 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq 16(%rdi), %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: movq %rcx, %r9 -; AVX-NEXT: movq %rcx, %r10 -; AVX-NEXT: movl %ecx, %r11d -; AVX-NEXT: movl %ecx, %ebx -; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: shrl $8, %ecx -; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movq 16(%rdi), %rax +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: movq %rax, %r8 +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: movl %eax, %r11d +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: shrl $16, %ebx ; AVX-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 ; AVX-NEXT: shrl $24, %r11d @@ -5115,74 +5115,74 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 ; AVX-NEXT: shrq $48, %r8 ; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 -; AVX-NEXT: movq 24(%rdi), %rcx -; AVX-NEXT: shrq $56, %rax -; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: shrl $8, %eax -; AVX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: shrl $24, %eax -; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $32, %rax -; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $40, %rax -; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: movq 24(%rdi), %rax ; AVX-NEXT: shrq $56, %rcx -; AVX-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $8, %ecx -; AVX-NEXT: vmovd %eax, %xmm1 -; AVX-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $24, %ecx -; AVX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $40, %rcx -; AVX-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $48, %rcx -; AVX-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 -; AVX-NEXT: movq 8(%rdi), %rcx +; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movq (%rdi), %rcx ; AVX-NEXT: shrq $56, %rax -; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $8, %eax -; AVX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $24, %eax -; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $32, %rax -; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $40, %rax -; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX-NEXT: movq 8(%rdi), %rax ; AVX-NEXT: shrq $56, %rcx -; AVX-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $8, %ecx +; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $24, %ecx +; AVX-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq $40, %rcx +; AVX-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq $48, %rcx +; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX-NEXT: shrq $56, %rax +; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 @@ -5197,16 +5197,16 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: movq 16(%rdi), %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: movq %rcx, %r8 -; AVX2-NEXT: movq %rcx, %r9 -; AVX2-NEXT: movq %rcx, %r10 -; AVX2-NEXT: movl %ecx, %r11d -; AVX2-NEXT: movl %ecx, %ebx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: shrl $8, %ecx -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq 16(%rdi), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: shrl $8, %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX2-NEXT: shrl $16, %ebx ; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 ; AVX2-NEXT: shrl $24, %r11d @@ -5217,74 +5217,74 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX2-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 ; AVX2-NEXT: shrq $48, %r8 ; AVX2-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: movq 24(%rdi), %rcx -; AVX2-NEXT: shrq $56, %rax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: shrl $8, %eax -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: shrl $24, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: shrq $32, %rax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: shrq $40, %rax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: shrq $48, %rax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: movq 24(%rdi), %rax ; AVX2-NEXT: shrq $56, %rcx -; AVX2-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %ecx -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $24, %ecx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $40, %rcx -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: movq 8(%rdi), %rcx +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq (%rdi), %rcx ; AVX2-NEXT: shrq $56, %rax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $8, %eax -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $24, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $32, %rax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $40, %rax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $48, %rax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq 8(%rdi), %rax ; AVX2-NEXT: shrq $56, %rcx -; AVX2-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $8, %ecx +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $24, %ecx +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $40, %rcx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $48, %rcx +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shrq $56, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll index 5594d13a234d02..fe1d189c10cf51 100644 --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll @@ -705,22 +705,22 @@ define zeroext i16 @atomic_shl1_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind ; X86-LABEL: atomic_shl1_xor_16_gpr_val: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movzwl (%esi), %eax +; X86-NEXT: movl $1, %esi +; X86-NEXT: shll %cl, %esi +; X86-NEXT: movzwl (%edx), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB12_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: xorl %esi, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %cx, (%esi) +; X86-NEXT: lock cmpxchgw %cx, (%edx) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB12_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: andl %edx, %eax +; X86-NEXT: andl %esi, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -953,23 +953,23 @@ define zeroext i16 @atomic_blsi_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind ; X86-LABEL: atomic_blsi_xor_16_gpr_val: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: negl %ecx -; X86-NEXT: andl %eax, %ecx -; X86-NEXT: movzwl (%edx), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: negl %edx +; X86-NEXT: andl %eax, %edx +; X86-NEXT: movzwl (%ecx), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB17_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: xorl %edx, %esi ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %si, (%edx) +; X86-NEXT: lock cmpxchgw %si, (%ecx) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB17_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: andl %ecx, %eax +; X86-NEXT: andl %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -1270,25 +1270,25 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_valz(ptr %v, i16 zeroext %c) n ; X86-LABEL: atomic_shl1_mask01_xor_16_gpr_valz: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andb $15, %cl -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movzwl (%edx), %eax +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movzwl (%esi), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB22_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: xorl %esi, %ecx +; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %cx, (%edx) +; X86-NEXT: lock cmpxchgw %cx, (%esi) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB22_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end ; X86-NEXT: movzwl %ax, %ecx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testl %ecx, %esi +; X86-NEXT: testl %ecx, %edx ; X86-NEXT: sete %al ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi @@ -1663,25 +1663,25 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_valnz(ptr %v, i16 zeroext %c) ; X86-LABEL: atomic_shl1_mask01_xor_16_gpr_valnz: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andb $15, %cl -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movzwl (%edx), %eax +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movzwl (%esi), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB28_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: xorl %esi, %ecx +; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %cx, (%edx) +; X86-NEXT: lock cmpxchgw %cx, (%esi) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB28_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end ; X86-NEXT: movzwl %ax, %ecx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testl %ecx, %esi +; X86-NEXT: testl %ecx, %edx ; X86-NEXT: setne %al ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi @@ -2207,19 +2207,19 @@ define zeroext i16 @atomic_blsi_xor_16_gpr_brz(ptr %v, i16 zeroext %c) nounwind ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: negl %esi -; X86-NEXT: andl %ecx, %esi -; X86-NEXT: movzwl (%edx), %eax +; X86-NEXT: andl %edx, %esi +; X86-NEXT: movzwl (%ecx), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB35_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %edi ; X86-NEXT: xorl %esi, %edi ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %di, (%edx) +; X86-NEXT: lock cmpxchgw %di, (%ecx) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB35_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end @@ -2228,8 +2228,8 @@ define zeroext i16 @atomic_blsi_xor_16_gpr_brz(ptr %v, i16 zeroext %c) nounwind ; X86-NEXT: testl %edi, %esi ; X86-NEXT: jne .LBB35_4 ; X86-NEXT: # %bb.3: # %if.then -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl (%edx,%eax,2), %eax +; X86-NEXT: movzwl %dx, %eax +; X86-NEXT: movzwl (%ecx,%eax,2), %eax ; X86-NEXT: .LBB35_4: # %return ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -2288,24 +2288,24 @@ define zeroext i16 @atomic_shl1_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl $1, %esi +; X86-NEXT: shll %cl, %esi ; X86-NEXT: movl $-2, %edi ; X86-NEXT: roll %cl, %edi -; X86-NEXT: movzwl (%esi), %eax +; X86-NEXT: movzwl (%edx), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB36_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl %edi, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %cx, (%esi) +; X86-NEXT: lock cmpxchgw %cx, (%edx) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB36_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: andl %edx, %eax +; X86-NEXT: andl %esi, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -2544,25 +2544,25 @@ define zeroext i16 @atomic_blsi_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: negl %ecx -; X86-NEXT: andl %eax, %ecx -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %eax, %edx +; X86-NEXT: negl %edx +; X86-NEXT: andl %eax, %edx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: notl %esi -; X86-NEXT: movzwl (%edx), %eax +; X86-NEXT: movzwl (%ecx), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB41_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %edi ; X86-NEXT: andl %esi, %edi ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %di, (%edx) +; X86-NEXT: lock cmpxchgw %di, (%ecx) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB41_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: andl %ecx, %eax +; X86-NEXT: andl %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -2873,27 +2873,27 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_valnz(ptr %v, i16 zeroext %c) ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andb $15, %cl -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl $-2, %edi ; X86-NEXT: roll %cl, %edi -; X86-NEXT: movzwl (%edx), %eax +; X86-NEXT: movzwl (%esi), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB46_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl %edi, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %cx, (%edx) +; X86-NEXT: lock cmpxchgw %cx, (%esi) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB46_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end ; X86-NEXT: movzwl %ax, %ecx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testl %ecx, %esi +; X86-NEXT: testl %ecx, %edx ; X86-NEXT: setne %al ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi @@ -3453,21 +3453,21 @@ define zeroext i16 @atomic_blsi_and_16_gpr_brnz(ptr %v, i16 zeroext %c) nounwind ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: negl %esi -; X86-NEXT: andl %ecx, %esi +; X86-NEXT: andl %edx, %esi ; X86-NEXT: movl %esi, %edi ; X86-NEXT: notl %edi -; X86-NEXT: movzwl (%edx), %eax +; X86-NEXT: movzwl (%ecx), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB53_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: andl %edi, %ebx ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %bx, (%edx) +; X86-NEXT: lock cmpxchgw %bx, (%ecx) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB53_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end @@ -3475,8 +3475,8 @@ define zeroext i16 @atomic_blsi_and_16_gpr_brnz(ptr %v, i16 zeroext %c) nounwind ; X86-NEXT: testl %eax, %esi ; X86-NEXT: je .LBB53_3 ; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl (%edx,%eax,2), %eax +; X86-NEXT: movzwl %dx, %eax +; X86-NEXT: movzwl (%ecx,%eax,2), %eax ; X86-NEXT: jmp .LBB53_5 ; X86-NEXT: .LBB53_3: ; X86-NEXT: movw $123, %ax @@ -3916,21 +3916,21 @@ define i32 @atomic_blsi_or_32_gpr_val(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_blsi_or_32_gpr_val: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: negl %ecx -; X86-NEXT: andl %eax, %ecx -; X86-NEXT: movl (%edx), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: negl %edx +; X86-NEXT: andl %eax, %edx +; X86-NEXT: movl (%ecx), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB65_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: lock cmpxchgl %esi, (%edx) +; X86-NEXT: orl %edx, %esi +; X86-NEXT: lock cmpxchgl %esi, (%ecx) ; X86-NEXT: jne .LBB65_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: andl %ecx, %eax +; X86-NEXT: andl %edx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 3e7d1138132c4e..59301c88f56978 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1739,16 +1739,16 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps (%rdi), %xmm1 -; SSE2-NEXT: movaps (%rsi), %xmm0 -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps (%rsi), %xmm1 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d @@ -1758,14 +1758,17 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: addq %rdx, %rbp +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: addq %rax, %rdi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE2-NEXT: addq %rcx, %rdx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%rbp,%rcx), %rbp +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSE2-NEXT: leaq -1(%r13,%rcx), %r13 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSE2-NEXT: leaq -1(%r12,%rcx), %r12 @@ -1784,96 +1787,95 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSE2-NEXT: leaq -1(%r8,%rcx), %r8 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%rdi,%rcx), %rdi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSE2-NEXT: leaq -1(%rsi,%rcx), %rsi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: leaq -1(%rax,%rcx), %rax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: leaq -1(%rcx,%rax), %rax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: leaq -1(%rax,%rcx), %rax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: leaq -1(%rcx,%rax), %rax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: leaq -1(%rax,%rcx), %rax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: addq $-1, %rbp +; SSE2-NEXT: addq $-1, %rdi ; SSE2-NEXT: movl $0, %eax ; SSE2-NEXT: adcq $-1, %rax ; SSE2-NEXT: addq $-1, %rdx ; SSE2-NEXT: adcq $-1, %rcx ; SSE2-NEXT: shldq $63, %rdx, %rcx -; SSE2-NEXT: shldq $63, %rbp, %rax -; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: shldq $63, %rdi, %rax +; SSE2-NEXT: movq %rax, %xmm4 ; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: shrq %rbp +; SSE2-NEXT: movq %rbp, %xmm5 ; SSE2-NEXT: shrq %r13 -; SSE2-NEXT: movq %r13, %xmm3 +; SSE2-NEXT: movq %r13, %xmm6 ; SSE2-NEXT: shrq %r12 -; SSE2-NEXT: movq %r12, %xmm2 +; SSE2-NEXT: movq %r12, %xmm7 ; SSE2-NEXT: shrq %r15 -; SSE2-NEXT: movq %r15, %xmm5 +; SSE2-NEXT: movq %r15, %xmm2 ; SSE2-NEXT: shrq %r14 -; SSE2-NEXT: movq %r14, %xmm4 +; SSE2-NEXT: movq %r14, %xmm8 ; SSE2-NEXT: shrq %rbx -; SSE2-NEXT: movq %rbx, %xmm6 +; SSE2-NEXT: movq %rbx, %xmm9 ; SSE2-NEXT: shrq %r11 -; SSE2-NEXT: movq %r11, %xmm7 +; SSE2-NEXT: movq %r11, %xmm10 ; SSE2-NEXT: shrq %r10 -; SSE2-NEXT: movq %r10, %xmm9 +; SSE2-NEXT: movq %r10, %xmm1 ; SSE2-NEXT: shrq %r9 -; SSE2-NEXT: movq %r9, %xmm8 +; SSE2-NEXT: movq %r9, %xmm11 ; SSE2-NEXT: shrq %r8 -; SSE2-NEXT: movq %r8, %xmm11 -; SSE2-NEXT: shrq %rdi -; SSE2-NEXT: movq %rdi, %xmm12 +; SSE2-NEXT: movq %r8, %xmm12 ; SSE2-NEXT: shrq %rsi ; SSE2-NEXT: movq %rsi, %xmm13 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm10 +; SSE2-NEXT: movq %rax, %xmm3 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax ; SSE2-NEXT: movq %rax, %xmm14 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax ; SSE2-NEXT: movq %rax, %xmm15 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: psllq $48, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pslld $16, %xmm6 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT: psllq $48, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,1,1] +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,0,0] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: movupd %xmm2, (%rax) +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,1,0,1] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] +; SSE2-NEXT: movupd %xmm1, (%rax) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -1894,71 +1896,68 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpextrw $4, %xmm0, %eax +; AVX1-NEXT: vpextrw $4, %xmm0, %ecx +; AVX1-NEXT: vpextrw $5, %xmm0, %edx +; AVX1-NEXT: vpextrw $6, %xmm0, %r8d +; AVX1-NEXT: vpextrw $7, %xmm0, %eax +; AVX1-NEXT: vpextrw $0, %xmm3, %r9d +; AVX1-NEXT: vpextrw $1, %xmm3, %r10d +; AVX1-NEXT: vpextrw $2, %xmm3, %r11d +; AVX1-NEXT: vpextrw $3, %xmm3, %ebx +; AVX1-NEXT: vpextrw $4, %xmm3, %r14d +; AVX1-NEXT: vpextrw $5, %xmm3, %r15d +; AVX1-NEXT: vpextrw $6, %xmm3, %r12d +; AVX1-NEXT: vpextrw $7, %xmm3, %r13d +; AVX1-NEXT: vpextrw $1, %xmm0, %esi +; AVX1-NEXT: vpextrw $0, %xmm0, %ebp +; AVX1-NEXT: vpextrw $1, %xmm1, %edi +; AVX1-NEXT: addq %rsi, %rdi +; AVX1-NEXT: vpextrw $0, %xmm1, %esi +; AVX1-NEXT: addq %rbp, %rsi +; AVX1-NEXT: vpextrw $7, %xmm2, %ebp +; AVX1-NEXT: leaq -1(%r13,%rbp), %r13 +; AVX1-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm2, %r13d +; AVX1-NEXT: leaq -1(%r12,%r13), %rbp +; AVX1-NEXT: vpextrw $5, %xmm2, %r12d +; AVX1-NEXT: leaq -1(%r15,%r12), %r13 +; AVX1-NEXT: vpextrw $4, %xmm2, %r15d +; AVX1-NEXT: leaq -1(%r14,%r15), %r12 +; AVX1-NEXT: vpextrw $3, %xmm2, %r14d +; AVX1-NEXT: leaq -1(%rbx,%r14), %r15 +; AVX1-NEXT: vpextrw $2, %xmm2, %ebx +; AVX1-NEXT: leaq -1(%r11,%rbx), %r14 +; AVX1-NEXT: vpextrw $1, %xmm2, %r11d +; AVX1-NEXT: leaq -1(%r10,%r11), %rbx +; AVX1-NEXT: vpextrw $0, %xmm2, %r10d +; AVX1-NEXT: leaq -1(%r9,%r10), %r11 +; AVX1-NEXT: vpextrw $7, %xmm1, %r9d +; AVX1-NEXT: leaq -1(%rax,%r9), %r10 +; AVX1-NEXT: vpextrw $6, %xmm1, %eax +; AVX1-NEXT: leaq -1(%r8,%rax), %r9 +; AVX1-NEXT: vpextrw $5, %xmm1, %eax +; AVX1-NEXT: leaq -1(%rdx,%rax), %r8 +; AVX1-NEXT: vpextrw $4, %xmm1, %eax +; AVX1-NEXT: leaq -1(%rcx,%rax), %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm0, %eax +; AVX1-NEXT: vpextrw $3, %xmm0, %eax +; AVX1-NEXT: vpextrw $3, %xmm1, %edx +; AVX1-NEXT: leaq -1(%rax,%rdx), %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm0, %ebx -; AVX1-NEXT: vpextrw $7, %xmm0, %esi -; AVX1-NEXT: vpextrw $0, %xmm3, %edi -; AVX1-NEXT: vpextrw $1, %xmm3, %r8d -; AVX1-NEXT: vpextrw $2, %xmm3, %r9d -; AVX1-NEXT: vpextrw $3, %xmm3, %r10d -; AVX1-NEXT: vpextrw $4, %xmm3, %r11d -; AVX1-NEXT: vpextrw $5, %xmm3, %r14d -; AVX1-NEXT: vpextrw $6, %xmm3, %r15d -; AVX1-NEXT: vpextrw $7, %xmm3, %edx -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: vpextrw $0, %xmm0, %r12d -; AVX1-NEXT: vpextrw $1, %xmm1, %ecx -; AVX1-NEXT: addq %rax, %rcx -; AVX1-NEXT: vpextrw $0, %xmm1, %eax -; AVX1-NEXT: addq %r12, %rax -; AVX1-NEXT: vpextrw $7, %xmm2, %r12d -; AVX1-NEXT: leaq -1(%rdx,%r12), %rdx -; AVX1-NEXT: vpextrw $6, %xmm2, %r12d -; AVX1-NEXT: leaq -1(%r15,%r12), %rbp -; AVX1-NEXT: vpextrw $5, %xmm2, %r15d -; AVX1-NEXT: leaq -1(%r14,%r15), %r13 -; AVX1-NEXT: vpextrw $4, %xmm2, %r14d -; AVX1-NEXT: leaq -1(%r11,%r14), %r12 -; AVX1-NEXT: vpextrw $3, %xmm2, %r11d -; AVX1-NEXT: leaq -1(%r10,%r11), %r15 -; AVX1-NEXT: vpextrw $2, %xmm2, %r10d -; AVX1-NEXT: leaq -1(%r9,%r10), %r14 -; AVX1-NEXT: vpextrw $1, %xmm2, %r9d -; AVX1-NEXT: leaq -1(%r8,%r9), %r11 -; AVX1-NEXT: vpextrw $0, %xmm2, %r8d -; AVX1-NEXT: leaq -1(%rdi,%r8), %r10 -; AVX1-NEXT: vpextrw $7, %xmm1, %edi -; AVX1-NEXT: leaq -1(%rsi,%rdi), %r9 -; AVX1-NEXT: vpextrw $6, %xmm1, %esi -; AVX1-NEXT: leaq -1(%rbx,%rsi), %r8 -; AVX1-NEXT: vpextrw $5, %xmm1, %esi -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rdi,%rsi), %rsi -; AVX1-NEXT: vpextrw $4, %xmm1, %edi -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX1-NEXT: leaq -1(%rbx,%rdi), %rdi -; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm0, %edi -; AVX1-NEXT: vpextrw $3, %xmm1, %ebx -; AVX1-NEXT: leaq -1(%rdi,%rbx), %rdi -; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $2, %xmm0, %edi -; AVX1-NEXT: vpextrw $2, %xmm1, %ebx -; AVX1-NEXT: leaq -1(%rdi,%rbx), %rdi -; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: addq $-1, %rcx -; AVX1-NEXT: movl $0, %ebx -; AVX1-NEXT: adcq $-1, %rbx -; AVX1-NEXT: addq $-1, %rax -; AVX1-NEXT: adcq $-1, %rdi -; AVX1-NEXT: shldq $63, %rax, %rdi -; AVX1-NEXT: shldq $63, %rcx, %rbx -; AVX1-NEXT: shrq %rdx -; AVX1-NEXT: vmovq %rdx, %xmm0 +; AVX1-NEXT: vpextrw $2, %xmm0, %eax +; AVX1-NEXT: vpextrw $2, %xmm1, %edx +; AVX1-NEXT: leaq -1(%rax,%rdx), %rdx +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: addq $-1, %rdi +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: adcq $-1, %rax +; AVX1-NEXT: addq $-1, %rsi +; AVX1-NEXT: adcq $-1, %rcx +; AVX1-NEXT: shldq $63, %rsi, %rcx +; AVX1-NEXT: shldq $63, %rdi, %rax +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX1-NEXT: shrq %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 ; AVX1-NEXT: shrq %rbp ; AVX1-NEXT: vmovq %rbp, %xmm1 ; AVX1-NEXT: shrq %r13 @@ -1969,27 +1968,26 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX1-NEXT: vmovq %r15, %xmm4 ; AVX1-NEXT: shrq %r14 ; AVX1-NEXT: vmovq %r14, %xmm5 +; AVX1-NEXT: shrq %rbx +; AVX1-NEXT: vmovq %rbx, %xmm6 ; AVX1-NEXT: shrq %r11 -; AVX1-NEXT: vmovq %r11, %xmm6 +; AVX1-NEXT: vmovq %r11, %xmm7 ; AVX1-NEXT: shrq %r10 -; AVX1-NEXT: vmovq %r10, %xmm7 +; AVX1-NEXT: vmovq %r10, %xmm8 ; AVX1-NEXT: shrq %r9 -; AVX1-NEXT: vmovq %r9, %xmm8 +; AVX1-NEXT: vmovq %r9, %xmm9 ; AVX1-NEXT: shrq %r8 -; AVX1-NEXT: vmovq %r8, %xmm9 +; AVX1-NEXT: vmovq %r8, %xmm10 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX1-NEXT: shrq %rsi -; AVX1-NEXT: vmovq %rsi, %xmm10 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm11 -; AVX1-NEXT: vmovq %rbx, %xmm12 -; AVX1-NEXT: vmovq %rdi, %xmm13 +; AVX1-NEXT: vmovq %rsi, %xmm11 +; AVX1-NEXT: vmovq %rax, %xmm12 +; AVX1-NEXT: vmovq %rcx, %xmm13 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: vmovq %rax, %xmm14 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm15 +; AVX1-NEXT: shrq %rdx +; AVX1-NEXT: vmovq %rdx, %xmm15 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] @@ -2030,161 +2028,163 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-NEXT: vmovq %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vmovq %xmm7, %rsi +; AVX2-NEXT: vmovq %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm2, %rdx +; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX2-NEXT: vmovq %xmm8, %r8 -; AVX2-NEXT: vpextrq $1, %xmm8, %r13 -; AVX2-NEXT: vpextrq $1, %xmm2, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %r15 -; AVX2-NEXT: vpextrq $1, %xmm6, %r12 -; AVX2-NEXT: vpextrq $1, %xmm4, %rbx -; AVX2-NEXT: vpextrq $1, %xmm1, %rdi -; AVX2-NEXT: vpextrq $1, %xmm3, %rcx -; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: vpextrq $1, %xmm0, %r11 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX2-NEXT: vmovq %xmm8, %rsi +; AVX2-NEXT: vpextrq $1, %xmm8, %r8 +; AVX2-NEXT: vpextrq $1, %xmm2, %r9 +; AVX2-NEXT: vpextrq $1, %xmm7, %r10 +; AVX2-NEXT: vpextrq $1, %xmm6, %r11 +; AVX2-NEXT: vpextrq $1, %xmm5, %rbx +; AVX2-NEXT: vpextrq $1, %xmm1, %r14 +; AVX2-NEXT: vpextrq $1, %xmm3, %rax +; AVX2-NEXT: vmovq %xmm3, %rcx +; AVX2-NEXT: vpextrq $1, %xmm0, %rdi +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-NEXT: vpextrq $1, %xmm9, %r9 -; AVX2-NEXT: addq %r13, %r9 -; AVX2-NEXT: movq %r9, %r13 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX2-NEXT: vpextrq $1, %xmm9, %r15 +; AVX2-NEXT: addq %r8, %r15 +; AVX2-NEXT: movq %r15, %r8 +; AVX2-NEXT: vpextrq $1, %xmm4, %rdx +; AVX2-NEXT: addq %r9, %rdx ; AVX2-NEXT: vpextrq $1, %xmm8, %r9 -; AVX2-NEXT: addq %r14, %r9 -; AVX2-NEXT: movq %r9, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %r10 -; AVX2-NEXT: addq %r15, %r10 -; AVX2-NEXT: vpextrq $1, %xmm5, %r15 -; AVX2-NEXT: addq %r12, %r15 -; AVX2-NEXT: vpextrq $1, %xmm4, %r12 +; AVX2-NEXT: addq %r10, %r9 +; AVX2-NEXT: movq %r9, %r10 +; AVX2-NEXT: vpextrq $1, %xmm7, %r13 +; AVX2-NEXT: addq %r11, %r13 +; AVX2-NEXT: vpextrq $1, %xmm5, %r12 ; AVX2-NEXT: addq %rbx, %r12 -; AVX2-NEXT: vpextrq $1, %xmm3, %rbp -; AVX2-NEXT: addq %rdi, %rbp -; AVX2-NEXT: vpextrq $1, %xmm6, %r9 -; AVX2-NEXT: addq %rcx, %r9 -; AVX2-NEXT: vmovq %xmm6, %rdi -; AVX2-NEXT: addq %rax, %rdi -; AVX2-NEXT: vpextrq $1, %xmm2, %rcx -; AVX2-NEXT: addq %r11, %rcx -; AVX2-NEXT: vmovq %xmm9, %r11 -; AVX2-NEXT: leaq -1(%r8,%r11), %rax +; AVX2-NEXT: vpextrq $1, %xmm3, %r15 +; AVX2-NEXT: addq %r14, %r15 +; AVX2-NEXT: vpextrq $1, %xmm6, %rbp +; AVX2-NEXT: addq %rax, %rbp +; AVX2-NEXT: vmovq %xmm6, %rbx +; AVX2-NEXT: addq %rcx, %rbx +; AVX2-NEXT: vpextrq $1, %xmm2, %r11 +; AVX2-NEXT: addq %rdi, %r11 +; AVX2-NEXT: vmovq %xmm9, %rax +; AVX2-NEXT: leaq -1(%rsi,%rax), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm8, %r8 -; AVX2-NEXT: leaq -1(%rdx,%r8), %rax +; AVX2-NEXT: vmovq %xmm4, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: leaq -1(%rcx,%rax), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm7, %rdx -; AVX2-NEXT: leaq -1(%rsi,%rdx), %rax +; AVX2-NEXT: vmovq %xmm8, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: leaq -1(%rcx,%rax), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm5, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdx), %rax +; AVX2-NEXT: vmovq %xmm7, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: leaq -1(%rcx,%rax), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm4, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdx), %rax +; AVX2-NEXT: vmovq %xmm5, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: leaq -1(%rcx,%rax), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vmovq %xmm3, %rcx +; AVX2-NEXT: leaq -1(%rax,%rcx), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm1, %rdx -; AVX2-NEXT: vmovq %xmm3, %rsi -; AVX2-NEXT: leaq -1(%rdx,%rsi), %rax +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovq %xmm2, %rcx +; AVX2-NEXT: leaq -1(%rax,%rcx), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm0, %rdx -; AVX2-NEXT: vmovq %xmm2, %rsi -; AVX2-NEXT: leaq -1(%rdx,%rsi), %rdx +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: addq $-1, %r8 +; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %r9d +; AVX2-NEXT: adcq $-1, %r9 +; AVX2-NEXT: addq $-1, %rdx ; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %r8d +; AVX2-NEXT: adcq $-1, %r8 +; AVX2-NEXT: addq $-1, %r10 +; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %edi +; AVX2-NEXT: adcq $-1, %rdi ; AVX2-NEXT: addq $-1, %r13 -; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: adcq $-1, %rdx -; AVX2-NEXT: addq $-1, %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: movl $0, %esi ; AVX2-NEXT: adcq $-1, %rsi -; AVX2-NEXT: addq $-1, %r10 -; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r8d -; AVX2-NEXT: adcq $-1, %r8 +; AVX2-NEXT: addq $-1, %r12 +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: adcq $-1, %rdx ; AVX2-NEXT: addq $-1, %r15 +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: adcq $-1, %rcx +; AVX2-NEXT: addq $-1, %rbp ; AVX2-NEXT: movl $0, %r10d ; AVX2-NEXT: adcq $-1, %r10 -; AVX2-NEXT: addq $-1, %r12 -; AVX2-NEXT: movl $0, %ebx -; AVX2-NEXT: adcq $-1, %rbx -; AVX2-NEXT: addq $-1, %rbp -; AVX2-NEXT: movl $0, %r14d -; AVX2-NEXT: adcq $-1, %r14 -; AVX2-NEXT: addq $-1, %r9 -; AVX2-NEXT: movl $0, %r13d -; AVX2-NEXT: adcq $-1, %r13 -; AVX2-NEXT: addq $-1, %rdi -; AVX2-NEXT: movl $0, %r11d -; AVX2-NEXT: adcq $-1, %r11 -; AVX2-NEXT: addq $-1, %rcx +; AVX2-NEXT: addq $-1, %rbx ; AVX2-NEXT: movl $0, %eax ; AVX2-NEXT: adcq $-1, %rax -; AVX2-NEXT: shldq $63, %rcx, %rax -; AVX2-NEXT: shldq $63, %rdi, %r11 -; AVX2-NEXT: shldq $63, %r9, %r13 -; AVX2-NEXT: shldq $63, %rbp, %r14 -; AVX2-NEXT: shldq $63, %r12, %rbx -; AVX2-NEXT: shldq $63, %r15, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %rsi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vmovq %rsi, %xmm2 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm3 -; AVX2-NEXT: vmovq %r8, %xmm4 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm5 -; AVX2-NEXT: vmovq %r10, %xmm6 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm7 -; AVX2-NEXT: vmovq %rbx, %xmm8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm9 -; AVX2-NEXT: vmovq %r14, %xmm10 +; AVX2-NEXT: addq $-1, %r11 +; AVX2-NEXT: adcq $-1, %r14 +; AVX2-NEXT: shldq $63, %r11, %r14 +; AVX2-NEXT: shldq $63, %rbx, %rax +; AVX2-NEXT: shldq $63, %rbp, %r10 +; AVX2-NEXT: shldq $63, %r15, %rcx +; AVX2-NEXT: shldq $63, %r12, %rdx +; AVX2-NEXT: shldq $63, %r13, %rsi +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: shldq $63, %r11, %rdi +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: shldq $63, %r11, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: shldq $63, %r11, %r9 +; AVX2-NEXT: vmovq %r9, %xmm0 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX2-NEXT: shrq %r9 +; AVX2-NEXT: vmovq %r9, %xmm1 +; AVX2-NEXT: vmovq %r8, %xmm2 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: shrq %r8 +; AVX2-NEXT: vmovq %r8, %xmm3 +; AVX2-NEXT: vmovq %rdi, %xmm4 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX2-NEXT: shrq %rdi +; AVX2-NEXT: vmovq %rdi, %xmm5 +; AVX2-NEXT: vmovq %rsi, %xmm6 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: shrq %rsi +; AVX2-NEXT: vmovq %rsi, %xmm7 +; AVX2-NEXT: vmovq %rdx, %xmm8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX2-NEXT: shrq %rdx +; AVX2-NEXT: vmovq %rdx, %xmm9 +; AVX2-NEXT: vmovq %rcx, %xmm10 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm11 -; AVX2-NEXT: vmovq %r13, %xmm12 -; AVX2-NEXT: vmovq %r11, %xmm13 -; AVX2-NEXT: vmovq %rax, %xmm14 +; AVX2-NEXT: vmovq %r10, %xmm12 +; AVX2-NEXT: vmovq %rax, %xmm13 +; AVX2-NEXT: vmovq %r14, %xmm14 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: vmovq %rax, %xmm15 @@ -2229,7 +2229,7 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -2238,125 +2238,125 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX512-NEXT: vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX512-NEXT: vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vmovq %xmm3, %r13 +; AVX512-NEXT: vmovq %xmm3, %rdx ; AVX512-NEXT: vpextrq $1, %xmm3, %rsi ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vmovq %xmm3, %rdi -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-NEXT: vmovq %xmm5, %r8 +; AVX512-NEXT: vmovq %xmm3, %r9 +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-NEXT: vmovq %xmm4, %r10 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vmovq %xmm2, %r9 -; AVX512-NEXT: vpextrq $1, %xmm2, %r10 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512-NEXT: vmovq %xmm2, %r11 -; AVX512-NEXT: vpextrq $1, %xmm2, %rbx -; AVX512-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512-NEXT: vpextrq $1, %xmm3, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vpextrq $1, %xmm0, %r14 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512-NEXT: vpextrq $1, %xmm2, %r14 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %r15 +; AVX512-NEXT: vpextrq $1, %xmm2, %r12 +; AVX512-NEXT: vpextrq $1, %xmm4, %rax +; AVX512-NEXT: vpextrq $1, %xmm3, %rbx +; AVX512-NEXT: vpextrq $1, %xmm1, %r13 +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm2 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm4 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm9 ; AVX512-NEXT: vpextrq $1, %xmm8, %rbp -; AVX512-NEXT: addq %rdx, %rbp -; AVX512-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512-NEXT: addq %rcx, %rdx -; AVX512-NEXT: vpextrq $1, %xmm3, %rcx -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: vpextrq $1, %xmm2, %rax -; AVX512-NEXT: addq %r14, %rax -; AVX512-NEXT: vpextrq $1, %xmm9, %r14 -; AVX512-NEXT: leaq -1(%rbx,%r14), %r12 -; AVX512-NEXT: vmovq %xmm9, %rbx -; AVX512-NEXT: leaq -1(%r11,%rbx), %r15 -; AVX512-NEXT: vpextrq $1, %xmm7, %r11 +; AVX512-NEXT: addq %rax, %rbp +; AVX512-NEXT: vpextrq $1, %xmm7, %r8 +; AVX512-NEXT: addq %rbx, %r8 +; AVX512-NEXT: vpextrq $1, %xmm3, %rbx +; AVX512-NEXT: addq %r13, %rbx +; AVX512-NEXT: vpextrq $1, %xmm2, %rdi +; AVX512-NEXT: addq %rcx, %rdi +; AVX512-NEXT: vpextrq $1, %xmm9, %r13 +; AVX512-NEXT: leaq -1(%r12,%r13), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm9, %r12 +; AVX512-NEXT: leaq -1(%r15,%r12), %r13 +; AVX512-NEXT: vpextrq $1, %xmm5, %r15 +; AVX512-NEXT: leaq -1(%r14,%r15), %r12 +; AVX512-NEXT: vmovq %xmm5, %r14 +; AVX512-NEXT: leaq -1(%r11,%r14), %r15 +; AVX512-NEXT: vmovq %xmm8, %r11 ; AVX512-NEXT: leaq -1(%r10,%r11), %r14 ; AVX512-NEXT: vmovq %xmm7, %r10 -; AVX512-NEXT: leaq -1(%r9,%r10), %rbx -; AVX512-NEXT: vmovq %xmm8, %r9 -; AVX512-NEXT: leaq -1(%r8,%r9), %r11 -; AVX512-NEXT: vmovq %xmm4, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %r10 -; AVX512-NEXT: vpextrq $1, %xmm6, %rdi -; AVX512-NEXT: leaq -1(%rsi,%rdi), %r9 +; AVX512-NEXT: leaq -1(%r9,%r10), %r11 +; AVX512-NEXT: vpextrq $1, %xmm6, %r9 +; AVX512-NEXT: leaq -1(%rsi,%r9), %r10 ; AVX512-NEXT: vmovq %xmm6, %rsi -; AVX512-NEXT: leaq -1(%r13,%rsi), %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vpextrq $1, %xmm5, %rsi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: leaq -1(%rdi,%rsi), %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm5, %rsi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: leaq -1(%rdi,%rsi), %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm1, %rsi -; AVX512-NEXT: vmovq %xmm3, %rdi -; AVX512-NEXT: leaq -1(%rsi,%rdi), %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm0, %rsi -; AVX512-NEXT: vmovq %xmm2, %rdi -; AVX512-NEXT: leaq -1(%rsi,%rdi), %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: xorl %r8d, %r8d +; AVX512-NEXT: leaq -1(%rdx,%rsi), %r9 +; AVX512-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: leaq -1(%rax,%rdx), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm4, %rcx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: leaq -1(%rax,%rcx), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm1, %rcx +; AVX512-NEXT: vmovq %xmm3, %rdx +; AVX512-NEXT: leaq -1(%rcx,%rdx), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: vmovq %xmm2, %rdx +; AVX512-NEXT: leaq -1(%rcx,%rdx), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: xorl %esi, %esi ; AVX512-NEXT: addq $-1, %rbp -; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: adcq $-1, %rdx +; AVX512-NEXT: addq $-1, %r8 +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: addq $-1, %rbx +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: adcq $-1, %rcx +; AVX512-NEXT: addq $-1, %rdi ; AVX512-NEXT: adcq $-1, %rsi -; AVX512-NEXT: addq $-1, %rdx -; AVX512-NEXT: movl $0, %edi -; AVX512-NEXT: adcq $-1, %rdi -; AVX512-NEXT: addq $-1, %rcx -; AVX512-NEXT: movl $0, %r13d -; AVX512-NEXT: adcq $-1, %r13 -; AVX512-NEXT: addq $-1, %rax -; AVX512-NEXT: adcq $-1, %r8 -; AVX512-NEXT: shldq $63, %rax, %r8 -; AVX512-NEXT: shldq $63, %rcx, %r13 -; AVX512-NEXT: shldq $63, %rdx, %rdi -; AVX512-NEXT: shldq $63, %rbp, %rsi +; AVX512-NEXT: shldq $63, %rdi, %rsi +; AVX512-NEXT: shldq $63, %rbx, %rcx +; AVX512-NEXT: shldq $63, %r8, %rax +; AVX512-NEXT: shldq $63, %rbp, %rdx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX512-NEXT: shrq %rdi +; AVX512-NEXT: vmovq %rdi, %xmm0 +; AVX512-NEXT: shrq %r13 +; AVX512-NEXT: vmovq %r13, %xmm1 ; AVX512-NEXT: shrq %r12 -; AVX512-NEXT: vmovq %r12, %xmm0 +; AVX512-NEXT: vmovq %r12, %xmm2 ; AVX512-NEXT: shrq %r15 -; AVX512-NEXT: vmovq %r15, %xmm1 +; AVX512-NEXT: vmovq %r15, %xmm3 +; AVX512-NEXT: vmovq %rdx, %xmm4 ; AVX512-NEXT: shrq %r14 -; AVX512-NEXT: vmovq %r14, %xmm2 -; AVX512-NEXT: shrq %rbx -; AVX512-NEXT: vmovq %rbx, %xmm3 -; AVX512-NEXT: vmovq %rsi, %xmm4 +; AVX512-NEXT: vmovq %r14, %xmm5 +; AVX512-NEXT: vmovq %rax, %xmm6 ; AVX512-NEXT: shrq %r11 -; AVX512-NEXT: vmovq %r11, %xmm5 -; AVX512-NEXT: vmovq %rdi, %xmm6 +; AVX512-NEXT: vmovq %r11, %xmm7 ; AVX512-NEXT: shrq %r10 -; AVX512-NEXT: vmovq %r10, %xmm7 +; AVX512-NEXT: vmovq %r10, %xmm8 ; AVX512-NEXT: shrq %r9 -; AVX512-NEXT: vmovq %r9, %xmm8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm9 +; AVX512-NEXT: vmovq %r9, %xmm9 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: shrq %rax ; AVX512-NEXT: vmovq %rax, %xmm10 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: shrq %rax ; AVX512-NEXT: vmovq %rax, %xmm11 -; AVX512-NEXT: vmovq %r13, %xmm12 +; AVX512-NEXT: vmovq %rcx, %xmm12 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: shrq %rax ; AVX512-NEXT: vmovq %rax, %xmm13 -; AVX512-NEXT: vmovq %r8, %xmm14 +; AVX512-NEXT: vmovq %rsi, %xmm14 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: shrq %rax ; AVX512-NEXT: vmovq %rax, %xmm15 diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index b39b089faa2a5e..9d28aa5410a1b4 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -910,84 +910,84 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: kmovw %r10d, %k2 ; KNL-NEXT: kandw %k1, %k2, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftrw $1, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftrw $3, %k0, %k1 -; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: kmovw %k1, %r12d ; KNL-NEXT: kshiftrw $4, %k0, %k1 -; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: kmovw %k1, %r15d ; KNL-NEXT: kshiftrw $5, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftrw $6, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftrw $7, %k0, %k1 ; KNL-NEXT: kmovw %k1, %ebx ; KNL-NEXT: kshiftrw $8, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ebp +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftrw $9, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r14d +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftrw $10, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r11d +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftrw $11, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r15d +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftrw $12, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r12d +; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftrw $13, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r13d +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftrw $14, %k0, %k1 -; KNL-NEXT: andl $1, %edx -; KNL-NEXT: movb %dl, 2(%rax) -; KNL-NEXT: kmovw %k0, %edx -; KNL-NEXT: andl $1, %edx -; KNL-NEXT: andl $1, %r9d -; KNL-NEXT: leal (%rdx,%r9,2), %r9d -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: andl $1, %r13d +; KNL-NEXT: movb %r13b, 2(%rax) +; KNL-NEXT: kmovw %k0, %r13d +; KNL-NEXT: andl $1, %r13d +; KNL-NEXT: andl $1, %r11d +; KNL-NEXT: leal (%r13,%r11,2), %r13d +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: andl $1, %r8d -; KNL-NEXT: leal (%r9,%r8,4), %r9d -; KNL-NEXT: kmovw %k0, %r8d -; KNL-NEXT: andl $1, %esi -; KNL-NEXT: leal (%r9,%rsi,8), %esi -; KNL-NEXT: andl $1, %edi -; KNL-NEXT: shll $4, %edi -; KNL-NEXT: orl %esi, %edi -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: shll $5, %ecx -; KNL-NEXT: orl %edi, %ecx -; KNL-NEXT: andl $1, %r10d -; KNL-NEXT: shll $6, %r10d -; KNL-NEXT: andl $1, %ebx -; KNL-NEXT: shll $7, %ebx -; KNL-NEXT: orl %r10d, %ebx -; KNL-NEXT: andl $1, %ebp -; KNL-NEXT: shll $8, %ebp -; KNL-NEXT: orl %ebx, %ebp ; KNL-NEXT: andl $1, %r14d -; KNL-NEXT: shll $9, %r14d -; KNL-NEXT: orl %ebp, %r14d -; KNL-NEXT: andl $1, %r11d -; KNL-NEXT: shll $10, %r11d -; KNL-NEXT: orl %r14d, %r11d -; KNL-NEXT: orl %ecx, %r11d -; KNL-NEXT: andl $1, %r15d -; KNL-NEXT: shll $11, %r15d +; KNL-NEXT: leal (%r13,%r14,4), %r13d +; KNL-NEXT: kmovw %k0, %r14d ; KNL-NEXT: andl $1, %r12d -; KNL-NEXT: shll $12, %r12d -; KNL-NEXT: orl %r15d, %r12d -; KNL-NEXT: andl $1, %r13d -; KNL-NEXT: shll $13, %r13d -; KNL-NEXT: orl %r12d, %r13d +; KNL-NEXT: leal (%r13,%r12,8), %r12d +; KNL-NEXT: andl $1, %r15d +; KNL-NEXT: shll $4, %r15d +; KNL-NEXT: orl %r12d, %r15d ; KNL-NEXT: andl $1, %edx -; KNL-NEXT: shll $14, %edx -; KNL-NEXT: orl %r13d, %edx +; KNL-NEXT: shll $5, %edx +; KNL-NEXT: orl %r15d, %edx +; KNL-NEXT: andl $1, %ebp +; KNL-NEXT: shll $6, %ebp +; KNL-NEXT: andl $1, %ebx +; KNL-NEXT: shll $7, %ebx +; KNL-NEXT: orl %ebp, %ebx +; KNL-NEXT: andl $1, %r10d +; KNL-NEXT: shll $8, %r10d +; KNL-NEXT: orl %ebx, %r10d +; KNL-NEXT: andl $1, %r9d +; KNL-NEXT: shll $9, %r9d +; KNL-NEXT: orl %r10d, %r9d +; KNL-NEXT: andl $1, %ecx +; KNL-NEXT: shll $10, %ecx +; KNL-NEXT: orl %r9d, %ecx +; KNL-NEXT: orl %edx, %ecx ; KNL-NEXT: andl $1, %r8d -; KNL-NEXT: shll $15, %r8d -; KNL-NEXT: orl %edx, %r8d -; KNL-NEXT: orl %r11d, %r8d -; KNL-NEXT: movw %r8w, (%rax) +; KNL-NEXT: shll $11, %r8d +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: shll $12, %edi +; KNL-NEXT: orl %r8d, %edi +; KNL-NEXT: andl $1, %esi +; KNL-NEXT: shll $13, %esi +; KNL-NEXT: orl %edi, %esi +; KNL-NEXT: andl $1, %r11d +; KNL-NEXT: shll $14, %r11d +; KNL-NEXT: orl %esi, %r11d +; KNL-NEXT: andl $1, %r14d +; KNL-NEXT: shll $15, %r14d +; KNL-NEXT: orl %r11d, %r14d +; KNL-NEXT: orl %ecx, %r14d +; KNL-NEXT: movw %r14w, (%rax) ; KNL-NEXT: popq %rbx ; KNL-NEXT: popq %r12 ; KNL-NEXT: popq %r13 @@ -1085,16 +1085,16 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kshiftrd $21, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: movl $-2049, %edi ## imm = 0xF7FF -; SKX-NEXT: kmovd %edi, %k6 -; SKX-NEXT: kandd %k6, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftld $31, %k1, %k1 ; SKX-NEXT: kshiftrd $20, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: movl $-4097, %edi ## imm = 0xEFFF -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; SKX-NEXT: kandd %k1, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k6 +; SKX-NEXT: kandd %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftld $31, %k1, %k1 ; SKX-NEXT: kshiftrd $19, %k1, %k1 @@ -1190,14 +1190,14 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kshiftrd $21, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kandd %k6, %k0, %k0 +; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 4-byte Reload +; SKX-NEXT: kandd %k7, %k0, %k0 ; SKX-NEXT: kshiftld $31, %k1, %k1 ; SKX-NEXT: kshiftrd $20, %k1, %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; SKX-NEXT: kord %k1, %k0, %k0 -; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload -; SKX-NEXT: kandd %k1, %k0, %k0 -; SKX-NEXT: kshiftld $31, %k6, %k1 +; SKX-NEXT: kandd %k6, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k7, %k1 ; SKX-NEXT: kshiftrd $19, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: kandd %k5, %k0, %k0 @@ -1223,84 +1223,84 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload ; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k1 -; SKX-NEXT: kmovd %k1, %edx +; SKX-NEXT: kmovd %k1, %r13d ; SKX-NEXT: kshiftrd $1, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r9d +; SKX-NEXT: kmovd %k1, %r11d ; SKX-NEXT: kshiftrd $2, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r8d +; SKX-NEXT: kmovd %k1, %r14d ; SKX-NEXT: kshiftrd $3, %k0, %k1 -; SKX-NEXT: kmovd %k1, %esi +; SKX-NEXT: kmovd %k1, %r12d ; SKX-NEXT: kshiftrd $4, %k0, %k1 -; SKX-NEXT: kmovd %k1, %edi +; SKX-NEXT: kmovd %k1, %r15d ; SKX-NEXT: kshiftrd $5, %k0, %k1 -; SKX-NEXT: kmovd %k1, %ecx +; SKX-NEXT: kmovd %k1, %edx ; SKX-NEXT: kshiftrd $6, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r10d +; SKX-NEXT: kmovd %k1, %ebp ; SKX-NEXT: kshiftrd $7, %k0, %k1 ; SKX-NEXT: kmovd %k1, %ebx ; SKX-NEXT: kshiftrd $8, %k0, %k1 -; SKX-NEXT: kmovd %k1, %ebp +; SKX-NEXT: kmovd %k1, %r10d ; SKX-NEXT: kshiftrd $9, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r14d +; SKX-NEXT: kmovd %k1, %r9d ; SKX-NEXT: kshiftrd $10, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r11d +; SKX-NEXT: kmovd %k1, %ecx ; SKX-NEXT: kshiftrd $11, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r15d +; SKX-NEXT: kmovd %k1, %r8d ; SKX-NEXT: kshiftrd $12, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r12d +; SKX-NEXT: kmovd %k1, %edi ; SKX-NEXT: kshiftrd $13, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r13d +; SKX-NEXT: kmovd %k1, %esi ; SKX-NEXT: kshiftrd $14, %k0, %k1 -; SKX-NEXT: andl $1, %edx -; SKX-NEXT: movb %dl, 2(%rax) -; SKX-NEXT: kmovd %k0, %edx -; SKX-NEXT: andl $1, %edx -; SKX-NEXT: andl $1, %r9d -; SKX-NEXT: leal (%rdx,%r9,2), %r9d -; SKX-NEXT: kmovd %k1, %edx +; SKX-NEXT: andl $1, %r13d +; SKX-NEXT: movb %r13b, 2(%rax) +; SKX-NEXT: kmovd %k0, %r13d +; SKX-NEXT: andl $1, %r13d +; SKX-NEXT: andl $1, %r11d +; SKX-NEXT: leal (%r13,%r11,2), %r13d +; SKX-NEXT: kmovd %k1, %r11d ; SKX-NEXT: kshiftrd $15, %k0, %k0 -; SKX-NEXT: andl $1, %r8d -; SKX-NEXT: leal (%r9,%r8,4), %r9d -; SKX-NEXT: kmovd %k0, %r8d -; SKX-NEXT: andl $1, %esi -; SKX-NEXT: leal (%r9,%rsi,8), %esi -; SKX-NEXT: andl $1, %edi -; SKX-NEXT: shll $4, %edi -; SKX-NEXT: orl %esi, %edi -; SKX-NEXT: andl $1, %ecx -; SKX-NEXT: shll $5, %ecx -; SKX-NEXT: orl %edi, %ecx -; SKX-NEXT: andl $1, %r10d -; SKX-NEXT: shll $6, %r10d -; SKX-NEXT: andl $1, %ebx -; SKX-NEXT: shll $7, %ebx -; SKX-NEXT: orl %r10d, %ebx -; SKX-NEXT: andl $1, %ebp -; SKX-NEXT: shll $8, %ebp -; SKX-NEXT: orl %ebx, %ebp ; SKX-NEXT: andl $1, %r14d -; SKX-NEXT: shll $9, %r14d -; SKX-NEXT: orl %ebp, %r14d -; SKX-NEXT: andl $1, %r11d -; SKX-NEXT: shll $10, %r11d -; SKX-NEXT: orl %r14d, %r11d -; SKX-NEXT: orl %ecx, %r11d -; SKX-NEXT: andl $1, %r15d -; SKX-NEXT: shll $11, %r15d +; SKX-NEXT: leal (%r13,%r14,4), %r13d +; SKX-NEXT: kmovd %k0, %r14d ; SKX-NEXT: andl $1, %r12d -; SKX-NEXT: shll $12, %r12d -; SKX-NEXT: orl %r15d, %r12d -; SKX-NEXT: andl $1, %r13d -; SKX-NEXT: shll $13, %r13d -; SKX-NEXT: orl %r12d, %r13d +; SKX-NEXT: leal (%r13,%r12,8), %r12d +; SKX-NEXT: andl $1, %r15d +; SKX-NEXT: shll $4, %r15d +; SKX-NEXT: orl %r12d, %r15d ; SKX-NEXT: andl $1, %edx -; SKX-NEXT: shll $14, %edx -; SKX-NEXT: orl %r13d, %edx +; SKX-NEXT: shll $5, %edx +; SKX-NEXT: orl %r15d, %edx +; SKX-NEXT: andl $1, %ebp +; SKX-NEXT: shll $6, %ebp +; SKX-NEXT: andl $1, %ebx +; SKX-NEXT: shll $7, %ebx +; SKX-NEXT: orl %ebp, %ebx +; SKX-NEXT: andl $1, %r10d +; SKX-NEXT: shll $8, %r10d +; SKX-NEXT: orl %ebx, %r10d +; SKX-NEXT: andl $1, %r9d +; SKX-NEXT: shll $9, %r9d +; SKX-NEXT: orl %r10d, %r9d +; SKX-NEXT: andl $1, %ecx +; SKX-NEXT: shll $10, %ecx +; SKX-NEXT: orl %r9d, %ecx +; SKX-NEXT: orl %edx, %ecx ; SKX-NEXT: andl $1, %r8d -; SKX-NEXT: shll $15, %r8d -; SKX-NEXT: orl %edx, %r8d -; SKX-NEXT: orl %r11d, %r8d -; SKX-NEXT: movw %r8w, (%rax) +; SKX-NEXT: shll $11, %r8d +; SKX-NEXT: andl $1, %edi +; SKX-NEXT: shll $12, %edi +; SKX-NEXT: orl %r8d, %edi +; SKX-NEXT: andl $1, %esi +; SKX-NEXT: shll $13, %esi +; SKX-NEXT: orl %edi, %esi +; SKX-NEXT: andl $1, %r11d +; SKX-NEXT: shll $14, %r11d +; SKX-NEXT: orl %esi, %r11d +; SKX-NEXT: andl $1, %r14d +; SKX-NEXT: shll $15, %r14d +; SKX-NEXT: orl %r11d, %r14d +; SKX-NEXT: orl %ecx, %r14d +; SKX-NEXT: movw %r14w, (%rax) ; SKX-NEXT: popq %rbx ; SKX-NEXT: popq %r12 ; SKX-NEXT: popq %r13 @@ -1556,9 +1556,9 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: kshiftrw $1, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $2, %k0, %k1 -; KNL_X32-NEXT: kmovw %k1, %esi -; KNL_X32-NEXT: kshiftrw $3, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edi +; KNL_X32-NEXT: kshiftrw $3, %k0, %k1 +; KNL_X32-NEXT: kmovw %k1, %esi ; KNL_X32-NEXT: kshiftrw $4, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edx ; KNL_X32-NEXT: kshiftrw $5, %k0, %k1 @@ -1572,18 +1572,18 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: leal (%ebx,%ebp,2), %ebx ; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $7, %k0, %k1 +; KNL_X32-NEXT: andl $1, %edi +; KNL_X32-NEXT: leal (%ebx,%edi,4), %edi +; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: kshiftrw $8, %k0, %k1 ; KNL_X32-NEXT: andl $1, %esi -; KNL_X32-NEXT: leal (%ebx,%esi,4), %ebx +; KNL_X32-NEXT: leal (%edi,%esi,8), %edi ; KNL_X32-NEXT: kmovw %k1, %esi -; KNL_X32-NEXT: kshiftrw $8, %k0, %k1 -; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: leal (%ebx,%edi,8), %ebx -; KNL_X32-NEXT: kmovw %k1, %edi ; KNL_X32-NEXT: kshiftrw $9, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edx ; KNL_X32-NEXT: shll $4, %edx -; KNL_X32-NEXT: orl %ebx, %edx -; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: orl %edi, %edx +; KNL_X32-NEXT: kmovw %k1, %edi ; KNL_X32-NEXT: kshiftrw $10, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ecx ; KNL_X32-NEXT: shll $5, %ecx @@ -1592,42 +1592,42 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: kshiftrw $11, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebp ; KNL_X32-NEXT: shll $6, %ebp -; KNL_X32-NEXT: andl $1, %esi -; KNL_X32-NEXT: shll $7, %esi -; KNL_X32-NEXT: orl %ebp, %esi +; KNL_X32-NEXT: andl $1, %ebx +; KNL_X32-NEXT: shll $7, %ebx +; KNL_X32-NEXT: orl %ebp, %ebx ; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $12, %k0, %k1 +; KNL_X32-NEXT: andl $1, %esi +; KNL_X32-NEXT: shll $8, %esi +; KNL_X32-NEXT: orl %ebx, %esi +; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: kshiftrw $13, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: shll $8, %edi +; KNL_X32-NEXT: shll $9, %edi ; KNL_X32-NEXT: orl %esi, %edi ; KNL_X32-NEXT: kmovw %k1, %esi -; KNL_X32-NEXT: kshiftrw $13, %k0, %k1 -; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: shll $9, %ebx -; KNL_X32-NEXT: orl %edi, %ebx -; KNL_X32-NEXT: kmovw %k1, %edi ; KNL_X32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edx ; KNL_X32-NEXT: shll $10, %edx -; KNL_X32-NEXT: orl %ebx, %edx -; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: orl %edi, %edx +; KNL_X32-NEXT: kmovw %k1, %edi ; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 ; KNL_X32-NEXT: orl %ecx, %edx ; KNL_X32-NEXT: kmovw %k0, %ecx ; KNL_X32-NEXT: andl $1, %ebp ; KNL_X32-NEXT: shll $11, %ebp +; KNL_X32-NEXT: andl $1, %ebx +; KNL_X32-NEXT: shll $12, %ebx +; KNL_X32-NEXT: orl %ebp, %ebx ; KNL_X32-NEXT: andl $1, %esi -; KNL_X32-NEXT: shll $12, %esi -; KNL_X32-NEXT: orl %ebp, %esi +; KNL_X32-NEXT: shll $13, %esi +; KNL_X32-NEXT: orl %ebx, %esi ; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: shll $13, %edi +; KNL_X32-NEXT: shll $14, %edi ; KNL_X32-NEXT: orl %esi, %edi -; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: shll $14, %ebx -; KNL_X32-NEXT: orl %edi, %ebx ; KNL_X32-NEXT: andl $1, %ecx ; KNL_X32-NEXT: shll $15, %ecx -; KNL_X32-NEXT: orl %ebx, %ecx +; KNL_X32-NEXT: orl %edi, %ecx ; KNL_X32-NEXT: orl %edx, %ecx ; KNL_X32-NEXT: movw %cx, (%eax) ; KNL_X32-NEXT: addl $16, %esp @@ -1726,16 +1726,16 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kshiftrd $21, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: movl $-2049, %edi ## imm = 0xF7FF -; FASTISEL-NEXT: kmovd %edi, %k6 -; FASTISEL-NEXT: kandd %k6, %k0, %k0 +; FASTISEL-NEXT: kmovd %edi, %k1 +; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; FASTISEL-NEXT: kandd %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftld $31, %k1, %k1 ; FASTISEL-NEXT: kshiftrd $20, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: movl $-4097, %edi ## imm = 0xEFFF -; FASTISEL-NEXT: kmovd %edi, %k1 -; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; FASTISEL-NEXT: kandd %k1, %k0, %k0 +; FASTISEL-NEXT: kmovd %edi, %k6 +; FASTISEL-NEXT: kandd %k6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftld $31, %k1, %k1 ; FASTISEL-NEXT: kshiftrd $19, %k1, %k1 @@ -1831,14 +1831,14 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kshiftrd $21, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kandd %k6, %k0, %k0 +; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 4-byte Reload +; FASTISEL-NEXT: kandd %k7, %k0, %k0 ; FASTISEL-NEXT: kshiftld $31, %k1, %k1 ; FASTISEL-NEXT: kshiftrd $20, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; FASTISEL-NEXT: kord %k1, %k0, %k0 -; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload -; FASTISEL-NEXT: kandd %k1, %k0, %k0 -; FASTISEL-NEXT: kshiftld $31, %k6, %k1 +; FASTISEL-NEXT: kandd %k6, %k0, %k0 +; FASTISEL-NEXT: kshiftld $31, %k7, %k1 ; FASTISEL-NEXT: kshiftrd $19, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: kandd %k5, %k0, %k0 @@ -1864,84 +1864,84 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload ; FASTISEL-NEXT: kandd %k1, %k0, %k0 ; FASTISEL-NEXT: kshiftrd $16, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %edx +; FASTISEL-NEXT: kmovd %k1, %r13d ; FASTISEL-NEXT: kshiftrd $1, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r9d +; FASTISEL-NEXT: kmovd %k1, %r11d ; FASTISEL-NEXT: kshiftrd $2, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r8d +; FASTISEL-NEXT: kmovd %k1, %r14d ; FASTISEL-NEXT: kshiftrd $3, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %esi +; FASTISEL-NEXT: kmovd %k1, %r12d ; FASTISEL-NEXT: kshiftrd $4, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %edi +; FASTISEL-NEXT: kmovd %k1, %r15d ; FASTISEL-NEXT: kshiftrd $5, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %ecx +; FASTISEL-NEXT: kmovd %k1, %edx ; FASTISEL-NEXT: kshiftrd $6, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r10d +; FASTISEL-NEXT: kmovd %k1, %ebp ; FASTISEL-NEXT: kshiftrd $7, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %ebx ; FASTISEL-NEXT: kshiftrd $8, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %ebp +; FASTISEL-NEXT: kmovd %k1, %r10d ; FASTISEL-NEXT: kshiftrd $9, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r14d +; FASTISEL-NEXT: kmovd %k1, %r9d ; FASTISEL-NEXT: kshiftrd $10, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r11d +; FASTISEL-NEXT: kmovd %k1, %ecx ; FASTISEL-NEXT: kshiftrd $11, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r15d +; FASTISEL-NEXT: kmovd %k1, %r8d ; FASTISEL-NEXT: kshiftrd $12, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r12d +; FASTISEL-NEXT: kmovd %k1, %edi ; FASTISEL-NEXT: kshiftrd $13, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r13d +; FASTISEL-NEXT: kmovd %k1, %esi ; FASTISEL-NEXT: kshiftrd $14, %k0, %k1 -; FASTISEL-NEXT: andl $1, %edx -; FASTISEL-NEXT: movb %dl, 2(%rax) -; FASTISEL-NEXT: kmovd %k0, %edx -; FASTISEL-NEXT: andl $1, %edx -; FASTISEL-NEXT: andl $1, %r9d -; FASTISEL-NEXT: leal (%rdx,%r9,2), %r9d -; FASTISEL-NEXT: kmovd %k1, %edx +; FASTISEL-NEXT: andl $1, %r13d +; FASTISEL-NEXT: movb %r13b, 2(%rax) +; FASTISEL-NEXT: kmovd %k0, %r13d +; FASTISEL-NEXT: andl $1, %r13d +; FASTISEL-NEXT: andl $1, %r11d +; FASTISEL-NEXT: leal (%r13,%r11,2), %r13d +; FASTISEL-NEXT: kmovd %k1, %r11d ; FASTISEL-NEXT: kshiftrd $15, %k0, %k0 -; FASTISEL-NEXT: andl $1, %r8d -; FASTISEL-NEXT: leal (%r9,%r8,4), %r9d -; FASTISEL-NEXT: kmovd %k0, %r8d -; FASTISEL-NEXT: andl $1, %esi -; FASTISEL-NEXT: leal (%r9,%rsi,8), %esi -; FASTISEL-NEXT: andl $1, %edi -; FASTISEL-NEXT: shll $4, %edi -; FASTISEL-NEXT: orl %esi, %edi -; FASTISEL-NEXT: andl $1, %ecx -; FASTISEL-NEXT: shll $5, %ecx -; FASTISEL-NEXT: orl %edi, %ecx -; FASTISEL-NEXT: andl $1, %r10d -; FASTISEL-NEXT: shll $6, %r10d -; FASTISEL-NEXT: andl $1, %ebx -; FASTISEL-NEXT: shll $7, %ebx -; FASTISEL-NEXT: orl %r10d, %ebx -; FASTISEL-NEXT: andl $1, %ebp -; FASTISEL-NEXT: shll $8, %ebp -; FASTISEL-NEXT: orl %ebx, %ebp ; FASTISEL-NEXT: andl $1, %r14d -; FASTISEL-NEXT: shll $9, %r14d -; FASTISEL-NEXT: orl %ebp, %r14d -; FASTISEL-NEXT: andl $1, %r11d -; FASTISEL-NEXT: shll $10, %r11d -; FASTISEL-NEXT: orl %r14d, %r11d -; FASTISEL-NEXT: orl %ecx, %r11d -; FASTISEL-NEXT: andl $1, %r15d -; FASTISEL-NEXT: shll $11, %r15d +; FASTISEL-NEXT: leal (%r13,%r14,4), %r13d +; FASTISEL-NEXT: kmovd %k0, %r14d ; FASTISEL-NEXT: andl $1, %r12d -; FASTISEL-NEXT: shll $12, %r12d -; FASTISEL-NEXT: orl %r15d, %r12d -; FASTISEL-NEXT: andl $1, %r13d -; FASTISEL-NEXT: shll $13, %r13d -; FASTISEL-NEXT: orl %r12d, %r13d +; FASTISEL-NEXT: leal (%r13,%r12,8), %r12d +; FASTISEL-NEXT: andl $1, %r15d +; FASTISEL-NEXT: shll $4, %r15d +; FASTISEL-NEXT: orl %r12d, %r15d ; FASTISEL-NEXT: andl $1, %edx -; FASTISEL-NEXT: shll $14, %edx -; FASTISEL-NEXT: orl %r13d, %edx +; FASTISEL-NEXT: shll $5, %edx +; FASTISEL-NEXT: orl %r15d, %edx +; FASTISEL-NEXT: andl $1, %ebp +; FASTISEL-NEXT: shll $6, %ebp +; FASTISEL-NEXT: andl $1, %ebx +; FASTISEL-NEXT: shll $7, %ebx +; FASTISEL-NEXT: orl %ebp, %ebx +; FASTISEL-NEXT: andl $1, %r10d +; FASTISEL-NEXT: shll $8, %r10d +; FASTISEL-NEXT: orl %ebx, %r10d +; FASTISEL-NEXT: andl $1, %r9d +; FASTISEL-NEXT: shll $9, %r9d +; FASTISEL-NEXT: orl %r10d, %r9d +; FASTISEL-NEXT: andl $1, %ecx +; FASTISEL-NEXT: shll $10, %ecx +; FASTISEL-NEXT: orl %r9d, %ecx +; FASTISEL-NEXT: orl %edx, %ecx ; FASTISEL-NEXT: andl $1, %r8d -; FASTISEL-NEXT: shll $15, %r8d -; FASTISEL-NEXT: orl %edx, %r8d -; FASTISEL-NEXT: orl %r11d, %r8d -; FASTISEL-NEXT: movw %r8w, (%rax) +; FASTISEL-NEXT: shll $11, %r8d +; FASTISEL-NEXT: andl $1, %edi +; FASTISEL-NEXT: shll $12, %edi +; FASTISEL-NEXT: orl %r8d, %edi +; FASTISEL-NEXT: andl $1, %esi +; FASTISEL-NEXT: shll $13, %esi +; FASTISEL-NEXT: orl %edi, %esi +; FASTISEL-NEXT: andl $1, %r11d +; FASTISEL-NEXT: shll $14, %r11d +; FASTISEL-NEXT: orl %esi, %r11d +; FASTISEL-NEXT: andl $1, %r14d +; FASTISEL-NEXT: shll $15, %r14d +; FASTISEL-NEXT: orl %r11d, %r14d +; FASTISEL-NEXT: orl %ecx, %r14d +; FASTISEL-NEXT: movw %r14w, (%rax) ; FASTISEL-NEXT: popq %rbx ; FASTISEL-NEXT: popq %r12 ; FASTISEL-NEXT: popq %r13 @@ -2380,19 +2380,21 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: movb $-9, %dil -; SKX-NEXT: kmovd %edi, %k6 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SKX-NEXT: kandb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $4, %k2, %k2 ; SKX-NEXT: korb %k2, %k0, %k0 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: movb $-17, %dil -; SKX-NEXT: kmovd %edi, %k3 -; SKX-NEXT: kandb %k3, %k0, %k0 -; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $3, %k2, %k2 -; SKX-NEXT: korb %k2, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: kandb %k1, %k0, %k0 +; SKX-NEXT: kmovq %k1, %k6 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k3, %k0, %k0 ; SKX-NEXT: movb $-33, %dil ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill @@ -2404,8 +2406,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: movb $-65, %dil ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: kandb %k1, %k0, %k0 -; SKX-NEXT: kmovq %k1, %k2 -; SKX-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SKX-NEXT: kmovq %k1, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 @@ -2417,19 +2418,20 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k7, %k7 ; SKX-NEXT: kshiftrb $7, %k7, %k7 ; SKX-NEXT: korb %k0, %k7, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload -; SKX-NEXT: kandb %k4, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; SKX-NEXT: kandb %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; SKX-NEXT: kshiftlb $7, %k7, %k7 ; SKX-NEXT: kshiftrb $5, %k7, %k7 ; SKX-NEXT: korb %k7, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: kandb %k6, %k0, %k1 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; SKX-NEXT: kandb %k4, %k0, %k1 ; SKX-NEXT: kshiftlb $7, %k7, %k7 ; SKX-NEXT: kshiftrb $4, %k7, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: korb %k7, %k1, %k1 -; SKX-NEXT: kandb %k3, %k1, %k1 +; SKX-NEXT: kandb %k6, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $3, %k0, %k0 ; SKX-NEXT: korb %k0, %k1, %k0 @@ -2439,7 +2441,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kandb %k3, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 @@ -2453,32 +2455,30 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kmovq %k4, %k2 -; SKX-NEXT: kandb %k4, %k0, %k0 +; SKX-NEXT: kandb %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovq %k6, %k7 -; SKX-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kmovq %k4, %k7 +; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload -; SKX-NEXT: kandb %k4, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; SKX-NEXT: kandb %k1, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k5, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kmovq %k3, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 @@ -2497,62 +2497,68 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftrb $5, %k5, %k5 ; SKX-NEXT: korb %k5, %k1, %k1 ; SKX-NEXT: kandb %k7, %k1, %k1 +; SKX-NEXT: kmovq %k7, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $4, %k5, %k5 ; SKX-NEXT: korb %k5, %k1, %k1 -; SKX-NEXT: kandb %k3, %k1, %k1 +; SKX-NEXT: kandb %k6, %k1, %k1 +; SKX-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 ; SKX-NEXT: korb %k5, %k1, %k1 -; SKX-NEXT: kandb %k4, %k1, %k1 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; SKX-NEXT: kandb %k0, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 ; SKX-NEXT: korb %k5, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kandb %k6, %k1, %k1 +; SKX-NEXT: kandb %k4, %k1, %k1 +; SKX-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 +; SKX-NEXT: korb %k5, %k1, %k0 +; SKX-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SKX-NEXT: kshiftlb $7, %k7, %k1 +; SKX-NEXT: kshiftrb $6, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 +; SKX-NEXT: kshiftlb $7, %k7, %k7 +; SKX-NEXT: kshiftrb $7, %k7, %k7 +; SKX-NEXT: korb %k1, %k7, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 +; SKX-NEXT: kandb %k2, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k7, %k7 +; SKX-NEXT: kshiftrb $5, %k7, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; SKX-NEXT: korb %k5, %k1, %k7 +; SKX-NEXT: korb %k7, %k1, %k1 +; SKX-NEXT: kandb %k3, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k0 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kshiftlb $7, %k1, %k1 -; SKX-NEXT: kshiftrb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k0, %k0 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kandb %k2, %k0, %k0 -; SKX-NEXT: kshiftlb $7, %k1, %k1 -; SKX-NEXT: kshiftrb $5, %k1, %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload -; SKX-NEXT: kandb %k1, %k0, %k0 -; SKX-NEXT: kshiftlb $7, %k5, %k1 -; SKX-NEXT: kshiftrb $4, %k1, %k1 -; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k4, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; SKX-NEXT: kandb %k5, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k7, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; SKX-NEXT: kandb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; SKX-NEXT: kshiftlb $7, %k1, %k0 ; SKX-NEXT: kshiftrb $6, %k0, %k0 @@ -2566,22 +2572,25 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kmovq %k3, %k6 ; SKX-NEXT: kshiftlb $7, %k7, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k4, %k0, %k0 +; SKX-NEXT: kmovq %k5, %k7 +; SKX-NEXT: kandb %k5, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; SKX-NEXT: kandb %k5, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 @@ -2594,28 +2603,30 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 +; SKX-NEXT: kmovq %k2, %k3 ; SKX-NEXT: kandb %k2, %k0, %k0 -; SKX-NEXT: kmovq %k2, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kandb %k3, %k0, %k2 +; SKX-NEXT: kandb %k4, %k0, %k2 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: korb %k1, %k2, %k1 -; SKX-NEXT: kandb %k4, %k1, %k1 +; SKX-NEXT: kmovq %k7, %k2 +; SKX-NEXT: kandb %k7, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $2, %k0, %k0 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kmovq %k5, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 @@ -2630,28 +2641,30 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kandb %k7, %k0, %k0 +; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kmovq %k3, %k5 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kandb %k4, %k0, %k0 +; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kmovq %k2, %k3 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kandb %k7, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k2, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 @@ -2662,27 +2675,27 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $7, %k2, %k2 ; SKX-NEXT: korb %k1, %k2, %k1 -; SKX-NEXT: kandb %k7, %k1, %k1 +; SKX-NEXT: kandb %k5, %k1, %k1 ; SKX-NEXT: kmovd %ecx, %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $5, %k2, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k5, %k1, %k1 +; SKX-NEXT: kandb %k6, %k1, %k1 ; SKX-NEXT: kmovd %r8d, %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $4, %k2, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k3, %k1, %k1 +; SKX-NEXT: kandb %k4, %k1, %k1 ; SKX-NEXT: kmovd %r9d, %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $3, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k4, %k1, %k1 -; SKX-NEXT: kshiftlb $7, %k3, %k2 +; SKX-NEXT: kandb %k3, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k6, %k2 ; SKX-NEXT: kshiftrb $2, %k2, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k6, %k1, %k1 +; SKX-NEXT: kandb %k7, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $1, %k2, %k2 @@ -3156,8 +3169,9 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 ; FASTISEL-NEXT: korb %k0, %k1, %k0 ; FASTISEL-NEXT: movb $-5, %dil -; FASTISEL-NEXT: kmovd %edi, %k3 -; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kmovd %edi, %k1 +; FASTISEL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kandb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 @@ -3170,177 +3184,177 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $4, %k2, %k2 ; FASTISEL-NEXT: korb %k2, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; FASTISEL-NEXT: movb $-17, %dil -; FASTISEL-NEXT: kmovd %edi, %k1 -; FASTISEL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kandb %k1, %k0, %k0 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $3, %k2, %k2 -; FASTISEL-NEXT: korb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovd %edi, %k2 +; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kshiftlb $7, %k3, %k3 +; FASTISEL-NEXT: kshiftrb $3, %k3, %k3 +; FASTISEL-NEXT: korb %k3, %k0, %k0 ; FASTISEL-NEXT: movb $-33, %dil -; FASTISEL-NEXT: kmovd %edi, %k6 -; FASTISEL-NEXT: kandb %k6, %k0, %k0 -; FASTISEL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kmovd %edi, %k5 +; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; FASTISEL-NEXT: kshiftlb $7, %k4, %k4 ; FASTISEL-NEXT: kshiftrb $2, %k4, %k4 ; FASTISEL-NEXT: korb %k4, %k0, %k0 ; FASTISEL-NEXT: movb $-65, %dil -; FASTISEL-NEXT: kmovd %edi, %k2 -; FASTISEL-NEXT: kandb %k2, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 -; FASTISEL-NEXT: kshiftrb $1, %k5, %k5 -; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovd %edi, %k1 +; FASTISEL-NEXT: kandb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovq %k1, %k4 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 +; FASTISEL-NEXT: kshiftrb $1, %k6, %k6 +; FASTISEL-NEXT: korb %k6, %k0, %k0 ; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 -; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 +; FASTISEL-NEXT: kshiftrb $6, %k6, %k6 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $7, %k7, %k7 -; FASTISEL-NEXT: korb %k0, %k7, %k0 -; FASTISEL-NEXT: kandb %k3, %k0, %k0 -; FASTISEL-NEXT: kmovq %k3, %k5 +; FASTISEL-NEXT: korb %k6, %k7, %k6 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k3, %k6, %k6 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $5, %k7, %k7 -; FASTISEL-NEXT: korb %k7, %k0, %k0 +; FASTISEL-NEXT: korb %k7, %k6, %k6 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k4, %k0, %k1 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k1, %k6, %k6 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $4, %k7, %k7 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; FASTISEL-NEXT: korb %k7, %k1, %k1 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k3, %k1, %k1 +; FASTISEL-NEXT: korb %k7, %k6, %k6 +; FASTISEL-NEXT: kandb %k2, %k6, %k6 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 ; FASTISEL-NEXT: kshiftrb $3, %k0, %k0 -; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kandb %k6, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k2, %k0, %k0 -; FASTISEL-NEXT: kmovq %k2, %k6 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k1, %k0, %k0 +; FASTISEL-NEXT: korb %k0, %k6, %k0 +; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 +; FASTISEL-NEXT: kshiftrb $2, %k6, %k6 +; FASTISEL-NEXT: korb %k6, %k0, %k0 +; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 +; FASTISEL-NEXT: kshiftrb $1, %k6, %k6 +; FASTISEL-NEXT: korb %k6, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k5, %k0, %k0 ; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 ; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 -; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kmovq %k5, %k2 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovq %k4, %k7 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $7, %k5, %k5 +; FASTISEL-NEXT: korb %k0, %k5, %k0 +; FASTISEL-NEXT: kmovq %k3, %k7 ; FASTISEL-NEXT: kandb %k3, %k0, %k0 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $5, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kandb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $4, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $3, %k5, %k5 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kshiftlb $7, %k6, %k5 +; FASTISEL-NEXT: kshiftrb $2, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovq %k4, %k6 ; FASTISEL-NEXT: kandb %k4, %k0, %k0 -; FASTISEL-NEXT: kshiftlb $7, %k5, %k1 -; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k6, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $1, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k0, %k0 ; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $6, %k1, %k1 +; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 +; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 ; FASTISEL-NEXT: kshiftrb $7, %k5, %k5 -; FASTISEL-NEXT: korb %k1, %k5, %k1 -; FASTISEL-NEXT: kandb %k2, %k1, %k1 -; FASTISEL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: korb %k0, %k5, %k0 +; FASTISEL-NEXT: kandb %k7, %k0, %k0 +; FASTISEL-NEXT: kmovq %k7, %k4 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 ; FASTISEL-NEXT: kshiftrb $5, %k5, %k5 -; FASTISEL-NEXT: korb %k5, %k1, %k1 -; FASTISEL-NEXT: kandb %k7, %k1, %k1 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kandb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 ; FASTISEL-NEXT: kshiftrb $4, %k5, %k5 -; FASTISEL-NEXT: korb %k5, %k1, %k1 -; FASTISEL-NEXT: kandb %k3, %k1, %k1 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 ; FASTISEL-NEXT: kshiftrb $3, %k5, %k5 -; FASTISEL-NEXT: korb %k5, %k1, %k1 -; FASTISEL-NEXT: kandb %k4, %k1, %k1 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kandb %k3, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 ; FASTISEL-NEXT: kshiftrb $2, %k5, %k5 -; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: korb %k5, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; FASTISEL-NEXT: kandb %k6, %k1, %k1 -; FASTISEL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 ; FASTISEL-NEXT: kshiftrb $1, %k5, %k5 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; FASTISEL-NEXT: korb %k5, %k1, %k5 -; FASTISEL-NEXT: kshiftlb $7, %k7, %k1 -; FASTISEL-NEXT: kshiftrb $6, %k1, %k1 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kshiftlb $7, %k7, %k5 +; FASTISEL-NEXT: kshiftrb $6, %k5, %k5 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $7, %k7, %k7 -; FASTISEL-NEXT: korb %k1, %k7, %k1 +; FASTISEL-NEXT: korb %k5, %k7, %k5 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; FASTISEL-NEXT: kandb %k2, %k1, %k1 +; FASTISEL-NEXT: kandb %k4, %k5, %k5 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $5, %k7, %k7 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; FASTISEL-NEXT: korb %k7, %k1, %k1 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k3, %k1, %k1 +; FASTISEL-NEXT: korb %k7, %k5, %k5 +; FASTISEL-NEXT: kmovq %k1, %k7 +; FASTISEL-NEXT: kandb %k1, %k5, %k5 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 ; FASTISEL-NEXT: kshiftrb $4, %k0, %k0 -; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; FASTISEL-NEXT: korb %k0, %k5, %k0 ; FASTISEL-NEXT: kandb %k2, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $3, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $2, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovq %k6, %k4 ; FASTISEL-NEXT: kandb %k6, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $1, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k2, %k0, %k0 ; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kshiftlb $7, %k1, %k0 ; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 @@ -3353,64 +3367,64 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k3, %k0, %k0 -; FASTISEL-NEXT: kshiftlb $7, %k7, %k1 +; FASTISEL-NEXT: kandb %k7, %k0, %k0 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k1 ; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload ; FASTISEL-NEXT: kandb %k2, %k0, %k0 -; FASTISEL-NEXT: kmovq %k2, %k7 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kandb %k3, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 -; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 -; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kandb %k6, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k3, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kandb %k2, %k0, %k2 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; FASTISEL-NEXT: korb %k1, %k2, %k1 +; FASTISEL-NEXT: kshiftrb $6, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $7, %k5, %k5 +; FASTISEL-NEXT: korb %k1, %k5, %k1 +; FASTISEL-NEXT: kandb %k6, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $5, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k7, %k1, %k1 +; FASTISEL-NEXT: kmovq %k7, %k3 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $4, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kandb %k2, %k1, %k1 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $3, %k5, %k5 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k2, %k1, %k1 +; FASTISEL-NEXT: kshiftlb $7, %k7, %k5 +; FASTISEL-NEXT: kshiftrb $2, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 ; FASTISEL-NEXT: kandb %k4, %k1, %k1 -; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 -; FASTISEL-NEXT: kshiftrb $2, %k0, %k0 -; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovq %k4, %k7 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $1, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k0, %k1, %k0 ; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 @@ -3429,49 +3443,51 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k7, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kandb %k2, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k1 +; FASTISEL-NEXT: kandb %k7, %k0, %k0 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k1 ; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovd %edx, %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $6, %k1, %k1 -; FASTISEL-NEXT: kmovd %esi, %k2 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $7, %k2, %k2 -; FASTISEL-NEXT: korb %k1, %k2, %k1 +; FASTISEL-NEXT: kmovd %esi, %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $7, %k5, %k5 +; FASTISEL-NEXT: korb %k1, %k5, %k1 ; FASTISEL-NEXT: kandb %k6, %k1, %k1 -; FASTISEL-NEXT: kmovd %ecx, %k2 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $5, %k2, %k2 -; FASTISEL-NEXT: korb %k2, %k1, %k1 +; FASTISEL-NEXT: kmovd %ecx, %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $5, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 ; FASTISEL-NEXT: kandb %k3, %k1, %k1 -; FASTISEL-NEXT: kmovd %r8d, %k2 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $4, %k2, %k2 -; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k7, %k1, %k1 +; FASTISEL-NEXT: kmovd %r8d, %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $4, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k4, %k1, %k1 ; FASTISEL-NEXT: kmovd %r9d, %k2 ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $3, %k2, %k2 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k4, %k1, %k1 -; FASTISEL-NEXT: kshiftlb $7, %k3, %k2 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k2, %k1, %k1 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k2 ; FASTISEL-NEXT: kshiftrb $2, %k2, %k2 ; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k7, %k1, %k1 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $1, %k2, %k2 diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index 87332531f750b7..52e796004b1d09 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1924,9 +1924,8 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: movw $-33, %ax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kandw %k1, %k0, %k0 -; KNL-NEXT: kmovw %k1, %k2 ; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -1934,8 +1933,9 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: movw $-65, %ax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kmovw %k1, %k2 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -1943,9 +1943,8 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: movw $-129, %ax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kandw %k1, %k0, %k0 -; KNL-NEXT: kmovw %k1, %k3 ; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -1953,18 +1952,17 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: movw $-257, %ax # imm = 0xFEFF ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kmovw %k1, %k4 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $7, %k1, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: movw $-513, %ax # imm = 0xFDFF -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kandw %k1, %k0, %k0 -; KNL-NEXT: kmovw %k1, %k4 -; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -1980,8 +1978,9 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: kshiftrw $5, %k1, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: movw $-2049, %ax # imm = 0xF7FF -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kandw %k5, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -2001,218 +2000,218 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $2, %k1, %k1 -; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $2, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k3 ; KNL-NEXT: movw $-16385, %ax # imm = 0xBFFF ; KNL-NEXT: kmovw %eax, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; KNL-NEXT: kandw %k0, %k1, %k1 +; KNL-NEXT: kandw %k0, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $14, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kshiftlw $1, %k3, %k3 +; KNL-NEXT: kshiftrw $1, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k1 ; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: andl $1, %edi -; KNL-NEXT: kmovw %esi, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 +; KNL-NEXT: kmovw %esi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 ; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: korw %k1, %k6, %k1 -; KNL-NEXT: kandw %k7, %k1, %k1 +; KNL-NEXT: korw %k3, %k6, %k3 +; KNL-NEXT: kandw %k7, %k3, %k3 ; KNL-NEXT: kmovw %edx, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $13, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; KNL-NEXT: kandw %k7, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; KNL-NEXT: kandw %k0, %k3, %k3 ; KNL-NEXT: kmovw %ecx, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; KNL-NEXT: kandw %k0, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: kandw %k1, %k3, %k3 ; KNL-NEXT: kmovw %r8d, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kandw %k2, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: kandw %k7, %k3, %k3 ; KNL-NEXT: kmovw %r9d, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; KNL-NEXT: kandw %k2, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kandw %k2, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kandw %k3, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: kandw %k2, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $8, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; KNL-NEXT: kandw %k3, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kandw %k4, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $7, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kandw %k4, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kandw %k5, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $6, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; KNL-NEXT: kandw %k4, %k1, %k1 +; KNL-NEXT: kandw %k4, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $5, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; KNL-NEXT: kandw %k5, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; KNL-NEXT: kandw %k6, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $4, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; KNL-NEXT: kandw %k6, %k1, %k1 +; KNL-NEXT: kandw %k6, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $3, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; KNL-NEXT: kandw %k6, %k1, %k1 +; KNL-NEXT: kandw %k6, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $2, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; KNL-NEXT: kandw %k6, %k1, %k1 +; KNL-NEXT: kandw %k6, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $14, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kshiftlw $1, %k3, %k3 +; KNL-NEXT: kshiftrw $1, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; KNL-NEXT: kmovw %ecx, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 ; KNL-NEXT: kmovw %eax, %k6 -; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k3, %k6, %k3 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; KNL-NEXT: kandw %k6, %k1, %k1 +; KNL-NEXT: kandw %k6, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $13, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kandw %k7, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kandw %k0, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kandw %k0, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kandw %k1, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; KNL-NEXT: kandw %k0, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kandw %k7, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kandw %k2, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; KNL-NEXT: kandw %k0, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; KNL-NEXT: kandw %k0, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kandw %k2, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $8, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kandw %k3, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: kandw %k2, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $7, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; KNL-NEXT: kandw %k3, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kandw %k5, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $6, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kandw %k4, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kandw %k4, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $5, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kandw %k5, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; KNL-NEXT: kandw %k0, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $4, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; KNL-NEXT: kandw %k0, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: kandw %k1, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $3, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; KNL-NEXT: kandw %k2, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; KNL-NEXT: kandw %k6, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $2, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; KNL-NEXT: kandw %k5, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; KNL-NEXT: kandw %k6, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $14, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 +; KNL-NEXT: kshiftlw $1, %k3, %k3 +; KNL-NEXT: kshiftrw $1, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 +; KNL-NEXT: korw %k6, %k3, %k3 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx @@ -2221,56 +2220,55 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: kshiftrw $14, %k6, %k6 ; KNL-NEXT: kmovw %eax, %k7 ; KNL-NEXT: korw %k6, %k7, %k6 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; KNL-NEXT: kandw %k5, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: kandw %k7, %k6, %k6 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $13, %k7, %k7 ; KNL-NEXT: korw %k7, %k6, %k6 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; KNL-NEXT: kandw %k5, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: kandw %k7, %k6, %k6 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $12, %k7, %k7 ; KNL-NEXT: korw %k7, %k6, %k6 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; KNL-NEXT: kandw %k5, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: kandw %k7, %k6, %k6 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $11, %k7, %k7 ; KNL-NEXT: korw %k7, %k6, %k6 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; KNL-NEXT: kandw %k5, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: kandw %k7, %k6, %k6 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $10, %k7, %k7 ; KNL-NEXT: korw %k7, %k6, %k6 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; KNL-NEXT: kandw %k5, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: kandw %k7, %k6, %k6 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $9, %k7, %k7 ; KNL-NEXT: korw %k7, %k6, %k6 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; KNL-NEXT: kandw %k5, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: kandw %k7, %k6, %k6 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $8, %k7, %k7 ; KNL-NEXT: korw %k7, %k6, %k6 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; KNL-NEXT: kandw %k5, %k6, %k6 +; KNL-NEXT: kandw %k2, %k6, %k6 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $7, %k7, %k7 ; KNL-NEXT: korw %k7, %k6, %k6 -; KNL-NEXT: kandw %k3, %k6, %k6 +; KNL-NEXT: kandw %k5, %k6, %k6 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 @@ -2282,39 +2280,39 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $5, %k6, %k6 ; KNL-NEXT: korw %k6, %k5, %k5 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; KNL-NEXT: kandw %k3, %k5, %k4 +; KNL-NEXT: kandw %k0, %k5, %k4 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 ; KNL-NEXT: kshiftrw $4, %k5, %k5 ; KNL-NEXT: korw %k5, %k4, %k4 -; KNL-NEXT: kandw %k0, %k4, %k3 +; KNL-NEXT: kandw %k1, %k4, %k2 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 ; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: korw %k4, %k3, %k3 -; KNL-NEXT: kandw %k2, %k3, %k2 -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k3 -; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: korw %k4, %k2, %k2 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; KNL-NEXT: kandw %k0, %k2, %k0 +; KNL-NEXT: kandw %k0, %k2, %k1 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; KNL-NEXT: kandw %k0, %k1, %k0 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: korw %k2, %k0, %k2 -; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} -; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -2378,9 +2376,8 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: movw $-33, %ax ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 -; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw %k1, %k2 ; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 @@ -2388,8 +2385,9 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: movw $-65, %ax ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 -; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k1, %k2 +; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 @@ -2397,9 +2395,8 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: movw $-129, %ax ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 -; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw %k1, %k3 ; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 @@ -2407,18 +2404,17 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: movw $-257, %ax # imm = 0xFEFF ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 -; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512DQNOBW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512DQNOBW-NEXT: kmovw %eax, %k1 -; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw %k1, %k4 -; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kandw %k5, %k0, %k0 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 @@ -2434,8 +2430,9 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; AVX512DQNOBW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kandw %k5, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 @@ -2455,218 +2452,218 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX512DQNOBW-NEXT: kmovw %eax, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k3 +; AVX512DQNOBW-NEXT: korw %k3, %k0, %k3 ; AVX512DQNOBW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512DQNOBW-NEXT: kmovw %eax, %k1 -; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %eax, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: kandw %k0, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k0 ; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: andl $1, %edi -; AVX512DQNOBW-NEXT: kmovw %esi, %k0 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %esi, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw %edi, %k6 -; AVX512DQNOBW-NEXT: korw %k0, %k6, %k0 -; AVX512DQNOBW-NEXT: kandw %k7, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k3, %k6, %k3 +; AVX512DQNOBW-NEXT: kandw %k7, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw %edx, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $13, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k7, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k0, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw %ecx, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $12, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kandw %k1, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw %r8d, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kandw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k7, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw %r9d, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $10, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kandw %k2, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $9, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kandw %k3, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k2, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k3, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kandw %k4, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $7, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kandw %k4, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kandw %k5, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $6, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k4, %k0, %k0 +; AVX512DQNOBW-NEXT: kandw %k4, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQNOBW-NEXT: kandw %k5, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $4, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: kandw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $3, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: kandw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: kandw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: andl $1, %eax ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX512DQNOBW-NEXT: kmovw %ecx, %k0 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %ecx, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 -; AVX512DQNOBW-NEXT: korw %k0, %k6, %k0 +; AVX512DQNOBW-NEXT: korw %k3, %k6, %k3 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: kandw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $13, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kandw %k7, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kandw %k0, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $12, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kandw %k1, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kandw %k7, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $10, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kandw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k0, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $9, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kandw %k2, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kandw %k3, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k2, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $7, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k3, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kandw %k5, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $6, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kandw %k4, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kandw %k4, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kandw %k5, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k0, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $4, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kandw %k1, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $3, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k5, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512DQNOBW-NEXT: korw %k6, %k0, %k0 +; AVX512DQNOBW-NEXT: korw %k6, %k3, %k3 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: andl $1, %eax ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx @@ -2675,56 +2672,55 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; AVX512DQNOBW-NEXT: kshiftrw $14, %k6, %k6 ; AVX512DQNOBW-NEXT: kmovw %eax, %k7 ; AVX512DQNOBW-NEXT: korw %k6, %k7, %k6 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k5, %k6, %k6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k7, %k6, %k6 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $13, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k6, %k6 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k5, %k6, %k6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k7, %k6, %k6 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $12, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k6, %k6 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k5, %k6, %k6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k7, %k6, %k6 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $11, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k6, %k6 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k5, %k6, %k6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k7, %k6, %k6 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $10, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k6, %k6 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k5, %k6, %k6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k7, %k6, %k6 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $9, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k6, %k6 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k5, %k6, %k6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k7, %k6, %k6 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $8, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k6, %k6 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k5, %k6, %k6 +; AVX512DQNOBW-NEXT: kandw %k2, %k6, %k6 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $7, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k6, %k6 -; AVX512DQNOBW-NEXT: kandw %k3, %k6, %k6 +; AVX512DQNOBW-NEXT: kandw %k5, %k6, %k6 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 @@ -2736,39 +2732,39 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; AVX512DQNOBW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512DQNOBW-NEXT: kshiftrw $5, %k6, %k6 ; AVX512DQNOBW-NEXT: korw %k6, %k5, %k5 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k3, %k5, %k4 +; AVX512DQNOBW-NEXT: kandw %k0, %k5, %k4 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k5 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512DQNOBW-NEXT: kshiftrw $4, %k5, %k5 ; AVX512DQNOBW-NEXT: korw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kandw %k1, %k4, %k3 +; AVX512DQNOBW-NEXT: kandw %k1, %k4, %k2 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k4 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512DQNOBW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQNOBW-NEXT: korw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kandw %k2, %k3, %k2 -; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k3 -; AVX512DQNOBW-NEXT: korw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQNOBW-NEXT: kandw %k1, %k2, %k1 -; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX512DQNOBW-NEXT: kmovw %eax, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512DQNOBW-NEXT: korw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k4, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k0, %k2, %k1 ; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k2 ; AVX512DQNOBW-NEXT: korw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm2 -; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: kandw %k0, %k1, %k0 +; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQNOBW-NEXT: vpmovm2d %k3, %zmm3 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm4 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll index bafa33ff9a1c8a..6d515abfc39318 100644 --- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll @@ -157,27 +157,27 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> ; ; X86NOBW-LABEL: test_vgf2p8affineinvqb_512: ; X86NOBW: # %bb.0: -; X86NOBW-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x04] -; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x06] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k3 # encoding: [0xc5,0xf8,0x90,0x5c,0x24,0x0a] ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08] -; X86NOBW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x03] -; X86NOBW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xe1,0x04] -; X86NOBW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xd9,0x05] +; X86NOBW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] +; X86NOBW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x04] +; X86NOBW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] -; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} # encoding: [0x62,0xf3,0x55,0xc9,0x25,0xed,0xff] ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] -; X86NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff] +; X86NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} # encoding: [0x62,0xf3,0x4d,0xca,0x25,0xf6,0xff] ; X86NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] ; X86NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] ; X86NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] ; X86NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] -; X86NOBW-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8] +; X86NOBW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd0,0xb8] +; X86NOBW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X86NOBW-NEXT: retl # encoding: [0xc3] ; ; X64NOBW-LABEL: test_vgf2p8affineinvqb_512: @@ -374,27 +374,27 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s ; ; X86NOBW-LABEL: test_vgf2p8affineqb_512: ; X86NOBW: # %bb.0: -; X86NOBW-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x04] -; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x06] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k3 # encoding: [0xc5,0xf8,0x90,0x5c,0x24,0x0a] ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08] -; X86NOBW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x03] -; X86NOBW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xe1,0x04] -; X86NOBW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xd9,0x05] +; X86NOBW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] +; X86NOBW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x04] +; X86NOBW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] -; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} # encoding: [0x62,0xf3,0x55,0xc9,0x25,0xed,0xff] ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] -; X86NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff] +; X86NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} # encoding: [0x62,0xf3,0x4d,0xca,0x25,0xf6,0xff] ; X86NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] ; X86NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] ; X86NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] ; X86NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] -; X86NOBW-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8] +; X86NOBW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd0,0xb8] +; X86NOBW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X86NOBW-NEXT: retl # encoding: [0xc3] ; ; X64NOBW-LABEL: test_vgf2p8affineqb_512: diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll index 7a534721bae056..f1b7a66ce8723f 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -941,43 +941,42 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; X32-NEXT: pushl %ebx ; X32-NEXT: subl $12, %esp ; X32-NEXT: movl %esi, (%esp) # 4-byte Spill +; X32-NEXT: movl %ecx, %esi ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: leal (%edx,%edi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %eax -; X32-NEXT: subl %edi, %eax -; X32-NEXT: movl %ebp, %edx -; X32-NEXT: subl %ecx, %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: subl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: imull %edx, %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl %esi, %edx +; X32-NEXT: leal (%edx,%edi), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: subl %edi, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: subl %esi, %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: subl {{[0-9]+}}(%esp), %edx -; X32-NEXT: imull %eax, %edx -; X32-NEXT: addl %ebx, %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl (%esp), %edi # 4-byte Reload -; X32-NEXT: subl %ebx, %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: subl {{[0-9]+}}(%esp), %eax -; X32-NEXT: imull %edi, %eax -; X32-NEXT: addl %edx, %eax -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload +; X32-NEXT: imull %edi, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, %edi +; X32-NEXT: subl {{[0-9]+}}(%esp), %edi +; X32-NEXT: imull %ecx, %edi +; X32-NEXT: addl %edx, %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: subl %ebp, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: addl {{[0-9]+}}(%esp), %edx -; X32-NEXT: imull %edx, %ebp +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: addl {{[0-9]+}}(%esp), %edi +; X32-NEXT: imull %edi, %eax ; X32-NEXT: addl {{[0-9]+}}(%esp), %esi ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: addl %eax, %ebp -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl {{[0-9]+}}(%esp), %edx +; X32-NEXT: imull %ebp, %edx +; X32-NEXT: addl %edx, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: addl $12, %esp ; X32-NEXT: popl %ebx ; X32-NEXT: popl %ebp @@ -985,7 +984,6 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; ; WIN64-LABEL: testi32_inp: ; WIN64: # %bb.0: -; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: # kill: def $esi killed $esi def $rsi @@ -998,36 +996,35 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi ; WIN64-NEXT: leal (%rdx,%rdi), %ebx -; WIN64-NEXT: movl %edx, %ebp -; WIN64-NEXT: subl %edi, %ebp -; WIN64-NEXT: leal (%rsi,%r8), %edx +; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx +; WIN64-NEXT: subl %edi, %edx +; WIN64-NEXT: leal (%rsi,%r8), %edi ; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi ; WIN64-NEXT: subl %r8d, %esi -; WIN64-NEXT: leal (%r9,%r10), %edi -; WIN64-NEXT: movl %r9d, %r8d -; WIN64-NEXT: subl %r10d, %r8d -; WIN64-NEXT: movl %eax, %r9d -; WIN64-NEXT: subl %ecx, %r9d -; WIN64-NEXT: imull %r9d, %r8d -; WIN64-NEXT: leal (%r11,%r12), %r9d -; WIN64-NEXT: movl %r11d, %r10d -; WIN64-NEXT: subl %r12d, %r10d -; WIN64-NEXT: imull %ebp, %r10d -; WIN64-NEXT: addl %r8d, %r10d -; WIN64-NEXT: leal (%r14,%r15), %r8d -; WIN64-NEXT: movl %r14d, %r11d -; WIN64-NEXT: subl %r15d, %r11d -; WIN64-NEXT: imull %esi, %r11d -; WIN64-NEXT: addl %r10d, %r11d +; WIN64-NEXT: leal (%r9,%r10), %r8d +; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9 +; WIN64-NEXT: subl %r10d, %r9d +; WIN64-NEXT: movl %eax, %r10d +; WIN64-NEXT: subl %ecx, %r10d +; WIN64-NEXT: imull %r10d, %r9d +; WIN64-NEXT: leal (%r11,%r12), %r10d +; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11 +; WIN64-NEXT: subl %r12d, %r11d +; WIN64-NEXT: imull %edx, %r11d +; WIN64-NEXT: addl %r9d, %r11d +; WIN64-NEXT: leal (%r14,%r15), %edx +; WIN64-NEXT: movl %r14d, %r9d +; WIN64-NEXT: subl %r15d, %r9d +; WIN64-NEXT: imull %esi, %r9d +; WIN64-NEXT: addl %r11d, %r9d ; WIN64-NEXT: addl %ecx, %eax -; WIN64-NEXT: imull %edi, %eax -; WIN64-NEXT: imull %ebx, %r9d +; WIN64-NEXT: imull %r8d, %eax +; WIN64-NEXT: imull %ebx, %r10d +; WIN64-NEXT: addl %r10d, %eax +; WIN64-NEXT: imull %edi, %edx +; WIN64-NEXT: addl %edx, %eax ; WIN64-NEXT: addl %r9d, %eax -; WIN64-NEXT: imull %edx, %r8d -; WIN64-NEXT: addl %r8d, %eax -; WIN64-NEXT: addl %r11d, %eax ; WIN64-NEXT: popq %rbx -; WIN64-NEXT: popq %rbp ; WIN64-NEXT: retq ; ; LINUXOSX64-LABEL: testi32_inp: @@ -1041,35 +1038,35 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; LINUXOSX64-NEXT: # kill: def $r8d killed $r8d def $r8 ; LINUXOSX64-NEXT: # kill: def $edi killed $edi def $rdi ; LINUXOSX64-NEXT: leal (%rdx,%rdi), %r10d -; LINUXOSX64-NEXT: movl %edx, %r11d -; LINUXOSX64-NEXT: subl %edi, %r11d -; LINUXOSX64-NEXT: leal (%rsi,%r8), %edx +; LINUXOSX64-NEXT: # kill: def $edx killed $edx killed $rdx +; LINUXOSX64-NEXT: subl %edi, %edx +; LINUXOSX64-NEXT: leal (%rsi,%r8), %edi ; LINUXOSX64-NEXT: # kill: def $esi killed $esi killed $rsi ; LINUXOSX64-NEXT: subl %r8d, %esi -; LINUXOSX64-NEXT: leal (%r9,%r12), %edi -; LINUXOSX64-NEXT: movl %r9d, %r8d -; LINUXOSX64-NEXT: subl %r12d, %r8d -; LINUXOSX64-NEXT: movl %eax, %r9d -; LINUXOSX64-NEXT: subl %ecx, %r9d -; LINUXOSX64-NEXT: imull %r9d, %r8d -; LINUXOSX64-NEXT: leal (%r13,%r14), %r9d +; LINUXOSX64-NEXT: leal (%r9,%r12), %r8d +; LINUXOSX64-NEXT: # kill: def $r9d killed $r9d killed $r9 +; LINUXOSX64-NEXT: subl %r12d, %r9d +; LINUXOSX64-NEXT: movl %eax, %r11d +; LINUXOSX64-NEXT: subl %ecx, %r11d +; LINUXOSX64-NEXT: imull %r11d, %r9d +; LINUXOSX64-NEXT: leal (%r13,%r14), %r11d ; LINUXOSX64-NEXT: movl %r13d, %r12d ; LINUXOSX64-NEXT: subl %r14d, %r12d -; LINUXOSX64-NEXT: imull %r11d, %r12d -; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; LINUXOSX64-NEXT: addl %r8d, %r12d -; LINUXOSX64-NEXT: movl %r15d, %r8d -; LINUXOSX64-NEXT: subl %r11d, %r8d -; LINUXOSX64-NEXT: imull %esi, %r8d -; LINUXOSX64-NEXT: addl %r12d, %r8d +; LINUXOSX64-NEXT: imull %edx, %r12d +; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %edx +; LINUXOSX64-NEXT: addl %r9d, %r12d +; LINUXOSX64-NEXT: movl %r15d, %r9d +; LINUXOSX64-NEXT: subl %edx, %r9d +; LINUXOSX64-NEXT: imull %esi, %r9d +; LINUXOSX64-NEXT: addl %r12d, %r9d ; LINUXOSX64-NEXT: addl %ecx, %eax -; LINUXOSX64-NEXT: imull %edi, %eax -; LINUXOSX64-NEXT: imull %r10d, %r9d -; LINUXOSX64-NEXT: addl %r9d, %eax -; LINUXOSX64-NEXT: addl %r15d, %r11d -; LINUXOSX64-NEXT: imull %edx, %r11d +; LINUXOSX64-NEXT: imull %r8d, %eax +; LINUXOSX64-NEXT: imull %r10d, %r11d ; LINUXOSX64-NEXT: addl %r11d, %eax -; LINUXOSX64-NEXT: addl %r8d, %eax +; LINUXOSX64-NEXT: addl %r15d, %edx +; LINUXOSX64-NEXT: imull %edi, %edx +; LINUXOSX64-NEXT: addl %edx, %eax +; LINUXOSX64-NEXT: addl %r9d, %eax ; LINUXOSX64-NEXT: retq %x1 = sub i32 %a1, %a2 %x2 = sub i32 %a3, %a4 diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll index 536c667c7ec902..08e3b641a46945 100644 --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -556,22 +556,22 @@ define void @vselect_v1i1(ptr %w, ptr %x, ptr %y) nounwind { ; X86-AVX512F-LABEL: vselect_v1i1: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: pushl %esi +; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: movzbl (%eax), %esi +; X86-AVX512F-NEXT: movzbl (%edx), %esi ; X86-AVX512F-NEXT: kmovw %esi, %k0 -; X86-AVX512F-NEXT: movzbl (%edx), %edx -; X86-AVX512F-NEXT: kmovw %edx, %k1 ; X86-AVX512F-NEXT: movzbl (%ecx), %ecx -; X86-AVX512F-NEXT: kmovw %ecx, %k2 +; X86-AVX512F-NEXT: kmovw %ecx, %k1 +; X86-AVX512F-NEXT: movzbl (%eax), %eax +; X86-AVX512F-NEXT: kmovw %eax, %k2 ; X86-AVX512F-NEXT: kandnw %k1, %k2, %k1 ; X86-AVX512F-NEXT: kandw %k2, %k0, %k0 ; X86-AVX512F-NEXT: korw %k1, %k0, %k0 ; X86-AVX512F-NEXT: kshiftlw $15, %k0, %k0 ; X86-AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; X86-AVX512F-NEXT: kmovw %k0, %ecx -; X86-AVX512F-NEXT: movb %cl, (%eax) +; X86-AVX512F-NEXT: kmovw %k0, %eax +; X86-AVX512F-NEXT: movb %al, (%edx) ; X86-AVX512F-NEXT: popl %esi ; X86-AVX512F-NEXT: retl ; @@ -595,22 +595,22 @@ define void @vselect_v1i1(ptr %w, ptr %x, ptr %y) nounwind { ; X86-AVX512BW-LABEL: vselect_v1i1: ; X86-AVX512BW: # %bb.0: ; X86-AVX512BW-NEXT: pushl %esi +; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512BW-NEXT: movzbl (%eax), %esi +; X86-AVX512BW-NEXT: movzbl (%edx), %esi ; X86-AVX512BW-NEXT: kmovd %esi, %k0 -; X86-AVX512BW-NEXT: movzbl (%edx), %edx -; X86-AVX512BW-NEXT: kmovd %edx, %k1 ; X86-AVX512BW-NEXT: movzbl (%ecx), %ecx -; X86-AVX512BW-NEXT: kmovd %ecx, %k2 +; X86-AVX512BW-NEXT: kmovd %ecx, %k1 +; X86-AVX512BW-NEXT: movzbl (%eax), %eax +; X86-AVX512BW-NEXT: kmovd %eax, %k2 ; X86-AVX512BW-NEXT: kandnw %k1, %k2, %k1 ; X86-AVX512BW-NEXT: kandw %k2, %k0, %k0 ; X86-AVX512BW-NEXT: korw %k1, %k0, %k0 ; X86-AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; X86-AVX512BW-NEXT: kshiftrw $15, %k0, %k0 -; X86-AVX512BW-NEXT: kmovd %k0, %ecx -; X86-AVX512BW-NEXT: movb %cl, (%eax) +; X86-AVX512BW-NEXT: kmovd %k0, %eax +; X86-AVX512BW-NEXT: movb %al, (%edx) ; X86-AVX512BW-NEXT: popl %esi ; X86-AVX512BW-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 51ffeca52a6652..48192a333a986c 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1860,32 +1860,32 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind { ; X86-NEXT: pushl %esi # encoding: [0x56] ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xc0] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] +; X86-NEXT: addl %ecx, %esi # encoding: [0x01,0xce] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] +; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: addl %esi, %ecx # encoding: [0x01,0xf1] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] +; X86-NEXT: addl %ecx, %esi # encoding: [0x01,0xce] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] ; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] ; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] +; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] @@ -1947,37 +1947,37 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwin ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] +; X86-NEXT: addl %ecx, %esi # encoding: [0x01,0xce] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: addl %esi, %ecx # encoding: [0x01,0xf1] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] +; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] ; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %esi # encoding: [0xc5,0xfb,0x93,0xf2] ; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] -; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] -; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] +; X86-NEXT: addl %edx, %edi # encoding: [0x01,0xd7] +; X86-NEXT: adcl %ecx, %esi # encoding: [0x11,0xce] ; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] -; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] +; X86-NEXT: adcl %esi, %edx # encoding: [0x11,0xf2] ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x0c] ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x10] ; X86-NEXT: popl %esi # encoding: [0x5e] @@ -2035,32 +2035,32 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind { ; X86-NEXT: pushl %esi # encoding: [0x56] ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x01] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] +; X86-NEXT: addl %ecx, %esi # encoding: [0x01,0xce] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] +; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: addl %esi, %ecx # encoding: [0x01,0xf1] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] +; X86-NEXT: addl %ecx, %esi # encoding: [0x01,0xce] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] ; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] ; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] +; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] @@ -2122,37 +2122,37 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] +; X86-NEXT: addl %ecx, %esi # encoding: [0x01,0xce] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: addl %esi, %ecx # encoding: [0x01,0xf1] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] +; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] ; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %esi # encoding: [0xc5,0xfb,0x93,0xf2] ; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] -; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] -; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] +; X86-NEXT: addl %edx, %edi # encoding: [0x01,0xd7] +; X86-NEXT: adcl %ecx, %esi # encoding: [0x11,0xce] ; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] -; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] +; X86-NEXT: adcl %esi, %edx # encoding: [0x11,0xf2] ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x0c] ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x10] ; X86-NEXT: popl %esi # encoding: [0x5e] diff --git a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll index fd7212623c79de..51904598bbc891 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll @@ -4,62 +4,62 @@ define void @test_mscatter_v16f16(ptr %base, <16 x i32> %index, <16 x half> %val) ; CHECK-LABEL: test_mscatter_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastq %rdi, %zmm3 -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpmovsxdq %ymm2, %zmm2 -; CHECK-NEXT: vpaddq %zmm2, %zmm2, %zmm2 -; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; CHECK-NEXT: vpbroadcastq %rdi, %zmm2 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vpmovsxdq %ymm3, %zmm3 +; CHECK-NEXT: vpaddq %zmm3, %zmm3, %zmm3 +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm3 ; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) -; CHECK-NEXT: vpsrld $16, %xmm1, %xmm3 +; CHECK-NEXT: vpsrld $16, %xmm1, %xmm2 ; CHECK-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-NEXT: vmovsh %xmm3, (%rax) -; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-NEXT: vmovsh %xmm2, (%rax) +; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm4 ; CHECK-NEXT: vmovq %xmm4, %rax -; CHECK-NEXT: vmovsh %xmm3, (%rax) -; CHECK-NEXT: vpsrlq $48, %xmm1, %xmm3 +; CHECK-NEXT: vmovsh %xmm2, (%rax) +; CHECK-NEXT: vpsrlq $48, %xmm1, %xmm2 ; CHECK-NEXT: vpextrq $1, %xmm4, %rax -; CHECK-NEXT: vmovsh %xmm3, (%rax) -; CHECK-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; CHECK-NEXT: vmovsh %xmm2, (%rax) +; CHECK-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; CHECK-NEXT: vmovq %xmm4, %rax -; CHECK-NEXT: vmovsh %xmm3, (%rax) -; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vmovsh %xmm2, (%rax) +; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpextrq $1, %xmm4, %rax -; CHECK-NEXT: vmovsh %xmm3, (%rax) -; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; CHECK-NEXT: vmovsh %xmm2, (%rax) +; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; CHECK-NEXT: vmovq %xmm0, %rax -; CHECK-NEXT: vmovsh %xmm3, (%rax) -; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vmovsh %xmm2, (%rax) +; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-NEXT: vmovsh %xmm3, (%rax) +; CHECK-NEXT: vmovsh %xmm2, (%rax) ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm0 -; CHECK-NEXT: vmovq %xmm2, %rax +; CHECK-NEXT: vmovq %xmm3, %rax ; CHECK-NEXT: vmovsh %xmm0, (%rax) ; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1 -; CHECK-NEXT: vpextrq $1, %xmm2, %rax +; CHECK-NEXT: vpextrq $1, %xmm3, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) ; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vmovq %xmm3, %rax +; CHECK-NEXT: vextracti128 $1, %ymm3, %xmm2 +; CHECK-NEXT: vmovq %xmm2, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) ; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm1 -; CHECK-NEXT: vpextrq $1, %xmm3, %rax +; CHECK-NEXT: vpextrq $1, %xmm2, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) ; CHECK-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-NEXT: vextracti32x4 $2, %zmm2, %xmm3 -; CHECK-NEXT: vmovq %xmm3, %rax +; CHECK-NEXT: vextracti32x4 $2, %zmm3, %xmm2 +; CHECK-NEXT: vmovq %xmm2, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) ; CHECK-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpextrq $1, %xmm3, %rax +; CHECK-NEXT: vpextrq $1, %xmm2, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; CHECK-NEXT: vextracti32x4 $3, %zmm2, %xmm2 +; CHECK-NEXT: vextracti32x4 $3, %zmm3, %xmm2 ; CHECK-NEXT: vmovq %xmm2, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero diff --git a/llvm/test/CodeGen/X86/avx512vnni-combine.ll b/llvm/test/CodeGen/X86/avx512vnni-combine.ll index 7a0527be054194..3ee6c2972a2643 100644 --- a/llvm/test/CodeGen/X86/avx512vnni-combine.ll +++ b/llvm/test/CodeGen/X86/avx512vnni-combine.ll @@ -42,18 +42,18 @@ define <8 x i64> @foo_512(i32 %0, <8 x i64> %1, <8 x i64> %2, ptr %3) { ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: jle .LBB1_6 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: andl $3, %eax ; CHECK-NEXT: cmpl $4, %edi ; CHECK-NEXT: jae .LBB1_7 ; CHECK-NEXT: # %bb.2: -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: jmp .LBB1_3 ; CHECK-NEXT: .LBB1_7: -; CHECK-NEXT: andl $-4, %edx +; CHECK-NEXT: andl $-4, %ecx ; CHECK-NEXT: leaq 192(%rsi), %rdi -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vpdpwssd -192(%rdi), %zmm1, %zmm0 @@ -63,16 +63,16 @@ define <8 x i64> @foo_512(i32 %0, <8 x i64> %1, <8 x i64> %2, ptr %3) { ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vpmaddwd (%rdi), %zmm1, %zmm2 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; CHECK-NEXT: addq $4, %rcx +; CHECK-NEXT: addq $4, %rdx ; CHECK-NEXT: addq $256, %rdi # imm = 0x100 -; CHECK-NEXT: cmpq %rcx, %rdx +; CHECK-NEXT: cmpq %rdx, %rcx ; CHECK-NEXT: jne .LBB1_8 ; CHECK-NEXT: .LBB1_3: ; CHECK-NEXT: testq %rax, %rax ; CHECK-NEXT: je .LBB1_6 ; CHECK-NEXT: # %bb.4: # %.preheader -; CHECK-NEXT: shlq $6, %rcx -; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: shlq $6, %rdx +; CHECK-NEXT: addq %rdx, %rsi ; CHECK-NEXT: shlq $6, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: .p2align 4, 0x90 diff --git a/llvm/test/CodeGen/X86/avxvnni-combine.ll b/llvm/test/CodeGen/X86/avxvnni-combine.ll index d8e73a5cf37d8d..5e3c69103e722d 100644 --- a/llvm/test/CodeGen/X86/avxvnni-combine.ll +++ b/llvm/test/CodeGen/X86/avxvnni-combine.ll @@ -47,18 +47,18 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) { ; AVX-NEXT: testl %edi, %edi ; AVX-NEXT: jle .LBB1_6 ; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %edi, %edx -; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: movl %edi, %ecx +; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: andl $3, %eax ; AVX-NEXT: cmpl $4, %edi ; AVX-NEXT: jae .LBB1_7 ; AVX-NEXT: # %bb.2: -; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: xorl %edx, %edx ; AVX-NEXT: jmp .LBB1_3 ; AVX-NEXT: .LBB1_7: -; AVX-NEXT: andl $-4, %edx +; AVX-NEXT: andl $-4, %ecx ; AVX-NEXT: leaq 48(%rsi), %rdi -; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: xorl %edx, %edx ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0 @@ -68,16 +68,16 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) { ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: addq $4, %rcx +; AVX-NEXT: addq $4, %rdx ; AVX-NEXT: addq $64, %rdi -; AVX-NEXT: cmpq %rcx, %rdx +; AVX-NEXT: cmpq %rdx, %rcx ; AVX-NEXT: jne .LBB1_8 ; AVX-NEXT: .LBB1_3: ; AVX-NEXT: testq %rax, %rax ; AVX-NEXT: je .LBB1_6 ; AVX-NEXT: # %bb.4: # %.preheader -; AVX-NEXT: shlq $4, %rcx -; AVX-NEXT: addq %rcx, %rsi +; AVX-NEXT: shlq $4, %rdx +; AVX-NEXT: addq %rdx, %rsi ; AVX-NEXT: shlq $4, %rax ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4, 0x90 @@ -94,18 +94,18 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) { ; AVX512-NEXT: testl %edi, %edi ; AVX512-NEXT: jle .LBB1_6 ; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl %edi, %edx -; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: movl %edi, %ecx +; AVX512-NEXT: movl %ecx, %eax ; AVX512-NEXT: andl $3, %eax ; AVX512-NEXT: cmpl $4, %edi ; AVX512-NEXT: jae .LBB1_7 ; AVX512-NEXT: # %bb.2: -; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: xorl %edx, %edx ; AVX512-NEXT: jmp .LBB1_3 ; AVX512-NEXT: .LBB1_7: -; AVX512-NEXT: andl $-4, %edx +; AVX512-NEXT: andl $-4, %ecx ; AVX512-NEXT: leaq 48(%rsi), %rdi -; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: xorl %edx, %edx ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vpdpwssd -48(%rdi), %xmm1, %xmm0 @@ -115,16 +115,16 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) { ; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2 ; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: addq $4, %rcx +; AVX512-NEXT: addq $4, %rdx ; AVX512-NEXT: addq $64, %rdi -; AVX512-NEXT: cmpq %rcx, %rdx +; AVX512-NEXT: cmpq %rdx, %rcx ; AVX512-NEXT: jne .LBB1_8 ; AVX512-NEXT: .LBB1_3: ; AVX512-NEXT: testq %rax, %rax ; AVX512-NEXT: je .LBB1_6 ; AVX512-NEXT: # %bb.4: # %.preheader -; AVX512-NEXT: shlq $4, %rcx -; AVX512-NEXT: addq %rcx, %rsi +; AVX512-NEXT: shlq $4, %rdx +; AVX512-NEXT: addq %rdx, %rsi ; AVX512-NEXT: shlq $4, %rax ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4, 0x90 @@ -394,18 +394,18 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) { ; AVX-NEXT: testl %edi, %edi ; AVX-NEXT: jle .LBB4_6 ; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %edi, %edx -; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: movl %edi, %ecx +; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: andl $3, %eax ; AVX-NEXT: cmpl $4, %edi ; AVX-NEXT: jae .LBB4_7 ; AVX-NEXT: # %bb.2: -; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: xorl %edx, %edx ; AVX-NEXT: jmp .LBB4_3 ; AVX-NEXT: .LBB4_7: -; AVX-NEXT: andl $-4, %edx +; AVX-NEXT: andl $-4, %ecx ; AVX-NEXT: leaq 96(%rsi), %rdi -; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: xorl %edx, %edx ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0 @@ -415,16 +415,16 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) { ; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2 ; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: addq $4, %rcx +; AVX-NEXT: addq $4, %rdx ; AVX-NEXT: subq $-128, %rdi -; AVX-NEXT: cmpq %rcx, %rdx +; AVX-NEXT: cmpq %rdx, %rcx ; AVX-NEXT: jne .LBB4_8 ; AVX-NEXT: .LBB4_3: ; AVX-NEXT: testq %rax, %rax ; AVX-NEXT: je .LBB4_6 ; AVX-NEXT: # %bb.4: # %.preheader -; AVX-NEXT: shlq $5, %rcx -; AVX-NEXT: addq %rcx, %rsi +; AVX-NEXT: shlq $5, %rdx +; AVX-NEXT: addq %rdx, %rsi ; AVX-NEXT: shlq $5, %rax ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4, 0x90 @@ -441,18 +441,18 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) { ; AVX512-NEXT: testl %edi, %edi ; AVX512-NEXT: jle .LBB4_6 ; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl %edi, %edx -; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: movl %edi, %ecx +; AVX512-NEXT: movl %ecx, %eax ; AVX512-NEXT: andl $3, %eax ; AVX512-NEXT: cmpl $4, %edi ; AVX512-NEXT: jae .LBB4_7 ; AVX512-NEXT: # %bb.2: -; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: xorl %edx, %edx ; AVX512-NEXT: jmp .LBB4_3 ; AVX512-NEXT: .LBB4_7: -; AVX512-NEXT: andl $-4, %edx +; AVX512-NEXT: andl $-4, %ecx ; AVX512-NEXT: leaq 96(%rsi), %rdi -; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: xorl %edx, %edx ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vpdpwssd -96(%rdi), %ymm1, %ymm0 @@ -462,16 +462,16 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) { ; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2 ; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: addq $4, %rcx +; AVX512-NEXT: addq $4, %rdx ; AVX512-NEXT: subq $-128, %rdi -; AVX512-NEXT: cmpq %rcx, %rdx +; AVX512-NEXT: cmpq %rdx, %rcx ; AVX512-NEXT: jne .LBB4_8 ; AVX512-NEXT: .LBB4_3: ; AVX512-NEXT: testq %rax, %rax ; AVX512-NEXT: je .LBB4_6 ; AVX512-NEXT: # %bb.4: # %.preheader -; AVX512-NEXT: shlq $5, %rcx -; AVX512-NEXT: addq %rcx, %rsi +; AVX512-NEXT: shlq $5, %rdx +; AVX512-NEXT: addq %rdx, %rsi ; AVX512-NEXT: shlq $5, %rax ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4, 0x90 diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index 7a82515ad24b72..6d418ba6f6eec0 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -320,47 +320,47 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; SSE2-NEXT: shrq $48, %rax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: movq %xmm0, %r12 -; SSE2-NEXT: movq %r12, %rax +; SSE2-NEXT: movq %xmm0, %rbx +; SSE2-NEXT: movq %rbx, %rax ; SSE2-NEXT: shrq $32, %rax ; SSE2-NEXT: movq %rax, (%rsp) # 8-byte Spill ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1] ; SSE2-NEXT: movq %xmm1, %r14 -; SSE2-NEXT: movq %r14, %rbp -; SSE2-NEXT: shrq $32, %rbp -; SSE2-NEXT: movq %r12, %r15 -; SSE2-NEXT: shrq $48, %r15 ; SSE2-NEXT: movq %r14, %r13 -; SSE2-NEXT: shrq $48, %r13 +; SSE2-NEXT: shrq $32, %r13 +; SSE2-NEXT: movq %rbx, %r15 +; SSE2-NEXT: shrq $48, %r15 +; SSE2-NEXT: movq %r14, %rbp +; SSE2-NEXT: shrq $48, %rbp ; SSE2-NEXT: movl %r14d, %eax ; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: movl %ebx, %eax ; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx -; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd %xmm0, %r12d +; SSE2-NEXT: shll $16, %r12d ; SSE2-NEXT: shll $16, %r14d ; SSE2-NEXT: movd %r14d, %xmm1 -; SSE2-NEXT: shll $16, %r12d -; SSE2-NEXT: movd %r12d, %xmm0 +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd %ebx, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movzwl %ax, %r12d -; SSE2-NEXT: orl %ebx, %r12d -; SSE2-NEXT: shll $16, %r13d -; SSE2-NEXT: movd %r13d, %xmm1 +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %r12d, %r14d +; SSE2-NEXT: shll $16, %ebp +; SSE2-NEXT: movd %ebp, %xmm1 ; SSE2-NEXT: shll $16, %r15d ; SSE2-NEXT: movd %r15d, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %r14d -; SSE2-NEXT: shll $16, %r14d +; SSE2-NEXT: movd %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp -; SSE2-NEXT: movd %ebp, %xmm1 +; SSE2-NEXT: shll $16, %r13d +; SSE2-NEXT: movd %r13d, %xmm1 ; SSE2-NEXT: movq (%rsp), %rax # 8-byte Reload ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm0 @@ -368,9 +368,9 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %ebx -; SSE2-NEXT: orl %r14d, %ebx +; SSE2-NEXT: orl %ebp, %ebx ; SSE2-NEXT: shlq $32, %rbx -; SSE2-NEXT: orq %r12, %rbx +; SSE2-NEXT: orq %r14, %rbx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; SSE2-NEXT: movl %r15d, %eax ; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 @@ -552,8 +552,8 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; FP16-NEXT: pushq %r12 ; FP16-NEXT: pushq %rbx ; FP16-NEXT: subq $40, %rsp -; FP16-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; FP16-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; FP16-NEXT: vmovw %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm2 @@ -563,77 +563,77 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; FP16-NEXT: vaddss %xmm2, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT ; FP16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; FP16-NEXT: vpextrw $7, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; FP16-NEXT: vpextrw $7, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT ; FP16-NEXT: vmovd %xmm0, %ebp -; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; FP16-NEXT: vpextrw $6, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; FP16-NEXT: vpextrw $6, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT ; FP16-NEXT: vmovd %xmm0, %r14d -; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; FP16-NEXT: vpextrw $5, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; FP16-NEXT: vpextrw $5, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT ; FP16-NEXT: vmovd %xmm0, %r15d -; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; FP16-NEXT: vpextrw $4, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; FP16-NEXT: vpextrw $4, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT ; FP16-NEXT: vmovd %xmm0, %r12d -; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; FP16-NEXT: vpextrw $3, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; FP16-NEXT: vpextrw $3, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT ; FP16-NEXT: vmovd %xmm0, %r13d -; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; FP16-NEXT: vpextrw $2, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; FP16-NEXT: vpextrw $2, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT ; FP16-NEXT: vmovd %xmm0, %ebx -; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; FP16-NEXT: vpextrw $1, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; FP16-NEXT: vpextrw $1, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 @@ -1149,13 +1149,13 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: movd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: jne .LBB12_65 ; SSE2-NEXT: # %bb.66: # %cond.load91 ; SSE2-NEXT: movzwl (%rax), %eax @@ -1711,39 +1711,39 @@ define <4 x float> @pr64460_1(<4 x bfloat> %a) { define <8 x float> @pr64460_2(<8 x bfloat> %a) { ; SSE2-LABEL: pr64460_2: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: shrq $32, %rax -; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: shrq $32, %rdx +; SSE2-NEXT: movq %rax, %rsi ; SSE2-NEXT: shrq $32, %rsi -; SSE2-NEXT: movl %edx, %edi +; SSE2-NEXT: movl %eax, %edi ; SSE2-NEXT: andl $-65536, %edi # imm = 0xFFFF0000 ; SSE2-NEXT: movd %edi, %xmm1 -; SSE2-NEXT: movl %edx, %edi +; SSE2-NEXT: movl %eax, %edi ; SSE2-NEXT: shll $16, %edi ; SSE2-NEXT: movd %edi, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: shrq $48, %rdx -; SSE2-NEXT: shll $16, %edx -; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: shrq $48, %rax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: shll $16, %esi ; SSE2-NEXT: movd %esi, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: andl $-65536, %edx # imm = 0xFFFF0000 -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shll $16, %edx -; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: shrq $48, %rcx ; SSE2-NEXT: shll $16, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: shll $16, %edx +; SSE2-NEXT: movd %edx, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE2-NEXT: retq @@ -1765,41 +1765,41 @@ define <16 x float> @pr64460_3(<16 x bfloat> %a) { ; SSE2-NEXT: movq %xmm1, %rcx ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: shrq $32, %rax -; SSE2-NEXT: movq %xmm0, %r9 +; SSE2-NEXT: movq %xmm0, %r8 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: movq %xmm0, %rsi ; SSE2-NEXT: movq %rsi, %rdx ; SSE2-NEXT: shrq $32, %rdx -; SSE2-NEXT: movq %rdi, %r8 -; SSE2-NEXT: shrq $32, %r8 -; SSE2-NEXT: movq %r9, %r10 +; SSE2-NEXT: movq %rdi, %r9 +; SSE2-NEXT: shrq $32, %r9 +; SSE2-NEXT: movq %r8, %r10 ; SSE2-NEXT: shrq $32, %r10 -; SSE2-NEXT: movl %r9d, %r11d +; SSE2-NEXT: movl %r8d, %r11d ; SSE2-NEXT: andl $-65536, %r11d # imm = 0xFFFF0000 ; SSE2-NEXT: movd %r11d, %xmm1 -; SSE2-NEXT: movl %r9d, %r11d +; SSE2-NEXT: movl %r8d, %r11d ; SSE2-NEXT: shll $16, %r11d ; SSE2-NEXT: movd %r11d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: shrq $48, %r9 -; SSE2-NEXT: shll $16, %r9d -; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: shrq $48, %r8 +; SSE2-NEXT: shll $16, %r8d +; SSE2-NEXT: movd %r8d, %xmm1 ; SSE2-NEXT: shll $16, %r10d ; SSE2-NEXT: movd %r10d, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: movl %edi, %r9d -; SSE2-NEXT: andl $-65536, %r9d # imm = 0xFFFF0000 -; SSE2-NEXT: movd %r9d, %xmm1 -; SSE2-NEXT: movl %edi, %r9d -; SSE2-NEXT: shll $16, %r9d -; SSE2-NEXT: movd %r9d, %xmm2 +; SSE2-NEXT: movl %edi, %r8d +; SSE2-NEXT: andl $-65536, %r8d # imm = 0xFFFF0000 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movl %edi, %r8d +; SSE2-NEXT: shll $16, %r8d +; SSE2-NEXT: movd %r8d, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: shrq $48, %rdi ; SSE2-NEXT: shll $16, %edi ; SSE2-NEXT: movd %edi, %xmm1 -; SSE2-NEXT: shll $16, %r8d -; SSE2-NEXT: movd %r8d, %xmm3 +; SSE2-NEXT: shll $16, %r9d +; SSE2-NEXT: movd %r9d, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: movl %esi, %edi @@ -1854,23 +1854,23 @@ define <16 x float> @pr64460_3(<16 x bfloat> %a) { define <8 x double> @pr64460_4(<8 x bfloat> %a) { ; SSE2-LABEL: pr64460_4: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: movq %rdx, %rax ; SSE2-NEXT: shrq $32, %rax -; SSE2-NEXT: movq %rdx, %rcx -; SSE2-NEXT: shrq $48, %rcx -; SSE2-NEXT: movq %rsi, %rdi +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: shrq $48, %rsi +; SSE2-NEXT: movq %rcx, %rdi ; SSE2-NEXT: shrq $32, %rdi -; SSE2-NEXT: movq %rsi, %r8 +; SSE2-NEXT: movq %rcx, %r8 ; SSE2-NEXT: shrq $48, %r8 -; SSE2-NEXT: movl %esi, %r9d +; SSE2-NEXT: movl %ecx, %r9d ; SSE2-NEXT: andl $-65536, %r9d # imm = 0xFFFF0000 ; SSE2-NEXT: movd %r9d, %xmm0 ; SSE2-NEXT: cvtss2sd %xmm0, %xmm1 -; SSE2-NEXT: shll $16, %esi -; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: shll $16, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: cvtss2sd %xmm0, %xmm0 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shll $16, %r8d @@ -1880,16 +1880,16 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) { ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: cvtss2sd %xmm1, %xmm1 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: movl %edx, %esi -; SSE2-NEXT: andl $-65536, %esi # imm = 0xFFFF0000 -; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: cvtss2sd %xmm2, %xmm3 ; SSE2-NEXT: shll $16, %edx ; SSE2-NEXT: movd %edx, %xmm2 ; SSE2-NEXT: cvtss2sd %xmm2, %xmm2 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: shll $16, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: shll $16, %esi +; SSE2-NEXT: movd %esi, %xmm3 ; SSE2-NEXT: cvtss2sd %xmm3, %xmm4 ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm3 diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll index 34ef23db345755..a87b129951de50 100644 --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -9,29 +9,29 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; SSE2-SSSE3-LABEL: v4i64: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: pxor %xmm9, %xmm3 -; SSE2-SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSE2-SSSE3-NEXT: pxor %xmm9, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm11 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm10[0,2] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm9[0,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE2-SSSE3-NEXT: andps %xmm11, %xmm0 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm10[1,3] -; SSE2-SSSE3-NEXT: orps %xmm0, %xmm8 -; SSE2-SSSE3-NEXT: pxor %xmm9, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,3],xmm9[1,3] +; SSE2-SSSE3-NEXT: orps %xmm0, %xmm10 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm5 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm9, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm9, %xmm4 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm4 ; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm1 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 @@ -42,7 +42,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; SSE2-SSSE3-NEXT: andps %xmm2, %xmm4 ; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] ; SSE2-SSSE3-NEXT: orps %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: andps %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: andps %xmm10, %xmm1 ; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq @@ -734,8 +734,8 @@ define i8 @v8f32_xor(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> define i8 @v8f32_xor_and(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, <8 x float> %e, <8 x float> %f) { ; SSE2-SSSE3-LABEL: v8f32_xor_and: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 ; SSE2-SSSE3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 +; SSE2-SSSE3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 ; SSE2-SSSE3-NEXT: cmpnleps %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: cmpnleps %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: movaps %xmm5, %xmm2 @@ -748,13 +748,13 @@ define i8 @v8f32_xor_and(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x fl ; SSE2-SSSE3-NEXT: cmpunordps %xmm6, %xmm4 ; SSE2-SSSE3-NEXT: orps %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: xorps %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm8 -; SSE2-SSSE3-NEXT: andps %xmm4, %xmm8 ; SSE2-SSSE3-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm9 -; SSE2-SSSE3-NEXT: andps %xmm5, %xmm9 -; SSE2-SSSE3-NEXT: packssdw %xmm9, %xmm8 -; SSE2-SSSE3-NEXT: packsswb %xmm8, %xmm8 -; SSE2-SSSE3-NEXT: pmovmskb %xmm8, %eax +; SSE2-SSSE3-NEXT: andps %xmm4, %xmm9 +; SSE2-SSSE3-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm8 +; SSE2-SSSE3-NEXT: andps %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: packssdw %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: packsswb %xmm9, %xmm9 +; SSE2-SSSE3-NEXT: pmovmskb %xmm9, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index 501e73c46af9cf..58b7fa4fded795 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -1133,29 +1133,29 @@ define i64 @bitcast_v128i8_to_v2i64(<128 x i8> %a0) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovmskb %xmm2, %eax ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpmovmskb %xmm2, %edx -; AVX1-NEXT: shll $16, %edx -; AVX1-NEXT: orl %eax, %edx -; AVX1-NEXT: vpmovmskb %xmm3, %eax -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 ; AVX1-NEXT: vpmovmskb %xmm2, %ecx ; AVX1-NEXT: shll $16, %ecx ; AVX1-NEXT: orl %eax, %ecx -; AVX1-NEXT: shlq $32, %rcx -; AVX1-NEXT: orq %rdx, %rcx -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %edx +; AVX1-NEXT: vpmovmskb %xmm3, %eax +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpmovmskb %xmm2, %edx ; AVX1-NEXT: shll $16, %edx ; AVX1-NEXT: orl %eax, %edx +; AVX1-NEXT: shlq $32, %rdx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: orl %eax, %ecx ; AVX1-NEXT: vpmovmskb %xmm1, %esi ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %esi, %eax ; AVX1-NEXT: shlq $32, %rax -; AVX1-NEXT: orq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: addq %rdx, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1200,29 +1200,29 @@ define i1 @trunc_v128i8_cmp(<128 x i8> %a0) nounwind { ; SSE2-SSSE3-NEXT: shll $16, %ecx ; SSE2-SSSE3-NEXT: orl %eax, %ecx ; SSE2-SSSE3-NEXT: psllw $7, %xmm2 -; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %edx +; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax ; SSE2-SSSE3-NEXT: psllw $7, %xmm3 -; SSE2-SSSE3-NEXT: pmovmskb %xmm3, %eax -; SSE2-SSSE3-NEXT: shll $16, %eax -; SSE2-SSSE3-NEXT: orl %edx, %eax -; SSE2-SSSE3-NEXT: shlq $32, %rax -; SSE2-SSSE3-NEXT: orq %rcx, %rax +; SSE2-SSSE3-NEXT: pmovmskb %xmm3, %edx +; SSE2-SSSE3-NEXT: shll $16, %edx +; SSE2-SSSE3-NEXT: orl %eax, %edx +; SSE2-SSSE3-NEXT: shlq $32, %rdx +; SSE2-SSSE3-NEXT: orq %rcx, %rdx ; SSE2-SSSE3-NEXT: psllw $7, %xmm4 -; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %ecx +; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax ; SSE2-SSSE3-NEXT: psllw $7, %xmm5 -; SSE2-SSSE3-NEXT: pmovmskb %xmm5, %edx -; SSE2-SSSE3-NEXT: shll $16, %edx -; SSE2-SSSE3-NEXT: orl %ecx, %edx +; SSE2-SSSE3-NEXT: pmovmskb %xmm5, %ecx +; SSE2-SSSE3-NEXT: shll $16, %ecx +; SSE2-SSSE3-NEXT: orl %eax, %ecx ; SSE2-SSSE3-NEXT: psllw $7, %xmm6 -; SSE2-SSSE3-NEXT: pmovmskb %xmm6, %ecx +; SSE2-SSSE3-NEXT: pmovmskb %xmm6, %eax ; SSE2-SSSE3-NEXT: psllw $7, %xmm7 ; SSE2-SSSE3-NEXT: pmovmskb %xmm7, %esi ; SSE2-SSSE3-NEXT: shll $16, %esi -; SSE2-SSSE3-NEXT: orl %ecx, %esi +; SSE2-SSSE3-NEXT: orl %eax, %esi ; SSE2-SSSE3-NEXT: shlq $32, %rsi -; SSE2-SSSE3-NEXT: orq %rdx, %rsi +; SSE2-SSSE3-NEXT: orq %rcx, %rsi ; SSE2-SSSE3-NEXT: movq %rsi, %xmm0 -; SSE2-SSSE3-NEXT: movq %rax, %xmm1 +; SSE2-SSSE3-NEXT: movq %rdx, %xmm1 ; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 @@ -1240,29 +1240,29 @@ define i1 @trunc_v128i8_cmp(<128 x i8> %a0) nounwind { ; SSE41-NEXT: shll $16, %ecx ; SSE41-NEXT: orl %eax, %ecx ; SSE41-NEXT: psllw $7, %xmm2 -; SSE41-NEXT: pmovmskb %xmm2, %edx +; SSE41-NEXT: pmovmskb %xmm2, %eax ; SSE41-NEXT: psllw $7, %xmm3 -; SSE41-NEXT: pmovmskb %xmm3, %eax -; SSE41-NEXT: shll $16, %eax -; SSE41-NEXT: orl %edx, %eax -; SSE41-NEXT: shlq $32, %rax -; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: pmovmskb %xmm3, %edx +; SSE41-NEXT: shll $16, %edx +; SSE41-NEXT: orl %eax, %edx +; SSE41-NEXT: shlq $32, %rdx +; SSE41-NEXT: orq %rcx, %rdx ; SSE41-NEXT: psllw $7, %xmm4 -; SSE41-NEXT: pmovmskb %xmm4, %ecx +; SSE41-NEXT: pmovmskb %xmm4, %eax ; SSE41-NEXT: psllw $7, %xmm5 -; SSE41-NEXT: pmovmskb %xmm5, %edx -; SSE41-NEXT: shll $16, %edx -; SSE41-NEXT: orl %ecx, %edx +; SSE41-NEXT: pmovmskb %xmm5, %ecx +; SSE41-NEXT: shll $16, %ecx +; SSE41-NEXT: orl %eax, %ecx ; SSE41-NEXT: psllw $7, %xmm6 -; SSE41-NEXT: pmovmskb %xmm6, %ecx +; SSE41-NEXT: pmovmskb %xmm6, %eax ; SSE41-NEXT: psllw $7, %xmm7 ; SSE41-NEXT: pmovmskb %xmm7, %esi ; SSE41-NEXT: shll $16, %esi -; SSE41-NEXT: orl %ecx, %esi +; SSE41-NEXT: orl %eax, %esi ; SSE41-NEXT: shlq $32, %rsi -; SSE41-NEXT: orq %rdx, %rsi +; SSE41-NEXT: orq %rcx, %rsi ; SSE41-NEXT: movq %rsi, %xmm0 -; SSE41-NEXT: movq %rax, %xmm1 +; SSE41-NEXT: movq %rdx, %xmm1 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE41-NEXT: ptest %xmm0, %xmm1 @@ -1279,32 +1279,32 @@ define i1 @trunc_v128i8_cmp(<128 x i8> %a0) nounwind { ; AVX1-NEXT: shll $16, %ecx ; AVX1-NEXT: orl %eax, %ecx ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %edx +; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: shll $16, %eax -; AVX1-NEXT: orl %edx, %eax -; AVX1-NEXT: shlq $32, %rax -; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vpmovmskb %xmm0, %edx +; AVX1-NEXT: shll $16, %edx +; AVX1-NEXT: orl %eax, %edx +; AVX1-NEXT: shlq $32, %rdx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vpsllw $7, %xmm2, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %edx -; AVX1-NEXT: shll $16, %edx -; AVX1-NEXT: orl %ecx, %edx -; AVX1-NEXT: vpsllw $7, %xmm3, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: orl %eax, %ecx +; AVX1-NEXT: vpsllw $7, %xmm3, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %esi ; AVX1-NEXT: shll $16, %esi -; AVX1-NEXT: orl %ecx, %esi +; AVX1-NEXT: orl %eax, %esi ; AVX1-NEXT: shlq $32, %rsi -; AVX1-NEXT: orq %rdx, %rsi +; AVX1-NEXT: orq %rcx, %rsi ; AVX1-NEXT: vmovq %rsi, %xmm0 -; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vmovq %rdx, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vptest %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll index 9daac1df1d9750..40b76f0c3bbe26 100644 --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -710,7 +710,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $60, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -733,7 +733,8 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431633920, %ebp # imm = 0x55550000 ; X86-NEXT: shrl %ebx ; X86-NEXT: andl $1431633920, %ebx # imm = 0x55550000 -; X86-NEXT: leal (%ebx,%ebp,2), %ebp +; X86-NEXT: leal (%ebx,%ebp,2), %ebx +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-NEXT: bswapl %edi ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F @@ -768,7 +769,8 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %edi # imm = 0x55555555 ; X86-NEXT: shrl %esi ; X86-NEXT: andl $1431655765, %esi # imm = 0x55555555 -; X86-NEXT: leal (%esi,%edi,2), %ebx +; X86-NEXT: leal (%esi,%edi,2), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bswapl %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F @@ -897,7 +899,8 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: leal (%eax,%ecx,2), %edi +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx @@ -915,8 +918,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: leal (%eax,%ecx,2), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal (%eax,%ecx,2), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx @@ -935,7 +937,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx @@ -991,8 +993,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: leal (%eax,%ecx,2), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal (%eax,%ecx,2), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx @@ -1010,8 +1011,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: leal (%eax,%ecx,2), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal (%eax,%ecx,2), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx @@ -1030,11 +1030,14 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %edx -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $16, %eax, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shrdl $16, %ecx, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: shrdl $16, %ebx, %ecx +; X86-NEXT: shrdl $16, %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $16, %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: shrdl $16, %ecx, %eax @@ -1054,32 +1057,28 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shrdl $16, %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shrdl $16, %edi, %eax +; X86-NEXT: shrdl $16, %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $16, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: shrdl $16, %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrdl $16, %eax, %ebp +; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shrdl $16, %ebp, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shrdl $16, %ebx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shrdl $16, %edi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shrdl $16, %ecx, %edi -; X86-NEXT: shrdl $16, %edx, %ecx +; X86-NEXT: shrdl $16, %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $16, %eax, %ebp +; X86-NEXT: shrdl $16, %ebx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrdl $16, %esi, %ebx +; X86-NEXT: shrdl $16, %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, 60(%eax) -; X86-NEXT: movl %edi, 56(%eax) -; X86-NEXT: movl %ebx, 52(%eax) +; X86-NEXT: movl %esi, 60(%eax) +; X86-NEXT: movl %ebx, 56(%eax) +; X86-NEXT: movl %ecx, 52(%eax) ; X86-NEXT: movl %ebp, 48(%eax) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 44(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 44(%eax) +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 40(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 36(%eax) @@ -1099,10 +1098,10 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: shrl $16, %edx ; X86-NEXT: movw %dx, 64(%eax) -; X86-NEXT: addl $60, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1359,25 +1358,25 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86XOP-NEXT: vmovd %xmm1, %ebp ; X86XOP-NEXT: shrdl $16, %ebp, %eax ; X86XOP-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86XOP-NEXT: vpextrd $1, %xmm1, %ebx -; X86XOP-NEXT: shrdl $16, %ebx, %ebp +; X86XOP-NEXT: vpextrd $1, %xmm1, %edi +; X86XOP-NEXT: shrdl $16, %edi, %ebp ; X86XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm1 ; X86XOP-NEXT: vmovd %xmm1, %esi -; X86XOP-NEXT: shrdl $16, %esi, %ebx +; X86XOP-NEXT: shrdl $16, %esi, %edi ; X86XOP-NEXT: vpextrd $1, %xmm1, %edx ; X86XOP-NEXT: shrdl $16, %edx, %esi ; X86XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm0 ; X86XOP-NEXT: vmovd %xmm0, %ecx ; X86XOP-NEXT: shrdl $16, %ecx, %edx -; X86XOP-NEXT: vpextrd $1, %xmm0, %edi -; X86XOP-NEXT: shrdl $16, %edi, %ecx +; X86XOP-NEXT: vpextrd $1, %xmm0, %ebx +; X86XOP-NEXT: shrdl $16, %ebx, %ecx ; X86XOP-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86XOP-NEXT: movl %ecx, 60(%eax) ; X86XOP-NEXT: movl %edx, 56(%eax) ; X86XOP-NEXT: movl %esi, 52(%eax) -; X86XOP-NEXT: movl %ebx, 48(%eax) +; X86XOP-NEXT: movl %edi, 48(%eax) ; X86XOP-NEXT: movl %ebp, 44(%eax) ; X86XOP-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86XOP-NEXT: movl %ecx, 40(%eax) @@ -1401,8 +1400,8 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86XOP-NEXT: movl %ecx, 4(%eax) ; X86XOP-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86XOP-NEXT: movl %ecx, (%eax) -; X86XOP-NEXT: shrl $16, %edi -; X86XOP-NEXT: movw %di, 64(%eax) +; X86XOP-NEXT: shrl $16, %ebx +; X86XOP-NEXT: movw %bx, 64(%eax) ; X86XOP-NEXT: addl $44, %esp ; X86XOP-NEXT: popl %esi ; X86XOP-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/bmi-out-of-order.ll b/llvm/test/CodeGen/X86/bmi-out-of-order.ll index 3e23a5f348bde4..49d9764f613796 100644 --- a/llvm/test/CodeGen/X86/bmi-out-of-order.ll +++ b/llvm/test/CodeGen/X86/bmi-out-of-order.ll @@ -80,11 +80,11 @@ define i64 @blsmask_through3(i64 %a, i64 %b, i64 %c, i64 %d) nounwind { ; X86-LABEL: blsmask_through3: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $-1, %eax -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: movl %esi, %edx ; X86-NEXT: adcl $-1, %edx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax @@ -92,8 +92,8 @@ define i64 @blsmask_through3(i64 %a, i64 %b, i64 %c, i64 %d) nounwind { ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %esi, %eax -; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: xorl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -154,19 +154,19 @@ define i64 @blsmask_through1_used2(i64 %a, i64 %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: addl $-1, %edi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: adcl $-1, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %ebp, %ebx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: xorl %ebx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %edi, %eax -; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: xorl %ebp, %esi ; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: imull %eax, %ebp +; X86-NEXT: imull %eax, %ebx ; X86-NEXT: mull %edi -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: imull %edi, %ebx ; X86-NEXT: addl %ebx, %edx +; X86-NEXT: imull %edi, %ebp +; X86-NEXT: addl %ebp, %edx ; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: popl %esi @@ -531,19 +531,19 @@ define i64 @blsr_through1_used2(i64 %a, i64 %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: addl $-1, %edi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: adcl $-1, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: andl %ebp, %ebx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: andl %ebx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl %edi, %eax -; X86-NEXT: andl %ebx, %esi +; X86-NEXT: andl %ebp, %esi ; X86-NEXT: andl %eax, %ecx -; X86-NEXT: imull %eax, %ebp +; X86-NEXT: imull %eax, %ebx ; X86-NEXT: mull %edi -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: imull %edi, %ebx ; X86-NEXT: addl %ebx, %edx +; X86-NEXT: imull %edi, %ebp +; X86-NEXT: addl %ebp, %edx ; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/bool-vector.ll b/llvm/test/CodeGen/X86/bool-vector.ll index e4deb878aa461d..b0fe0b92df2995 100644 --- a/llvm/test/CodeGen/X86/bool-vector.ll +++ b/llvm/test/CodeGen/X86/bool-vector.ll @@ -10,18 +10,18 @@ define i32 @PR15215_bad(<4 x i32> %input) { ; X86-LABEL: PR15215_bad: ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: shlb $3, %ah -; X86-NEXT: andb $1, %cl -; X86-NEXT: shlb $2, %cl -; X86-NEXT: orb %ah, %cl -; X86-NEXT: addb %dl, %dl +; X86-NEXT: andb $1, %dl +; X86-NEXT: shlb $2, %dl +; X86-NEXT: orb %ah, %dl +; X86-NEXT: addb %cl, %cl ; X86-NEXT: andb $1, %al -; X86-NEXT: orb %dl, %al -; X86-NEXT: andb $3, %al ; X86-NEXT: orb %cl, %al +; X86-NEXT: andb $3, %al +; X86-NEXT: orb %dl, %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: andl $15, %eax ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll index 17fd612b812ebc..4e86d87bc10af3 100644 --- a/llvm/test/CodeGen/X86/bswap.ll +++ b/llvm/test/CodeGen/X86/bswap.ll @@ -300,51 +300,52 @@ define i528 @large_promotion(i528 %A) nounwind { ; CHECK-NEXT: bswapl %ebp ; CHECK-NEXT: shrdl $16, %ebp, %ecx ; CHECK-NEXT: movl %ecx, (%esp) # 4-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx -; CHECK-NEXT: bswapl %ebx -; CHECK-NEXT: shrdl $16, %ebx, %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: bswapl %edi +; CHECK-NEXT: shrdl $16, %edi, %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: bswapl %esi -; CHECK-NEXT: shrdl $16, %esi, %ebx +; CHECK-NEXT: shrdl $16, %esi, %edi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: bswapl %edx ; CHECK-NEXT: shrdl $16, %edx, %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: bswapl %ecx -; CHECK-NEXT: shrdl $16, %ecx, %edx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: bswapl %edi -; CHECK-NEXT: shrdl $16, %edi, %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %ecx, 60(%eax) -; CHECK-NEXT: movl %edx, 56(%eax) -; CHECK-NEXT: movl %esi, 52(%eax) -; CHECK-NEXT: movl %ebx, 48(%eax) -; CHECK-NEXT: movl %ebp, 44(%eax) -; CHECK-NEXT: movl (%esp), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 40(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 36(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 32(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 28(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 24(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 20(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 16(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 12(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 8(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 4(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: shrl $16, %edi -; CHECK-NEXT: movw %di, 64(%eax) +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrdl $16, %eax, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: bswapl %ebx +; CHECK-NEXT: shrdl $16, %ebx, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %eax, 60(%ecx) +; CHECK-NEXT: movl %edx, 56(%ecx) +; CHECK-NEXT: movl %esi, 52(%ecx) +; CHECK-NEXT: movl %edi, 48(%ecx) +; CHECK-NEXT: movl %ebp, 44(%ecx) +; CHECK-NEXT: movl (%esp), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 40(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 36(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 32(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 28(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 24(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 20(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 16(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 12(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 8(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 4(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, (%ecx) +; CHECK-NEXT: shrl $16, %ebx +; CHECK-NEXT: movw %bx, 64(%ecx) +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: addl $44, %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi @@ -356,19 +357,19 @@ define i528 @large_promotion(i528 %A) nounwind { ; CHECK64: # %bb.0: ; CHECK64-NEXT: pushq %rbx ; CHECK64-NEXT: movq %rdi, %rax -; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK64-NEXT: bswapq %rdi -; CHECK64-NEXT: bswapq %r10 -; CHECK64-NEXT: shrdq $48, %r10, %rdi -; CHECK64-NEXT: bswapq %r11 -; CHECK64-NEXT: shrdq $48, %r11, %r10 +; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; CHECK64-NEXT: bswapq %rbx -; CHECK64-NEXT: shrdq $48, %rbx, %r11 +; CHECK64-NEXT: bswapq %r11 +; CHECK64-NEXT: shrdq $48, %r11, %rbx +; CHECK64-NEXT: bswapq %r10 +; CHECK64-NEXT: shrdq $48, %r10, %r11 +; CHECK64-NEXT: bswapq %rdi +; CHECK64-NEXT: shrdq $48, %rdi, %r10 ; CHECK64-NEXT: bswapq %r9 -; CHECK64-NEXT: shrdq $48, %r9, %rbx +; CHECK64-NEXT: shrdq $48, %r9, %rdi ; CHECK64-NEXT: bswapq %r8 ; CHECK64-NEXT: shrdq $48, %r8, %r9 ; CHECK64-NEXT: bswapq %rcx @@ -382,10 +383,10 @@ define i528 @large_promotion(i528 %A) nounwind { ; CHECK64-NEXT: movq %rcx, 48(%rax) ; CHECK64-NEXT: movq %r8, 40(%rax) ; CHECK64-NEXT: movq %r9, 32(%rax) -; CHECK64-NEXT: movq %rbx, 24(%rax) -; CHECK64-NEXT: movq %r11, 16(%rax) -; CHECK64-NEXT: movq %r10, 8(%rax) -; CHECK64-NEXT: movq %rdi, (%rax) +; CHECK64-NEXT: movq %rdi, 24(%rax) +; CHECK64-NEXT: movq %r10, 16(%rax) +; CHECK64-NEXT: movq %r11, 8(%rax) +; CHECK64-NEXT: movq %rbx, (%rax) ; CHECK64-NEXT: movw %si, 64(%rax) ; CHECK64-NEXT: popq %rbx ; CHECK64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-sbb.ll b/llvm/test/CodeGen/X86/combine-sbb.ll index 89aee965a2c1f6..90ee5e7850a626 100644 --- a/llvm/test/CodeGen/X86/combine-sbb.ll +++ b/llvm/test/CodeGen/X86/combine-sbb.ll @@ -60,22 +60,22 @@ define void @PR25858_i64(ptr sret(%WideUInt64), ptr, ptr) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl (%edi), %edx -; X86-NEXT: movl 4(%edi), %esi -; X86-NEXT: movl 12(%edi), %ecx -; X86-NEXT: movl 8(%edi), %edi -; X86-NEXT: subl 8(%ebx), %edi -; X86-NEXT: sbbl 12(%ebx), %ecx -; X86-NEXT: subl (%ebx), %edx -; X86-NEXT: sbbl 4(%ebx), %esi -; X86-NEXT: sbbl $0, %edi -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl 4(%edx), %edi +; X86-NEXT: movl 12(%edx), %ebx +; X86-NEXT: movl 8(%edx), %edx +; X86-NEXT: subl 8(%ecx), %edx +; X86-NEXT: sbbl 12(%ecx), %ebx +; X86-NEXT: subl (%ecx), %esi +; X86-NEXT: sbbl 4(%ecx), %edi +; X86-NEXT: sbbl $0, %edx +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 549fe726269730..adb6f6a7a1412d 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -352,30 +352,30 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2] -; SSE41-NEXT: pmullw %xmm0, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: pmullw %xmm0, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: paddb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE41-NEXT: pmullw %xmm0, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: packuswb %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128] -; SSE41-NEXT: pmullw %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [256,64,128,16,32,8,4,128] +; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: psraw $8, %xmm2 -; SSE41-NEXT: pmullw %xmm3, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: psraw $8, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: packuswb %xmm0, %xmm3 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -579,59 +579,60 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psraw $15, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = -; SSE2-NEXT: pmulhuw %xmm7, %xmm0 -; SSE2-NEXT: paddw %xmm3, %xmm0 +; SSE2-NEXT: pmulhuw %xmm7, %xmm2 +; SSE2-NEXT: paddw %xmm0, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: psraw $4, %xmm2 ; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pandn %xmm0, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,0,65535] -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: psraw $2, %xmm6 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,0] -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: psraw $1, %xmm3 ; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: pandn %xmm2, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: pandn %xmm3, %xmm8 +; SSE2-NEXT: por %xmm2, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pand %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psraw $15, %xmm3 -; SSE2-NEXT: pmulhuw %xmm7, %xmm3 -; SSE2-NEXT: paddw %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psraw $15, %xmm0 +; SSE2-NEXT: pmulhuw %xmm7, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: psraw $4, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm4 ; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: psraw $2, %xmm4 ; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: psraw $1, %xmm5 ; SSE2-NEXT: pandn %xmm5, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm2, %xmm6 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; @@ -739,155 +740,155 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = -; SSE2-NEXT: pmulhuw %xmm9, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,0,0,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = +; SSE2-NEXT: pmulhuw %xmm10, %xmm0 +; SSE2-NEXT: paddw %xmm5, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,0,0,65535] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: pandn %xmm0, %xmm8 -; SSE2-NEXT: por %xmm4, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535] -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: psraw $2, %xmm8 -; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: pandn %xmm0, %xmm6 +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,0,65535] +; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: psraw $2, %xmm6 +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,0,0,65535,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,0,0,65535,0] ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: psraw $1, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm10 -; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm4, %xmm10 +; SSE2-NEXT: pand %xmm4, %xmm6 ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: pmulhuw %xmm9, %xmm1 -; SSE2-NEXT: paddw %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: psraw $4, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm11 -; SSE2-NEXT: pandn %xmm1, %xmm11 -; SSE2-NEXT: por %xmm10, %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: psraw $2, %xmm11 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pandn %xmm11, %xmm10 -; SSE2-NEXT: por %xmm1, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm1 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: psraw $1, %xmm10 -; SSE2-NEXT: movdqa %xmm8, %xmm11 -; SSE2-NEXT: pandn %xmm10, %xmm11 -; SSE2-NEXT: por %xmm1, %xmm11 -; SSE2-NEXT: pand %xmm4, %xmm11 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm5, %xmm0 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: psraw $15, %xmm5 -; SSE2-NEXT: pmulhuw %xmm9, %xmm5 -; SSE2-NEXT: paddw %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 +; SSE2-NEXT: pmulhuw %xmm10, %xmm5 +; SSE2-NEXT: paddw %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm7, %xmm6 ; SSE2-NEXT: psraw $4, %xmm5 -; SSE2-NEXT: movdqa %xmm6, %xmm11 +; SSE2-NEXT: movdqa %xmm7, %xmm11 ; SSE2-NEXT: pandn %xmm5, %xmm11 -; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: por %xmm6, %xmm11 ; SSE2-NEXT: movdqa %xmm11, %xmm5 -; SSE2-NEXT: pand %xmm7, %xmm5 -; SSE2-NEXT: psraw $2, %xmm11 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pandn %xmm11, %xmm10 -; SSE2-NEXT: por %xmm5, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: psraw $1, %xmm10 -; SSE2-NEXT: movdqa %xmm8, %xmm11 -; SSE2-NEXT: pandn %xmm10, %xmm11 +; SSE2-NEXT: psraw $2, %xmm11 +; SSE2-NEXT: movdqa %xmm8, %xmm6 +; SSE2-NEXT: pandn %xmm11, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: psraw $1, %xmm6 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pandn %xmm6, %xmm11 ; SSE2-NEXT: por %xmm5, %xmm11 ; SSE2-NEXT: pand %xmm4, %xmm11 ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm5 ; SSE2-NEXT: por %xmm11, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: pmulhuw %xmm9, %xmm2 -; SSE2-NEXT: paddw %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm6, %xmm9 -; SSE2-NEXT: psraw $4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: pmulhuw %xmm10, %xmm1 +; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: psraw $4, %xmm1 +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: pandn %xmm1, %xmm11 +; SSE2-NEXT: por %xmm6, %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: psraw $2, %xmm11 +; SSE2-NEXT: movdqa %xmm8, %xmm6 +; SSE2-NEXT: pandn %xmm11, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: psraw $1, %xmm6 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pandn %xmm6, %xmm11 +; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm4, %xmm11 +; SSE2-NEXT: movdqa %xmm4, %xmm6 ; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: por %xmm9, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: por %xmm11, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: pmulhuw %xmm10, %xmm1 +; SSE2-NEXT: paddw %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: psraw $2, %xmm6 -; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: psraw $4, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm7 ; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: psraw $1, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: psraw $2, %xmm7 ; SSE2-NEXT: pandn %xmm7, %xmm8 -; SSE2-NEXT: por %xmm2, %xmm8 -; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: por %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: psraw $1, %xmm8 +; SSE2-NEXT: pandn %xmm8, %xmm9 +; SSE2-NEXT: por %xmm1, %xmm9 +; SSE2-NEXT: pand %xmm4, %xmm9 ; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: por %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: por %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: psraw $15, %xmm6 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psraw $15, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = -; SSE41-NEXT: pmulhuw %xmm5, %xmm6 -; SSE41-NEXT: paddw %xmm0, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pmulhw %xmm4, %xmm7 -; SSE41-NEXT: psraw $1, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: psraw $15, %xmm6 -; SSE41-NEXT: pmulhuw %xmm5, %xmm6 -; SSE41-NEXT: paddw %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pmulhw %xmm4, %xmm7 -; SSE41-NEXT: psraw $1, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: psraw $15, %xmm6 -; SSE41-NEXT: pmulhuw %xmm5, %xmm6 -; SSE41-NEXT: paddw %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pmulhw %xmm4, %xmm7 -; SSE41-NEXT: psraw $1, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: psraw $15, %xmm6 -; SSE41-NEXT: pmulhuw %xmm5, %xmm6 -; SSE41-NEXT: paddw %xmm3, %xmm6 -; SSE41-NEXT: pmulhw %xmm6, %xmm4 -; SSE41-NEXT: psraw $1, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3,4,5,6,7] +; SSE41-NEXT: pmulhuw %xmm5, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pmulhw %xmm6, %xmm7 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4,5,6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psraw $15, %xmm4 +; SSE41-NEXT: pmulhuw %xmm5, %xmm4 +; SSE41-NEXT: paddw %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pmulhw %xmm6, %xmm7 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4,5,6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psraw $15, %xmm4 +; SSE41-NEXT: pmulhuw %xmm5, %xmm4 +; SSE41-NEXT: paddw %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pmulhw %xmm6, %xmm7 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4,5,6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $15, %xmm4 +; SSE41-NEXT: pmulhuw %xmm5, %xmm4 +; SSE41-NEXT: paddw %xmm3, %xmm4 +; SSE41-NEXT: pmulhw %xmm4, %xmm6 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4,5,6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16: @@ -1858,45 +1859,45 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlq $60, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlq $61, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm4 -; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $4, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $3, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 ; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5 ; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5 ; AVX1-NEXT: vpsrad $2, %xmm5, %xmm6 ; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm5 ; AVX1-NEXT: vpsrlq $60, %xmm5, %xmm6 ; AVX1-NEXT: vpsrlq $61, %xmm5, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 -; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $4, %xmm2, %xmm5 +; AVX1-NEXT: vpsrlq $3, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlq $62, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpsrad $2, %xmm3, %xmm4 +; AVX1-NEXT: vpsrlq $2, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX1-NEXT: retq ; @@ -2190,34 +2191,34 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: psllw $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: psllw $1, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7] +; SSE41-NEXT: psrlw $8, %xmm4 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: paddb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE41-NEXT: packuswb %xmm2, %xmm4 +; SSE41-NEXT: paddb %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw $1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $1, %xmm2 ; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6],xmm2[7] ; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: psraw $8, %xmm2 -; SSE41-NEXT: psllw $7, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: psraw $8, %xmm4 +; SSE41-NEXT: psllw $7, %xmm4 +; SSE41-NEXT: psrlw $8, %xmm4 +; SSE41-NEXT: packuswb %xmm0, %xmm4 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] ; SSE41-NEXT: pxor %xmm0, %xmm1 ; SSE41-NEXT: psubb %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/copy-eflags.ll b/llvm/test/CodeGen/X86/copy-eflags.ll index 6af80860401afd..b17aa4b9a002f9 100644 --- a/llvm/test/CodeGen/X86/copy-eflags.ll +++ b/llvm/test/CodeGen/X86/copy-eflags.ll @@ -297,22 +297,22 @@ define dso_local void @PR37431(ptr %arg1, ptr %arg2, ptr %arg3, i32 %arg4, i64 % ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: movl (%edi), %edi -; X32-NEXT: movl %edi, %ebp -; X32-NEXT: sarl $31, %ebp -; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: movl %edi, %ebx +; X32-NEXT: sarl $31, %ebx +; X32-NEXT: xorl %ecx, %ecx ; X32-NEXT: cmpl %edi, {{[0-9]+}}(%esp) -; X32-NEXT: sbbl %ebp, %esi -; X32-NEXT: sbbl %ebx, %ebx -; X32-NEXT: movb %bl, (%edx) +; X32-NEXT: sbbl %ebx, %esi +; X32-NEXT: sbbl %ecx, %ecx +; X32-NEXT: movb %cl, (%edx) ; X32-NEXT: cltd -; X32-NEXT: idivl %ebx -; X32-NEXT: movb %dl, (%ecx) +; X32-NEXT: idivl %ecx +; X32-NEXT: movb %dl, (%ebp) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll index adb7319fe80b11..54a2cc79d6eafe 100644 --- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll +++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll @@ -24,38 +24,38 @@ define void @_Z1nv() local_unnamed_addr { ; CHECK-LABEL: _Z1nv: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq k@GOTPCREL(%rip), %rax -; CHECK-NEXT: movl 4(%rax), %edx -; CHECK-NEXT: movq c@GOTPCREL(%rip), %rax -; CHECK-NEXT: movswl (%rax), %ecx -; CHECK-NEXT: movq b@GOTPCREL(%rip), %rax -; CHECK-NEXT: movswl (%rax), %edi -; CHECK-NEXT: movq a@GOTPCREL(%rip), %rsi -; CHECK-NEXT: movl (%rsi), %esi +; CHECK-NEXT: movl 4(%rax), %eax +; CHECK-NEXT: movq c@GOTPCREL(%rip), %rcx +; CHECK-NEXT: movswl (%rcx), %ecx +; CHECK-NEXT: movq b@GOTPCREL(%rip), %rdx +; CHECK-NEXT: movswl (%rdx), %esi +; CHECK-NEXT: movq a@GOTPCREL(%rip), %rdi +; CHECK-NEXT: movl (%rdi), %edi ; CHECK-NEXT: movq l@GOTPCREL(%rip), %r8 ; CHECK-NEXT: movl (%r8), %r8d ; CHECK-NEXT: movl %r8d, %r9d ; CHECK-NEXT: shll $7, %r9d ; CHECK-NEXT: sarl $7, %r9d ; CHECK-NEXT: negl %r9d -; CHECK-NEXT: testl %esi, %esi -; CHECK-NEXT: cmovel %esi, %r9d -; CHECK-NEXT: movzwl %dx, %r10d +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovel %edi, %r9d +; CHECK-NEXT: movzwl %ax, %r10d ; CHECK-NEXT: leal (%rcx,%r10,2), %ecx -; CHECK-NEXT: addl %edi, %ecx +; CHECK-NEXT: addl %esi, %ecx ; CHECK-NEXT: cmpl %r9d, %ecx -; CHECK-NEXT: sete %dil +; CHECK-NEXT: sete %sil ; CHECK-NEXT: testl $33554431, %r8d # imm = 0x1FFFFFF ; CHECK-NEXT: sete %r8b -; CHECK-NEXT: orb %dil, %r8b -; CHECK-NEXT: movzbl %r8b, %edi +; CHECK-NEXT: orb %sil, %r8b +; CHECK-NEXT: movzbl %r8b, %esi ; CHECK-NEXT: movq e@GOTPCREL(%rip), %r8 -; CHECK-NEXT: movw %di, (%r8) +; CHECK-NEXT: movw %si, (%r8) ; CHECK-NEXT: notl %ecx ; CHECK-NEXT: shrl $31, %ecx -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: sarl %cl, %esi -; CHECK-NEXT: movw %si, (%rax) +; CHECK-NEXT: sarl %cl, %edi +; CHECK-NEXT: movw %di, (%rdx) ; CHECK-NEXT: retq entry: %bf.load = load i32, ptr getelementptr inbounds (%struct.m, ptr @k, i64 0, i32 0, i32 1), align 4 @@ -117,20 +117,20 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax ; CHECK-NEXT: movl (%rax), %ebx ; CHECK-NEXT: andl $511, %ebx # imm = 0x1FF -; CHECK-NEXT: leaq 1(%rbx), %rax -; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx -; CHECK-NEXT: movl %eax, (%rcx) -; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx -; CHECK-NEXT: movl (%rcx), %ecx +; CHECK-NEXT: leaq 1(%rbx), %rdx +; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rax +; CHECK-NEXT: movl %edx, (%rax) +; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rax +; CHECK-NEXT: movl (%rax), %ecx ; CHECK-NEXT: testl %ecx, %ecx ; CHECK-NEXT: je .LBB1_18 ; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph -; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rdx -; CHECK-NEXT: movq (%rdx), %rsi -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: notl %edx -; CHECK-NEXT: leaq 8(,%rdx,8), %rdi -; CHECK-NEXT: imulq %rax, %rdi +; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rax +; CHECK-NEXT: movq (%rax), %rsi +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: notl %eax +; CHECK-NEXT: leaq 8(,%rax,8), %rdi +; CHECK-NEXT: imulq %rdx, %rdi ; CHECK-NEXT: addq %rsi, %rdi ; CHECK-NEXT: movq x2@GOTPCREL(%rip), %r8 ; CHECK-NEXT: movl (%r8), %edx diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll index 3efd536adc4d18..1bc5851384db12 100644 --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -51,43 +51,43 @@ define i96 @square_high(i96 %x) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: setb %al -; X86-NEXT: movzbl %al, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebx +; X86-NEXT: movzbl %al, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: movl %esi, %eax ; X86-NEXT: adcl $0, %eax ; X86-NEXT: setb %dl -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl %ebp, %edi ; X86-NEXT: movzbl %dl, %ecx ; X86-NEXT: adcl %eax, %esi ; X86-NEXT: adcl $0, %ecx diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index d26f4b7044cf3c..4f12e71d4a1b3c 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -177,103 +177,105 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $152, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: subl $148, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: sarl $31, %ebx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: movl %eax, %esi ; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: xorl %ebp, %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: subl %edx, %ebp -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: xorl %eax, %edx +; X86-NEXT: sbbl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: xorl %edi, %esi +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi +; X86-NEXT: subl %ebx, %edi +; X86-NEXT: sbbl %ebx, %ebp +; X86-NEXT: sbbl %ebx, %edx +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: xorl %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al -; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: movb %al, (%esp) # 1-byte Spill +; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl %ebx, %ecx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %esi, %edx +; X86-NEXT: bsrl %ebp, %edx ; X86-NEXT: xorl $31, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: bsrl %edi, %edi +; X86-NEXT: xorl $31, %edi +; X86-NEXT: addl $32, %edi ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: bsrl %ebp, %ebp -; X86-NEXT: xorl $31, %ebp -; X86-NEXT: addl $32, %ebp -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %esi, %esi -; X86-NEXT: cmovnel %edx, %ebp -; X86-NEXT: addl $64, %ebp +; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: cmovnel %edx, %edi +; X86-NEXT: addl $64, %edi +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: cmovnel %ecx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %edx +; X86-NEXT: cmovnel %ecx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: bsrl %ebp, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: bsrl %eax, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %ebp, %ebp ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: bsrl %ebx, %esi -; X86-NEXT: xorl $31, %esi -; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: addl $32, %edx -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: addl $64, %edx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl %edx, %ebp +; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: xorl $31, %esi +; X86-NEXT: addl $32, %esi +; X86-NEXT: testl %edi, %edi +; X86-NEXT: cmovnel %edx, %esi +; X86-NEXT: addl $64, %esi +; X86-NEXT: orl %ebp, %eax +; X86-NEXT: cmovnel %ecx, %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: subl %esi, %ebx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: movl $0, %edx @@ -281,9 +283,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi ; X86-NEXT: movl $127, %ecx -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebp, %ecx -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %ebx, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ecx @@ -291,130 +292,123 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: setb %cl -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: cmovnel %ebx, %edi -; X86-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NEXT: cmovnel %ebx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovnel %ebx, %eax -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: jne .LBB4_1 -; X86-NEXT: # %bb.8: # %_udiv-special-cases -; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: orb (%esp), %cl # 1-byte Folded Reload +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: cmovnel %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: cmovnel %edi, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: xorl $127, %ebp -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %ebx, %ecx -; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: cmovnel %edi, %ebp +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: jne .LBB4_8 +; X86-NEXT: # %bb.1: # %_udiv-special-cases ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: je .LBB4_9 -; X86-NEXT: # %bb.5: # %udiv-bb1 +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: xorl $127, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: je .LBB4_8 +; X86-NEXT: # %bb.2: # %udiv-bb1 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: xorb $127, %al ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb $7, %ch ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $15, %al ; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %edi -; X86-NEXT: movl 144(%esp,%edi), %edx -; X86-NEXT: movl 148(%esp,%edi), %esi +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 140(%esp,%eax), %edx +; X86-NEXT: movl 144(%esp,%eax), %esi ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shldl %cl, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: movl 140(%esp,%edi), %eax -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl 136(%esp,%eax), %ebp +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: shrl %esi ; X86-NEXT: shrl %cl, %esi ; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: movl 136(%esp,%edi), %esi +; X86-NEXT: movl 132(%esp,%eax), %eax ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl $1, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %ebp +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: addl $1, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $0, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $0, %esi -; X86-NEXT: jae .LBB4_2 +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl $0, %edx +; X86-NEXT: jae .LBB4_3 ; X86-NEXT: # %bb.6: -; X86-NEXT: xorl %ebp, %ebp +; X86-NEXT: xorl %edi, %edi ; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: jmp .LBB4_7 -; X86-NEXT: .LBB4_1: -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: jmp .LBB4_9 -; X86-NEXT: .LBB4_2: # %udiv-preheader -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: .LBB4_3: # %udiv-preheader +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %bl, %ch ; X86-NEXT: andb $7, %ch -; X86-NEXT: movb %bl, %cl -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %cl, %ebx -; X86-NEXT: movl 100(%esp,%ebx), %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: movl 96(%esp,%ebx), %edi +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $15, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 96(%esp,%eax), %esi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %ebp +; X86-NEXT: movl 92(%esp,%eax), %edx +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shrdl %cl, %esi, %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl 88(%esp,%ebx), %esi -; X86-NEXT: movl 92(%esp,%ebx), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: shrl %cl, %eax +; X86-NEXT: movl 84(%esp,%eax), %ebx +; X86-NEXT: movl 88(%esp,%eax), %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: shrl %cl, %edi ; X86-NEXT: notb %cl -; X86-NEXT: addl %edi, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; X86-NEXT: shrdl %cl, %ebx, %esi +; X86-NEXT: shrl %cl, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrdl %cl, %eax, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -428,173 +422,170 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB4_3: # %udiv-do-while +; X86-NEXT: .LBB4_4: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $1, %ebp, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebx, %ebp +; X86-NEXT: shldl $1, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %ebp +; X86-NEXT: shldl $1, %ebx, %eax ; X86-NEXT: shldl $1, %esi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: shldl $1, %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: shldl $1, %edi, %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NEXT: shldl $1, %ecx, %edi -; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: sbbl %ebp, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $1, %edx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: subl %ecx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %eax, (%esp) # 4-byte Folded Spill +; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl $-1, %edi -; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: adcl $-1, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %edi, %ecx ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: jne .LBB4_3 -; X86-NEXT: # %bb.4: +; X86-NEXT: jne .LBB4_4 +; X86-NEXT: # %bb.5: ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %edx, %edi ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %esi, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: addl %esi, %esi -; X86-NEXT: orl %ebp, %esi -; X86-NEXT: .LBB4_9: # %udiv-end -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %eax -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: subl %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: movl %edx, 8(%ecx) -; X86-NEXT: movl %edi, 12(%ecx) -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi +; X86-NEXT: shldl $1, %ebp, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: shldl $1, %eax, %ebp +; X86-NEXT: orl %ecx, %ebp +; X86-NEXT: addl %eax, %eax +; X86-NEXT: orl %edi, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: .LBB4_8: # %udiv-end +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: xorl %eax, %ebp +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, (%edi) +; X86-NEXT: movl %ebp, 4(%edi) +; X86-NEXT: movl %esi, 8(%edi) +; X86-NEXT: movl %edx, 12(%edi) +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: imull %eax, %ecx -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: imull %eax, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %esi, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: imull %edx, %ebp -; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: imull %ebp, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: mull %ecx ; X86-NEXT: addl %edx, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl %ebp, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: addl $152, %esp +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: addl $148, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -608,24 +599,24 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r8, %r15 -; X64-NEXT: movq %rcx, %r12 -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movq %r8, %rbx +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rsi, %r12 +; X64-NEXT: movq %rdi, %r13 ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rdx, 8(%r15) -; X64-NEXT: movq %rax, (%r15) -; X64-NEXT: imulq %rax, %r12 -; X64-NEXT: mulq %r13 -; X64-NEXT: addq %r12, %rdx -; X64-NEXT: imulq %r13, %rcx +; X64-NEXT: movq %rdx, 8(%rbx) +; X64-NEXT: movq %rax, (%rbx) +; X64-NEXT: imulq %rax, %r14 +; X64-NEXT: mulq %r15 +; X64-NEXT: addq %r14, %rdx +; X64-NEXT: imulq %r15, %rcx ; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: subq %rax, %r14 -; X64-NEXT: sbbq %rcx, %rbx -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %rbx, %rdx +; X64-NEXT: subq %rax, %r13 +; X64-NEXT: sbbq %rcx, %r12 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r12, %rdx ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 @@ -714,37 +705,37 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X86-NEXT: movsbl (%esp), %eax ; X86-NEXT: idivb {{[0-9]+}}(%esp) ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-NEXT: movd %edx, %xmm7 +; X86-NEXT: movd %edx, %xmm4 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X86-NEXT: movd %esi, %xmm4 +; X86-NEXT: movd %esi, %xmm2 ; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; X86-NEXT: movd %edi, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; X86-NEXT: movd %ebx, %xmm5 +; X86-NEXT: movd %edi, %xmm5 +; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; X86-NEXT: movd %ebx, %xmm4 ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: movd %ecx, %xmm6 ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X86-NEXT: movdqa %xmm2, %xmm4 -; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; X86-NEXT: movdqa %xmm4, (%ecx) +; X86-NEXT: movd %eax, %xmm5 +; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; X86-NEXT: movdqa %xmm5, %xmm2 +; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X86-NEXT: movdqa %xmm2, (%ecx) ; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: movdqa %xmm1, %xmm4 -; X86-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X86-NEXT: pmullw %xmm3, %xmm4 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-NEXT: pmullw %xmm3, %xmm2 ; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X86-NEXT: pand %xmm3, %xmm4 -; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-NEXT: pand %xmm3, %xmm2 +; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: pmullw %xmm2, %xmm1 +; X86-NEXT: pmullw %xmm5, %xmm1 ; X86-NEXT: pand %xmm3, %xmm1 -; X86-NEXT: packuswb %xmm4, %xmm1 +; X86-NEXT: packuswb %xmm2, %xmm1 ; X86-NEXT: psubb %xmm1, %xmm0 ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi @@ -817,47 +808,47 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X64-NEXT: movd %r8d, %xmm5 ; X64-NEXT: movd %r9d, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movd %r10d, %xmm7 +; X64-NEXT: movd %r10d, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; X64-NEXT: movd %r11d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X64-NEXT: movd %ebx, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; X64-NEXT: movd %ebp, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-NEXT: movd %ebx, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; X64-NEXT: movd %ebp, %xmm6 +; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; X64-NEXT: movd %r14d, %xmm4 -; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; X64-NEXT: movd %r15d, %xmm6 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: movd %r15d, %xmm2 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; X64-NEXT: movd %r12d, %xmm5 -; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; X64-NEXT: movd %r13d, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; X64-NEXT: movd %edx, %xmm6 +; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; X64-NEXT: movd %r13d, %xmm6 +; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; X64-NEXT: movd %edx, %xmm2 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: movd %ecx, %xmm4 -; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: movd %eax, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; X64-NEXT: movdqa %xmm3, %xmm4 -; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; X64-NEXT: movd %eax, %xmm6 +; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; X64-NEXT: movdqa %xmm6, %xmm2 +; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movdqa %xmm4, (%rax) -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X64-NEXT: pmullw %xmm2, %xmm4 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; X64-NEXT: pand %xmm2, %xmm4 +; X64-NEXT: movdqa %xmm2, (%rax) ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-NEXT: pmullw %xmm3, %xmm2 +; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; X64-NEXT: pand %xmm3, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: pmullw %xmm3, %xmm1 -; X64-NEXT: pand %xmm2, %xmm1 -; X64-NEXT: packuswb %xmm4, %xmm1 +; X64-NEXT: pmullw %xmm6, %xmm1 +; X64-NEXT: pand %xmm3, %xmm1 +; X64-NEXT: packuswb %xmm2, %xmm1 ; X64-NEXT: psubb %xmm1, %xmm0 ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 @@ -1147,25 +1138,25 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __divdi3 -; X86-NEXT: movd %edx, %xmm1 -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; X86-NEXT: movdqa %xmm3, (%esi) +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: movdqa %xmm0, %xmm1 -; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: pmuludq %xmm3, %xmm1 -; X86-NEXT: movdqa %xmm3, %xmm2 +; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X86-NEXT: movdqa %xmm1, (%esi) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload +; X86-NEXT: movdqa %xmm3, %xmm0 +; X86-NEXT: psrlq $32, %xmm0 +; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: movdqa %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm2 -; X86-NEXT: paddq %xmm1, %xmm2 +; X86-NEXT: pmuludq %xmm3, %xmm2 +; X86-NEXT: paddq %xmm0, %xmm2 ; X86-NEXT: psllq $32, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm3 -; X86-NEXT: paddq %xmm2, %xmm3 +; X86-NEXT: pmuludq %xmm3, %xmm1 +; X86-NEXT: paddq %xmm2, %xmm1 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: psubq %xmm3, %xmm0 +; X86-NEXT: psubq %xmm1, %xmm0 ; X86-NEXT: addl $64, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index ebb95f16a723c4..d818243c0ceab3 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -177,113 +177,110 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $132, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: subl $128, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: orl %edi, %ecx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sete %bl +; X86-NEXT: sete %cl ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl %edi, %esi ; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al -; X86-NEXT: orb %bl, %al -; X86-NEXT: movb %al, (%esp) # 1-byte Spill -; X86-NEXT: bsrl %esi, %edx +; X86-NEXT: orb %cl, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: bsrl %ebx, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl %edi, %ecx +; X86-NEXT: bsrl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: bsrl %eax, %edx +; X86-NEXT: bsrl %ebp, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl %ebp, %ebp -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl $31, %ebp -; X86-NEXT: addl $32, %ebp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: cmovnel %edx, %ebp -; X86-NEXT: addl $64, %ebp -; X86-NEXT: orl %ebx, %edi -; X86-NEXT: cmovnel %ecx, %ebp -; X86-NEXT: bsrl %esi, %edx +; X86-NEXT: bsrl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: addl $32, %eax +; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: addl $64, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: bsrl %ebp, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: bsrl %ebx, %ecx +; X86-NEXT: bsrl %esi, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %ebp, %ebp ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: bsrl %edi, %esi -; X86-NEXT: xorl $31, %esi -; X86-NEXT: bsrl {{[0-9]+}}(%esp), %edx +; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: addl $32, %edx +; X86-NEXT: bsrl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: addl $32, %esi ; X86-NEXT: testl %edi, %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: addl $64, %edx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: orl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: subl %edx, %ebp +; X86-NEXT: cmovnel %edx, %esi +; X86-NEXT: addl $64, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: orl %ebp, %edx +; X86-NEXT: cmovnel %ecx, %esi +; X86-NEXT: subl %esi, %eax ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: movl $0, %edi -; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: movl $0, %ebx ; X86-NEXT: sbbl %ebx, %ebx ; X86-NEXT: movl $127, %edx -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebp, %edx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %eax, %edx ; X86-NEXT: movl $0, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: movl $0, %edx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edi, %edx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %ecx, %edx ; X86-NEXT: movl $0, %edx ; X86-NEXT: sbbl %ebx, %edx ; X86-NEXT: setb %dl -; X86-NEXT: orb (%esp), %dl # 1-byte Folded Reload +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovnel %ecx, %esi -; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: cmovnel %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: cmovnel %ecx, %ebp +; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: jne .LBB4_8 ; X86-NEXT: # %bb.1: # %_udiv-special-cases -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: xorl $127, %eax ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ebx, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl %edi, %ebp +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: je .LBB4_8 ; X86-NEXT: # %bb.2: # %udiv-bb1 +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -292,7 +289,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: xorb $127, %al ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb $7, %ch @@ -300,253 +297,250 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: andb $15, %al ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 124(%esp,%eax), %edx -; X86-NEXT: movl 128(%esp,%eax), %esi +; X86-NEXT: movl 120(%esp,%eax), %edx +; X86-NEXT: movl 124(%esp,%eax), %esi ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shldl %cl, %edx, %esi ; X86-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: movl 120(%esp,%eax), %ebp -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl 116(%esp,%eax), %edi +; X86-NEXT: movl %edi, %esi ; X86-NEXT: shrl %esi ; X86-NEXT: shrl %cl, %esi ; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: movl 116(%esp,%eax), %ebp +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl 112(%esp,%eax), %eax ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %ebp, %edx -; X86-NEXT: shll %cl, %ebp -; X86-NEXT: addl $1, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl $1, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $0, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: jae .LBB4_3 ; X86-NEXT: # %bb.6: -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: xorl %esi, %esi ; X86-NEXT: jmp .LBB4_7 ; X86-NEXT: .LBB4_3: # %udiv-preheader -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb $7, %ch -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $15, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 80(%esp,%eax), %ebp -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 76(%esp,%eax), %edi +; X86-NEXT: movl 76(%esp,%eax), %ebp ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %ebp, %ebx -; X86-NEXT: movl 68(%esp,%eax), %esi ; X86-NEXT: movl 72(%esp,%eax), %edx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: notb %cl -; X86-NEXT: addl %edi, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shrdl %cl, %ebp, %edi +; X86-NEXT: movl 64(%esp,%eax), %ebx +; X86-NEXT: movl 68(%esp,%eax), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: notb %cl +; X86-NEXT: addl %edx, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: orl %esi, %edx ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: shrdl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrdl %cl, %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl $-1, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB4_4: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: shldl $1, %ebx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebx, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %ebx -; X86-NEXT: shldl $1, %esi, %edx +; X86-NEXT: shldl $1, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: shldl $1, %ebx, %edx +; X86-NEXT: shldl $1, %ebp, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl $1, %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: orl %edi, %esi +; X86-NEXT: shldl $1, %eax, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %esi, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: shldl $1, %ecx, %eax -; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shldl $1, %eax, %ecx -; X86-NEXT: orl %edi, %ecx +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %eax, %eax ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: andl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NEXT: sbbl %edi, %ebx -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-NEXT: subl %ecx, %ebx +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $-1, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl $-1, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %eax, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: orl (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: jne .LBB4_4 ; X86-NEXT: # %bb.5: -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: shldl $1, %ebx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: shldl $1, %ebp, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: addl %ebp, %ebp -; X86-NEXT: orl %ecx, %ebp -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: shldl $1, %edi, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: shldl $1, %ecx, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: .LBB4_8: # %udiv-end -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, (%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: movl %esi, 8(%ecx) -; X86-NEXT: movl %edx, 12(%ecx) -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, (%esi) +; X86-NEXT: movl %edi, 4(%esi) +; X86-NEXT: movl %edx, 8(%esi) +; X86-NEXT: movl %eax, 12(%esi) +; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull %ecx, %esi -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: imull %edi, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %ebx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: imull %ebp, %ebx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: imull %esi, %ecx ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebp -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebx -; X86-NEXT: movl %edi, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: imull %ebp, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: subl (%esp), %ebx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %ebx, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: addl $132, %esp +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: addl $128, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -560,24 +554,24 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r8, %r15 -; X64-NEXT: movq %rcx, %r12 -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movq %r8, %rbx +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rsi, %r12 +; X64-NEXT: movq %rdi, %r13 ; X64-NEXT: callq __udivti3@PLT ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rdx, 8(%r15) -; X64-NEXT: movq %rax, (%r15) -; X64-NEXT: imulq %rax, %r12 -; X64-NEXT: mulq %r13 -; X64-NEXT: addq %r12, %rdx -; X64-NEXT: imulq %r13, %rcx +; X64-NEXT: movq %rdx, 8(%rbx) +; X64-NEXT: movq %rax, (%rbx) +; X64-NEXT: imulq %rax, %r14 +; X64-NEXT: mulq %r15 +; X64-NEXT: addq %r14, %rdx +; X64-NEXT: imulq %r15, %rcx ; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: subq %rax, %r14 -; X64-NEXT: sbbq %rcx, %rbx -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %rbx, %rdx +; X64-NEXT: subq %rax, %r13 +; X64-NEXT: sbbq %rcx, %r12 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r12, %rdx ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 @@ -666,37 +660,37 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X86-NEXT: movzbl (%esp), %eax ; X86-NEXT: divb {{[0-9]+}}(%esp) ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-NEXT: movd %edx, %xmm7 +; X86-NEXT: movd %edx, %xmm4 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X86-NEXT: movd %esi, %xmm4 +; X86-NEXT: movd %esi, %xmm2 ; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; X86-NEXT: movd %edi, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; X86-NEXT: movd %ebx, %xmm5 +; X86-NEXT: movd %edi, %xmm5 +; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; X86-NEXT: movd %ebx, %xmm4 ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: movd %ecx, %xmm6 ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X86-NEXT: movdqa %xmm2, %xmm4 -; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; X86-NEXT: movdqa %xmm4, (%ecx) +; X86-NEXT: movd %eax, %xmm5 +; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; X86-NEXT: movdqa %xmm5, %xmm2 +; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X86-NEXT: movdqa %xmm2, (%ecx) ; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: movdqa %xmm1, %xmm4 -; X86-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X86-NEXT: pmullw %xmm3, %xmm4 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-NEXT: pmullw %xmm3, %xmm2 ; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X86-NEXT: pand %xmm3, %xmm4 -; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-NEXT: pand %xmm3, %xmm2 +; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: pmullw %xmm2, %xmm1 +; X86-NEXT: pmullw %xmm5, %xmm1 ; X86-NEXT: pand %xmm3, %xmm1 -; X86-NEXT: packuswb %xmm4, %xmm1 +; X86-NEXT: packuswb %xmm2, %xmm1 ; X86-NEXT: psubb %xmm1, %xmm0 ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi @@ -769,47 +763,47 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X64-NEXT: movd %r8d, %xmm5 ; X64-NEXT: movd %r9d, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movd %r10d, %xmm7 +; X64-NEXT: movd %r10d, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; X64-NEXT: movd %r11d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X64-NEXT: movd %ebx, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; X64-NEXT: movd %ebp, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-NEXT: movd %ebx, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; X64-NEXT: movd %ebp, %xmm6 +; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; X64-NEXT: movd %r14d, %xmm4 -; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; X64-NEXT: movd %r15d, %xmm6 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: movd %r15d, %xmm2 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; X64-NEXT: movd %r12d, %xmm5 -; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; X64-NEXT: movd %r13d, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; X64-NEXT: movd %edx, %xmm6 +; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; X64-NEXT: movd %r13d, %xmm6 +; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; X64-NEXT: movd %edx, %xmm2 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: movd %ecx, %xmm4 -; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: movd %eax, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; X64-NEXT: movdqa %xmm3, %xmm4 -; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; X64-NEXT: movd %eax, %xmm6 +; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; X64-NEXT: movdqa %xmm6, %xmm2 +; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movdqa %xmm4, (%rax) -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X64-NEXT: pmullw %xmm2, %xmm4 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; X64-NEXT: pand %xmm2, %xmm4 +; X64-NEXT: movdqa %xmm2, (%rax) ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-NEXT: pmullw %xmm3, %xmm2 +; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; X64-NEXT: pand %xmm3, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: pmullw %xmm3, %xmm1 -; X64-NEXT: pand %xmm2, %xmm1 -; X64-NEXT: packuswb %xmm4, %xmm1 +; X64-NEXT: pmullw %xmm6, %xmm1 +; X64-NEXT: pand %xmm3, %xmm1 +; X64-NEXT: packuswb %xmm2, %xmm1 ; X64-NEXT: psubb %xmm1, %xmm0 ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 @@ -1099,25 +1093,25 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __udivdi3 -; X86-NEXT: movd %edx, %xmm1 -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; X86-NEXT: movdqa %xmm3, (%esi) +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: movdqa %xmm0, %xmm1 -; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: pmuludq %xmm3, %xmm1 -; X86-NEXT: movdqa %xmm3, %xmm2 +; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X86-NEXT: movdqa %xmm1, (%esi) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload +; X86-NEXT: movdqa %xmm3, %xmm0 +; X86-NEXT: psrlq $32, %xmm0 +; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: movdqa %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm2 -; X86-NEXT: paddq %xmm1, %xmm2 +; X86-NEXT: pmuludq %xmm3, %xmm2 +; X86-NEXT: paddq %xmm0, %xmm2 ; X86-NEXT: psllq $32, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm3 -; X86-NEXT: paddq %xmm2, %xmm3 +; X86-NEXT: pmuludq %xmm3, %xmm1 +; X86-NEXT: paddq %xmm2, %xmm1 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: psubq %xmm3, %xmm0 +; X86-NEXT: psubq %xmm1, %xmm0 ; X86-NEXT: addl $64, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll index 3b015acb69bd2e..5bed15247d5979 100644 --- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll @@ -489,13 +489,13 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex ; AVX1: # %bb.0: ; AVX1-NEXT: subq $40, %rsp ; AVX1-NEXT: .cfi_def_cfa_offset 48 -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-NEXT: addq $40, %rsp ; AVX1-NEXT: .cfi_def_cfa_offset 8 @@ -505,13 +505,13 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex ; AVX2: # %bb.0: ; AVX2-NEXT: subq $40, %rsp ; AVX2-NEXT: .cfi_def_cfa_offset 48 -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-NEXT: addq $40, %rsp ; AVX2-NEXT: .cfi_def_cfa_offset 8 diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll index 38a1de251a3d91..44de7470a9a683 100644 --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -462,13 +462,13 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: subl $8, %esp -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: shll $8, %ecx -; X86-BMI1-NEXT: movzbl %al, %edx -; X86-BMI1-NEXT: orl %ecx, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: shll $8, %eax +; X86-BMI1-NEXT: movzbl %cl, %edx +; X86-BMI1-NEXT: orl %eax, %edx ; X86-BMI1-NEXT: bextrl %edx, {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl %eax, (%esp) +; X86-BMI1-NEXT: movl %ecx, (%esp) ; X86-BMI1-NEXT: calll use32@PLT ; X86-BMI1-NEXT: movl %esi, %eax ; X86-BMI1-NEXT: addl $8, %esp @@ -1678,27 +1678,27 @@ define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; ; X86-BMI2-LABEL: bextr64_32_a0: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: pushl %ebx -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-BMI2-NEXT: pushl %esi +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: shrdl %cl, %eax, %edx +; X86-BMI2-NEXT: shrdl %cl, %eax, %esi ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB14_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: shrxl %ecx, %eax, %edx +; X86-BMI2-NEXT: shrxl %ecx, %eax, %esi ; X86-BMI2-NEXT: .LBB14_2: ; X86-BMI2-NEXT: xorl %eax, %eax -; X86-BMI2-NEXT: testb $32, %bl +; X86-BMI2-NEXT: testb $32, %dl ; X86-BMI2-NEXT: jne .LBB14_4 ; X86-BMI2-NEXT: # %bb.3: ; X86-BMI2-NEXT: movl $1, %eax -; X86-BMI2-NEXT: shlxl %ebx, %eax, %eax +; X86-BMI2-NEXT: shlxl %edx, %eax, %eax ; X86-BMI2-NEXT: .LBB14_4: ; X86-BMI2-NEXT: decl %eax -; X86-BMI2-NEXT: andl %edx, %eax -; X86-BMI2-NEXT: popl %ebx +; X86-BMI2-NEXT: andl %esi, %eax +; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bextr64_32_a0: @@ -1769,18 +1769,18 @@ define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %edi, %esi +; X86-BMI1-NEXT: shrl %cl, %esi +; X86-BMI1-NEXT: shrdl %cl, %edi, %edx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB15_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: .LBB15_2: ; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: bextrl %eax, %esi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: retl @@ -2015,18 +2015,18 @@ define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %edi, %esi +; X86-BMI1-NEXT: shrl %cl, %esi +; X86-BMI1-NEXT: shrdl %cl, %edi, %edx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB17_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: .LBB17_2: ; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: bextrl %eax, %esi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: retl @@ -2150,27 +2150,27 @@ define i32 @bextr64_32_a3(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; ; X86-BMI2-LABEL: bextr64_32_a3: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: pushl %ebx -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-BMI2-NEXT: pushl %esi +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: shrdl %cl, %eax, %edx +; X86-BMI2-NEXT: shrdl %cl, %eax, %esi ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB18_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: shrxl %ecx, %eax, %edx +; X86-BMI2-NEXT: shrxl %ecx, %eax, %esi ; X86-BMI2-NEXT: .LBB18_2: ; X86-BMI2-NEXT: xorl %eax, %eax -; X86-BMI2-NEXT: testb $32, %bl +; X86-BMI2-NEXT: testb $32, %dl ; X86-BMI2-NEXT: jne .LBB18_4 ; X86-BMI2-NEXT: # %bb.3: ; X86-BMI2-NEXT: movl $1, %eax -; X86-BMI2-NEXT: shlxl %ebx, %eax, %eax +; X86-BMI2-NEXT: shlxl %edx, %eax, %eax ; X86-BMI2-NEXT: .LBB18_4: ; X86-BMI2-NEXT: decl %eax -; X86-BMI2-NEXT: andl %edx, %eax -; X86-BMI2-NEXT: popl %ebx +; X86-BMI2-NEXT: andl %esi, %eax +; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bextr64_32_a3: @@ -2577,13 +2577,13 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: subl $8, %esp -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: shll $8, %ecx -; X86-BMI1-NEXT: movzbl %al, %edx -; X86-BMI1-NEXT: orl %ecx, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: shll $8, %eax +; X86-BMI1-NEXT: movzbl %cl, %edx +; X86-BMI1-NEXT: orl %eax, %edx ; X86-BMI1-NEXT: bextrl %edx, {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl %eax, (%esp) +; X86-BMI1-NEXT: movl %ecx, (%esp) ; X86-BMI1-NEXT: calll use32@PLT ; X86-BMI1-NEXT: movl %esi, %eax ; X86-BMI1-NEXT: addl $8, %esp @@ -2713,18 +2713,18 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: .LBB25_2: -; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl $-1, %ebx +; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %ebx +; X86-BMI1-NEXT: shll %cl, %edi ; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB25_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %ebx, %edi -; X86-BMI1-NEXT: xorl %ebx, %ebx +; X86-BMI1-NEXT: movl %edi, %ebx +; X86-BMI1-NEXT: xorl %edi, %edi ; X86-BMI1-NEXT: .LBB25_4: -; X86-BMI1-NEXT: andnl %edx, %edi, %edx -; X86-BMI1-NEXT: andnl %esi, %ebx, %eax +; X86-BMI1-NEXT: andnl %edx, %ebx, %edx +; X86-BMI1-NEXT: andnl %esi, %edi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: popl %ebx @@ -2851,18 +2851,18 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: .LBB26_2: -; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl $-1, %ebx +; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %ebx +; X86-BMI1-NEXT: shll %cl, %edi ; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB26_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %ebx, %edi -; X86-BMI1-NEXT: xorl %ebx, %ebx +; X86-BMI1-NEXT: movl %edi, %ebx +; X86-BMI1-NEXT: xorl %edi, %edi ; X86-BMI1-NEXT: .LBB26_4: -; X86-BMI1-NEXT: andnl %edx, %edi, %edx -; X86-BMI1-NEXT: andnl %esi, %ebx, %eax +; X86-BMI1-NEXT: andnl %edx, %ebx, %edx +; X86-BMI1-NEXT: andnl %esi, %edi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: popl %ebx @@ -2995,18 +2995,18 @@ define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: .LBB27_2: -; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl $-1, %ebx +; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %ebx +; X86-BMI1-NEXT: shll %cl, %edi ; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB27_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %ebx, %edi -; X86-BMI1-NEXT: xorl %ebx, %ebx +; X86-BMI1-NEXT: movl %edi, %ebx +; X86-BMI1-NEXT: xorl %edi, %edi ; X86-BMI1-NEXT: .LBB27_4: -; X86-BMI1-NEXT: andnl %edx, %edi, %edx -; X86-BMI1-NEXT: andnl %esi, %ebx, %eax +; X86-BMI1-NEXT: andnl %edx, %ebx, %edx +; X86-BMI1-NEXT: andnl %esi, %edi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: popl %ebx @@ -3138,18 +3138,18 @@ define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: .LBB28_2: -; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl $-1, %ebx +; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %ebx +; X86-BMI1-NEXT: shll %cl, %edi ; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB28_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %ebx, %edi -; X86-BMI1-NEXT: xorl %ebx, %ebx +; X86-BMI1-NEXT: movl %edi, %ebx +; X86-BMI1-NEXT: xorl %edi, %edi ; X86-BMI1-NEXT: .LBB28_4: -; X86-BMI1-NEXT: andnl %edx, %edi, %edx -; X86-BMI1-NEXT: andnl %esi, %ebx, %eax +; X86-BMI1-NEXT: andnl %edx, %ebx, %edx +; X86-BMI1-NEXT: andnl %esi, %edi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: popl %ebx @@ -3283,18 +3283,18 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: .LBB29_2: -; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl $-1, %ebx +; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %ebx +; X86-BMI1-NEXT: shll %cl, %edi ; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB29_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %ebx, %edi -; X86-BMI1-NEXT: xorl %ebx, %ebx +; X86-BMI1-NEXT: movl %edi, %ebx +; X86-BMI1-NEXT: xorl %edi, %edi ; X86-BMI1-NEXT: .LBB29_4: -; X86-BMI1-NEXT: andnl %edx, %edi, %edx -; X86-BMI1-NEXT: andnl %esi, %ebx, %eax +; X86-BMI1-NEXT: andnl %edx, %ebx, %edx +; X86-BMI1-NEXT: andnl %esi, %edi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: popl %ebx @@ -3711,18 +3711,18 @@ define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %edi, %esi +; X86-BMI1-NEXT: shrl %cl, %esi +; X86-BMI1-NEXT: shrdl %cl, %edi, %edx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB32_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: .LBB32_2: ; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: bextrl %eax, %esi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: retl @@ -3813,18 +3813,18 @@ define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %edi, %esi +; X86-BMI1-NEXT: shrl %cl, %esi +; X86-BMI1-NEXT: shrdl %cl, %edi, %edx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB33_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: .LBB33_2: ; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: bextrl %eax, %esi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: retl @@ -6236,18 +6236,18 @@ define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %edi, %esi +; X86-BMI1-NEXT: shrl %cl, %esi +; X86-BMI1-NEXT: shrdl %cl, %edi, %edx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB48_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: .LBB48_2: ; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: bextrl %eax, %esi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: retl @@ -6335,18 +6335,18 @@ define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %edi, %esi +; X86-BMI1-NEXT: shrl %cl, %esi +; X86-BMI1-NEXT: shrdl %cl, %edi, %edx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB49_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: .LBB49_2: ; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: bextrl %eax, %esi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: retl @@ -6821,13 +6821,13 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: subl $8, %esp -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: shll $8, %ecx -; X86-BMI1-NEXT: movzbl %al, %edx -; X86-BMI1-NEXT: orl %ecx, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: shll $8, %eax +; X86-BMI1-NEXT: movzbl %cl, %edx +; X86-BMI1-NEXT: orl %eax, %edx ; X86-BMI1-NEXT: bextrl %edx, {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl %eax, (%esp) +; X86-BMI1-NEXT: movl %ecx, (%esp) ; X86-BMI1-NEXT: calll use32@PLT ; X86-BMI1-NEXT: movl %esi, %eax ; X86-BMI1-NEXT: addl $8, %esp @@ -7987,18 +7987,18 @@ define i32 @bextr64_32_d1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %edi, %esi +; X86-BMI1-NEXT: shrl %cl, %esi +; X86-BMI1-NEXT: shrdl %cl, %edi, %edx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB62_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: .LBB62_2: ; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: bextrl %eax, %esi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: retl diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll index 03de1533e1d64c..dc12ab729defb1 100644 --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -310,11 +310,11 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) # ; FMACALL64: ## %bb.0: ## %entry ; FMACALL64-NEXT: subq $88, %rsp ## encoding: [0x48,0x83,0xec,0x58] ; FMACALL64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x10] ; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x20] ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x30] ; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] ; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] ; FMACALL64-NEXT: shufps $255, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xff] @@ -326,15 +326,15 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) # ; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] ; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] ; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x10] ; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] ; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] @@ -345,25 +345,25 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) # ; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x10] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x40] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] ; FMACALL64-NEXT: shufps $85, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0x55] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] ; FMACALL64-NEXT: shufps $85, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0x55] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x10] ; FMACALL64-NEXT: shufps $85, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0x55] ; FMACALL64-NEXT: ## xmm2 = xmm2[1,1,1,1] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] @@ -483,16 +483,16 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) # ; FMACALL64-NEXT: subq $136, %rsp ## encoding: [0x48,0x81,0xec,0x88,0x00,0x00,0x00] ; FMACALL64-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x6c,0x24,0x50] -; FMACALL64-NEXT: movaps %xmm4, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x24,0x24] +; FMACALL64-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x64,0x24,0x60] ; FMACALL64-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x40] ; FMACALL64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x60] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x10] ; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x30] -; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] ; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] ; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] ; FMACALL64-NEXT: movaps %xmm2, %xmm1 ## encoding: [0x0f,0x28,0xca] @@ -505,16 +505,16 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) # ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] +; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] ; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] ; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] -; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x60] ; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] ; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] @@ -524,26 +524,26 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) # ; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x70] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] +; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] -; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x60] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] +; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] ; FMACALL64-NEXT: shufps $85, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0x55] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] ; FMACALL64-NEXT: shufps $85, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0x55] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1,1,1] -; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x60] ; FMACALL64-NEXT: shufps $85, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0x55] ; FMACALL64-NEXT: ## xmm2 = xmm2[1,1,1,1] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] @@ -836,7 +836,7 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL64-NEXT: movaps %xmm5, (%rsp) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x2c,0x24] ; FMACALL64-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x64,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x64,0x24,0x50] ; FMACALL64-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x70] ; FMACALL64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill @@ -844,7 +844,7 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x30] ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] ; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] ; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] @@ -858,11 +858,11 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x60] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] ; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x50] ; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] @@ -876,20 +876,20 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x84,0x24,0x90,0x00,0x00,0x00] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x50] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x60] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] ; FMACALL64-NEXT: shufps $85, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0x55] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x50] ; FMACALL64-NEXT: shufps $85, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0x55] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1,1,1] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] @@ -1126,7 +1126,7 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## imm = 0x1C0 ; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm4 ## encoding: [0xc5,0xf8,0x28,0x65,0x38] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x4c,0x24,0x60] @@ -1134,7 +1134,7 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] @@ -1226,13 +1226,13 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] @@ -1281,10 +1281,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1294,10 +1294,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1307,10 +1307,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1320,10 +1320,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xd0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll index 5bb5d1e9c17ec8..36f93bfcbd9fec 100644 --- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll @@ -19,7 +19,6 @@ declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>) define float @test_fmaximum(float %x, float %y) nounwind { ; SSE2-LABEL: test_fmaximum: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -27,17 +26,18 @@ define float @test_fmaximum(float %x, float %y) nounwind { ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: .LBB0_2: -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: cmpunordss %xmm3, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: cmpunordss %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm4 ; SSE2-NEXT: andps %xmm3, %xmm4 ; SSE2-NEXT: js .LBB0_4 ; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: .LBB0_4: ; SSE2-NEXT: maxss %xmm1, %xmm3 -; SSE2-NEXT: andnps %xmm3, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: orps %xmm4, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximum: @@ -75,18 +75,18 @@ define float @test_fmaximum(float %x, float %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-NEXT: vmovd %xmm2, %eax +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vmovd %xmm1, %eax ; X86-NEXT: testl %eax, %eax ; X86-NEXT: js .LBB0_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: vmovdqa %xmm2, %xmm1 +; X86-NEXT: vmovdqa %xmm1, %xmm2 ; X86-NEXT: jmp .LBB0_3 ; X86-NEXT: .LBB0_1: -; X86-NEXT: vmovdqa %xmm0, %xmm1 -; X86-NEXT: vmovdqa %xmm2, %xmm0 +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB0_3: -; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 ; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) @@ -904,18 +904,18 @@ define float @test_fminimum_combine_cmps(float %x, float %y) nounwind { ; ; AVX1-LABEL: test_fminimum_combine_cmps: ; AVX1: # %bb.0: -; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: js .LBB19_1 ; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: vmovaps %xmm2, %xmm1 +; AVX1-NEXT: vmovaps %xmm1, %xmm2 ; AVX1-NEXT: jmp .LBB19_3 ; AVX1-NEXT: .LBB19_1: -; AVX1-NEXT: vmovaps %xmm0, %xmm1 -; AVX1-NEXT: vmovaps %xmm2, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, %xmm2 +; AVX1-NEXT: vmovaps %xmm1, %xmm0 ; AVX1-NEXT: .LBB19_3: -; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vminss %xmm2, %xmm0, %xmm1 ; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq @@ -952,18 +952,18 @@ define float @test_fminimum_combine_cmps(float %x, float %y) nounwind { ; X86-NEXT: pushl %eax ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: vdivss %xmm0, %xmm1, %xmm2 +; X86-NEXT: vdivss %xmm0, %xmm1, %xmm1 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: testl %eax, %eax ; X86-NEXT: js .LBB19_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: vmovaps %xmm2, %xmm1 +; X86-NEXT: vmovaps %xmm1, %xmm2 ; X86-NEXT: jmp .LBB19_3 ; X86-NEXT: .LBB19_1: -; X86-NEXT: vmovaps %xmm0, %xmm1 -; X86-NEXT: vmovaps %xmm2, %xmm0 +; X86-NEXT: vmovaps %xmm0, %xmm2 +; X86-NEXT: vmovaps %xmm1, %xmm0 ; X86-NEXT: .LBB19_3: -; X86-NEXT: vminss %xmm1, %xmm0, %xmm1 +; X86-NEXT: vminss %xmm2, %xmm0, %xmm1 ; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll index 5ea2964057588f..3f48dfa47aadd6 100644 --- a/llvm/test/CodeGen/X86/fold-tied-op.ll +++ b/llvm/test/CodeGen/X86/fold-tied-op.ll @@ -20,95 +20,102 @@ define i64 @fn1() #0 { ; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: subl $12, %esp +; CHECK-NEXT: subl $16, %esp ; CHECK-NEXT: .cfi_offset %esi, -20 ; CHECK-NEXT: .cfi_offset %edi, -16 ; CHECK-NEXT: .cfi_offset %ebx, -12 -; CHECK-NEXT: movl $-1028477379, %ecx # imm = 0xC2B2AE3D -; CHECK-NEXT: movl $668265295, %esi # imm = 0x27D4EB4F +; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D +; CHECK-NEXT: movl $668265295, %ecx # imm = 0x27D4EB4F ; CHECK-NEXT: movl a, %edi ; CHECK-NEXT: cmpl $0, (%edi) ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl 8(%edi), %ecx -; CHECK-NEXT: movl 12(%edi), %edx -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: shldl $1, %ecx, %eax -; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: leal (%ecx,%ecx), %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 16(%edi), %ebx -; CHECK-NEXT: movl 20(%edi), %edx -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: shldl $2, %ebx, %edx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: shldl $31, %ebx, %ecx -; CHECK-NEXT: shll $2, %ebx -; CHECK-NEXT: orl %ecx, %ebx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: shrl %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; CHECK-NEXT: movl 8(%edi), %eax +; CHECK-NEXT: movl 12(%edi), %esi +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: shldl $1, %eax, %edx +; CHECK-NEXT: orl %esi, %edx +; CHECK-NEXT: leal (%eax,%eax), %esi +; CHECK-NEXT: orl %eax, %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 16(%edi), %eax +; CHECK-NEXT: movl 20(%edi), %esi +; CHECK-NEXT: movl $-1028477379, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: # imm = 0xC2B2AE3D +; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: shldl $2, %eax, %ebx ; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: adcl %eax, %ecx -; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 24(%edi), %eax +; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: shldl $31, %eax, %ebx +; CHECK-NEXT: shll $2, %eax +; CHECK-NEXT: orl %ebx, %eax +; CHECK-NEXT: shrl %esi +; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D -; CHECK-NEXT: imull %eax, %ebx -; CHECK-NEXT: mull %esi -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: addl %ebx, %edx -; CHECK-NEXT: movl 28(%edi), %edi -; CHECK-NEXT: imull %edi, %esi -; CHECK-NEXT: addl %edx, %esi +; CHECK-NEXT: adcl %edx, %esi +; CHECK-NEXT: movl 24(%edi), %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: imull %edx, %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: mull %ecx +; CHECK-NEXT: movl %eax, %ebx +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; CHECK-NEXT: movl 28(%edi), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: imull %eax, %ecx +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: movl $1336530590, %edx # imm = 0x4FA9D69E -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: mull %edx +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: imull $-2056954758, %edi, %eax # imm = 0x85655C7A +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: imull $1336530590, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; CHECK-NEXT: # imm = 0x4FA9D69E +; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: shrdl $3, %ecx, %ebx +; CHECK-NEXT: sarl $3, %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; CHECK-NEXT: movl $-66860409, %edx # imm = 0xFC03CA87 ; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: mull %edx -; CHECK-NEXT: imull $-2056954758, %ebx, %ebx # imm = 0x85655C7A -; CHECK-NEXT: addl %edx, %ebx -; CHECK-NEXT: imull $1336530590, %edi, %edx # imm = 0x4FA9D69E -; CHECK-NEXT: addl %ebx, %edx -; CHECK-NEXT: shrdl $3, %esi, %ecx -; CHECK-NEXT: sarl $3, %esi -; CHECK-NEXT: orl %edx, %esi -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: movl $-66860409, %ebx # imm = 0xFC03CA87 -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: mull %ebx ; CHECK-NEXT: movl %eax, %edi -; CHECK-NEXT: imull $326129324, %ecx, %eax # imm = 0x137056AC +; CHECK-NEXT: imull $326129324, %ebx, %eax # imm = 0x137056AC ; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: imull $-66860409, %esi, %ecx # imm = 0xFC03CA87 +; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87 ; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; CHECK-NEXT: xorl %esi, %ecx ; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; CHECK-NEXT: movl %edi, b ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: mull %ebx +; CHECK-NEXT: movl $-66860409, %edx # imm = 0xFC03CA87 +; CHECK-NEXT: mull %edx ; CHECK-NEXT: imull $326129324, %edi, %esi # imm = 0x137056AC ; CHECK-NEXT: addl %edx, %esi ; CHECK-NEXT: movl %ecx, b+4 ; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87 ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %if.else -; CHECK-NEXT: xorl b+4, %ecx -; CHECK-NEXT: xorl b, %esi +; CHECK-NEXT: xorl b+4, %ebx +; CHECK-NEXT: xorl b, %ecx ; CHECK-NEXT: movl $1419758215, %edx # imm = 0x549FCA87 -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: mull %edx -; CHECK-NEXT: imull $93298681, %esi, %esi # imm = 0x58F9FF9 +; CHECK-NEXT: imull $93298681, %ecx, %esi # imm = 0x58F9FF9 ; CHECK-NEXT: addl %edx, %esi -; CHECK-NEXT: imull $1419758215, %ecx, %ecx # imm = 0x549FCA87 +; CHECK-NEXT: imull $1419758215, %ebx, %ecx # imm = 0x549FCA87 ; CHECK-NEXT: .LBB0_3: # %if.end ; CHECK-NEXT: addl %esi, %ecx ; CHECK-NEXT: addl $-1028477341, %eax # imm = 0xC2B2AE63 ; CHECK-NEXT: adcl $-2048144777, %ecx # imm = 0x85EBCA77 ; CHECK-NEXT: movl %eax, b ; CHECK-NEXT: movl %ecx, b+4 -; CHECK-NEXT: addl $12, %esp +; CHECK-NEXT: addl $16, %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi ; CHECK-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll index 2253d7cbaf8b67..19245ec070849f 100644 --- a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll @@ -797,14 +797,14 @@ define <2 x double> @f24(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] ; NOFMA-NEXT: xorps %xmm3, %xmm0 -; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: xorps %xmm3, %xmm2 -; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; NOFMA-NEXT: callq fma@PLT ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload +; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] -; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; NOFMA-NEXT: pshufd $238, (%rsp), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] @@ -1033,14 +1033,14 @@ define <2 x double> @f28(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] ; NOFMA-NEXT: xorps %xmm3, %xmm0 -; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: xorps %xmm3, %xmm2 -; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; NOFMA-NEXT: callq fma@PLT ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload +; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] -; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; NOFMA-NEXT: pshufd $238, (%rsp), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll index f4689b2ab6bb94..5348029f1755a4 100644 --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1414,14 +1414,14 @@ define i64 @f20u64(double %x) #0 { ; ; SSE-LABEL: f20u64: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: comisd %xmm2, %xmm0 -; SSE-NEXT: xorpd %xmm1, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: comisd %xmm1, %xmm0 +; SSE-NEXT: xorpd %xmm2, %xmm2 ; SSE-NEXT: jb .LBB25_2 ; SSE-NEXT: # %bb.1: # %entry -; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm2 ; SSE-NEXT: .LBB25_2: # %entry -; SSE-NEXT: subsd %xmm1, %xmm0 +; SSE-NEXT: subsd %xmm2, %xmm0 ; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: setae %al ; SSE-NEXT: movzbl %al, %eax diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll index fac14d8f14e8a1..f7a1ee2da7f5a8 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll @@ -347,14 +347,14 @@ define i64 @fptoui_f16toi64(half %x) #0 { ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rax ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: comiss %xmm2, %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: comiss %xmm1, %xmm0 +; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: jb .LBB9_2 ; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: .LBB9_2: -; SSE2-NEXT: subss %xmm1, %xmm0 +; SSE2-NEXT: subss %xmm2, %xmm0 ; SSE2-NEXT: cvttss2si %xmm0, %rcx ; SSE2-NEXT: setae %al ; SSE2-NEXT: movzbl %al, %eax diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll index 25a946465ff3fa..988042770bd196 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll @@ -572,14 +572,14 @@ define i64 @fptoui_f32toi64(float %x) #0 { ; ; SSE-X64-LABEL: fptoui_f32toi64: ; SSE-X64: # %bb.0: -; SSE-X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-X64-NEXT: comiss %xmm2, %xmm0 -; SSE-X64-NEXT: xorps %xmm1, %xmm1 +; SSE-X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-X64-NEXT: comiss %xmm1, %xmm0 +; SSE-X64-NEXT: xorps %xmm2, %xmm2 ; SSE-X64-NEXT: jb .LBB9_2 ; SSE-X64-NEXT: # %bb.1: -; SSE-X64-NEXT: movaps %xmm2, %xmm1 +; SSE-X64-NEXT: movaps %xmm1, %xmm2 ; SSE-X64-NEXT: .LBB9_2: -; SSE-X64-NEXT: subss %xmm1, %xmm0 +; SSE-X64-NEXT: subss %xmm2, %xmm0 ; SSE-X64-NEXT: cvttss2si %xmm0, %rcx ; SSE-X64-NEXT: setae %al ; SSE-X64-NEXT: movzbl %al, %eax @@ -1212,14 +1212,14 @@ define i64 @fptoui_f64toi64(double %x) #0 { ; ; SSE-X64-LABEL: fptoui_f64toi64: ; SSE-X64: # %bb.0: -; SSE-X64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-X64-NEXT: comisd %xmm2, %xmm0 -; SSE-X64-NEXT: xorpd %xmm1, %xmm1 +; SSE-X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-X64-NEXT: comisd %xmm1, %xmm0 +; SSE-X64-NEXT: xorpd %xmm2, %xmm2 ; SSE-X64-NEXT: jb .LBB18_2 ; SSE-X64-NEXT: # %bb.1: -; SSE-X64-NEXT: movapd %xmm2, %xmm1 +; SSE-X64-NEXT: movapd %xmm1, %xmm2 ; SSE-X64-NEXT: .LBB18_2: -; SSE-X64-NEXT: subsd %xmm1, %xmm0 +; SSE-X64-NEXT: subsd %xmm2, %xmm0 ; SSE-X64-NEXT: cvttsd2si %xmm0, %rcx ; SSE-X64-NEXT: setae %al ; SSE-X64-NEXT: movzbl %al, %eax diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll index 5e4f690fcdbbf6..33b2109e962383 100644 --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -154,14 +154,14 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-NEXT: movq %rax, %xmm2 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm4 +; CHECK-NEXT: movq %rax, %xmm3 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm4, %xmm1 +; CHECK-NEXT: movdqa %xmm3, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; CHECK-NEXT: pxor %xmm6, %xmm6 @@ -173,20 +173,20 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-NEXT: pand %xmm5, %xmm9 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] ; CHECK-NEXT: por %xmm9, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm4 -; CHECK-NEXT: pandn %xmm3, %xmm1 -; CHECK-NEXT: por %xmm4, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm4 -; CHECK-NEXT: pxor %xmm0, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: pandn %xmm4, %xmm1 +; CHECK-NEXT: por %xmm3, %xmm1 +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pxor %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm7 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] +; CHECK-NEXT: pand %xmm5, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm5 +; CHECK-NEXT: por %xmm3, %xmm5 ; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pandn %xmm3, %xmm5 +; CHECK-NEXT: pandn %xmm4, %xmm5 ; CHECK-NEXT: por %xmm2, %xmm5 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: movdqa %xmm5, %xmm3 @@ -317,14 +317,14 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NEXT: movq %rax, %xmm2 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm4 +; CHECK-NEXT: movq %rax, %xmm3 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm4, %xmm1 +; CHECK-NEXT: movdqa %xmm3, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; CHECK-NEXT: pxor %xmm6, %xmm6 @@ -336,20 +336,20 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NEXT: pand %xmm5, %xmm9 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] ; CHECK-NEXT: por %xmm9, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm4 -; CHECK-NEXT: pandn %xmm3, %xmm1 -; CHECK-NEXT: por %xmm4, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm4 -; CHECK-NEXT: pxor %xmm0, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: pandn %xmm4, %xmm1 +; CHECK-NEXT: por %xmm3, %xmm1 +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pxor %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm7 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] +; CHECK-NEXT: pand %xmm5, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm5 +; CHECK-NEXT: por %xmm3, %xmm5 ; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pandn %xmm3, %xmm5 +; CHECK-NEXT: pandn %xmm4, %xmm5 ; CHECK-NEXT: por %xmm2, %xmm5 ; CHECK-NEXT: movdqa %xmm5, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 @@ -446,10 +446,9 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 +; CHECK-NEXT: pand %xmm4, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: por %xmm7, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: movdqa %xmm4, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 @@ -650,10 +649,9 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 +; CHECK-NEXT: pand %xmm4, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: por %xmm7, %xmm4 ; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm3 @@ -2012,10 +2010,9 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 +; CHECK-NEXT: pand %xmm4, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: por %xmm7, %xmm4 ; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] @@ -2213,10 +2210,9 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 +; CHECK-NEXT: pand %xmm4, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: por %xmm7, %xmm4 ; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm3 diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll index 76e15ca0bf919b..f6b7f8e04d970b 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll @@ -790,30 +790,30 @@ define i100 @test_signed_i100_f32(float %f) nounwind { ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: xorl %ebp, %ebp ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movl $-8, %ebx +; X86-SSE-NEXT: movl $-8, %edi ; X86-SSE-NEXT: movl $0, %ecx ; X86-SSE-NEXT: movl $0, %edx -; X86-SSE-NEXT: movl $0, %edi +; X86-SSE-NEXT: movl $0, %ebx ; X86-SSE-NEXT: jb .LBB8_2 ; X86-SSE-NEXT: # %bb.1: -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SSE-NEXT: .LBB8_2: ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movl $-1, %eax -; X86-SSE-NEXT: cmoval %eax, %edi +; X86-SSE-NEXT: cmoval %eax, %ebx ; X86-SSE-NEXT: cmoval %eax, %edx ; X86-SSE-NEXT: cmoval %eax, %ecx ; X86-SSE-NEXT: movl $7, %eax -; X86-SSE-NEXT: cmovbel %ebx, %eax +; X86-SSE-NEXT: cmovbel %edi, %eax ; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 ; X86-SSE-NEXT: cmovpl %ebp, %eax ; X86-SSE-NEXT: cmovpl %ebp, %ecx ; X86-SSE-NEXT: cmovpl %ebp, %edx -; X86-SSE-NEXT: cmovpl %ebp, %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) +; X86-SSE-NEXT: cmovpl %ebp, %ebx +; X86-SSE-NEXT: movl %ebx, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) ; X86-SSE-NEXT: andl $15, %eax @@ -971,21 +971,21 @@ define i128 @test_signed_i128_f32(float %f) nounwind { ; X86-SSE-NEXT: cmovbl %ecx, %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: cmovbl %ecx, %edi -; X86-SSE-NEXT: movl $-2147483648, %ebp # imm = 0x80000000 -; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebp +; X86-SSE-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 +; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebx ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movl $2147483647, %ebx # imm = 0x7FFFFFFF -; X86-SSE-NEXT: cmovbel %ebp, %ebx -; X86-SSE-NEXT: movl $-1, %ebp -; X86-SSE-NEXT: cmoval %ebp, %edi -; X86-SSE-NEXT: cmoval %ebp, %edx -; X86-SSE-NEXT: cmoval %ebp, %eax +; X86-SSE-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF +; X86-SSE-NEXT: cmovbel %ebx, %ebp +; X86-SSE-NEXT: movl $-1, %ebx +; X86-SSE-NEXT: cmoval %ebx, %edi +; X86-SSE-NEXT: cmoval %ebx, %edx +; X86-SSE-NEXT: cmoval %ebx, %eax ; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 ; X86-SSE-NEXT: cmovpl %ecx, %eax ; X86-SSE-NEXT: cmovpl %ecx, %edx ; X86-SSE-NEXT: cmovpl %ecx, %edi -; X86-SSE-NEXT: cmovpl %ecx, %ebx -; X86-SSE-NEXT: movl %ebx, 12(%esi) +; X86-SSE-NEXT: cmovpl %ecx, %ebp +; X86-SSE-NEXT: movl %ebp, 12(%esi) ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %eax, (%esi) @@ -1804,30 +1804,30 @@ define i100 @test_signed_i100_f64(double %f) nounwind { ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE-NEXT: xorl %ebp, %ebp ; X86-SSE-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movl $-8, %ebx +; X86-SSE-NEXT: movl $-8, %edi ; X86-SSE-NEXT: movl $0, %ecx ; X86-SSE-NEXT: movl $0, %edx -; X86-SSE-NEXT: movl $0, %edi +; X86-SSE-NEXT: movl $0, %ebx ; X86-SSE-NEXT: jb .LBB18_2 ; X86-SSE-NEXT: # %bb.1: -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SSE-NEXT: .LBB18_2: ; X86-SSE-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movl $-1, %eax -; X86-SSE-NEXT: cmoval %eax, %edi +; X86-SSE-NEXT: cmoval %eax, %ebx ; X86-SSE-NEXT: cmoval %eax, %edx ; X86-SSE-NEXT: cmoval %eax, %ecx ; X86-SSE-NEXT: movl $7, %eax -; X86-SSE-NEXT: cmovbel %ebx, %eax +; X86-SSE-NEXT: cmovbel %edi, %eax ; X86-SSE-NEXT: ucomisd %xmm0, %xmm0 ; X86-SSE-NEXT: cmovpl %ebp, %eax ; X86-SSE-NEXT: cmovpl %ebp, %ecx ; X86-SSE-NEXT: cmovpl %ebp, %edx -; X86-SSE-NEXT: cmovpl %ebp, %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) +; X86-SSE-NEXT: cmovpl %ebp, %ebx +; X86-SSE-NEXT: movl %ebx, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) ; X86-SSE-NEXT: andl $15, %eax @@ -1985,21 +1985,21 @@ define i128 @test_signed_i128_f64(double %f) nounwind { ; X86-SSE-NEXT: cmovbl %ecx, %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: cmovbl %ecx, %edi -; X86-SSE-NEXT: movl $-2147483648, %ebp # imm = 0x80000000 -; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebp +; X86-SSE-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 +; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebx ; X86-SSE-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movl $2147483647, %ebx # imm = 0x7FFFFFFF -; X86-SSE-NEXT: cmovbel %ebp, %ebx -; X86-SSE-NEXT: movl $-1, %ebp -; X86-SSE-NEXT: cmoval %ebp, %edi -; X86-SSE-NEXT: cmoval %ebp, %edx -; X86-SSE-NEXT: cmoval %ebp, %eax +; X86-SSE-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF +; X86-SSE-NEXT: cmovbel %ebx, %ebp +; X86-SSE-NEXT: movl $-1, %ebx +; X86-SSE-NEXT: cmoval %ebx, %edi +; X86-SSE-NEXT: cmoval %ebx, %edx +; X86-SSE-NEXT: cmoval %ebx, %eax ; X86-SSE-NEXT: ucomisd %xmm0, %xmm0 ; X86-SSE-NEXT: cmovpl %ecx, %eax ; X86-SSE-NEXT: cmovpl %ecx, %edx ; X86-SSE-NEXT: cmovpl %ecx, %edi -; X86-SSE-NEXT: cmovpl %ecx, %ebx -; X86-SSE-NEXT: movl %ebx, 12(%esi) +; X86-SSE-NEXT: cmovpl %ecx, %ebp +; X86-SSE-NEXT: movl %ebp, 12(%esi) ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %eax, (%esi) @@ -2991,30 +2991,30 @@ define i100 @test_signed_i100_f16(half %f) nounwind { ; X86-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: xorl %ebp, %ebp ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movl $-8, %ebx +; X86-SSE-NEXT: movl $-8, %edi ; X86-SSE-NEXT: movl $0, %ecx ; X86-SSE-NEXT: movl $0, %edx -; X86-SSE-NEXT: movl $0, %edi +; X86-SSE-NEXT: movl $0, %ebx ; X86-SSE-NEXT: jb .LBB28_2 ; X86-SSE-NEXT: # %bb.1: -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SSE-NEXT: .LBB28_2: ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movl $-1, %eax -; X86-SSE-NEXT: cmoval %eax, %edi +; X86-SSE-NEXT: cmoval %eax, %ebx ; X86-SSE-NEXT: cmoval %eax, %edx ; X86-SSE-NEXT: cmoval %eax, %ecx ; X86-SSE-NEXT: movl $7, %eax -; X86-SSE-NEXT: cmovbel %ebx, %eax +; X86-SSE-NEXT: cmovbel %edi, %eax ; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 ; X86-SSE-NEXT: cmovpl %ebp, %eax ; X86-SSE-NEXT: cmovpl %ebp, %ecx ; X86-SSE-NEXT: cmovpl %ebp, %edx -; X86-SSE-NEXT: cmovpl %ebp, %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) +; X86-SSE-NEXT: cmovpl %ebp, %ebx +; X86-SSE-NEXT: movl %ebx, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) ; X86-SSE-NEXT: andl $15, %eax @@ -3182,21 +3182,21 @@ define i128 @test_signed_i128_f16(half %f) nounwind { ; X86-SSE-NEXT: cmovbl %ecx, %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: cmovbl %ecx, %edi -; X86-SSE-NEXT: movl $-2147483648, %ebp # imm = 0x80000000 -; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebp +; X86-SSE-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 +; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebx ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movl $2147483647, %ebx # imm = 0x7FFFFFFF -; X86-SSE-NEXT: cmovbel %ebp, %ebx -; X86-SSE-NEXT: movl $-1, %ebp -; X86-SSE-NEXT: cmoval %ebp, %edi -; X86-SSE-NEXT: cmoval %ebp, %edx -; X86-SSE-NEXT: cmoval %ebp, %eax +; X86-SSE-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF +; X86-SSE-NEXT: cmovbel %ebx, %ebp +; X86-SSE-NEXT: movl $-1, %ebx +; X86-SSE-NEXT: cmoval %ebx, %edi +; X86-SSE-NEXT: cmoval %ebx, %edx +; X86-SSE-NEXT: cmoval %ebx, %eax ; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 ; X86-SSE-NEXT: cmovpl %ecx, %eax ; X86-SSE-NEXT: cmovpl %ecx, %edx ; X86-SSE-NEXT: cmovpl %ecx, %edi -; X86-SSE-NEXT: cmovpl %ecx, %ebx -; X86-SSE-NEXT: movl %ebx, 12(%esi) +; X86-SSE-NEXT: cmovpl %ecx, %ebp +; X86-SSE-NEXT: movl %ebp, 12(%esi) ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %eax, (%esi) @@ -4320,33 +4320,33 @@ define i100 @test_signed_i100_f80(x86_fp80 %f) nounwind { ; X86-SSE-NEXT: fxch %st(1) ; X86-SSE-NEXT: fucomi %st(1), %st ; X86-SSE-NEXT: fstp %st(1) -; X86-SSE-NEXT: movl $-8, %ebx +; X86-SSE-NEXT: movl $-8, %edi ; X86-SSE-NEXT: movl $0, %ecx ; X86-SSE-NEXT: movl $0, %edx -; X86-SSE-NEXT: movl $0, %edi +; X86-SSE-NEXT: movl $0, %ebx ; X86-SSE-NEXT: jb .LBB38_2 ; X86-SSE-NEXT: # %bb.1: -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SSE-NEXT: .LBB38_2: ; X86-SSE-NEXT: fldt {{\.?LCPI[0-9]+_[0-9]+}} ; X86-SSE-NEXT: fxch %st(1) ; X86-SSE-NEXT: fucomi %st(1), %st ; X86-SSE-NEXT: fstp %st(1) ; X86-SSE-NEXT: movl $-1, %eax -; X86-SSE-NEXT: cmoval %eax, %edi +; X86-SSE-NEXT: cmoval %eax, %ebx ; X86-SSE-NEXT: cmoval %eax, %edx ; X86-SSE-NEXT: cmoval %eax, %ecx ; X86-SSE-NEXT: movl $7, %eax -; X86-SSE-NEXT: cmovbel %ebx, %eax +; X86-SSE-NEXT: cmovbel %edi, %eax ; X86-SSE-NEXT: fucompi %st(0), %st ; X86-SSE-NEXT: cmovpl %ebp, %eax ; X86-SSE-NEXT: cmovpl %ebp, %ecx ; X86-SSE-NEXT: cmovpl %ebp, %edx -; X86-SSE-NEXT: cmovpl %ebp, %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) +; X86-SSE-NEXT: cmovpl %ebp, %ebx +; X86-SSE-NEXT: movl %ebx, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) ; X86-SSE-NEXT: andl $15, %eax @@ -4517,24 +4517,24 @@ define i128 @test_signed_i128_f80(x86_fp80 %f) nounwind { ; X86-SSE-NEXT: cmovbl %ecx, %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: cmovbl %ecx, %edi -; X86-SSE-NEXT: movl $-2147483648, %ebp # imm = 0x80000000 -; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebp +; X86-SSE-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 +; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebx ; X86-SSE-NEXT: fldt {{\.?LCPI[0-9]+_[0-9]+}} ; X86-SSE-NEXT: fxch %st(1) ; X86-SSE-NEXT: fucomi %st(1), %st ; X86-SSE-NEXT: fstp %st(1) -; X86-SSE-NEXT: movl $2147483647, %ebx # imm = 0x7FFFFFFF -; X86-SSE-NEXT: cmovbel %ebp, %ebx -; X86-SSE-NEXT: movl $-1, %ebp -; X86-SSE-NEXT: cmoval %ebp, %edi -; X86-SSE-NEXT: cmoval %ebp, %edx -; X86-SSE-NEXT: cmoval %ebp, %eax +; X86-SSE-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF +; X86-SSE-NEXT: cmovbel %ebx, %ebp +; X86-SSE-NEXT: movl $-1, %ebx +; X86-SSE-NEXT: cmoval %ebx, %edi +; X86-SSE-NEXT: cmoval %ebx, %edx +; X86-SSE-NEXT: cmoval %ebx, %eax ; X86-SSE-NEXT: fucompi %st(0), %st ; X86-SSE-NEXT: cmovpl %ecx, %eax ; X86-SSE-NEXT: cmovpl %ecx, %edx ; X86-SSE-NEXT: cmovpl %ecx, %edi -; X86-SSE-NEXT: cmovpl %ecx, %ebx -; X86-SSE-NEXT: movl %ebx, 12(%esi) +; X86-SSE-NEXT: cmovpl %ecx, %ebp +; X86-SSE-NEXT: movl %ebp, 12(%esi) ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %eax, (%esi) diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll index 2856cfa01fad15..9a31e6fe2543a2 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll @@ -61,37 +61,37 @@ define <4 x i8> @test_signed_v4i8_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_signed_v4i8_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: movaps %xmm1, %xmm3 -; CHECK-NEXT: maxss %xmm0, %xmm3 -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: movaps %xmm2, %xmm4 -; CHECK-NEXT: minss %xmm3, %xmm4 +; CHECK-NEXT: movaps %xmm1, %xmm2 +; CHECK-NEXT: maxss %xmm0, %xmm2 +; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm3, %xmm4 +; CHECK-NEXT: minss %xmm2, %xmm4 ; CHECK-NEXT: cvttss2si %xmm4, %eax ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; CHECK-NEXT: movaps %xmm1, %xmm4 -; CHECK-NEXT: maxss %xmm3, %xmm4 -; CHECK-NEXT: movaps %xmm2, %xmm3 -; CHECK-NEXT: minss %xmm4, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %ecx +; CHECK-NEXT: maxss %xmm2, %xmm4 +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: minss %xmm4, %xmm2 +; CHECK-NEXT: cvttss2si %xmm2, %ecx ; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: shll $8, %ecx ; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; CHECK-NEXT: movaps %xmm1, %xmm4 -; CHECK-NEXT: maxss %xmm3, %xmm4 -; CHECK-NEXT: movaps %xmm2, %xmm3 -; CHECK-NEXT: minss %xmm4, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %eax +; CHECK-NEXT: maxss %xmm2, %xmm4 +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: minss %xmm4, %xmm2 +; CHECK-NEXT: cvttss2si %xmm2, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: minss %xmm1, %xmm2 -; CHECK-NEXT: cvttss2si %xmm2, %ecx +; CHECK-NEXT: minss %xmm1, %xmm3 +; CHECK-NEXT: cvttss2si %xmm3, %ecx ; CHECK-NEXT: shll $24, %ecx ; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: movd %ecx, %xmm0 @@ -143,37 +143,37 @@ define <4 x i32> @test_signed_v4i32_v4f32(<4 x float> %f) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; CHECK-NEXT: cvttss2si %xmm1, %edx +; CHECK-NEXT: cvttss2si %xmm1, %eax ; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss %xmm2, %xmm1 -; CHECK-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-NEXT: cmoval %eax, %edx -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; CHECK-NEXT: cmoval %ecx, %eax +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: ucomiss %xmm1, %xmm1 -; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: cmovpl %edx, %eax +; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm3, %edx +; CHECK-NEXT: cvttss2si %xmm3, %eax ; CHECK-NEXT: ucomiss %xmm2, %xmm3 -; CHECK-NEXT: cmoval %eax, %edx +; CHECK-NEXT: cmoval %ecx, %eax ; CHECK-NEXT: ucomiss %xmm3, %xmm3 -; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm3 +; CHECK-NEXT: cmovpl %edx, %eax +; CHECK-NEXT: movd %eax, %xmm3 ; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-NEXT: cvttss2si %xmm0, %edx +; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss %xmm2, %xmm0 -; CHECK-NEXT: cmoval %eax, %edx +; CHECK-NEXT: cmoval %ecx, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: cmovpl %edx, %eax +; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: cvttss2si %xmm0, %edx +; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss %xmm2, %xmm0 -; CHECK-NEXT: cmoval %eax, %edx +; CHECK-NEXT: cmoval %ecx, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm0 +; CHECK-NEXT: cmovpl %edx, %eax +; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: movdqa %xmm1, %xmm0 @@ -185,39 +185,39 @@ define <4 x i32> @test_signed_v4i32_v4f32(<4 x float> %f) nounwind { define <4 x i64> @test_signed_v4i64_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_signed_v4i64_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: cvttss2si %xmm0, %rdx +; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovaq %rax, %rdx -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovaq %rcx, %rax +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm2 +; CHECK-NEXT: cmovpq %rdx, %rax +; CHECK-NEXT: movq %rax, %xmm2 ; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; CHECK-NEXT: cvttss2si %xmm3, %rdx +; CHECK-NEXT: cvttss2si %xmm3, %rax ; CHECK-NEXT: ucomiss %xmm1, %xmm3 -; CHECK-NEXT: cmovaq %rax, %rdx +; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: ucomiss %xmm3, %xmm3 -; CHECK-NEXT: cmovpq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm3 +; CHECK-NEXT: cmovpq %rdx, %rax +; CHECK-NEXT: movq %rax, %xmm3 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] -; CHECK-NEXT: cvttss2si %xmm3, %rdx +; CHECK-NEXT: cvttss2si %xmm3, %rax ; CHECK-NEXT: ucomiss %xmm1, %xmm3 -; CHECK-NEXT: cmovaq %rax, %rdx +; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: ucomiss %xmm3, %xmm3 -; CHECK-NEXT: cmovpq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm3 +; CHECK-NEXT: cmovpq %rdx, %rax +; CHECK-NEXT: movq %rax, %xmm3 ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: cvttss2si %xmm0, %rdx +; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: cmovaq %rax, %rdx +; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm1 +; CHECK-NEXT: cmovpq %rdx, %rax +; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: retq @@ -309,8 +309,7 @@ define <4 x i128> @test_signed_v4i128_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %rsi, %rax -; CHECK-NEXT: movl $0, %ecx -; CHECK-NEXT: cmovpq %rcx, %rdx +; CHECK-NEXT: cmovpq %rsi, %rdx ; CHECK-NEXT: movq %rdx, 8(%rbx) ; CHECK-NEXT: movq %rax, (%rbx) ; CHECK-NEXT: movq %r14, 56(%rbx) @@ -486,17 +485,17 @@ define <2 x i128> @test_signed_v2i128_v2f64(<2 x double> %f) nounwind { ; CHECK-NEXT: movq %rax, %r14 ; CHECK-NEXT: movq %rdx, %r15 ; CHECK-NEXT: xorl %r12d, %r12d -; CHECK-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cmovbq %r12, %r14 ; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 ; CHECK-NEXT: cmovbq %rax, %r15 -; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: movabsq $9223372036854775807, %rbp # imm = 0x7FFFFFFFFFFFFFFF ; CHECK-NEXT: cmovaq %rbp, %r15 ; CHECK-NEXT: movq $-1, %r13 ; CHECK-NEXT: cmovaq %r13, %r14 -; CHECK-NEXT: ucomisd %xmm0, %xmm0 +; CHECK-NEXT: ucomisd %xmm1, %xmm1 ; CHECK-NEXT: cmovpq %r12, %r14 ; CHECK-NEXT: cmovpq %r12, %r15 ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -1340,8 +1339,7 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %rsi, %rax -; CHECK-NEXT: movl $0, %ecx -; CHECK-NEXT: cmovpq %rcx, %rdx +; CHECK-NEXT: cmovpq %rsi, %rdx ; CHECK-NEXT: movq %rdx, 8(%rbx) ; CHECK-NEXT: movq %rax, (%rbx) ; CHECK-NEXT: movq %r13, 120(%rbx) diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll index 01426b1ac91c24..66faa37c378c17 100644 --- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll @@ -484,28 +484,28 @@ define i50 @test_unsigned_i50_f32(float %f) nounwind { ; X86-SSE-NEXT: movaps %xmm0, %xmm3 ; X86-SSE-NEXT: subss %xmm2, %xmm3 ; X86-SSE-NEXT: movss %xmm3, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: setbe %cl +; X86-SSE-NEXT: setbe %al ; X86-SSE-NEXT: flds {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: orl $3072, %eax # imm = 0xC00 -; X86-SSE-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: orl $3072, %ecx # imm = 0xC00 +; X86-SSE-NEXT: movw %cx, {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %ecx, %ecx ; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 ; X86-SSE-NEXT: movl $0, %esi ; X86-SSE-NEXT: jb .LBB6_4 ; X86-SSE-NEXT: # %bb.3: -; X86-SSE-NEXT: movzbl %cl, %eax -; X86-SSE-NEXT: shll $31, %eax -; X86-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzbl %al, %ecx +; X86-SSE-NEXT: shll $31, %ecx +; X86-SSE-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE-NEXT: .LBB6_4: ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movl $262143, %edx # imm = 0x3FFFF -; X86-SSE-NEXT: cmovbel %eax, %edx +; X86-SSE-NEXT: cmovbel %ecx, %edx ; X86-SSE-NEXT: movl $-1, %eax ; X86-SSE-NEXT: cmovbel %esi, %eax ; X86-SSE-NEXT: addl $16, %esp @@ -676,7 +676,7 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind { ; X86-X87-NEXT: movl %eax, %ebx ; X86-X87-NEXT: calll __fixunssfti ; X86-X87-NEXT: subl $4, %esp -; X86-X87-NEXT: xorl %edi, %edi +; X86-X87-NEXT: xorl %esi, %esi ; X86-X87-NEXT: movb %bh, %ah ; X86-X87-NEXT: sahf ; X86-X87-NEXT: movl $0, %eax @@ -684,16 +684,16 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind { ; X86-X87-NEXT: # %bb.1: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: .LBB8_2: -; X86-X87-NEXT: movl $0, %esi +; X86-X87-NEXT: movl $0, %edi ; X86-X87-NEXT: movl $0, %ebx ; X86-X87-NEXT: jb .LBB8_4 ; X86-X87-NEXT: # %bb.3: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-X87-NEXT: .LBB8_4: ; X86-X87-NEXT: jb .LBB8_6 ; X86-X87-NEXT: # %bb.5: -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-X87-NEXT: .LBB8_6: ; X86-X87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -706,20 +706,20 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind { ; X86-X87-NEXT: movl $15, %eax ; X86-X87-NEXT: ja .LBB8_8 ; X86-X87-NEXT: # %bb.7: -; X86-X87-NEXT: movl %edi, %eax +; X86-X87-NEXT: movl %esi, %eax ; X86-X87-NEXT: .LBB8_8: -; X86-X87-NEXT: movl $-1, %edi +; X86-X87-NEXT: movl $-1, %esi ; X86-X87-NEXT: movl $-1, %ebp ; X86-X87-NEXT: movl $-1, %edx ; X86-X87-NEXT: ja .LBB8_10 ; X86-X87-NEXT: # %bb.9: -; X86-X87-NEXT: movl %ebx, %edi -; X86-X87-NEXT: movl %esi, %ebp +; X86-X87-NEXT: movl %ebx, %esi +; X86-X87-NEXT: movl %edi, %ebp ; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-X87-NEXT: .LBB8_10: ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) -; X86-X87-NEXT: movl %edi, (%ecx) +; X86-X87-NEXT: movl %esi, (%ecx) ; X86-X87-NEXT: andl $15, %eax ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax @@ -820,7 +820,7 @@ define i128 @test_unsigned_i128_f32(float %f) nounwind { ; X86-X87-NEXT: movl %eax, %ebx ; X86-X87-NEXT: calll __fixunssfti ; X86-X87-NEXT: subl $4, %esp -; X86-X87-NEXT: xorl %edx, %edx +; X86-X87-NEXT: xorl %esi, %esi ; X86-X87-NEXT: movb %bh, %ah ; X86-X87-NEXT: sahf ; X86-X87-NEXT: movl $0, %eax @@ -828,20 +828,20 @@ define i128 @test_unsigned_i128_f32(float %f) nounwind { ; X86-X87-NEXT: # %bb.1: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: .LBB9_2: -; X86-X87-NEXT: movl $0, %ecx +; X86-X87-NEXT: movl $0, %edx ; X86-X87-NEXT: jb .LBB9_4 ; X86-X87-NEXT: # %bb.3: -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-X87-NEXT: .LBB9_4: -; X86-X87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-X87-NEXT: movl $0, %ebx ; X86-X87-NEXT: jb .LBB9_6 ; X86-X87-NEXT: # %bb.5: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-X87-NEXT: .LBB9_6: +; X86-X87-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X86-X87-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-X87-NEXT: fucompp @@ -855,8 +855,8 @@ define i128 @test_unsigned_i128_f32(float %f) nounwind { ; X86-X87-NEXT: ja .LBB9_8 ; X86-X87-NEXT: # %bb.7: ; X86-X87-NEXT: movl %ebx, %eax -; X86-X87-NEXT: movl %edx, %ebp -; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-X87-NEXT: movl %edx, %edi ; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-X87-NEXT: .LBB9_8: ; X86-X87-NEXT: movl %esi, 12(%ecx) @@ -1412,28 +1412,28 @@ define i50 @test_unsigned_i50_f64(double %f) nounwind { ; X86-SSE-NEXT: movapd %xmm0, %xmm3 ; X86-SSE-NEXT: subsd %xmm2, %xmm3 ; X86-SSE-NEXT: movsd %xmm3, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: setbe %cl +; X86-SSE-NEXT: setbe %al ; X86-SSE-NEXT: fldl {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: orl $3072, %eax # imm = 0xC00 -; X86-SSE-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: orl $3072, %ecx # imm = 0xC00 +; X86-SSE-NEXT: movw %cx, {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %ecx, %ecx ; X86-SSE-NEXT: ucomisd %xmm1, %xmm0 ; X86-SSE-NEXT: movl $0, %esi ; X86-SSE-NEXT: jb .LBB16_4 ; X86-SSE-NEXT: # %bb.3: -; X86-SSE-NEXT: movzbl %cl, %eax -; X86-SSE-NEXT: shll $31, %eax -; X86-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzbl %al, %ecx +; X86-SSE-NEXT: shll $31, %ecx +; X86-SSE-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE-NEXT: .LBB16_4: ; X86-SSE-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movl $262143, %edx # imm = 0x3FFFF -; X86-SSE-NEXT: cmovbel %eax, %edx +; X86-SSE-NEXT: cmovbel %ecx, %edx ; X86-SSE-NEXT: movl $-1, %eax ; X86-SSE-NEXT: cmovbel %esi, %eax ; X86-SSE-NEXT: addl $16, %esp @@ -1600,7 +1600,7 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind { ; X86-X87-NEXT: movl %eax, %ebx ; X86-X87-NEXT: calll __fixunsdfti ; X86-X87-NEXT: subl $4, %esp -; X86-X87-NEXT: xorl %edi, %edi +; X86-X87-NEXT: xorl %esi, %esi ; X86-X87-NEXT: movb %bh, %ah ; X86-X87-NEXT: sahf ; X86-X87-NEXT: movl $0, %eax @@ -1608,16 +1608,16 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind { ; X86-X87-NEXT: # %bb.1: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: .LBB18_2: -; X86-X87-NEXT: movl $0, %esi +; X86-X87-NEXT: movl $0, %edi ; X86-X87-NEXT: movl $0, %ebx ; X86-X87-NEXT: jb .LBB18_4 ; X86-X87-NEXT: # %bb.3: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-X87-NEXT: .LBB18_4: ; X86-X87-NEXT: jb .LBB18_6 ; X86-X87-NEXT: # %bb.5: -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-X87-NEXT: .LBB18_6: ; X86-X87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -1630,20 +1630,20 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind { ; X86-X87-NEXT: movl $15, %eax ; X86-X87-NEXT: ja .LBB18_8 ; X86-X87-NEXT: # %bb.7: -; X86-X87-NEXT: movl %edi, %eax +; X86-X87-NEXT: movl %esi, %eax ; X86-X87-NEXT: .LBB18_8: -; X86-X87-NEXT: movl $-1, %edi +; X86-X87-NEXT: movl $-1, %esi ; X86-X87-NEXT: movl $-1, %ebp ; X86-X87-NEXT: movl $-1, %edx ; X86-X87-NEXT: ja .LBB18_10 ; X86-X87-NEXT: # %bb.9: -; X86-X87-NEXT: movl %ebx, %edi -; X86-X87-NEXT: movl %esi, %ebp +; X86-X87-NEXT: movl %ebx, %esi +; X86-X87-NEXT: movl %edi, %ebp ; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-X87-NEXT: .LBB18_10: ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) -; X86-X87-NEXT: movl %edi, (%ecx) +; X86-X87-NEXT: movl %esi, (%ecx) ; X86-X87-NEXT: andl $15, %eax ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax @@ -1744,7 +1744,7 @@ define i128 @test_unsigned_i128_f64(double %f) nounwind { ; X86-X87-NEXT: movl %eax, %ebx ; X86-X87-NEXT: calll __fixunsdfti ; X86-X87-NEXT: subl $4, %esp -; X86-X87-NEXT: xorl %edx, %edx +; X86-X87-NEXT: xorl %esi, %esi ; X86-X87-NEXT: movb %bh, %ah ; X86-X87-NEXT: sahf ; X86-X87-NEXT: movl $0, %eax @@ -1752,20 +1752,20 @@ define i128 @test_unsigned_i128_f64(double %f) nounwind { ; X86-X87-NEXT: # %bb.1: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: .LBB19_2: -; X86-X87-NEXT: movl $0, %ecx +; X86-X87-NEXT: movl $0, %edx ; X86-X87-NEXT: jb .LBB19_4 ; X86-X87-NEXT: # %bb.3: -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-X87-NEXT: .LBB19_4: -; X86-X87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-X87-NEXT: movl $0, %ebx ; X86-X87-NEXT: jb .LBB19_6 ; X86-X87-NEXT: # %bb.5: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-X87-NEXT: .LBB19_6: +; X86-X87-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: fldl {{\.?LCPI[0-9]+_[0-9]+}} ; X86-X87-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload ; X86-X87-NEXT: fucompp @@ -1779,8 +1779,8 @@ define i128 @test_unsigned_i128_f64(double %f) nounwind { ; X86-X87-NEXT: ja .LBB19_8 ; X86-X87-NEXT: # %bb.7: ; X86-X87-NEXT: movl %ebx, %eax -; X86-X87-NEXT: movl %edx, %ebp -; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-X87-NEXT: movl %edx, %edi ; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-X87-NEXT: .LBB19_8: ; X86-X87-NEXT: movl %esi, 12(%ecx) @@ -2465,28 +2465,28 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind { ; X86-SSE-NEXT: movaps %xmm0, %xmm3 ; X86-SSE-NEXT: subss %xmm2, %xmm3 ; X86-SSE-NEXT: movss %xmm3, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: setae %cl +; X86-SSE-NEXT: setae %al ; X86-SSE-NEXT: flds {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: orl $3072, %eax # imm = 0xC00 -; X86-SSE-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: orl $3072, %ecx # imm = 0xC00 +; X86-SSE-NEXT: movw %cx, {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %ecx, %ecx ; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 ; X86-SSE-NEXT: movl $0, %esi ; X86-SSE-NEXT: jb .LBB26_4 ; X86-SSE-NEXT: # %bb.3: -; X86-SSE-NEXT: movzbl %cl, %eax -; X86-SSE-NEXT: shll $31, %eax -; X86-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzbl %al, %ecx +; X86-SSE-NEXT: shll $31, %ecx +; X86-SSE-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE-NEXT: .LBB26_4: ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movl $262143, %edx # imm = 0x3FFFF -; X86-SSE-NEXT: cmovbel %eax, %edx +; X86-SSE-NEXT: cmovbel %ecx, %edx ; X86-SSE-NEXT: movl $-1, %eax ; X86-SSE-NEXT: cmovbel %esi, %eax ; X86-SSE-NEXT: addl $24, %esp @@ -2679,7 +2679,7 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind { ; X86-X87-NEXT: movl %eax, %ebx ; X86-X87-NEXT: calll __fixunssfti ; X86-X87-NEXT: subl $4, %esp -; X86-X87-NEXT: xorl %edi, %edi +; X86-X87-NEXT: xorl %esi, %esi ; X86-X87-NEXT: movb %bh, %ah ; X86-X87-NEXT: sahf ; X86-X87-NEXT: movl $0, %eax @@ -2687,16 +2687,16 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind { ; X86-X87-NEXT: # %bb.1: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: .LBB28_2: -; X86-X87-NEXT: movl $0, %esi +; X86-X87-NEXT: movl $0, %edi ; X86-X87-NEXT: movl $0, %ebx ; X86-X87-NEXT: jb .LBB28_4 ; X86-X87-NEXT: # %bb.3: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-X87-NEXT: .LBB28_4: ; X86-X87-NEXT: jb .LBB28_6 ; X86-X87-NEXT: # %bb.5: -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-X87-NEXT: .LBB28_6: ; X86-X87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -2709,20 +2709,20 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind { ; X86-X87-NEXT: movl $15, %eax ; X86-X87-NEXT: ja .LBB28_8 ; X86-X87-NEXT: # %bb.7: -; X86-X87-NEXT: movl %edi, %eax +; X86-X87-NEXT: movl %esi, %eax ; X86-X87-NEXT: .LBB28_8: -; X86-X87-NEXT: movl $-1, %edi +; X86-X87-NEXT: movl $-1, %esi ; X86-X87-NEXT: movl $-1, %ebp ; X86-X87-NEXT: movl $-1, %edx ; X86-X87-NEXT: ja .LBB28_10 ; X86-X87-NEXT: # %bb.9: -; X86-X87-NEXT: movl %ebx, %edi -; X86-X87-NEXT: movl %esi, %ebp +; X86-X87-NEXT: movl %ebx, %esi +; X86-X87-NEXT: movl %edi, %ebp ; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-X87-NEXT: .LBB28_10: ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) -; X86-X87-NEXT: movl %edi, (%ecx) +; X86-X87-NEXT: movl %esi, (%ecx) ; X86-X87-NEXT: andl $15, %eax ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax @@ -2833,7 +2833,7 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind { ; X86-X87-NEXT: movl %eax, %ebx ; X86-X87-NEXT: calll __fixunssfti ; X86-X87-NEXT: subl $4, %esp -; X86-X87-NEXT: xorl %edx, %edx +; X86-X87-NEXT: xorl %esi, %esi ; X86-X87-NEXT: movb %bh, %ah ; X86-X87-NEXT: sahf ; X86-X87-NEXT: movl $0, %eax @@ -2841,20 +2841,20 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind { ; X86-X87-NEXT: # %bb.1: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: .LBB29_2: -; X86-X87-NEXT: movl $0, %ecx +; X86-X87-NEXT: movl $0, %edx ; X86-X87-NEXT: jb .LBB29_4 ; X86-X87-NEXT: # %bb.3: -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-X87-NEXT: .LBB29_4: -; X86-X87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-X87-NEXT: movl $0, %ebx ; X86-X87-NEXT: jb .LBB29_6 ; X86-X87-NEXT: # %bb.5: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-X87-NEXT: .LBB29_6: +; X86-X87-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X86-X87-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-X87-NEXT: fucompp @@ -2868,8 +2868,8 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind { ; X86-X87-NEXT: ja .LBB29_8 ; X86-X87-NEXT: # %bb.7: ; X86-X87-NEXT: movl %ebx, %eax -; X86-X87-NEXT: movl %edx, %ebp -; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-X87-NEXT: movl %edx, %edi ; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-X87-NEXT: .LBB29_8: ; X86-X87-NEXT: movl %esi, 12(%ecx) @@ -3894,7 +3894,7 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind { ; X86-X87-NEXT: movl %eax, %ebx ; X86-X87-NEXT: calll __fixunsxfti ; X86-X87-NEXT: subl $4, %esp -; X86-X87-NEXT: xorl %edi, %edi +; X86-X87-NEXT: xorl %esi, %esi ; X86-X87-NEXT: movb %bh, %ah ; X86-X87-NEXT: sahf ; X86-X87-NEXT: movl $0, %eax @@ -3902,16 +3902,16 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind { ; X86-X87-NEXT: # %bb.1: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: .LBB38_2: -; X86-X87-NEXT: movl $0, %esi +; X86-X87-NEXT: movl $0, %edi ; X86-X87-NEXT: movl $0, %ebx ; X86-X87-NEXT: jb .LBB38_4 ; X86-X87-NEXT: # %bb.3: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-X87-NEXT: .LBB38_4: ; X86-X87-NEXT: jb .LBB38_6 ; X86-X87-NEXT: # %bb.5: -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-X87-NEXT: .LBB38_6: ; X86-X87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -3924,20 +3924,20 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind { ; X86-X87-NEXT: movl $15, %eax ; X86-X87-NEXT: ja .LBB38_8 ; X86-X87-NEXT: # %bb.7: -; X86-X87-NEXT: movl %edi, %eax +; X86-X87-NEXT: movl %esi, %eax ; X86-X87-NEXT: .LBB38_8: -; X86-X87-NEXT: movl $-1, %edi +; X86-X87-NEXT: movl $-1, %esi ; X86-X87-NEXT: movl $-1, %ebp ; X86-X87-NEXT: movl $-1, %edx ; X86-X87-NEXT: ja .LBB38_10 ; X86-X87-NEXT: # %bb.9: -; X86-X87-NEXT: movl %ebx, %edi -; X86-X87-NEXT: movl %esi, %ebp +; X86-X87-NEXT: movl %ebx, %esi +; X86-X87-NEXT: movl %edi, %ebp ; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-X87-NEXT: .LBB38_10: ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) -; X86-X87-NEXT: movl %edi, (%ecx) +; X86-X87-NEXT: movl %esi, (%ecx) ; X86-X87-NEXT: andl $15, %eax ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax @@ -4052,7 +4052,7 @@ define i128 @test_unsigned_i128_f80(x86_fp80 %f) nounwind { ; X86-X87-NEXT: movl %eax, %ebx ; X86-X87-NEXT: calll __fixunsxfti ; X86-X87-NEXT: subl $4, %esp -; X86-X87-NEXT: xorl %edx, %edx +; X86-X87-NEXT: xorl %esi, %esi ; X86-X87-NEXT: movb %bh, %ah ; X86-X87-NEXT: sahf ; X86-X87-NEXT: movl $0, %eax @@ -4060,20 +4060,20 @@ define i128 @test_unsigned_i128_f80(x86_fp80 %f) nounwind { ; X86-X87-NEXT: # %bb.1: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: .LBB39_2: -; X86-X87-NEXT: movl $0, %ecx +; X86-X87-NEXT: movl $0, %edx ; X86-X87-NEXT: jb .LBB39_4 ; X86-X87-NEXT: # %bb.3: -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-X87-NEXT: .LBB39_4: -; X86-X87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-X87-NEXT: movl $0, %ebx ; X86-X87-NEXT: jb .LBB39_6 ; X86-X87-NEXT: # %bb.5: ; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-X87-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-X87-NEXT: .LBB39_6: +; X86-X87-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-X87-NEXT: fldt {{\.?LCPI[0-9]+_[0-9]+}} ; X86-X87-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; X86-X87-NEXT: fucompp @@ -4087,8 +4087,8 @@ define i128 @test_unsigned_i128_f80(x86_fp80 %f) nounwind { ; X86-X87-NEXT: ja .LBB39_8 ; X86-X87-NEXT: # %bb.7: ; X86-X87-NEXT: movl %ebx, %eax -; X86-X87-NEXT: movl %edx, %ebp -; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-X87-NEXT: movl %edx, %edi ; X86-X87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-X87-NEXT: .LBB39_8: ; X86-X87-NEXT: movl %esi, 12(%ecx) diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll index 0cced636dddbab..d46b24a0036c76 100644 --- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll @@ -52,37 +52,37 @@ define <4 x i8> @test_unsigned_v4i8_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_unsigned_v4i8_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: xorps %xmm3, %xmm3 -; CHECK-NEXT: maxss %xmm0, %xmm3 -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: movaps %xmm2, %xmm4 -; CHECK-NEXT: minss %xmm3, %xmm4 +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: maxss %xmm0, %xmm2 +; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm3, %xmm4 +; CHECK-NEXT: minss %xmm2, %xmm4 ; CHECK-NEXT: cvttss2si %xmm4, %eax ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; CHECK-NEXT: xorps %xmm4, %xmm4 -; CHECK-NEXT: maxss %xmm3, %xmm4 -; CHECK-NEXT: movaps %xmm2, %xmm3 -; CHECK-NEXT: minss %xmm4, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %ecx +; CHECK-NEXT: maxss %xmm2, %xmm4 +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: minss %xmm4, %xmm2 +; CHECK-NEXT: cvttss2si %xmm2, %ecx ; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: shll $8, %ecx ; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; CHECK-NEXT: xorps %xmm4, %xmm4 -; CHECK-NEXT: maxss %xmm3, %xmm4 -; CHECK-NEXT: movaps %xmm2, %xmm3 -; CHECK-NEXT: minss %xmm4, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %eax +; CHECK-NEXT: maxss %xmm2, %xmm4 +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: minss %xmm4, %xmm2 +; CHECK-NEXT: cvttss2si %xmm2, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: minss %xmm1, %xmm2 -; CHECK-NEXT: cvttss2si %xmm2, %ecx +; CHECK-NEXT: minss %xmm1, %xmm3 +; CHECK-NEXT: cvttss2si %xmm3, %ecx ; CHECK-NEXT: shll $24, %ecx ; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: movd %ecx, %xmm0 @@ -134,38 +134,38 @@ define <4 x i32> @test_unsigned_v4i32_v4f32(<4 x float> %f) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; CHECK-NEXT: cvttss2si %xmm1, %rdx -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cvttss2si %xmm1, %rax +; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: ucomiss %xmm2, %xmm1 -; CHECK-NEXT: cmovbl %eax, %edx +; CHECK-NEXT: cmovbl %ecx, %eax ; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss %xmm3, %xmm1 -; CHECK-NEXT: movl $-1, %ecx -; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: movl $-1, %edx +; CHECK-NEXT: cmoval %edx, %eax +; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: movaps %xmm0, %xmm4 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm4, %rdx +; CHECK-NEXT: cvttss2si %xmm4, %rax ; CHECK-NEXT: ucomiss %xmm2, %xmm4 -; CHECK-NEXT: cmovbl %eax, %edx +; CHECK-NEXT: cmovbl %ecx, %eax ; CHECK-NEXT: ucomiss %xmm3, %xmm4 -; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm4 +; CHECK-NEXT: cmoval %edx, %eax +; CHECK-NEXT: movd %eax, %xmm4 ; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-NEXT: cvttss2si %xmm0, %rdx +; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss %xmm2, %xmm0 -; CHECK-NEXT: cmovbl %eax, %edx +; CHECK-NEXT: cmovbl %ecx, %eax ; CHECK-NEXT: ucomiss %xmm3, %xmm0 -; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: cmoval %edx, %eax +; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: cvttss2si %xmm0, %rdx +; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss %xmm2, %xmm0 -; CHECK-NEXT: cmovbl %eax, %edx +; CHECK-NEXT: cmovbl %ecx, %eax ; CHECK-NEXT: ucomiss %xmm3, %xmm0 -; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm0 +; CHECK-NEXT: cmoval %edx, %eax +; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; CHECK-NEXT: movdqa %xmm1, %xmm0 @@ -1157,10 +1157,11 @@ define <8 x i128> @test_unsigned_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: cmovbq %r12, %rdx ; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: movq $-1, %r13 -; CHECK-NEXT: cmovaq %r13, %rax +; CHECK-NEXT: movq $-1, %rcx +; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovaq %r13, %rdx +; CHECK-NEXT: cmovaq %rcx, %rdx +; CHECK-NEXT: movq $-1, %r13 ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index defd81e6ab7710..0974233225fb57 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -507,20 +507,20 @@ define i32 @freeze_ashr_outofrange(i32 %a0) nounwind { define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind { ; X86-LABEL: freeze_ashr_vec: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psraw $1, %xmm2 -; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pandn %xmm2, %xmm3 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psraw $1, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; X86-NEXT: movdqa %xmm2, %xmm3 +; X86-NEXT: pandn %xmm1, %xmm3 ; X86-NEXT: psraw $3, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 +; X86-NEXT: pand %xmm2, %xmm0 ; X86-NEXT: por %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psraw $3, %xmm2 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psraw $3, %xmm1 ; X86-NEXT: psraw $1, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: pandn %xmm2, %xmm1 -; X86-NEXT: por %xmm1, %xmm0 +; X86-NEXT: pand %xmm2, %xmm0 +; X86-NEXT: pandn %xmm1, %xmm2 +; X86-NEXT: por %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_vec: @@ -616,20 +616,20 @@ define i32 @freeze_lshr_outofrange(i32 %a0) nounwind { define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind { ; X86-LABEL: freeze_lshr_vec: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psrlw $1, %xmm2 -; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pandn %xmm2, %xmm3 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrlw $1, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; X86-NEXT: movdqa %xmm2, %xmm3 +; X86-NEXT: pandn %xmm1, %xmm3 ; X86-NEXT: psrlw $2, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 +; X86-NEXT: pand %xmm2, %xmm0 ; X86-NEXT: por %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psrlw $2, %xmm2 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrlw $2, %xmm1 ; X86-NEXT: psrlw $1, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: pandn %xmm2, %xmm1 -; X86-NEXT: por %xmm1, %xmm0 +; X86-NEXT: pand %xmm2, %xmm0 +; X86-NEXT: pandn %xmm1, %xmm2 +; X86-NEXT: por %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_vec: diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll index ea3057d01cb7ff..e433dbfa19bc55 100644 --- a/llvm/test/CodeGen/X86/frem.ll +++ b/llvm/test/CodeGen/X86/frem.ll @@ -238,29 +238,29 @@ define void @frem_v8f32(<8 x float> %a0, <8 x float> %a1, ptr%p3) nounwind { ; CHECK-NEXT: subq $96, %rsp ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: movaps %xmm2, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll index 065b396e82ec31..e9cebdea3b245a 100644 --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -18,13 +18,13 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind { ; X86-LABEL: var_shift_i8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $8, %eax -; X86-NEXT: orl %edx, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: shll $8, %edx +; X86-NEXT: orl %eax, %edx ; X86-NEXT: andb $7, %cl -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movb %ah, %al +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movb %dh, %al ; X86-NEXT: retl ; ; X64-LABEL: var_shift_i8: @@ -325,26 +325,27 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: pushl %eax ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: testb $64, %al ; X86-SLOW-NEXT: jne .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: -; X86-SLOW-NEXT: movl %edx, %ebp -; X86-SLOW-NEXT: movl %ebx, %edx +; X86-SLOW-NEXT: movl %esi, %ebp +; X86-SLOW-NEXT: movl %ebx, %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SLOW-NEXT: movl %edi, %ecx -; X86-SLOW-NEXT: movl %esi, %edi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl %edx, %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: testb $32, %al ; X86-SLOW-NEXT: je .LBB6_5 ; X86-SLOW-NEXT: .LBB6_4: +; X86-SLOW-NEXT: movl %eax, %edx ; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-SLOW-NEXT: movl %edi, %ebx -; X86-SLOW-NEXT: movl %edx, %edi -; X86-SLOW-NEXT: movl %ecx, %edx +; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: movl %ecx, %esi ; X86-SLOW-NEXT: jmp .LBB6_6 ; X86-SLOW-NEXT: .LBB6_1: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp @@ -352,33 +353,34 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: testb $32, %al ; X86-SLOW-NEXT: jne .LBB6_4 ; X86-SLOW-NEXT: .LBB6_5: +; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %eax, %edx ; X86-SLOW-NEXT: movl %ecx, %ebp -; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-SLOW-NEXT: .LBB6_6: -; X86-SLOW-NEXT: movl %edx, %esi -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: movl %esi, %eax +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: shll %cl, %eax ; X86-SLOW-NEXT: shrl %ebp -; X86-SLOW-NEXT: movb %al, %ch +; X86-SLOW-NEXT: movb %dl, %ch ; X86-SLOW-NEXT: notb %ch ; X86-SLOW-NEXT: movb %ch, %cl ; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: orl %esi, %ebp -; X86-SLOW-NEXT: movl %edi, %esi -; X86-SLOW-NEXT: movb %al, %cl -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: shrl %edx +; X86-SLOW-NEXT: orl %eax, %ebp +; X86-SLOW-NEXT: movl %edi, %eax +; X86-SLOW-NEXT: movb %dl, %cl +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: shrl %esi ; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %edx -; X86-SLOW-NEXT: orl %esi, %edx -; X86-SLOW-NEXT: movl %ebx, %esi -; X86-SLOW-NEXT: movb %al, %cl -; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: orl %eax, %esi +; X86-SLOW-NEXT: movl %ebx, %eax +; X86-SLOW-NEXT: movb %dl, %cl +; X86-SLOW-NEXT: shll %cl, %eax ; X86-SLOW-NEXT: shrl %edi ; X86-SLOW-NEXT: movb %ch, %cl ; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: orl %esi, %edi -; X86-SLOW-NEXT: movb %al, %cl +; X86-SLOW-NEXT: orl %eax, %edi +; X86-SLOW-NEXT: movb %dl, %cl ; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SLOW-NEXT: shll %cl, %eax ; X86-SLOW-NEXT: shrl %ebx @@ -388,7 +390,7 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movl %ebx, 12(%eax) ; X86-SLOW-NEXT: movl %edi, 8(%eax) -; X86-SLOW-NEXT: movl %edx, 4(%eax) +; X86-SLOW-NEXT: movl %esi, 4(%eax) ; X86-SLOW-NEXT: movl %ebp, (%eax) ; X86-SLOW-NEXT: addl $4, %esp ; X86-SLOW-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll index 4340f8fd484aeb..ef4875d2cb98e0 100644 --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -316,73 +316,72 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: subl $8, %esp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: testb $64, %cl ; X86-SLOW-NEXT: je .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: -; X86-SLOW-NEXT: movl %ebp, %eax -; X86-SLOW-NEXT: movl %ebx, %ebp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl %edi, %edx -; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: movl %ebp, %edx +; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl %eax, %ebx +; X86-SLOW-NEXT: movl %esi, %eax ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: testb $32, %cl ; X86-SLOW-NEXT: jne .LBB6_5 ; X86-SLOW-NEXT: .LBB6_4: -; X86-SLOW-NEXT: movl %ebx, %esi -; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebp, %edi -; X86-SLOW-NEXT: movl %edx, %ebp -; X86-SLOW-NEXT: movl %eax, %edx +; X86-SLOW-NEXT: movl %edi, %esi +; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebp, %eax +; X86-SLOW-NEXT: movl %ebx, %ebp +; X86-SLOW-NEXT: movl %edx, %ebx ; X86-SLOW-NEXT: jmp .LBB6_6 ; X86-SLOW-NEXT: .LBB6_1: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SLOW-NEXT: testb $32, %cl ; X86-SLOW-NEXT: je .LBB6_4 ; X86-SLOW-NEXT: .LBB6_5: -; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-SLOW-NEXT: .LBB6_6: -; X86-SLOW-NEXT: shrl %cl, %edx -; X86-SLOW-NEXT: movl %ecx, %ebx -; X86-SLOW-NEXT: notb %bl -; X86-SLOW-NEXT: leal (%ebp,%ebp), %eax -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: orl %edx, %eax +; X86-SLOW-NEXT: shrl %cl, %ebx +; X86-SLOW-NEXT: movl %ecx, %edx +; X86-SLOW-NEXT: notb %dl +; X86-SLOW-NEXT: leal (%ebp,%ebp), %edi +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: orl %ebx, %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: leal (%edi,%edi), %edx -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %edx -; X86-SLOW-NEXT: orl %ebp, %edx +; X86-SLOW-NEXT: leal (%eax,%eax), %ebx +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: shll %cl, %ebx +; X86-SLOW-NEXT: orl %ebp, %ebx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-SLOW-NEXT: leal (%edi,%edi), %ebp -; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-SLOW-NEXT: leal (%eax,%eax), %ebp +; X86-SLOW-NEXT: movl %edx, %ecx ; X86-SLOW-NEXT: shll %cl, %ebp ; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: shrl %cl, %eax ; X86-SLOW-NEXT: addl %esi, %esi -; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: movl %edx, %ecx ; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: orl %edi, %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: movl %esi, 12(%ecx) -; X86-SLOW-NEXT: movl %ebp, 8(%ecx) -; X86-SLOW-NEXT: movl %edx, 4(%ecx) -; X86-SLOW-NEXT: movl %eax, (%ecx) -; X86-SLOW-NEXT: movl %ecx, %eax +; X86-SLOW-NEXT: orl %eax, %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl %esi, 12(%eax) +; X86-SLOW-NEXT: movl %ebp, 8(%eax) +; X86-SLOW-NEXT: movl %ebx, 4(%eax) +; X86-SLOW-NEXT: movl %edi, (%eax) ; X86-SLOW-NEXT: addl $8, %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll index 4123890ed1a764..ec916148e8e202 100644 --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -980,15 +980,15 @@ define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind { ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: leal (%eax,%eax,2), %esi -; X86-SSE2-NEXT: movzwl 8(%ecx,%esi,4), %edx -; X86-SSE2-NEXT: movl 4(%ecx,%esi,4), %edi -; X86-SSE2-NEXT: shrdl $8, %edx, %edi +; X86-SSE2-NEXT: leal (%eax,%eax,2), %edx +; X86-SSE2-NEXT: movzwl 8(%ecx,%edx,4), %esi +; X86-SSE2-NEXT: movl 4(%ecx,%edx,4), %edi +; X86-SSE2-NEXT: shrdl $8, %esi, %edi ; X86-SSE2-NEXT: xorl %eax, %edi ; X86-SSE2-NEXT: sarl $31, %eax -; X86-SSE2-NEXT: movzbl 10(%ecx,%esi,4), %ecx +; X86-SSE2-NEXT: movzbl 10(%ecx,%edx,4), %ecx ; X86-SSE2-NEXT: shll $16, %ecx -; X86-SSE2-NEXT: orl %edx, %ecx +; X86-SSE2-NEXT: orl %esi, %ecx ; X86-SSE2-NEXT: shll $8, %ecx ; X86-SSE2-NEXT: movl %ecx, %edx ; X86-SSE2-NEXT: sarl $8, %edx diff --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll index a67ce8f0be5b06..c9ae14fabfed4e 100644 --- a/llvm/test/CodeGen/X86/gather-addresses.ll +++ b/llvm/test/CodeGen/X86/gather-addresses.ll @@ -228,20 +228,20 @@ define <4 x i64> @old(ptr %p, ptr %i, ptr %h, i64 %f) nounwind { ; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LIN32-NEXT: movdqa (%ecx), %xmm0 ; LIN32-NEXT: pand (%eax), %xmm0 -; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LIN32-NEXT: movd %xmm0, %edx -; LIN32-NEXT: pextrd $1, %xmm0, %esi -; LIN32-NEXT: pextrd $2, %xmm0, %eax +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; LIN32-NEXT: movd %xmm0, %ecx +; LIN32-NEXT: pextrd $1, %xmm0, %edx +; LIN32-NEXT: pextrd $2, %xmm0, %esi ; LIN32-NEXT: pextrd $3, %xmm0, %edi -; LIN32-NEXT: andl %ecx, %edx -; LIN32-NEXT: andl %ecx, %esi -; LIN32-NEXT: andl %ecx, %eax -; LIN32-NEXT: andl %ecx, %edi -; LIN32-NEXT: movd %esi, %xmm1 -; LIN32-NEXT: movd %edx, %xmm0 +; LIN32-NEXT: andl %eax, %ecx +; LIN32-NEXT: andl %eax, %edx +; LIN32-NEXT: andl %eax, %esi +; LIN32-NEXT: andl %eax, %edi +; LIN32-NEXT: movd %edx, %xmm1 +; LIN32-NEXT: movd %ecx, %xmm0 ; LIN32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; LIN32-NEXT: movd %edi, %xmm2 -; LIN32-NEXT: movd %eax, %xmm1 +; LIN32-NEXT: movd %esi, %xmm1 ; LIN32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; LIN32-NEXT: popl %esi ; LIN32-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll index 29b58d047596d4..d72a06f7969543 100644 --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -266,34 +266,34 @@ declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_rotr_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa %xmm0, %xmm5 -; GFNISSE-NEXT: psrlw $2, %xmm5 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] -; GFNISSE-NEXT: movdqa %xmm4, %xmm6 -; GFNISSE-NEXT: pandn %xmm5, %xmm6 +; GFNISSE-NEXT: movdqa %xmm0, %xmm4 +; GFNISSE-NEXT: psrlw $2, %xmm4 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNISSE-NEXT: movdqa %xmm5, %xmm6 +; GFNISSE-NEXT: pandn %xmm4, %xmm6 ; GFNISSE-NEXT: psllw $6, %xmm0 -; GFNISSE-NEXT: pand %xmm4, %xmm0 +; GFNISSE-NEXT: pand %xmm5, %xmm0 ; GFNISSE-NEXT: por %xmm6, %xmm0 -; GFNISSE-NEXT: movdqa %xmm1, %xmm5 -; GFNISSE-NEXT: psrlw $2, %xmm5 -; GFNISSE-NEXT: movdqa %xmm4, %xmm6 -; GFNISSE-NEXT: pandn %xmm5, %xmm6 +; GFNISSE-NEXT: movdqa %xmm1, %xmm4 +; GFNISSE-NEXT: psrlw $2, %xmm4 +; GFNISSE-NEXT: movdqa %xmm5, %xmm6 +; GFNISSE-NEXT: pandn %xmm4, %xmm6 ; GFNISSE-NEXT: psllw $6, %xmm1 -; GFNISSE-NEXT: pand %xmm4, %xmm1 +; GFNISSE-NEXT: pand %xmm5, %xmm1 ; GFNISSE-NEXT: por %xmm6, %xmm1 -; GFNISSE-NEXT: movdqa %xmm2, %xmm5 -; GFNISSE-NEXT: psrlw $2, %xmm5 -; GFNISSE-NEXT: movdqa %xmm4, %xmm6 -; GFNISSE-NEXT: pandn %xmm5, %xmm6 +; GFNISSE-NEXT: movdqa %xmm2, %xmm4 +; GFNISSE-NEXT: psrlw $2, %xmm4 +; GFNISSE-NEXT: movdqa %xmm5, %xmm6 +; GFNISSE-NEXT: pandn %xmm4, %xmm6 ; GFNISSE-NEXT: psllw $6, %xmm2 -; GFNISSE-NEXT: pand %xmm4, %xmm2 +; GFNISSE-NEXT: pand %xmm5, %xmm2 ; GFNISSE-NEXT: por %xmm6, %xmm2 -; GFNISSE-NEXT: movdqa %xmm3, %xmm5 -; GFNISSE-NEXT: psrlw $2, %xmm5 +; GFNISSE-NEXT: movdqa %xmm3, %xmm4 +; GFNISSE-NEXT: psrlw $2, %xmm4 ; GFNISSE-NEXT: psllw $6, %xmm3 -; GFNISSE-NEXT: pand %xmm4, %xmm3 -; GFNISSE-NEXT: pandn %xmm5, %xmm4 -; GFNISSE-NEXT: por %xmm4, %xmm3 +; GFNISSE-NEXT: pand %xmm5, %xmm3 +; GFNISSE-NEXT: pandn %xmm4, %xmm5 +; GFNISSE-NEXT: por %xmm5, %xmm3 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_rotr_v64i8: diff --git a/llvm/test/CodeGen/X86/h-registers-1.ll b/llvm/test/CodeGen/X86/h-registers-1.ll index 07d85d260a37a7..4e0651a64ae74b 100644 --- a/llvm/test/CodeGen/X86/h-registers-1.ll +++ b/llvm/test/CodeGen/X86/h-registers-1.ll @@ -16,22 +16,24 @@ define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h) ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rcx, %r10 ; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: movq %r8, %rcx ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movzbl %bh, %esi -; CHECK-NEXT: movzbl %ah, %edi +; CHECK-NEXT: movzbl %ah, %ebx ; CHECK-NEXT: movzbl %dh, %edx -; CHECK-NEXT: movzbl %ch, %ebp -; CHECK-NEXT: movq %r8, %rax -; CHECK-NEXT: movzbl %ah, %ecx +; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: movzbl %ah, %edi +; CHECK-NEXT: movzbl %ch, %ecx ; CHECK-NEXT: movq %r9, %rax -; CHECK-NEXT: movzbl %ah, %ebx +; CHECK-NEXT: movzbl %ah, %ebp ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d -; CHECK-NEXT: addq %rdi, %rsi -; CHECK-NEXT: addq %rbp, %rdx +; CHECK-NEXT: addq %rbx, %rsi +; CHECK-NEXT: addq %rdi, %rdx ; CHECK-NEXT: addq %rsi, %rdx -; CHECK-NEXT: addq %rbx, %rcx +; CHECK-NEXT: addq %rbp, %rcx ; CHECK-NEXT: addq %r8, %rax ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: addq %rdx, %rax @@ -49,22 +51,24 @@ define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h) ; GNUX32-NEXT: .cfi_def_cfa_offset 24 ; GNUX32-NEXT: .cfi_offset %rbx, -24 ; GNUX32-NEXT: .cfi_offset %rbp, -16 +; GNUX32-NEXT: movq %rcx, %r10 ; GNUX32-NEXT: movq %rsi, %rax +; GNUX32-NEXT: movq %r8, %rcx ; GNUX32-NEXT: movq %rdi, %rbx ; GNUX32-NEXT: movzbl %bh, %esi -; GNUX32-NEXT: movzbl %ah, %edi +; GNUX32-NEXT: movzbl %ah, %ebx ; GNUX32-NEXT: movzbl %dh, %edx -; GNUX32-NEXT: movzbl %ch, %ebp -; GNUX32-NEXT: movq %r8, %rax -; GNUX32-NEXT: movzbl %ah, %ecx +; GNUX32-NEXT: movq %r10, %rax +; GNUX32-NEXT: movzbl %ah, %edi +; GNUX32-NEXT: movzbl %ch, %ecx ; GNUX32-NEXT: movq %r9, %rax -; GNUX32-NEXT: movzbl %ah, %ebx +; GNUX32-NEXT: movzbl %ah, %ebp ; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %r8d -; GNUX32-NEXT: addq %rdi, %rsi -; GNUX32-NEXT: addq %rbp, %rdx +; GNUX32-NEXT: addq %rbx, %rsi +; GNUX32-NEXT: addq %rdi, %rdx ; GNUX32-NEXT: addq %rsi, %rdx -; GNUX32-NEXT: addq %rbx, %rcx +; GNUX32-NEXT: addq %rbp, %rcx ; GNUX32-NEXT: addq %r8, %rax ; GNUX32-NEXT: addq %rcx, %rax ; GNUX32-NEXT: addq %rdx, %rax diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll index bca446fa8fb56e..3a06cb4a0a0229 100644 --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -535,24 +535,24 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSE3-NEXT: movd %xmm0, %edi ; SSE3-NEXT: addl %edx, %edi -; SSE3-NEXT: movd %xmm2, %r8d +; SSE3-NEXT: movd %xmm2, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE3-NEXT: movd %xmm0, %edx -; SSE3-NEXT: addl %r8d, %edx -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %r8d +; SSE3-NEXT: addl %edx, %r8d +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE3-NEXT: movd %xmm0, %r9d -; SSE3-NEXT: addl %r8d, %r9d -; SSE3-NEXT: movd %xmm3, %r8d +; SSE3-NEXT: addl %edx, %r9d +; SSE3-NEXT: movd %xmm3, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE3-NEXT: movd %xmm0, %r10d -; SSE3-NEXT: addl %r8d, %r10d +; SSE3-NEXT: addl %edx, %r10d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE3-NEXT: movd %xmm0, %r8d +; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] ; SSE3-NEXT: movd %xmm0, %r11d -; SSE3-NEXT: addl %r8d, %r11d +; SSE3-NEXT: addl %edx, %r11d ; SSE3-NEXT: movd %edi, %xmm0 ; SSE3-NEXT: movd %esi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -564,7 +564,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; SSE3-NEXT: movd %r10d, %xmm2 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE3-NEXT: movd %r9d, %xmm3 -; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: movd %r8d, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE3-NEXT: retq @@ -642,73 +642,73 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: pextrw $3, %xmm0, %eax ; SSE3-NEXT: addl %edx, %eax ; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE3-NEXT: pextrw $4, %xmm0, %edx -; SSE3-NEXT: pextrw $5, %xmm0, %esi -; SSE3-NEXT: addl %edx, %esi -; SSE3-NEXT: pextrw $6, %xmm0, %edx -; SSE3-NEXT: pextrw $7, %xmm0, %r8d -; SSE3-NEXT: addl %edx, %r8d -; SSE3-NEXT: movd %xmm1, %edx -; SSE3-NEXT: pextrw $1, %xmm1, %r10d -; SSE3-NEXT: addl %edx, %r10d -; SSE3-NEXT: pextrw $2, %xmm1, %edx -; SSE3-NEXT: pextrw $3, %xmm1, %ebx -; SSE3-NEXT: addl %edx, %ebx -; SSE3-NEXT: pextrw $4, %xmm1, %edx -; SSE3-NEXT: pextrw $5, %xmm1, %r14d -; SSE3-NEXT: addl %edx, %r14d -; SSE3-NEXT: pextrw $6, %xmm1, %edx -; SSE3-NEXT: pextrw $7, %xmm1, %r12d -; SSE3-NEXT: addl %edx, %r12d -; SSE3-NEXT: movd %xmm2, %edi -; SSE3-NEXT: pextrw $1, %xmm2, %edx -; SSE3-NEXT: addl %edi, %edx -; SSE3-NEXT: pextrw $2, %xmm2, %r9d -; SSE3-NEXT: pextrw $3, %xmm2, %edi -; SSE3-NEXT: addl %r9d, %edi -; SSE3-NEXT: pextrw $4, %xmm2, %r11d -; SSE3-NEXT: pextrw $5, %xmm2, %r9d -; SSE3-NEXT: addl %r11d, %r9d +; SSE3-NEXT: pextrw $4, %xmm0, %esi +; SSE3-NEXT: pextrw $5, %xmm0, %edx +; SSE3-NEXT: addl %esi, %edx +; SSE3-NEXT: pextrw $6, %xmm0, %edi +; SSE3-NEXT: pextrw $7, %xmm0, %esi +; SSE3-NEXT: addl %edi, %esi +; SSE3-NEXT: movd %xmm1, %r8d +; SSE3-NEXT: pextrw $1, %xmm1, %edi +; SSE3-NEXT: addl %r8d, %edi +; SSE3-NEXT: pextrw $2, %xmm1, %r8d +; SSE3-NEXT: pextrw $3, %xmm1, %r9d +; SSE3-NEXT: addl %r8d, %r9d +; SSE3-NEXT: pextrw $4, %xmm1, %r8d +; SSE3-NEXT: pextrw $5, %xmm1, %r11d +; SSE3-NEXT: addl %r8d, %r11d +; SSE3-NEXT: pextrw $6, %xmm1, %r8d +; SSE3-NEXT: pextrw $7, %xmm1, %ebx +; SSE3-NEXT: addl %r8d, %ebx +; SSE3-NEXT: movd %xmm2, %r10d +; SSE3-NEXT: pextrw $1, %xmm2, %r8d +; SSE3-NEXT: addl %r10d, %r8d +; SSE3-NEXT: pextrw $2, %xmm2, %ebp +; SSE3-NEXT: pextrw $3, %xmm2, %r10d +; SSE3-NEXT: addl %ebp, %r10d +; SSE3-NEXT: pextrw $4, %xmm2, %ebp +; SSE3-NEXT: pextrw $5, %xmm2, %r14d +; SSE3-NEXT: addl %ebp, %r14d ; SSE3-NEXT: pextrw $6, %xmm2, %ebp -; SSE3-NEXT: pextrw $7, %xmm2, %r11d -; SSE3-NEXT: addl %ebp, %r11d -; SSE3-NEXT: movd %xmm3, %r15d -; SSE3-NEXT: pextrw $1, %xmm3, %ebp -; SSE3-NEXT: addl %r15d, %ebp -; SSE3-NEXT: pextrw $2, %xmm3, %r13d -; SSE3-NEXT: pextrw $3, %xmm3, %r15d -; SSE3-NEXT: addl %r13d, %r15d -; SSE3-NEXT: pextrw $4, %xmm3, %r13d +; SSE3-NEXT: pextrw $7, %xmm2, %r15d +; SSE3-NEXT: addl %ebp, %r15d +; SSE3-NEXT: movd %xmm3, %ebp +; SSE3-NEXT: pextrw $1, %xmm3, %r12d +; SSE3-NEXT: addl %ebp, %r12d +; SSE3-NEXT: pextrw $2, %xmm3, %ebp +; SSE3-NEXT: pextrw $3, %xmm3, %r13d +; SSE3-NEXT: addl %ebp, %r13d +; SSE3-NEXT: pextrw $4, %xmm3, %ebp ; SSE3-NEXT: pextrw $5, %xmm3, %ecx -; SSE3-NEXT: addl %r13d, %ecx -; SSE3-NEXT: pextrw $6, %xmm3, %r13d +; SSE3-NEXT: addl %ebp, %ecx +; SSE3-NEXT: pextrw $6, %xmm3, %ebp ; SSE3-NEXT: pextrw $7, %xmm3, %eax -; SSE3-NEXT: addl %r13d, %eax -; SSE3-NEXT: movd %r12d, %xmm4 -; SSE3-NEXT: movd %r14d, %xmm2 -; SSE3-NEXT: movd %ebx, %xmm5 -; SSE3-NEXT: movd %r10d, %xmm3 -; SSE3-NEXT: movd %r8d, %xmm6 -; SSE3-NEXT: movd %esi, %xmm7 +; SSE3-NEXT: addl %ebp, %eax +; SSE3-NEXT: movd %ebx, %xmm2 +; SSE3-NEXT: movd %r11d, %xmm3 +; SSE3-NEXT: movd %r9d, %xmm4 +; SSE3-NEXT: movd %edi, %xmm5 +; SSE3-NEXT: movd %esi, %xmm6 +; SSE3-NEXT: movd %edx, %xmm7 ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload ; SSE3-NEXT: # xmm8 = mem[0],zero,zero,zero ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE3-NEXT: movd %eax, %xmm9 ; SSE3-NEXT: movd %ecx, %xmm10 -; SSE3-NEXT: movd %r15d, %xmm11 -; SSE3-NEXT: movd %ebp, %xmm12 -; SSE3-NEXT: movd %r11d, %xmm13 -; SSE3-NEXT: movd %r9d, %xmm14 -; SSE3-NEXT: movd %edi, %xmm15 -; SSE3-NEXT: movd %edx, %xmm1 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE3-NEXT: movd %r13d, %xmm11 +; SSE3-NEXT: movd %r12d, %xmm12 +; SSE3-NEXT: movd %r15d, %xmm13 +; SSE3-NEXT: movd %r14d, %xmm14 +; SSE3-NEXT: movd %r10d, %xmm15 +; SSE3-NEXT: movd %r8d, %xmm1 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] @@ -1149,24 +1149,24 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE3-NEXT: movd %xmm0, %edi ; SSE3-NEXT: addl %edx, %edi -; SSE3-NEXT: movd %xmm1, %r8d +; SSE3-NEXT: movd %xmm1, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE3-NEXT: movd %xmm0, %edx -; SSE3-NEXT: addl %r8d, %edx -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %r8d +; SSE3-NEXT: addl %edx, %r8d +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSE3-NEXT: movd %xmm0, %r9d -; SSE3-NEXT: addl %r8d, %r9d -; SSE3-NEXT: movd %xmm3, %r8d +; SSE3-NEXT: addl %edx, %r9d +; SSE3-NEXT: movd %xmm3, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE3-NEXT: movd %xmm0, %r10d -; SSE3-NEXT: addl %r8d, %r10d +; SSE3-NEXT: addl %edx, %r10d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE3-NEXT: movd %xmm0, %r8d +; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] ; SSE3-NEXT: movd %xmm0, %r11d -; SSE3-NEXT: addl %r8d, %r11d +; SSE3-NEXT: addl %edx, %r11d ; SSE3-NEXT: movd %edi, %xmm0 ; SSE3-NEXT: movd %esi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -1178,7 +1178,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { ; SSE3-NEXT: movd %r10d, %xmm2 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE3-NEXT: movd %r9d, %xmm3 -; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: movd %r8d, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE3-NEXT: retq @@ -1246,81 +1246,81 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: pushq %r13 ; SSE3-NEXT: pushq %r12 ; SSE3-NEXT: pushq %rbx -; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: pextrw $1, %xmm0, %edx -; SSE3-NEXT: addl %eax, %edx -; SSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE3-NEXT: pextrw $3, %xmm0, %esi -; SSE3-NEXT: addl %eax, %esi -; SSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE3-NEXT: pextrw $5, %xmm0, %r9d -; SSE3-NEXT: addl %eax, %r9d -; SSE3-NEXT: pextrw $6, %xmm0, %eax -; SSE3-NEXT: pextrw $7, %xmm0, %r10d -; SSE3-NEXT: addl %eax, %r10d -; SSE3-NEXT: movd %xmm1, %ecx -; SSE3-NEXT: pextrw $1, %xmm1, %eax +; SSE3-NEXT: movd %xmm0, %ecx +; SSE3-NEXT: pextrw $1, %xmm0, %eax ; SSE3-NEXT: addl %ecx, %eax ; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE3-NEXT: pextrw $2, %xmm1, %edi -; SSE3-NEXT: pextrw $3, %xmm1, %eax -; SSE3-NEXT: addl %edi, %eax +; SSE3-NEXT: pextrw $2, %xmm0, %edx +; SSE3-NEXT: pextrw $3, %xmm0, %eax +; SSE3-NEXT: addl %edx, %eax ; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE3-NEXT: pextrw $4, %xmm1, %r8d -; SSE3-NEXT: pextrw $5, %xmm1, %edi -; SSE3-NEXT: addl %r8d, %edi +; SSE3-NEXT: pextrw $4, %xmm0, %edx +; SSE3-NEXT: pextrw $5, %xmm0, %esi +; SSE3-NEXT: addl %edx, %esi +; SSE3-NEXT: pextrw $6, %xmm0, %edx +; SSE3-NEXT: pextrw $7, %xmm0, %r8d +; SSE3-NEXT: addl %edx, %r8d +; SSE3-NEXT: movd %xmm1, %edi +; SSE3-NEXT: pextrw $1, %xmm1, %edx +; SSE3-NEXT: addl %edi, %edx +; SSE3-NEXT: pextrw $2, %xmm1, %r9d +; SSE3-NEXT: pextrw $3, %xmm1, %edi +; SSE3-NEXT: addl %r9d, %edi +; SSE3-NEXT: pextrw $4, %xmm1, %r10d +; SSE3-NEXT: pextrw $5, %xmm1, %r9d +; SSE3-NEXT: addl %r10d, %r9d ; SSE3-NEXT: pextrw $6, %xmm1, %r11d -; SSE3-NEXT: pextrw $7, %xmm1, %r8d -; SSE3-NEXT: addl %r11d, %r8d +; SSE3-NEXT: pextrw $7, %xmm1, %r10d +; SSE3-NEXT: addl %r11d, %r10d ; SSE3-NEXT: movd %xmm2, %r11d -; SSE3-NEXT: pextrw $1, %xmm2, %ebp -; SSE3-NEXT: addl %r11d, %ebp +; SSE3-NEXT: pextrw $1, %xmm2, %ebx +; SSE3-NEXT: addl %r11d, %ebx ; SSE3-NEXT: pextrw $2, %xmm2, %r11d -; SSE3-NEXT: pextrw $3, %xmm2, %r14d -; SSE3-NEXT: addl %r11d, %r14d +; SSE3-NEXT: pextrw $3, %xmm2, %ebp +; SSE3-NEXT: addl %r11d, %ebp ; SSE3-NEXT: pextrw $4, %xmm2, %r11d -; SSE3-NEXT: pextrw $5, %xmm2, %r15d -; SSE3-NEXT: addl %r11d, %r15d +; SSE3-NEXT: pextrw $5, %xmm2, %r14d +; SSE3-NEXT: addl %r11d, %r14d ; SSE3-NEXT: pextrw $6, %xmm2, %r11d -; SSE3-NEXT: pextrw $7, %xmm2, %r12d +; SSE3-NEXT: pextrw $7, %xmm2, %r15d +; SSE3-NEXT: addl %r11d, %r15d +; SSE3-NEXT: movd %xmm3, %r11d +; SSE3-NEXT: pextrw $1, %xmm3, %r12d ; SSE3-NEXT: addl %r11d, %r12d -; SSE3-NEXT: movd %xmm3, %ebx -; SSE3-NEXT: pextrw $1, %xmm3, %r11d -; SSE3-NEXT: addl %ebx, %r11d -; SSE3-NEXT: pextrw $2, %xmm3, %r13d -; SSE3-NEXT: pextrw $3, %xmm3, %ebx -; SSE3-NEXT: addl %r13d, %ebx -; SSE3-NEXT: pextrw $4, %xmm3, %r13d +; SSE3-NEXT: pextrw $2, %xmm3, %r11d +; SSE3-NEXT: pextrw $3, %xmm3, %r13d +; SSE3-NEXT: addl %r11d, %r13d +; SSE3-NEXT: pextrw $4, %xmm3, %r11d ; SSE3-NEXT: pextrw $5, %xmm3, %ecx -; SSE3-NEXT: addl %r13d, %ecx -; SSE3-NEXT: pextrw $6, %xmm3, %r13d +; SSE3-NEXT: addl %r11d, %ecx +; SSE3-NEXT: pextrw $6, %xmm3, %r11d ; SSE3-NEXT: pextrw $7, %xmm3, %eax -; SSE3-NEXT: addl %r13d, %eax -; SSE3-NEXT: movd %r12d, %xmm4 +; SSE3-NEXT: addl %r11d, %eax ; SSE3-NEXT: movd %r15d, %xmm2 -; SSE3-NEXT: movd %r14d, %xmm5 -; SSE3-NEXT: movd %ebp, %xmm3 -; SSE3-NEXT: movd %r10d, %xmm6 -; SSE3-NEXT: movd %r9d, %xmm7 -; SSE3-NEXT: movd %esi, %xmm8 -; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: movd %r14d, %xmm3 +; SSE3-NEXT: movd %ebp, %xmm4 +; SSE3-NEXT: movd %ebx, %xmm5 +; SSE3-NEXT: movd %r8d, %xmm6 +; SSE3-NEXT: movd %esi, %xmm7 +; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSE3-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE3-NEXT: movd %eax, %xmm9 ; SSE3-NEXT: movd %ecx, %xmm10 -; SSE3-NEXT: movd %ebx, %xmm11 -; SSE3-NEXT: movd %r11d, %xmm12 -; SSE3-NEXT: movd %r8d, %xmm13 -; SSE3-NEXT: movd %edi, %xmm14 -; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE3-NEXT: movd %r13d, %xmm11 +; SSE3-NEXT: movd %r12d, %xmm12 +; SSE3-NEXT: movd %r10d, %xmm13 +; SSE3-NEXT: movd %r9d, %xmm14 +; SSE3-NEXT: movd %edi, %xmm15 +; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index 596e465ee8cacf..8235657eec64b1 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -2113,38 +2113,38 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 { define void @pr63114() { ; CHECK-LIBCALL-LABEL: pr63114: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: movdqu (%rax), %xmm4 -; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,3,4,5,6,7] -; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm0 -; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] -; CHECK-LIBCALL-NEXT: por %xmm2, %xmm0 -; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm0 +; CHECK-LIBCALL-NEXT: movdqu (%rax), %xmm0 +; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,3,3,4,5,6,7] +; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; CHECK-LIBCALL-NEXT: pand %xmm2, %xmm1 +; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,15360,0,0,0,0] +; CHECK-LIBCALL-NEXT: por %xmm3, %xmm1 +; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm1 ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] -; CHECK-LIBCALL-NEXT: por %xmm5, %xmm0 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,7,7] +; CHECK-LIBCALL-NEXT: por %xmm5, %xmm1 +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,7,7] ; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm6 -; CHECK-LIBCALL-NEXT: por %xmm2, %xmm6 -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm6 +; CHECK-LIBCALL-NEXT: pand %xmm2, %xmm6 +; CHECK-LIBCALL-NEXT: por %xmm3, %xmm6 +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm6 ; CHECK-LIBCALL-NEXT: por %xmm5, %xmm6 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,5,5,5] -; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3,0,3] -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm4 -; CHECK-LIBCALL-NEXT: por %xmm2, %xmm4 -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm4 -; CHECK-LIBCALL-NEXT: por %xmm5, %xmm4 -; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm7 -; CHECK-LIBCALL-NEXT: por %xmm2, %xmm7 -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm7 +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,5,5,5,5] +; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,3] +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; CHECK-LIBCALL-NEXT: pand %xmm2, %xmm0 +; CHECK-LIBCALL-NEXT: por %xmm3, %xmm0 +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm0 +; CHECK-LIBCALL-NEXT: por %xmm5, %xmm0 +; CHECK-LIBCALL-NEXT: pand %xmm2, %xmm7 +; CHECK-LIBCALL-NEXT: por %xmm3, %xmm7 +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm7 ; CHECK-LIBCALL-NEXT: por %xmm5, %xmm7 ; CHECK-LIBCALL-NEXT: movdqu %xmm7, 0 -; CHECK-LIBCALL-NEXT: movdqu %xmm4, 32 +; CHECK-LIBCALL-NEXT: movdqu %xmm0, 32 ; CHECK-LIBCALL-NEXT: movdqu %xmm6, 48 -; CHECK-LIBCALL-NEXT: movdqu %xmm0, 16 +; CHECK-LIBCALL-NEXT: movdqu %xmm1, 16 ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: pr63114: @@ -2178,8 +2178,8 @@ define void @pr63114() { ; ; CHECK-I686-LABEL: pr63114: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: movdqu (%eax), %xmm6 -; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,3,4,5,6,7] +; CHECK-I686-NEXT: movdqu (%eax), %xmm4 +; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,3,4,5,6,7] ; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; CHECK-I686-NEXT: pand %xmm1, %xmm0 @@ -2187,28 +2187,28 @@ define void @pr63114() { ; CHECK-I686-NEXT: por %xmm2, %xmm0 ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] ; CHECK-I686-NEXT: pand %xmm3, %xmm0 -; CHECK-I686-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] -; CHECK-I686-NEXT: por %xmm4, %xmm0 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,7] -; CHECK-I686-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; CHECK-I686-NEXT: pand %xmm1, %xmm5 -; CHECK-I686-NEXT: por %xmm2, %xmm5 -; CHECK-I686-NEXT: pand %xmm3, %xmm5 -; CHECK-I686-NEXT: por %xmm4, %xmm5 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,5,5,5,5] -; CHECK-I686-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3,0,3] -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] +; CHECK-I686-NEXT: por %xmm5, %xmm0 +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,7,7] +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; CHECK-I686-NEXT: pand %xmm1, %xmm6 ; CHECK-I686-NEXT: por %xmm2, %xmm6 ; CHECK-I686-NEXT: pand %xmm3, %xmm6 -; CHECK-I686-NEXT: por %xmm4, %xmm6 +; CHECK-I686-NEXT: por %xmm5, %xmm6 +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3,0,3] +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: pand %xmm1, %xmm4 +; CHECK-I686-NEXT: por %xmm2, %xmm4 +; CHECK-I686-NEXT: pand %xmm3, %xmm4 +; CHECK-I686-NEXT: por %xmm5, %xmm4 ; CHECK-I686-NEXT: pand %xmm1, %xmm7 ; CHECK-I686-NEXT: por %xmm2, %xmm7 ; CHECK-I686-NEXT: pand %xmm3, %xmm7 -; CHECK-I686-NEXT: por %xmm4, %xmm7 +; CHECK-I686-NEXT: por %xmm5, %xmm7 ; CHECK-I686-NEXT: movdqu %xmm7, 0 -; CHECK-I686-NEXT: movdqu %xmm6, 32 -; CHECK-I686-NEXT: movdqu %xmm5, 48 +; CHECK-I686-NEXT: movdqu %xmm4, 32 +; CHECK-I686-NEXT: movdqu %xmm6, 48 ; CHECK-I686-NEXT: movdqu %xmm0, 16 ; CHECK-I686-NEXT: retl %1 = load <24 x half>, ptr poison, align 2 diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll index d79aff5544b69d..8855fe7c9835ce 100644 --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -627,28 +627,28 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi ret <4 x i1> %res } define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef1_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: psrld %xmm3, %xmm4 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: psrld %xmm3, %xmm5 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: psrld %xmm3, %xmm4 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; X86-SSE2-NEXT: psrld %xmm1, %xmm2 -; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3] -; X86-SSE2-NEXT: andps %xmm5, %xmm0 -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; X86-SSE2-NEXT: retl +; SSE2-LABEL: vec_4xi32_nonsplat_undef1_eq: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrld %xmm2, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: psrld %xmm2, %xmm5 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrld %xmm2, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm1, %xmm3 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; SSE2-NEXT: andps %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: vec_4xi32_nonsplat_undef1_eq: ; AVX2: # %bb.0: @@ -658,57 +658,34 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} -; -; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef1_eq: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] -; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X64-SSE2-NEXT: psrld %xmm2, %xmm4 -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] -; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 -; X64-SSE2-NEXT: psrld %xmm2, %xmm5 -; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X64-SSE2-NEXT: psrld %xmm2, %xmm4 -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; X64-SSE2-NEXT: psrld %xmm1, %xmm3 -; X64-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; X64-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] -; X64-SSE2-NEXT: andps %xmm5, %xmm0 -; X64-SSE2-NEXT: pxor %xmm1, %xmm1 -; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; X64-SSE2-NEXT: retq %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, ret <4 x i1> %res } define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <1,1,u,1> -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: psrld %xmm3, %xmm4 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: psrld %xmm3, %xmm5 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: psrld %xmm3, %xmm4 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; X86-SSE2-NEXT: psrld %xmm1, %xmm2 -; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3] -; X86-SSE2-NEXT: andps %xmm5, %xmm0 -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; X86-SSE2-NEXT: retl +; SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = <1,1,u,1> +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrld %xmm2, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: psrld %xmm2, %xmm5 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrld %xmm2, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm1, %xmm3 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; SSE2-NEXT: andps %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: vec_4xi32_nonsplat_undef2_eq: ; AVX2: # %bb.0: @@ -718,29 +695,6 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} -; -; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <1,1,u,1> -; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X64-SSE2-NEXT: psrld %xmm2, %xmm4 -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] -; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 -; X64-SSE2-NEXT: psrld %xmm2, %xmm5 -; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X64-SSE2-NEXT: psrld %xmm2, %xmm4 -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; X64-SSE2-NEXT: psrld %xmm1, %xmm3 -; X64-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; X64-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] -; X64-SSE2-NEXT: andps %xmm5, %xmm0 -; X64-SSE2-NEXT: pxor %xmm1, %xmm1 -; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; X64-SSE2-NEXT: retq %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll index 1a2aac657d30fb..aa59244af060f7 100644 --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -346,16 +346,16 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind { ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: movl $1, %eax -; X86-BMI1-NEXT: xorl %esi, %esi ; X86-BMI1-NEXT: xorl %edx, %edx -; X86-BMI1-NEXT: shldl %cl, %eax, %edx +; X86-BMI1-NEXT: xorl %esi, %esi +; X86-BMI1-NEXT: shldl %cl, %eax, %esi ; X86-BMI1-NEXT: shll %cl, %eax ; X86-BMI1-NEXT: testb $32, %cl -; X86-BMI1-NEXT: cmovnel %eax, %edx -; X86-BMI1-NEXT: cmovnel %esi, %eax -; X86-BMI1-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-BMI1-NEXT: cmovnel %eax, %esi +; X86-BMI1-NEXT: cmovnel %edx, %eax +; X86-BMI1-NEXT: andl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: orl %edx, %eax +; X86-BMI1-NEXT: orl %esi, %eax ; X86-BMI1-NEXT: sete %al ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: retl @@ -364,17 +364,17 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind { ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl $1, %edx +; X86-BMI2-NEXT: movl $1, %eax +; X86-BMI2-NEXT: xorl %edx, %edx ; X86-BMI2-NEXT: xorl %esi, %esi -; X86-BMI2-NEXT: xorl %eax, %eax -; X86-BMI2-NEXT: shldl %cl, %edx, %eax -; X86-BMI2-NEXT: shlxl %ecx, %edx, %edx +; X86-BMI2-NEXT: shldl %cl, %eax, %esi +; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X86-BMI2-NEXT: testb $32, %cl +; X86-BMI2-NEXT: cmovnel %eax, %esi ; X86-BMI2-NEXT: cmovnel %edx, %eax -; X86-BMI2-NEXT: cmovnel %esi, %edx +; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %esi ; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %edx -; X86-BMI2-NEXT: orl %eax, %edx +; X86-BMI2-NEXT: orl %esi, %eax ; X86-BMI2-NEXT: sete %al ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll index 93049f9987a5e9..267647f8b46c85 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -280,27 +280,27 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -856,19 +856,19 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm2 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pandn %xmm2, %xmm0 -; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -1640,27 +1640,27 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2029,27 +2029,27 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2169,27 +2169,27 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll index 47bb0957f3fbb6..ad8c64b54c0c46 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -282,27 +282,27 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -860,19 +860,19 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm2 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pandn %xmm2, %xmm0 -; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -1644,27 +1644,27 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2033,27 +2033,27 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2173,27 +2173,27 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll index 5fde9bd5566b40..3ac36089a20e86 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -147,25 +147,25 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE2-NEXT: pxor %xmm2, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm3, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; X86-SSE2-NEXT: pand %xmm3, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm3 -; X86-SSE2-NEXT: por %xmm1, %xmm3 -; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm3, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: test_reduce_v4i32: @@ -672,31 +672,31 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE2-NEXT: pxor %xmm2, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; X86-SSE2-NEXT: pand %xmm3, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm3 -; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE2-NEXT: pxor %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: pxor %xmm2, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm3 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm3, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 -; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; X86-SSE2-NEXT: pand %xmm3, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm3 -; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 ; X86-SSE2-NEXT: movd %xmm3, %eax ; X86-SSE2-NEXT: retl ; @@ -1450,40 +1450,40 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm5, %xmm0 ; X86-SSE2-NEXT: pandn %xmm2, %xmm5 ; X86-SSE2-NEXT: por %xmm0, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pandn %xmm3, %xmm0 -; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm5 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 ; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: pxor %xmm4, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pandn %xmm2, %xmm0 -; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: pxor %xmm1, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm2 -; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: test_reduce_v16i32: @@ -1529,14 +1529,14 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE2-LABEL: test_reduce_v16i32: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; X64-SSE2-NEXT: movdqa %xmm2, %xmm6 -; X64-SSE2-NEXT: pxor %xmm4, %xmm6 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 ; X64-SSE2-NEXT: pxor %xmm4, %xmm5 -; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; X64-SSE2-NEXT: pand %xmm5, %xmm0 -; X64-SSE2-NEXT: pandn %xmm2, %xmm5 -; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; X64-SSE2-NEXT: pand %xmm6, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm6 +; X64-SSE2-NEXT: por %xmm0, %xmm6 ; X64-SSE2-NEXT: movdqa %xmm3, %xmm0 ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -1547,12 +1547,12 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE2-NEXT: por %xmm1, %xmm2 ; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm1 ; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm5 +; X64-SSE2-NEXT: pand %xmm1, %xmm6 ; X64-SSE2-NEXT: pandn %xmm2, %xmm1 -; X64-SSE2-NEXT: por %xmm5, %xmm1 +; X64-SSE2-NEXT: por %xmm6, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll index 699dce75e505c7..7c0c4c0b5b9e3a 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -149,24 +149,24 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pxor %xmm1, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pandn %xmm3, %xmm2 -; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: pxor %xmm1, %xmm3 ; X86-SSE2-NEXT: pxor %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm4, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: retl ; @@ -448,23 +448,23 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v4i64: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE42-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] ; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE42-NEXT: pxor %xmm2, %xmm4 +; X86-SSE42-NEXT: pxor %xmm3, %xmm4 ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: pxor %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: pxor %xmm2, %xmm0 -; X86-SSE42-NEXT: pxor %xmm3, %xmm2 -; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm2 -; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X86-SSE42-NEXT: movd %xmm3, %eax -; X86-SSE42-NEXT: pextrd $1, %xmm3, %edx +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm2, %xmm3 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm3 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx ; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v4i64: @@ -618,31 +618,31 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE2-NEXT: pxor %xmm2, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; X86-SSE2-NEXT: pand %xmm3, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm3 -; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: pxor %xmm2, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: pxor %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm3 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm3, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 -; X86-SSE2-NEXT: pxor %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm2 -; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm4, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm3, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: retl ; @@ -1105,32 +1105,32 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v8i64: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE42-NEXT: pxor %xmm5, %xmm6 +; X86-SSE42-NEXT: pxor %xmm4, %xmm6 ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE42-NEXT: pxor %xmm5, %xmm0 +; X86-SSE42-NEXT: pxor %xmm4, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X86-SSE42-NEXT: movdqa %xmm4, %xmm1 -; X86-SSE42-NEXT: pxor %xmm5, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE42-NEXT: pxor %xmm4, %xmm1 ; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE42-NEXT: pxor %xmm5, %xmm0 +; X86-SSE42-NEXT: pxor %xmm4, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; X86-SSE42-NEXT: movapd %xmm2, %xmm1 -; X86-SSE42-NEXT: xorpd %xmm5, %xmm1 +; X86-SSE42-NEXT: xorpd %xmm4, %xmm1 ; X86-SSE42-NEXT: movapd %xmm3, %xmm0 -; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE42-NEXT: pxor %xmm5, %xmm0 -; X86-SSE42-NEXT: pxor %xmm1, %xmm5 -; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm5 -; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm4 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; X86-SSE42-NEXT: movd %xmm1, %eax ; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx @@ -1138,26 +1138,26 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; ; X86-AVX1-LABEL: test_reduce_v8i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX1-NEXT: ## xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm3, %xmm4 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: ## xmm3 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm4 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm5, %xmm6 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm5, %xmm6 ; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm5, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm5 +; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm5, %xmm2 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm0, %xmm4 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm5 ; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx @@ -1254,32 +1254,32 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v8i64: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: movdqa %xmm0, %xmm5 -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] ; X64-SSE42-NEXT: movdqa %xmm1, %xmm6 -; X64-SSE42-NEXT: pxor %xmm4, %xmm6 +; X64-SSE42-NEXT: pxor %xmm5, %xmm6 ; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: pxor %xmm5, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE42-NEXT: movdqa %xmm5, %xmm1 -; X64-SSE42-NEXT: pxor %xmm4, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE42-NEXT: pxor %xmm5, %xmm1 ; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: pxor %xmm5, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; X64-SSE42-NEXT: movapd %xmm2, %xmm1 -; X64-SSE42-NEXT: xorpd %xmm4, %xmm1 +; X64-SSE42-NEXT: xorpd %xmm5, %xmm1 ; X64-SSE42-NEXT: movapd %xmm3, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X64-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X64-SSE42-NEXT: pxor %xmm4, %xmm0 -; X64-SSE42-NEXT: pxor %xmm1, %xmm4 -; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 -; X64-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE42-NEXT: pxor %xmm5, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm5 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm5 +; X64-SSE42-NEXT: movdqa %xmm5, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; X64-SSE42-NEXT: movq %xmm1, %rax ; X64-SSE42-NEXT: retq @@ -1368,39 +1368,39 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm5, %xmm1 ; X86-SSE2-NEXT: pandn %xmm3, %xmm5 ; X86-SSE2-NEXT: por %xmm1, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE2-NEXT: pxor %xmm4, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pandn %xmm5, %xmm1 +; X86-SSE2-NEXT: por %xmm3, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pandn %xmm5, %xmm0 -; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm4, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm0, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 ; X86-SSE2-NEXT: pandn %xmm0, %xmm4 -; X86-SSE2-NEXT: por %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm3, %xmm4 ; X86-SSE2-NEXT: movd %xmm4, %eax ; X86-SSE2-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index a026757a0264d6..e1d2f9c3438550 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -546,26 +546,27 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, < ; ; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5 -; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 +; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1] -; SSSE3-FAST-NEXT: addps %xmm5, %xmm0 -; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: addps %xmm5, %xmm4 +; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0 +; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSSE3-FAST-NEXT: addps %xmm1, %xmm2 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-FAST-NEXT: addps %xmm0, %xmm1 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-FAST-NEXT: addps %xmm2, %xmm3 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSSE3-FAST-NEXT: addps %xmm1, %xmm3 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32: diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll index 9f58ed08433485..b6336935e60775 100644 --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -14,21 +14,21 @@ define i64 @foo(i64 %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebp -; X86-NOBMI-NEXT: movl %ebx, %eax +; X86-NOBMI-NEXT: movl %edx, %ebx +; X86-NOBMI-NEXT: movl %ebp, %eax ; X86-NOBMI-NEXT: mull %esi ; X86-NOBMI-NEXT: movl %edx, %esi -; X86-NOBMI-NEXT: movl %eax, %ebx -; X86-NOBMI-NEXT: addl %ebp, %ebx +; X86-NOBMI-NEXT: movl %eax, %ebp +; X86-NOBMI-NEXT: addl %ebx, %ebp ; X86-NOBMI-NEXT: adcl $0, %esi ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: mull %ecx -; X86-NOBMI-NEXT: addl %ebx, %eax +; X86-NOBMI-NEXT: addl %ebp, %eax ; X86-NOBMI-NEXT: adcl %edx, %esi ; X86-NOBMI-NEXT: setb %al ; X86-NOBMI-NEXT: movzbl %al, %edi @@ -50,22 +50,22 @@ define i64 @foo(i64 %x, i64 %y) nounwind { ; X86-BMI-NEXT: pushl %esi ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: mulxl %edi, %ebx, %ebx +; X86-BMI-NEXT: mulxl %esi, %ebx, %ebx ; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: mulxl %edi, %edi, %ebp -; X86-BMI-NEXT: addl %ebx, %edi +; X86-BMI-NEXT: mulxl %esi, %esi, %ebp +; X86-BMI-NEXT: addl %ebx, %esi ; X86-BMI-NEXT: adcl $0, %ebp ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: mulxl %esi, %ecx, %ebx -; X86-BMI-NEXT: addl %edi, %ecx +; X86-BMI-NEXT: mulxl %edi, %ecx, %ebx +; X86-BMI-NEXT: addl %esi, %ecx ; X86-BMI-NEXT: adcl %ebp, %ebx ; X86-BMI-NEXT: setb %cl ; X86-BMI-NEXT: movzbl %cl, %ecx ; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: mulxl %esi, %eax, %edx +; X86-BMI-NEXT: mulxl %edi, %eax, %edx ; X86-BMI-NEXT: addl %ebx, %eax ; X86-BMI-NEXT: adcl %ecx, %edx ; X86-BMI-NEXT: popl %esi @@ -105,7 +105,7 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: subl $20, %esp +; X86-NOBMI-NEXT: subl $24, %esp ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: orl %ecx, %eax @@ -127,38 +127,36 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %ebx, %eax ; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebx -; X86-NOBMI-NEXT: movl %eax, %esi -; X86-NOBMI-NEXT: addl %ebp, %esi -; X86-NOBMI-NEXT: adcl $0, %ebx +; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl %eax, %ebx +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl $0, %ebp ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: mull %edx -; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: movl %eax, %edi -; X86-NOBMI-NEXT: addl %esi, %edi -; X86-NOBMI-NEXT: adcl %ebx, %ebp +; X86-NOBMI-NEXT: addl %ebx, %edi +; X86-NOBMI-NEXT: adcl %ebp, %esi ; X86-NOBMI-NEXT: setb %bl ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOBMI-NEXT: addl %ebp, %eax +; X86-NOBMI-NEXT: addl %esi, %eax ; X86-NOBMI-NEXT: movzbl %bl, %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NOBMI-NEXT: adcl %esi, %edx -; X86-NOBMI-NEXT: movl %ecx, %ebx -; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NOBMI-NEXT: adcl $0, %eax ; X86-NOBMI-NEXT: adcl $0, %edx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebx,8) -; X86-NOBMI-NEXT: movl %ebx, %ecx -; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebx,8) +; X86-NOBMI-NEXT: movl %ebx, (%esi,%ecx,8) +; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ecx,8) ; X86-NOBMI-NEXT: addl $1, %ecx ; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload ; X86-NOBMI-NEXT: adcl $0, %edi @@ -171,7 +169,7 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: .LBB1_3: # %for.end ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx -; X86-NOBMI-NEXT: addl $20, %esp +; X86-NOBMI-NEXT: addl $24, %esp ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: popl %ebx @@ -198,44 +196,42 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-BMI-NEXT: .LBB1_2: # %for.body ; X86-BMI-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-BMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI-NEXT: movl (%eax,%ebx,8), %ecx -; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %esi -; X86-BMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI-NEXT: mulxl %eax, %edx, %edi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI-NEXT: movl (%ecx,%ebx,8), %eax +; X86-BMI-NEXT: movl 4(%ecx,%ebx,8), %esi +; X86-BMI-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-BMI-NEXT: movl %eax, %edx +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI-NEXT: mulxl %ecx, %edx, %ebp ; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %esi, %edx -; X86-BMI-NEXT: mulxl %eax, %esi, %eax -; X86-BMI-NEXT: addl %edi, %esi -; X86-BMI-NEXT: adcl $0, %eax -; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI-NEXT: mulxl %ecx, %edi, %ebp -; X86-BMI-NEXT: addl %esi, %edi -; X86-BMI-NEXT: adcl %eax, %ebp -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-BMI-NEXT: mulxl %ecx, %ecx, %eax +; X86-BMI-NEXT: mulxl %ecx, %edi, %ecx +; X86-BMI-NEXT: addl %ebp, %edi +; X86-BMI-NEXT: adcl $0, %ecx +; X86-BMI-NEXT: movl %eax, %edx +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: mulxl %eax, %ebp, %esi +; X86-BMI-NEXT: addl %edi, %ebp +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-BMI-NEXT: adcl %ecx, %esi +; X86-BMI-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-BMI-NEXT: mulxl %eax, %ecx, %eax ; X86-BMI-NEXT: setb %dl -; X86-BMI-NEXT: addl %ebp, %ecx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI-NEXT: addl %esi, %ecx ; X86-BMI-NEXT: movzbl %dl, %edx ; X86-BMI-NEXT: adcl %edx, %eax -; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-BMI-NEXT: adcl (%esp), %edi # 4-byte Folded Reload +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-BMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-BMI-NEXT: adcl $0, %ecx -; X86-BMI-NEXT: adcl $0, %edx -; X86-BMI-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-BMI-NEXT: adcl $0, %eax ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI-NEXT: movl %eax, (%edx,%ebx,8) -; X86-BMI-NEXT: movl %edi, 4(%edx,%ebx,8) -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-BMI-NEXT: movl %esi, (%edx,%ebx,8) +; X86-BMI-NEXT: movl %ebp, 4(%edx,%ebx,8) +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI-NEXT: addl $1, %ebx ; X86-BMI-NEXT: adcl $0, %ebp ; X86-BMI-NEXT: movl %ebx, %edx @@ -243,7 +239,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-BMI-NEXT: movl %ebp, %esi ; X86-BMI-NEXT: xorl %edi, %esi ; X86-BMI-NEXT: orl %edx, %esi -; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-BMI-NEXT: jne .LBB1_2 ; X86-BMI-NEXT: .LBB1_3: # %for.end ; X86-BMI-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll index 717f52f198ee88..9e132e5a04ad24 100644 --- a/llvm/test/CodeGen/X86/i128-sdiv.ll +++ b/llvm/test/CodeGen/X86/i128-sdiv.ll @@ -12,23 +12,23 @@ define i128 @test1(i128 %x) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: shrl $30, %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $30, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %edx, %edi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: shrdl $2, %ecx, %edx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: sarl $2, %esi +; X86-NEXT: shrdl $2, %ecx, %esi +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $2, %edx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll index 53b70fa38958b5..63df83a0d88344 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll @@ -161,22 +161,22 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind { ; X86-LABEL: ne_and_with_dom_abs: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movswl %cx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movswl %dx, %eax ; X86-NEXT: sarl $15, %eax -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: xorl $12312, %eax # imm = 0x3018 ; X86-NEXT: movzwl %ax, %esi -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpw $64, %cx -; X86-NEXT: setne %cl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpw $64, %dx +; X86-NEXT: setne %dl ; X86-NEXT: cmpl $2345, %esi # imm = 0x929 ; X86-NEXT: jae .LBB3_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movb %dl, %cl +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB3_2: ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll index ec2323ac2250c7..96a077026cf37e 100644 --- a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll +++ b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll @@ -83,21 +83,21 @@ define i1 @is_snan_f80(x86_fp80 %x) { ; CHECK-32-NEXT: .cfi_def_cfa_offset 12 ; CHECK-32-NEXT: .cfi_offset %esi, -12 ; CHECK-32-NEXT: .cfi_offset %ebx, -8 -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: andl $32767, %eax # imm = 0x7FFF -; CHECK-32-NEXT: xorl %ecx, %ecx -; CHECK-32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: andl $32767, %ecx # imm = 0x7FFF +; CHECK-32-NEXT: xorl %edx, %edx +; CHECK-32-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: movl $-2147483648, %esi # imm = 0x80000000 -; CHECK-32-NEXT: sbbl %edx, %esi -; CHECK-32-NEXT: movl $32767, %esi # imm = 0x7FFF ; CHECK-32-NEXT: sbbl %eax, %esi +; CHECK-32-NEXT: movl $32767, %esi # imm = 0x7FFF +; CHECK-32-NEXT: sbbl %ecx, %esi ; CHECK-32-NEXT: movl $0, %esi ; CHECK-32-NEXT: sbbl %esi, %esi ; CHECK-32-NEXT: setl %bl -; CHECK-32-NEXT: cmpl $-1073741824, %edx # imm = 0xC0000000 -; CHECK-32-NEXT: sbbl $32767, %eax # imm = 0x7FFF -; CHECK-32-NEXT: sbbl %ecx, %ecx +; CHECK-32-NEXT: cmpl $-1073741824, %eax # imm = 0xC0000000 +; CHECK-32-NEXT: sbbl $32767, %ecx # imm = 0x7FFF +; CHECK-32-NEXT: sbbl %edx, %edx ; CHECK-32-NEXT: setl %al ; CHECK-32-NEXT: andb %bl, %al ; CHECK-32-NEXT: popl %esi @@ -363,22 +363,22 @@ define i1 @is_posnormal_f80(x86_fp80 %x) { ; CHECK-32-NEXT: pushl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-32-NEXT: .cfi_offset %esi, -8 -; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; CHECK-32-NEXT: movswl %dx, %ecx -; CHECK-32-NEXT: sarl $15, %ecx +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: movswl %cx, %edx +; CHECK-32-NEXT: sarl $15, %edx ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: andl $32767, %edx # imm = 0x7FFF -; CHECK-32-NEXT: decl %edx -; CHECK-32-NEXT: movzwl %dx, %edx +; CHECK-32-NEXT: andl $32767, %ecx # imm = 0x7FFF +; CHECK-32-NEXT: decl %ecx +; CHECK-32-NEXT: movzwl %cx, %ecx ; CHECK-32-NEXT: xorl %esi, %esi -; CHECK-32-NEXT: cmpl $32766, %edx # imm = 0x7FFE +; CHECK-32-NEXT: cmpl $32766, %ecx # imm = 0x7FFE ; CHECK-32-NEXT: sbbl %esi, %esi -; CHECK-32-NEXT: setb %dl -; CHECK-32-NEXT: testl %ecx, %ecx -; CHECK-32-NEXT: setns %cl +; CHECK-32-NEXT: setb %cl +; CHECK-32-NEXT: testl %edx, %edx +; CHECK-32-NEXT: setns %dl ; CHECK-32-NEXT: shrl $31, %eax -; CHECK-32-NEXT: andb %cl, %al ; CHECK-32-NEXT: andb %dl, %al +; CHECK-32-NEXT: andb %cl, %al ; CHECK-32-NEXT: # kill: def $al killed $al killed $eax ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 4 @@ -411,22 +411,22 @@ define i1 @is_negnormal_f80(x86_fp80 %x) { ; CHECK-32-NEXT: pushl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-32-NEXT: .cfi_offset %esi, -8 -; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; CHECK-32-NEXT: movswl %dx, %ecx -; CHECK-32-NEXT: sarl $15, %ecx +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: movswl %cx, %edx +; CHECK-32-NEXT: sarl $15, %edx ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: andl $32767, %edx # imm = 0x7FFF -; CHECK-32-NEXT: decl %edx -; CHECK-32-NEXT: movzwl %dx, %edx +; CHECK-32-NEXT: andl $32767, %ecx # imm = 0x7FFF +; CHECK-32-NEXT: decl %ecx +; CHECK-32-NEXT: movzwl %cx, %ecx ; CHECK-32-NEXT: xorl %esi, %esi -; CHECK-32-NEXT: cmpl $32766, %edx # imm = 0x7FFE +; CHECK-32-NEXT: cmpl $32766, %ecx # imm = 0x7FFE ; CHECK-32-NEXT: sbbl %esi, %esi -; CHECK-32-NEXT: setb %dl -; CHECK-32-NEXT: testl %ecx, %ecx -; CHECK-32-NEXT: sets %cl +; CHECK-32-NEXT: setb %cl +; CHECK-32-NEXT: testl %edx, %edx +; CHECK-32-NEXT: sets %dl ; CHECK-32-NEXT: shrl $31, %eax -; CHECK-32-NEXT: andb %cl, %al ; CHECK-32-NEXT: andb %dl, %al +; CHECK-32-NEXT: andb %cl, %al ; CHECK-32-NEXT: # kill: def $al killed $al killed $eax ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 4 @@ -459,19 +459,19 @@ define i1 @is_subnormal_f80(x86_fp80 %x) { ; CHECK-32-NEXT: pushl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-32-NEXT: .cfi_offset %esi, -8 -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: andl $32767, %eax # imm = 0x7FFF -; CHECK-32-NEXT: xorl %edx, %edx -; CHECK-32-NEXT: addl $-1, %esi +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; CHECK-32-NEXT: andl $32767, %edx # imm = 0x7FFF +; CHECK-32-NEXT: xorl %esi, %esi +; CHECK-32-NEXT: addl $-1, %eax ; CHECK-32-NEXT: adcl $-1, %ecx -; CHECK-32-NEXT: adcl $-1, %eax ; CHECK-32-NEXT: adcl $-1, %edx -; CHECK-32-NEXT: cmpl $-1, %esi +; CHECK-32-NEXT: adcl $-1, %esi +; CHECK-32-NEXT: cmpl $-1, %eax ; CHECK-32-NEXT: sbbl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-32-NEXT: sbbl $0, %eax ; CHECK-32-NEXT: sbbl $0, %edx +; CHECK-32-NEXT: sbbl $0, %esi ; CHECK-32-NEXT: setb %al ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 4 @@ -546,18 +546,18 @@ define i1 @is_negsubnormal_f80(x86_fp80 %x) { ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; CHECK-32-NEXT: movswl %cx, %eax ; CHECK-32-NEXT: sarl $15, %eax +; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK-32-NEXT: andl $32767, %ecx # imm = 0x7FFF -; CHECK-32-NEXT: xorl %edx, %edx -; CHECK-32-NEXT: addl $-1, %esi -; CHECK-32-NEXT: adcl $-1, %edi +; CHECK-32-NEXT: xorl %edi, %edi +; CHECK-32-NEXT: addl $-1, %edx +; CHECK-32-NEXT: adcl $-1, %esi ; CHECK-32-NEXT: adcl $-1, %ecx -; CHECK-32-NEXT: adcl $-1, %edx -; CHECK-32-NEXT: cmpl $-1, %esi -; CHECK-32-NEXT: sbbl $2147483647, %edi # imm = 0x7FFFFFFF +; CHECK-32-NEXT: adcl $-1, %edi +; CHECK-32-NEXT: cmpl $-1, %edx +; CHECK-32-NEXT: sbbl $2147483647, %esi # imm = 0x7FFFFFFF ; CHECK-32-NEXT: sbbl $0, %ecx -; CHECK-32-NEXT: sbbl $0, %edx +; CHECK-32-NEXT: sbbl $0, %edi ; CHECK-32-NEXT: setb %cl ; CHECK-32-NEXT: testl %eax, %eax ; CHECK-32-NEXT: sets %al diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll index 2046d790cc57e4..394c469020904f 100644 --- a/llvm/test/CodeGen/X86/is_fpclass.ll +++ b/llvm/test/CodeGen/X86/is_fpclass.ll @@ -1519,24 +1519,24 @@ define <4 x i1> @isnan_v4f_strictfp(<4 x float> %x) strictfp { ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: andl %ecx, %edx ; CHECK-32-NEXT: cmpl $2139095041, %edx # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dh -; CHECK-32-NEXT: shlb $2, %dh +; CHECK-32-NEXT: setge %dl +; CHECK-32-NEXT: shlb $2, %dl ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-32-NEXT: andl %ecx, %esi ; CHECK-32-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dl -; CHECK-32-NEXT: shlb $3, %dl -; CHECK-32-NEXT: orb %dh, %dl +; CHECK-32-NEXT: setge %dh +; CHECK-32-NEXT: shlb $3, %dh +; CHECK-32-NEXT: orb %dl, %dh ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-32-NEXT: andl %ecx, %esi ; CHECK-32-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dh +; CHECK-32-NEXT: setge %dl ; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %ecx ; CHECK-32-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 ; CHECK-32-NEXT: setge %cl ; CHECK-32-NEXT: addb %cl, %cl -; CHECK-32-NEXT: orb %dh, %cl ; CHECK-32-NEXT: orb %dl, %cl +; CHECK-32-NEXT: orb %dh, %cl ; CHECK-32-NEXT: movb %cl, (%eax) ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 4 diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll index 9eaa65442a727f..1c4d384a2d4ff2 100644 --- a/llvm/test/CodeGen/X86/jump_sign.ll +++ b/llvm/test/CodeGen/X86/jump_sign.ll @@ -325,15 +325,15 @@ define i32 @func_q(i32 %a0, i32 %a1, i32 %a2) { define ptr @func_r(ptr %base, ptr nocapture %offset, i32 %size) nounwind { ; CHECK-LABEL: func_r: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl (%edx), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl (%ecx), %edx ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: subl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: subl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: jl .LBB15_2 ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %ecx, (%edx) -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: movl %edx, (%ecx) +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: .LBB15_2: # %return ; CHECK-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/known-bits.ll b/llvm/test/CodeGen/X86/known-bits.ll index 9741f6f0a5e2d9..cd623772f954d6 100644 --- a/llvm/test/CodeGen/X86/known-bits.ll +++ b/llvm/test/CodeGen/X86/known-bits.ll @@ -7,12 +7,12 @@ define void @knownbits_zext_in_reg(ptr) nounwind { ; X86: # %bb.0: # %BB ; X86-NEXT: pushl %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl (%eax), %ecx -; X86-NEXT: imull $101, %ecx, %eax -; X86-NEXT: shrl $14, %eax -; X86-NEXT: imull $177, %ecx, %edx +; X86-NEXT: movzbl (%eax), %eax +; X86-NEXT: imull $101, %eax, %ecx +; X86-NEXT: shrl $14, %ecx +; X86-NEXT: imull $177, %eax, %edx ; X86-NEXT: shrl $14, %edx -; X86-NEXT: movzbl %al, %ecx +; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB0_1: # %CF diff --git a/llvm/test/CodeGen/X86/kshift.ll b/llvm/test/CodeGen/X86/kshift.ll index f4efacc1946cff..97078a63119d52 100644 --- a/llvm/test/CodeGen/X86/kshift.ll +++ b/llvm/test/CodeGen/X86/kshift.ll @@ -128,26 +128,26 @@ define i64 @kshiftl_v64i1_1(<64 x i8> %x, <64 x i8> %y) { ; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm4[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} ; KNL-NEXT: valignd {{.*#+}} zmm4 = zmm5[15],zmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; KNL-NEXT: kshiftlw $1, %k1, %k3 +; KNL-NEXT: kshiftlw $1, %k1, %k2 ; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; KNL-NEXT: vpcmpeqb %ymm2, %ymm5, %ymm5 ; KNL-NEXT: vextracti128 $1, %ymm5, %xmm6 ; KNL-NEXT: vpmovsxbd %xmm6, %zmm6 ; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1 ; KNL-NEXT: vpmovsxbd %xmm5, %zmm5 -; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2 +; KNL-NEXT: vptestmd %zmm5, %zmm5, %k3 ; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 ; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k4 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k3} +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k2} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 {%k4} ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2} +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k3} ; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax @@ -404,48 +404,48 @@ define i32 @kshiftr_v32i1_1(<32 x i16> %x, <32 x i16> %y) { define i64 @kshiftr_v64i1_1(<64 x i8> %x, <64 x i8> %y) { ; KNL-LABEL: kshiftr_v64i1_1: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm3 -; KNL-NEXT: vextracti128 $1, %ymm3, %xmm4 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vextracti128 $1, %ymm2, %xmm4 ; KNL-NEXT: vpmovsxbd %xmm4, %zmm4 ; KNL-NEXT: vptestmd %zmm4, %zmm4, %k1 -; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm4 ; KNL-NEXT: vptestmd %zmm4, %zmm4, %k2 -; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 -; KNL-NEXT: vptestmd %zmm3, %zmm3, %k3 +; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k3 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} -; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} -; KNL-NEXT: valignd {{.*#+}} zmm4 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm3[0] +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} +; KNL-NEXT: valignd {{.*#+}} zmm4 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm2[0] ; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} ; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] ; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; KNL-NEXT: valignd {{.*#+}} zmm3 = zmm3[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm5[0] -; KNL-NEXT: kshiftrw $1, %k1, %k3 -; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm5 +; KNL-NEXT: valignd {{.*#+}} zmm2 = zmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm5[0] +; KNL-NEXT: kshiftrw $1, %k1, %k2 +; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm5 ; KNL-NEXT: vextracti128 $1, %ymm5, %xmm6 ; KNL-NEXT: vpmovsxbd %xmm6, %zmm6 ; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1 ; KNL-NEXT: vpmovsxbd %xmm5, %zmm5 -; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2 +; KNL-NEXT: vptestmd %zmm5, %zmm5, %k3 ; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k4 +; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm3 +; KNL-NEXT: vptestmd %zmm3, %zmm3, %k4 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k3} +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k2} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: shll $16, %eax -; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k4} +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k4} ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: orl %eax, %ecx ; KNL-NEXT: shlq $32, %rcx -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2} +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k3} ; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll index 44c57c54ba0233..bf6dd33197a3f2 100644 --- a/llvm/test/CodeGen/X86/ldexp.ll +++ b/llvm/test/CodeGen/X86/ldexp.ll @@ -465,8 +465,6 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; WIN32-NEXT: addl $-254, %eax ; WIN32-NEXT: movl %eax, %edx ; WIN32-NEXT: LBB3_33: -; WIN32-NEXT: fxch %st(3) -; WIN32-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; WIN32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: cmpl $381, %ebp # imm = 0x17D ; WIN32-NEXT: movl %ebp, %eax @@ -474,10 +472,10 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; WIN32-NEXT: # %bb.34: ; WIN32-NEXT: movl $381, %eax # imm = 0x17D ; WIN32-NEXT: LBB3_35: -; WIN32-NEXT: fld %st(2) -; WIN32-NEXT: fmul %st(1), %st ; WIN32-NEXT: fld %st(0) ; WIN32-NEXT: fmul %st(2), %st +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(3), %st ; WIN32-NEXT: leal -127(%ebp), %edi ; WIN32-NEXT: cmpl $255, %ebp ; WIN32-NEXT: jae LBB3_36 @@ -512,10 +510,10 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; WIN32-NEXT: # %bb.41: ; WIN32-NEXT: movl %eax, %ecx ; WIN32-NEXT: LBB3_42: -; WIN32-NEXT: fld %st(3) -; WIN32-NEXT: fmul %st(3), %st -; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fld %st(1) ; WIN32-NEXT: fmul %st(4), %st +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(5), %st ; WIN32-NEXT: jb LBB3_44 ; WIN32-NEXT: # %bb.43: ; WIN32-NEXT: fstp %st(0) @@ -529,9 +527,9 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; WIN32-NEXT: fstp %st(0) ; WIN32-NEXT: movl %ebp, %ecx ; WIN32-NEXT: fldz -; WIN32-NEXT: fxch %st(4) +; WIN32-NEXT: fxch %st(2) ; WIN32-NEXT: LBB3_46: -; WIN32-NEXT: fstp %st(4) +; WIN32-NEXT: fstp %st(2) ; WIN32-NEXT: cmpl $127, %ebp ; WIN32-NEXT: flds {{[0-9]+}}(%esp) ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -545,6 +543,8 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; WIN32-NEXT: # %bb.49: ; WIN32-NEXT: movl $381, %ecx # imm = 0x17D ; WIN32-NEXT: LBB3_50: +; WIN32-NEXT: fxch %st(5) +; WIN32-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; WIN32-NEXT: addl $-254, %ecx ; WIN32-NEXT: leal -127(%esi), %eax ; WIN32-NEXT: cmpl $255, %esi @@ -552,7 +552,7 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; WIN32-NEXT: # %bb.51: ; WIN32-NEXT: movl %eax, %ecx ; WIN32-NEXT: LBB3_52: -; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fld %st(4) ; WIN32-NEXT: fmul %st(3), %st ; WIN32-NEXT: fmul %st, %st(3) ; WIN32-NEXT: jae LBB3_54 @@ -567,7 +567,7 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; WIN32-NEXT: # %bb.55: ; WIN32-NEXT: movl $-330, %eax # imm = 0xFEB6 ; WIN32-NEXT: LBB3_56: -; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fld %st(4) ; WIN32-NEXT: fmul %st(4), %st ; WIN32-NEXT: fmul %st, %st(4) ; WIN32-NEXT: cmpl $-228, %esi @@ -587,8 +587,10 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; WIN32-NEXT: fstp %st(3) ; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(4) +; WIN32-NEXT: fxch %st(3) ; WIN32-NEXT: LBB3_61: -; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: fstp %st(4) ; WIN32-NEXT: cmpl $127, %esi ; WIN32-NEXT: jg LBB3_63 ; WIN32-NEXT: # %bb.62: @@ -614,12 +616,12 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: jg LBB3_69 ; WIN32-NEXT: # %bb.68: -; WIN32-NEXT: fstp %st(2) +; WIN32-NEXT: fstp %st(3) ; WIN32-NEXT: fldz -; WIN32-NEXT: fxch %st(3) ; WIN32-NEXT: fxch %st(2) +; WIN32-NEXT: fxch %st(3) ; WIN32-NEXT: LBB3_69: -; WIN32-NEXT: fstp %st(3) +; WIN32-NEXT: fstp %st(2) ; WIN32-NEXT: shll $23, %edi ; WIN32-NEXT: addl $1065353216, %edi # imm = 0x3F800000 ; WIN32-NEXT: movl %edi, {{[0-9]+}}(%esp) @@ -632,18 +634,17 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; WIN32-NEXT: shll $23, %edx ; WIN32-NEXT: addl $1065353216, %edx # imm = 0x3F800000 ; WIN32-NEXT: movl %edx, {{[0-9]+}}(%esp) -; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: fxch %st(2) ; WIN32-NEXT: fmuls {{[0-9]+}}(%esp) -; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: fxch %st(2) ; WIN32-NEXT: fmuls {{[0-9]+}}(%esp) ; WIN32-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; WIN32-NEXT: fmuls {{[0-9]+}}(%esp) -; WIN32-NEXT: fxch %st(3) +; WIN32-NEXT: fxch %st(2) ; WIN32-NEXT: fmuls {{[0-9]+}}(%esp) ; WIN32-NEXT: fstps 12(%eax) -; WIN32-NEXT: fxch %st(2) -; WIN32-NEXT: fstps 8(%eax) ; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: fstps 8(%eax) ; WIN32-NEXT: fstps 4(%eax) ; WIN32-NEXT: fstps (%eax) ; WIN32-NEXT: addl $32, %esp @@ -830,22 +831,22 @@ define half @ldexp_f16(half %arg0, i32 %arg1) { ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: subl $8, %esp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: cmpl $381, %edi # imm = 0x17D -; WIN32-NEXT: movl %edi, %esi +; WIN32-NEXT: cmpl $381, %esi # imm = 0x17D +; WIN32-NEXT: movl %esi, %edi ; WIN32-NEXT: jl LBB6_2 ; WIN32-NEXT: # %bb.1: -; WIN32-NEXT: movl $381, %esi # imm = 0x17D +; WIN32-NEXT: movl $381, %edi # imm = 0x17D ; WIN32-NEXT: LBB6_2: -; WIN32-NEXT: addl $-254, %esi +; WIN32-NEXT: addl $-254, %edi ; WIN32-NEXT: calll ___gnu_h2f_ieee -; WIN32-NEXT: leal -127(%edi), %eax -; WIN32-NEXT: cmpl $255, %edi +; WIN32-NEXT: leal -127(%esi), %eax +; WIN32-NEXT: cmpl $255, %esi ; WIN32-NEXT: jae LBB6_4 ; WIN32-NEXT: # %bb.3: -; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: movl %eax, %edi ; WIN32-NEXT: LBB6_4: ; WIN32-NEXT: flds __real@7f000000 ; WIN32-NEXT: fld %st(1) @@ -857,8 +858,8 @@ define half @ldexp_f16(half %arg0, i32 %arg1) { ; WIN32-NEXT: fldz ; WIN32-NEXT: LBB6_6: ; WIN32-NEXT: fstp %st(0) -; WIN32-NEXT: cmpl $-329, %edi # imm = 0xFEB7 -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: cmpl $-329, %esi # imm = 0xFEB7 +; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: jge LBB6_8 ; WIN32-NEXT: # %bb.7: ; WIN32-NEXT: movl $-330, %eax # imm = 0xFEB6 @@ -867,38 +868,38 @@ define half @ldexp_f16(half %arg0, i32 %arg1) { ; WIN32-NEXT: fld %st(2) ; WIN32-NEXT: fmul %st(1), %st ; WIN32-NEXT: fmul %st, %st(1) -; WIN32-NEXT: cmpl $-228, %edi +; WIN32-NEXT: cmpl $-228, %esi ; WIN32-NEXT: jb LBB6_9 ; WIN32-NEXT: # %bb.10: ; WIN32-NEXT: fstp %st(1) -; WIN32-NEXT: leal 102(%edi), %eax -; WIN32-NEXT: cmpl $-126, %edi +; WIN32-NEXT: leal 102(%esi), %eax +; WIN32-NEXT: cmpl $-126, %esi ; WIN32-NEXT: jge LBB6_12 ; WIN32-NEXT: jmp LBB6_13 ; WIN32-NEXT: LBB6_9: ; WIN32-NEXT: fstp %st(0) ; WIN32-NEXT: addl $204, %eax -; WIN32-NEXT: cmpl $-126, %edi +; WIN32-NEXT: cmpl $-126, %esi ; WIN32-NEXT: jl LBB6_13 ; WIN32-NEXT: LBB6_12: ; WIN32-NEXT: fstp %st(0) -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: fldz ; WIN32-NEXT: fxch %st(2) ; WIN32-NEXT: LBB6_13: ; WIN32-NEXT: fstp %st(2) -; WIN32-NEXT: cmpl $127, %edi +; WIN32-NEXT: cmpl $127, %esi ; WIN32-NEXT: jg LBB6_15 ; WIN32-NEXT: # %bb.14: ; WIN32-NEXT: fstp %st(0) -; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: movl %eax, %edi ; WIN32-NEXT: fldz ; WIN32-NEXT: fxch %st(1) ; WIN32-NEXT: LBB6_15: ; WIN32-NEXT: fstp %st(1) -; WIN32-NEXT: shll $23, %esi -; WIN32-NEXT: addl $1065353216, %esi # imm = 0x3F800000 -; WIN32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN32-NEXT: shll $23, %edi +; WIN32-NEXT: addl $1065353216, %edi # imm = 0x3F800000 +; WIN32-NEXT: movl %edi, {{[0-9]+}}(%esp) ; WIN32-NEXT: fmuls {{[0-9]+}}(%esp) ; WIN32-NEXT: fstps (%esp) ; WIN32-NEXT: calll ___gnu_f2h_ieee diff --git a/llvm/test/CodeGen/X86/legalize-shift-64.ll b/llvm/test/CodeGen/X86/legalize-shift-64.ll index 53208de7ea27e8..1436a7398b687a 100644 --- a/llvm/test/CodeGen/X86/legalize-shift-64.ll +++ b/llvm/test/CodeGen/X86/legalize-shift-64.ll @@ -92,16 +92,16 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: movl %ebx, %edi -; CHECK-NEXT: shll %cl, %edi -; CHECK-NEXT: shldl %cl, %ebx, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: shll %cl, %esi +; CHECK-NEXT: shldl %cl, %ebx, %edi ; CHECK-NEXT: testb $32, %cl ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: je .LBB4_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: movl %edi, %esi -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: movl %esi, %edi +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: movb %ch, %cl @@ -115,8 +115,8 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) { ; CHECK-NEXT: .LBB4_4: ; CHECK-NEXT: movl %ebp, 12(%eax) ; CHECK-NEXT: movl %ebx, 8(%eax) -; CHECK-NEXT: movl %esi, 4(%eax) -; CHECK-NEXT: movl %edi, (%eax) +; CHECK-NEXT: movl %edi, 4(%eax) +; CHECK-NEXT: movl %esi, (%eax) ; CHECK-NEXT: popl %esi ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll index cf423227f23bca..b1112210186a46 100644 --- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll +++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll @@ -84,37 +84,35 @@ define <2 x i256> @test_srl(<2 x i256> %In) { ; X32-NEXT: .cfi_offset %edi, -16 ; X32-NEXT: .cfi_offset %ebx, -12 ; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %ebp, %esi -; X32-NEXT: shldl $28, %edx, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: shldl $28, %ebx, %edx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: shldl $28, %ecx, %ebx -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: shldl $28, %edi, %esi -; X32-NEXT: shldl $28, %eax, %edi -; X32-NEXT: movl %eax, %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shldl $28, %eax, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %edi, %ecx +; X32-NEXT: shldl $28, %ebx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: shldl $28, %esi, %ebx +; X32-NEXT: shldl $28, %eax, %esi +; X32-NEXT: shldl $28, %edx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: shldl $28, %ebp, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: shldl $28, %eax, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shrdl $4, %eax, %ecx -; X32-NEXT: shrl $4, %ebp +; X32-NEXT: shrl $4, %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %ebp, 60(%eax) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, 56(%eax) -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, 52(%eax) -; X32-NEXT: movl %ebx, 48(%eax) +; X32-NEXT: movl %edi, 60(%eax) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 56(%eax) +; X32-NEXT: movl %ebx, 52(%eax) +; X32-NEXT: movl %esi, 48(%eax) +; X32-NEXT: movl (%esp), %esi # 4-byte Reload ; X32-NEXT: movl %esi, 44(%eax) -; X32-NEXT: movl %edi, 40(%eax) -; X32-NEXT: movl %edx, 36(%eax) +; X32-NEXT: movl %edx, 40(%eax) +; X32-NEXT: movl %ebp, 36(%eax) ; X32-NEXT: movl %ecx, 32(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shrl $31, %ecx @@ -180,37 +178,35 @@ define <2 x i256> @test_sra(<2 x i256> %In) { ; X32-NEXT: .cfi_offset %edi, -16 ; X32-NEXT: .cfi_offset %ebx, -12 ; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %ebp, %esi -; X32-NEXT: shldl $26, %edx, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: shldl $26, %ebx, %edx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: shldl $26, %ecx, %ebx -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: shldl $26, %edi, %esi -; X32-NEXT: shldl $26, %eax, %edi -; X32-NEXT: movl %eax, %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shldl $26, %eax, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %edi, %ecx +; X32-NEXT: shldl $26, %ebx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: shldl $26, %esi, %ebx +; X32-NEXT: shldl $26, %eax, %esi +; X32-NEXT: shldl $26, %edx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: shldl $26, %ebp, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: shldl $26, %eax, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shrdl $6, %eax, %ecx -; X32-NEXT: sarl $6, %ebp +; X32-NEXT: sarl $6, %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %ebp, 60(%eax) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, 56(%eax) -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, 52(%eax) -; X32-NEXT: movl %ebx, 48(%eax) +; X32-NEXT: movl %edi, 60(%eax) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 56(%eax) +; X32-NEXT: movl %ebx, 52(%eax) +; X32-NEXT: movl %esi, 48(%eax) +; X32-NEXT: movl (%esp), %esi # 4-byte Reload ; X32-NEXT: movl %esi, 44(%eax) -; X32-NEXT: movl %edi, 40(%eax) -; X32-NEXT: movl %edx, 36(%eax) +; X32-NEXT: movl %edx, 40(%eax) +; X32-NEXT: movl %ebp, 36(%eax) ; X32-NEXT: movl %ecx, 32(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: sarl $31, %ecx diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll index 7f8115dc1ce389..37171b9c116f88 100644 --- a/llvm/test/CodeGen/X86/load-combine.ll +++ b/llvm/test/CodeGen/X86/load-combine.ll @@ -472,19 +472,19 @@ define i32 @load_i32_by_i8_bswap_store_in_between(ptr %arg, ptr %arg1) { ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: .cfi_offset %esi, -8 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl (%eax), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movzbl (%ecx), %edx ; CHECK-NEXT: shll $24, %edx -; CHECK-NEXT: movzbl 1(%eax), %esi -; CHECK-NEXT: movl $0, (%ecx) +; CHECK-NEXT: movzbl 1(%ecx), %esi +; CHECK-NEXT: movl $0, (%eax) ; CHECK-NEXT: shll $16, %esi ; CHECK-NEXT: orl %edx, %esi -; CHECK-NEXT: movzbl 2(%eax), %ecx -; CHECK-NEXT: shll $8, %ecx -; CHECK-NEXT: orl %esi, %ecx -; CHECK-NEXT: movzbl 3(%eax), %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movzbl 2(%ecx), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %esi, %edx +; CHECK-NEXT: movzbl 3(%ecx), %eax +; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: popl %esi ; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll index 9f0c1ea1dc3f6f..817e94b603173d 100644 --- a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll +++ b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -14,10 +14,10 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; GENERIC-NEXT: pushq %r14 ; GENERIC-NEXT: pushq %rbx ; GENERIC-NEXT: ## kill: def $ecx killed $ecx def $rcx -; GENERIC-NEXT: movl (%rdx), %r8d -; GENERIC-NEXT: movl 4(%rdx), %ebx +; GENERIC-NEXT: movl (%rdx), %r9d +; GENERIC-NEXT: movl 4(%rdx), %r15d ; GENERIC-NEXT: decl %ecx -; GENERIC-NEXT: leaq 20(%rdx), %r9 +; GENERIC-NEXT: leaq 20(%rdx), %r8 ; GENERIC-NEXT: movq _Te0@GOTPCREL(%rip), %rdi ; GENERIC-NEXT: movq _Te1@GOTPCREL(%rip), %rax ; GENERIC-NEXT: movq _Te3@GOTPCREL(%rip), %r10 @@ -25,61 +25,61 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; GENERIC-NEXT: .p2align 4, 0x90 ; GENERIC-NEXT: LBB0_1: ## %bb ; GENERIC-NEXT: ## =>This Inner Loop Header: Depth=1 -; GENERIC-NEXT: movzbl %r8b, %r14d -; GENERIC-NEXT: ## kill: def $r8d killed $r8d def $r8 -; GENERIC-NEXT: shrl $24, %r8d -; GENERIC-NEXT: movl %ebx, %ebp -; GENERIC-NEXT: shrl $16, %ebp -; GENERIC-NEXT: movzbl %bpl, %r15d -; GENERIC-NEXT: movl (%rax,%r15,4), %ebp -; GENERIC-NEXT: xorl (%rdi,%r8,4), %ebp -; GENERIC-NEXT: xorl -12(%r9), %ebp -; GENERIC-NEXT: shrl $24, %ebx +; GENERIC-NEXT: movzbl %r9b, %r14d +; GENERIC-NEXT: ## kill: def $r9d killed $r9d def $r9 +; GENERIC-NEXT: shrl $24, %r9d +; GENERIC-NEXT: movl %r15d, %ebx +; GENERIC-NEXT: shrl $16, %ebx +; GENERIC-NEXT: movzbl %bl, %ebx +; GENERIC-NEXT: movl (%rax,%rbx,4), %ebx +; GENERIC-NEXT: xorl (%rdi,%r9,4), %ebx +; GENERIC-NEXT: xorl -12(%r8), %ebx +; GENERIC-NEXT: shrl $24, %r15d ; GENERIC-NEXT: movl (%r10,%r14,4), %r14d -; GENERIC-NEXT: xorl (%rdi,%rbx,4), %r14d -; GENERIC-NEXT: xorl -8(%r9), %r14d -; GENERIC-NEXT: movl %ebp, %r8d -; GENERIC-NEXT: shrl $24, %r8d -; GENERIC-NEXT: movl (%rdi,%r8,4), %r8d +; GENERIC-NEXT: xorl (%rdi,%r15,4), %r14d +; GENERIC-NEXT: xorl -8(%r8), %r14d +; GENERIC-NEXT: movl %ebx, %r9d +; GENERIC-NEXT: shrl $24, %r9d +; GENERIC-NEXT: movl (%rdi,%r9,4), %r9d ; GENERIC-NEXT: subq $1, %r11 ; GENERIC-NEXT: jb LBB0_3 ; GENERIC-NEXT: ## %bb.2: ## %bb1 ; GENERIC-NEXT: ## in Loop: Header=BB0_1 Depth=1 -; GENERIC-NEXT: movl %r14d, %ebx -; GENERIC-NEXT: shrl $16, %ebx -; GENERIC-NEXT: movzbl %bl, %ebx -; GENERIC-NEXT: xorl (%rax,%rbx,4), %r8d -; GENERIC-NEXT: xorl -4(%r9), %r8d +; GENERIC-NEXT: movl %r14d, %ebp +; GENERIC-NEXT: shrl $16, %ebp +; GENERIC-NEXT: movzbl %bpl, %r15d +; GENERIC-NEXT: xorl (%rax,%r15,4), %r9d +; GENERIC-NEXT: xorl -4(%r8), %r9d ; GENERIC-NEXT: shrl $24, %r14d -; GENERIC-NEXT: movzbl %bpl, %ebx -; GENERIC-NEXT: movl (%r10,%rbx,4), %ebx -; GENERIC-NEXT: xorl (%rdi,%r14,4), %ebx -; GENERIC-NEXT: xorl (%r9), %ebx -; GENERIC-NEXT: addq $16, %r9 +; GENERIC-NEXT: movzbl %bl, %ebx +; GENERIC-NEXT: movl (%r10,%rbx,4), %r15d +; GENERIC-NEXT: xorl (%rdi,%r14,4), %r15d +; GENERIC-NEXT: xorl (%r8), %r15d +; GENERIC-NEXT: addq $16, %r8 ; GENERIC-NEXT: jmp LBB0_1 ; GENERIC-NEXT: LBB0_3: ## %bb2 ; GENERIC-NEXT: shlq $4, %rcx -; GENERIC-NEXT: andl $-16777216, %r8d ## imm = 0xFF000000 -; GENERIC-NEXT: movl %r14d, %r9d -; GENERIC-NEXT: shrl $16, %r9d -; GENERIC-NEXT: movzbl %r9b, %r9d -; GENERIC-NEXT: movzbl 2(%rax,%r9,4), %r9d -; GENERIC-NEXT: shll $16, %r9d -; GENERIC-NEXT: orl %r8d, %r9d -; GENERIC-NEXT: xorl 16(%rcx,%rdx), %r9d +; GENERIC-NEXT: andl $-16777216, %r9d ## imm = 0xFF000000 +; GENERIC-NEXT: movl %r14d, %r8d +; GENERIC-NEXT: shrl $16, %r8d +; GENERIC-NEXT: movzbl %r8b, %r8d +; GENERIC-NEXT: movzbl 2(%rax,%r8,4), %r8d +; GENERIC-NEXT: shll $16, %r8d +; GENERIC-NEXT: orl %r9d, %r8d +; GENERIC-NEXT: xorl 16(%rcx,%rdx), %r8d ; GENERIC-NEXT: shrl $8, %r14d ; GENERIC-NEXT: movzbl 3(%rdi,%r14,4), %edi ; GENERIC-NEXT: shll $24, %edi -; GENERIC-NEXT: movzbl %bpl, %r8d -; GENERIC-NEXT: movzbl 2(%rax,%r8,4), %eax +; GENERIC-NEXT: movzbl %bl, %r9d +; GENERIC-NEXT: movzbl 2(%rax,%r9,4), %eax ; GENERIC-NEXT: shll $16, %eax ; GENERIC-NEXT: orl %edi, %eax ; GENERIC-NEXT: xorl 20(%rcx,%rdx), %eax -; GENERIC-NEXT: movl %r9d, %ecx +; GENERIC-NEXT: movl %r8d, %ecx ; GENERIC-NEXT: shrl $24, %ecx ; GENERIC-NEXT: movb %cl, (%rsi) -; GENERIC-NEXT: shrl $16, %r9d -; GENERIC-NEXT: movb %r9b, 1(%rsi) +; GENERIC-NEXT: shrl $16, %r8d +; GENERIC-NEXT: movb %r8b, 1(%rsi) ; GENERIC-NEXT: movl %eax, %ecx ; GENERIC-NEXT: shrl $24, %ecx ; GENERIC-NEXT: movb %cl, 4(%rsi) @@ -101,8 +101,8 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; ATOM-NEXT: movl (%rdx), %r8d ; ATOM-NEXT: movl 4(%rdx), %r15d ; ATOM-NEXT: leaq 20(%rdx), %r9 -; ATOM-NEXT: movq _Te0@GOTPCREL(%rip), %rdi -; ATOM-NEXT: movq _Te1@GOTPCREL(%rip), %rax +; ATOM-NEXT: movq _Te0@GOTPCREL(%rip), %rax +; ATOM-NEXT: movq _Te1@GOTPCREL(%rip), %rdi ; ATOM-NEXT: movq _Te3@GOTPCREL(%rip), %r10 ; ATOM-NEXT: decl %ecx ; ATOM-NEXT: movq %rcx, %r11 @@ -116,16 +116,16 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; ATOM-NEXT: shrl $16, %ebx ; ATOM-NEXT: shrl $24, %r14d ; ATOM-NEXT: movzbl %bl, %ebx -; ATOM-NEXT: movl (%rax,%rbx,4), %ebx -; ATOM-NEXT: xorl (%rdi,%r14,4), %ebx +; ATOM-NEXT: movl (%rdi,%rbx,4), %ebx +; ATOM-NEXT: xorl (%rax,%r14,4), %ebx ; ATOM-NEXT: movl (%r10,%r8,4), %r14d ; ATOM-NEXT: xorl -12(%r9), %ebx -; ATOM-NEXT: xorl (%rdi,%r15,4), %r14d +; ATOM-NEXT: xorl (%rax,%r15,4), %r14d ; ATOM-NEXT: movl %ebx, %r8d ; ATOM-NEXT: xorl -8(%r9), %r14d ; ATOM-NEXT: shrl $24, %r8d ; ATOM-NEXT: subq $1, %r11 -; ATOM-NEXT: movl (%rdi,%r8,4), %r8d +; ATOM-NEXT: movl (%rax,%r8,4), %r8d ; ATOM-NEXT: jb LBB0_3 ; ATOM-NEXT: ## %bb.2: ## %bb1 ; ATOM-NEXT: ## in Loop: Header=BB0_1 Depth=1 @@ -134,9 +134,9 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; ATOM-NEXT: shrl $24, %r14d ; ATOM-NEXT: shrl $16, %ebp ; ATOM-NEXT: movzbl %bpl, %r15d -; ATOM-NEXT: xorl (%rax,%r15,4), %r8d +; ATOM-NEXT: xorl (%rdi,%r15,4), %r8d ; ATOM-NEXT: movl (%r10,%rbx,4), %r15d -; ATOM-NEXT: xorl (%rdi,%r14,4), %r15d +; ATOM-NEXT: xorl (%rax,%r14,4), %r15d ; ATOM-NEXT: xorl -4(%r9), %r8d ; ATOM-NEXT: xorl (%r9), %r15d ; ATOM-NEXT: addq $16, %r9 @@ -147,28 +147,28 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; ATOM-NEXT: shrl $8, %r14d ; ATOM-NEXT: shlq $4, %rcx ; ATOM-NEXT: shrl $16, %r9d -; ATOM-NEXT: movzbl 3(%rdi,%r14,4), %edi +; ATOM-NEXT: movzbl 3(%rax,%r14,4), %eax ; ATOM-NEXT: movzbl %r9b, %r9d -; ATOM-NEXT: shll $24, %edi -; ATOM-NEXT: movzbl 2(%rax,%r9,4), %r9d +; ATOM-NEXT: shll $24, %eax +; ATOM-NEXT: movzbl 2(%rdi,%r9,4), %r9d ; ATOM-NEXT: shll $16, %r9d ; ATOM-NEXT: orl %r8d, %r9d ; ATOM-NEXT: movzbl %bl, %r8d -; ATOM-NEXT: movzbl 2(%rax,%r8,4), %eax +; ATOM-NEXT: movzbl 2(%rdi,%r8,4), %edi ; ATOM-NEXT: xorl 16(%rcx,%rdx), %r9d -; ATOM-NEXT: shll $16, %eax -; ATOM-NEXT: orl %edi, %eax -; ATOM-NEXT: movl %r9d, %edi +; ATOM-NEXT: shll $16, %edi +; ATOM-NEXT: orl %eax, %edi +; ATOM-NEXT: movl %r9d, %eax ; ATOM-NEXT: shrl $16, %r9d -; ATOM-NEXT: xorl 20(%rcx,%rdx), %eax -; ATOM-NEXT: shrl $24, %edi -; ATOM-NEXT: movl %eax, %ecx -; ATOM-NEXT: shrl $16, %eax -; ATOM-NEXT: movb %dil, (%rsi) +; ATOM-NEXT: xorl 20(%rcx,%rdx), %edi +; ATOM-NEXT: shrl $24, %eax +; ATOM-NEXT: movb %al, (%rsi) +; ATOM-NEXT: movl %edi, %eax +; ATOM-NEXT: shrl $16, %edi ; ATOM-NEXT: movb %r9b, 1(%rsi) -; ATOM-NEXT: shrl $24, %ecx -; ATOM-NEXT: movb %cl, 4(%rsi) -; ATOM-NEXT: movb %al, 5(%rsi) +; ATOM-NEXT: shrl $24, %eax +; ATOM-NEXT: movb %al, 4(%rsi) +; ATOM-NEXT: movb %dil, 5(%rsi) ; ATOM-NEXT: popq %rbx ; ATOM-NEXT: popq %r14 ; ATOM-NEXT: popq %r15 diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll index c1cce6f5d8ca10..5a0c91e078bf2c 100644 --- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll +++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll @@ -168,23 +168,25 @@ define i1 @lshr_ctlz_undef_cmpne_zero_i64(i64 %in) nounwind { define <2 x i64> @lshr_ctlz_cmpeq_zero_v2i64(<2 x i64> %in) nounwind { ; X86-LABEL: lshr_ctlz_cmpeq_zero_v2i64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi ; X86-NEXT: setne %cl ; X86-NEXT: negl %ecx -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: orl {{[0-9]+}}(%esp), %esi -; X86-NEXT: setne %dl -; X86-NEXT: negl %edx -; X86-NEXT: movl %edx, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-NEXT: setne %bl +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %ebx, 8(%eax) ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl $4 ; ; X64-LABEL: lshr_ctlz_cmpeq_zero_v2i64: @@ -206,23 +208,25 @@ define <2 x i64> @lshr_ctlz_cmpeq_zero_v2i64(<2 x i64> %in) nounwind { define <2 x i64> @lshr_ctlz_cmpne_zero_v2i64(<2 x i64> %in) nounwind { ; X86-LABEL: lshr_ctlz_cmpne_zero_v2i64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sete %cl ; X86-NEXT: negl %ecx -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: orl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sete %dl -; X86-NEXT: negl %edx -; X86-NEXT: movl %edx, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sete %bl +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %ebx, 8(%eax) ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl $4 ; ; X64-LABEL: lshr_ctlz_cmpne_zero_v2i64: diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll index e3c5a5023ac9ee..ec85e938dc0497 100644 --- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll +++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll @@ -1371,37 +1371,37 @@ define <8 x i32> @reassociate_umin_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pxor %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm2 -; SSE-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pxor %xmm2, %xmm8 +; SSE-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pxor %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm7, %xmm2 +; SSE-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: retq ; @@ -1822,85 +1822,84 @@ define <32 x i16> @reassociate_umax_v32i16(<32 x i16> %x0, <32 x i16> %x1, <32 x define <16 x i32> @reassociate_umax_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, <16 x i32> %x3) { ; SSE-LABEL: reassociate_umax_v16i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: paddd %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: paddd %xmm4, %xmm0 ; SSE-NEXT: paddd %xmm5, %xmm1 ; SSE-NEXT: paddd %xmm6, %xmm2 -; SSE-NEXT: paddd %xmm9, %xmm8 +; SSE-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pxor %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pxor %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 ; SSE-NEXT: movdqa %xmm15, %xmm5 ; SSE-NEXT: pxor %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm3, %xmm4 -; SSE-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pxor %xmm3, %xmm6 +; SSE-NEXT: pcmpgtd %xmm6, %xmm5 ; SSE-NEXT: pand %xmm5, %xmm15 ; SSE-NEXT: pandn %xmm2, %xmm5 ; SSE-NEXT: por %xmm15, %xmm5 ; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pxor %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pxor %xmm3, %xmm4 -; SSE-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm3, %xmm6 +; SSE-NEXT: pcmpgtd %xmm6, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm14 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pxor %xmm3, %xmm4 -; SSE-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pxor %xmm3, %xmm6 +; SSE-NEXT: pcmpgtd %xmm6, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm13 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pxor %xmm3, %xmm0 -; SSE-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE-NEXT: pcmpgtd %xmm6, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm12 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pxor %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE-NEXT: pcmpgtd %xmm6, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pxor %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pxor %xmm3, %xmm2 -; SSE-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE-NEXT: pcmpgtd %xmm6, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm5, %xmm2 ; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pxor %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pxor %xmm5, %xmm3 -; SSE-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pxor %xmm3, %xmm5 +; SSE-NEXT: pxor %xmm9, %xmm3 +; SSE-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_umax_v16i32: @@ -2528,37 +2527,37 @@ define <32 x i16> @reassociate_umin_v32i16(<32 x i16> %x0, <32 x i16> %x1, <32 x ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 ; SSE-NEXT: paddw %xmm4, %xmm11 ; SSE-NEXT: paddw %xmm5, %xmm10 ; SSE-NEXT: paddw %xmm6, %xmm9 ; SSE-NEXT: paddw %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: psubusw %xmm8, %xmm4 -; SSE-NEXT: psubw %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: psubusw %xmm9, %xmm4 -; SSE-NEXT: psubw %xmm4, %xmm13 ; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: psubusw %xmm10, %xmm4 +; SSE-NEXT: psubusw %xmm8, %xmm4 ; SSE-NEXT: psubw %xmm4, %xmm15 ; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: psubusw %xmm11, %xmm4 +; SSE-NEXT: psubusw %xmm9, %xmm4 ; SSE-NEXT: psubw %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: psubusw %xmm10, %xmm4 +; SSE-NEXT: psubw %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: psubusw %xmm11, %xmm4 +; SSE-NEXT: psubw %xmm4, %xmm12 ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: psubusw %xmm14, %xmm4 +; SSE-NEXT: psubusw %xmm12, %xmm4 ; SSE-NEXT: psubw %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psubusw %xmm15, %xmm4 +; SSE-NEXT: psubusw %xmm13, %xmm4 ; SSE-NEXT: psubw %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psubusw %xmm13, %xmm4 +; SSE-NEXT: psubusw %xmm14, %xmm4 ; SSE-NEXT: psubw %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psubusw %xmm12, %xmm4 +; SSE-NEXT: psubusw %xmm15, %xmm4 ; SSE-NEXT: psubw %xmm4, %xmm3 ; SSE-NEXT: retq ; @@ -2602,69 +2601,69 @@ define <16 x i32> @reassociate_umin_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x ; SSE-NEXT: paddd %xmm5, %xmm1 ; SSE-NEXT: paddd %xmm6, %xmm2 ; SSE-NEXT: paddd %xmm7, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pxor %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pxor %xmm5, %xmm4 -; SSE-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE-NEXT: pand %xmm4, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm15, %xmm5 ; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pxor %xmm5, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: pcmpgtd %xmm6, %xmm3 ; SSE-NEXT: pand %xmm3, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pxor %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm5, %xmm2 -; SSE-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE-NEXT: pand %xmm6, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm13, %xmm6 ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pxor %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pxor %xmm5, %xmm6 -; SSE-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm12, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pxor %xmm5, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pxor %xmm5, %xmm6 -; SSE-NEXT: pcmpgtd %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pxor %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm5, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pxor %xmm5, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 ; SSE-NEXT: pcmpgtd %xmm6, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm8, %xmm5 -; SSE-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm3 +; SSE-NEXT: pxor %xmm8, %xmm4 +; SSE-NEXT: pcmpgtd %xmm4, %xmm3 ; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 ; SSE-NEXT: por %xmm8, %xmm3 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll index f84960485840d8..a64312af0fc4c1 100644 --- a/llvm/test/CodeGen/X86/machine-cp.ll +++ b/llvm/test/CodeGen/X86/machine-cp.ll @@ -99,20 +99,21 @@ while.end: ; preds = %while.body, %entry define <16 x float> @foo(<16 x float> %x) { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb -; CHECK-NEXT: xorps %xmm5, %xmm5 -; CHECK-NEXT: cvttps2dq %xmm3, %xmm8 -; CHECK-NEXT: movaps %xmm3, %xmm4 -; CHECK-NEXT: cmpltps %xmm5, %xmm4 -; CHECK-NEXT: movaps {{.*#+}} xmm7 = [13,14,15,16] -; CHECK-NEXT: movaps %xmm4, %xmm6 -; CHECK-NEXT: orps %xmm7, %xmm6 -; CHECK-NEXT: cvtdq2ps %xmm8, %xmm3 -; CHECK-NEXT: andps %xmm7, %xmm3 -; CHECK-NEXT: andps %xmm6, %xmm3 -; CHECK-NEXT: andnps %xmm4, %xmm6 +; CHECK-NEXT: xorps %xmm6, %xmm6 +; CHECK-NEXT: cvttps2dq %xmm3, %xmm4 +; CHECK-NEXT: movaps %xmm3, %xmm7 +; CHECK-NEXT: cmpltps %xmm6, %xmm7 +; CHECK-NEXT: movaps {{.*#+}} xmm5 = [13,14,15,16] +; CHECK-NEXT: movaps %xmm7, %xmm8 +; CHECK-NEXT: orps %xmm5, %xmm8 +; CHECK-NEXT: cvtdq2ps %xmm4, %xmm3 +; CHECK-NEXT: andps %xmm5, %xmm3 +; CHECK-NEXT: andps %xmm8, %xmm3 +; CHECK-NEXT: movaps %xmm8, %xmm5 +; CHECK-NEXT: andnps %xmm7, %xmm5 ; CHECK-NEXT: cvttps2dq %xmm2, %xmm4 ; CHECK-NEXT: movaps %xmm2, %xmm7 -; CHECK-NEXT: cmpltps %xmm5, %xmm7 +; CHECK-NEXT: cmpltps %xmm6, %xmm7 ; CHECK-NEXT: movaps {{.*#+}} xmm8 = [9,10,11,12] ; CHECK-NEXT: movaps %xmm7, %xmm9 ; CHECK-NEXT: orps %xmm8, %xmm9 @@ -121,7 +122,7 @@ define <16 x float> @foo(<16 x float> %x) { ; CHECK-NEXT: andps %xmm9, %xmm2 ; CHECK-NEXT: andnps %xmm7, %xmm9 ; CHECK-NEXT: cvttps2dq %xmm1, %xmm4 -; CHECK-NEXT: cmpltps %xmm5, %xmm1 +; CHECK-NEXT: cmpltps %xmm6, %xmm1 ; CHECK-NEXT: movaps {{.*#+}} xmm7 = [5,6,7,8] ; CHECK-NEXT: movaps %xmm1, %xmm8 ; CHECK-NEXT: orps %xmm7, %xmm8 @@ -130,12 +131,12 @@ define <16 x float> @foo(<16 x float> %x) { ; CHECK-NEXT: andps %xmm8, %xmm4 ; CHECK-NEXT: andnps %xmm1, %xmm8 ; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 -; CHECK-NEXT: cmpltps %xmm5, %xmm0 -; CHECK-NEXT: movaps {{.*#+}} xmm5 = [1,2,3,4] +; CHECK-NEXT: cmpltps %xmm6, %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm6 = [1,2,3,4] ; CHECK-NEXT: movaps %xmm0, %xmm7 -; CHECK-NEXT: orps %xmm5, %xmm7 +; CHECK-NEXT: orps %xmm6, %xmm7 ; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1 -; CHECK-NEXT: andps %xmm5, %xmm1 +; CHECK-NEXT: andps %xmm6, %xmm1 ; CHECK-NEXT: andps %xmm7, %xmm1 ; CHECK-NEXT: andnps %xmm0, %xmm7 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] @@ -145,8 +146,8 @@ define <16 x float> @foo(<16 x float> %x) { ; CHECK-NEXT: orps %xmm8, %xmm4 ; CHECK-NEXT: andps %xmm0, %xmm9 ; CHECK-NEXT: orps %xmm9, %xmm2 -; CHECK-NEXT: andps %xmm0, %xmm6 -; CHECK-NEXT: orps %xmm6, %xmm3 +; CHECK-NEXT: andps %xmm0, %xmm5 +; CHECK-NEXT: orps %xmm5, %xmm3 ; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: movaps %xmm4, %xmm1 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index 4d0045fec26497..f566cddef76c1e 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -9,27 +9,27 @@ define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture read ; SSE2-LABEL: _Z10test_shortPsS_i_128: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB0_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: pmaddwd %xmm3, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $8, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB0_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; AVX-LABEL: _Z10test_shortPsS_i_128: @@ -943,34 +943,34 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB7_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm7 +; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm5 ; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm6 -; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm8 -; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm7 +; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; SSE2-NEXT: psraw $8, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] ; SSE2-NEXT: psraw $8, %xmm10 ; SSE2-NEXT: pmaddwd %xmm9, %xmm10 ; SSE2-NEXT: paddd %xmm10, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm8 -; SSE2-NEXT: pmaddwd %xmm7, %xmm8 -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: pmaddwd %xmm5, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSE2-NEXT: psraw $8, %xmm8 -; SSE2-NEXT: pmaddwd %xmm7, %xmm8 -; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmaddwd %xmm5, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: pmaddwd %xmm6, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; SSE2-NEXT: psraw $8, %xmm6 +; SSE2-NEXT: pmaddwd %xmm5, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm3 ; SSE2-NEXT: addq $32, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB7_1 @@ -1382,8 +1382,8 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB10_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1404,15 +1404,15 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: pmullw %xmm5, %xmm7 ; SSE2-NEXT: movdqa %xmm7, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: paddd %xmm7, %xmm2 +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB10_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -1637,9 +1637,9 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; AVX1-LABEL: test_unsigned_short_1024: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: xorl %ecx, %ecx ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .p2align 4, 0x90 @@ -1669,14 +1669,14 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm10 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmulld %xmm11, %xmm12, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm12 ; AVX1-NEXT: vpaddd %xmm4, %xmm12, %xmm4 -; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpaddd %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm3 @@ -1689,16 +1689,16 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %middle.block -; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm5 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2738,23 +2738,23 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; SSE2-LABEL: sum_and_sum_of_squares: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB33_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; SSE2-NEXT: paddd %xmm5, %xmm3 ; SSE2-NEXT: pmaddwd %xmm4, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: addq $-8, %rax ; SSE2-NEXT: jne .LBB33_1 @@ -2765,12 +2765,12 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] ; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: shlq $32, %rcx ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll index 46b1fa5dd2757e..c3a789743c398d 100644 --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -2634,9 +2634,9 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32 ; ; AVX512-LABEL: expandload_v32f32_v32i32: ; AVX512: ## %bb.0: -; AVX512-NEXT: vptestnmd %zmm3, %zmm3, %k2 -; AVX512-NEXT: vptestnmd %zmm2, %zmm2, %k1 -; AVX512-NEXT: kmovw %k1, %eax +; AVX512-NEXT: vptestnmd %zmm3, %zmm3, %k1 +; AVX512-NEXT: vptestnmd %zmm2, %zmm2, %k2 +; AVX512-NEXT: kmovw %k2, %eax ; AVX512-NEXT: movl %eax, %ecx ; AVX512-NEXT: shrl %ecx ; AVX512-NEXT: andl $21845, %ecx ## imm = 0x5555 @@ -2652,8 +2652,8 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32 ; AVX512-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F ; AVX512-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 ; AVX512-NEXT: shrl $24, %eax -; AVX512-NEXT: vexpandps (%rdi,%rax,4), %zmm1 {%k2} -; AVX512-NEXT: vexpandps (%rdi), %zmm0 {%k1} +; AVX512-NEXT: vexpandps (%rdi,%rax,4), %zmm1 {%k1} +; AVX512-NEXT: vexpandps (%rdi), %zmm0 {%k2} ; AVX512-NEXT: retq %mask = icmp eq <32 x i32> %trigger, zeroinitializer %res = call <32 x float> @llvm.masked.expandload.v32f32(ptr %base, <32 x i1> %mask, <32 x float> %src0) diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 1289eef7795dcc..0f8b8261285a00 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -3380,20 +3380,20 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64 ; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; KNL_32-NEXT: .LBB42_2: # %else ; KNL_32-NEXT: testb $2, %bl -; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx +; KNL_32-NEXT: vpextrd $1, %xmm0, %edx ; KNL_32-NEXT: je .LBB42_4 ; KNL_32-NEXT: # %bb.3: # %cond.load1 -; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm2 -; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm2, %xmm2 +; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm1, %xmm2 +; KNL_32-NEXT: vpinsrd $3, 4(%edx), %xmm2, %xmm2 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; KNL_32-NEXT: .LBB42_4: # %else2 ; KNL_32-NEXT: testb $4, %bl -; KNL_32-NEXT: vpextrd $2, %xmm0, %edx +; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx ; KNL_32-NEXT: je .LBB42_6 ; KNL_32-NEXT: # %bb.5: # %cond.load4 -; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2 +; KNL_32-NEXT: vpbroadcastd (%ecx), %ymm2 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2 +; KNL_32-NEXT: vpbroadcastd 4(%ecx), %ymm2 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; KNL_32-NEXT: .LBB42_6: # %else5 ; KNL_32-NEXT: testb $8, %bl @@ -3457,15 +3457,15 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64 ; KNL_32-NEXT: testb $2, %bl ; KNL_32-NEXT: je .LBB42_12 ; KNL_32-NEXT: .LBB42_11: # %cond.load17 -; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm2 -; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm2, %xmm2 +; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm2 +; KNL_32-NEXT: vpinsrd $3, 4(%edx), %xmm2, %xmm2 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; KNL_32-NEXT: testb $4, %bl ; KNL_32-NEXT: je .LBB42_14 ; KNL_32-NEXT: .LBB42_13: # %cond.load23 -; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2 +; KNL_32-NEXT: vpbroadcastd (%ecx), %ymm2 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] -; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2 +; KNL_32-NEXT: vpbroadcastd 4(%ecx), %ymm2 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: jne .LBB42_15 @@ -3475,15 +3475,15 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64 ; KNL_32-NEXT: testb $2, %bl ; KNL_32-NEXT: je .LBB42_20 ; KNL_32-NEXT: .LBB42_19: # %cond.load42 -; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm2, %xmm3 -; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm3, %xmm3 +; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm2, %xmm3 +; KNL_32-NEXT: vpinsrd $3, 4(%edx), %xmm3, %xmm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; KNL_32-NEXT: testb $4, %bl ; KNL_32-NEXT: je .LBB42_22 ; KNL_32-NEXT: .LBB42_21: # %cond.load48 -; KNL_32-NEXT: vpbroadcastd (%edx), %ymm3 +; KNL_32-NEXT: vpbroadcastd (%ecx), %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm3 +; KNL_32-NEXT: vpbroadcastd 4(%ecx), %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: jne .LBB42_23 @@ -4191,16 +4191,16 @@ define <16 x double> @test_gather_setcc_split(ptr %base, <16 x i32> %ind, <16 x ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp -; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm3 -; KNL_32-NEXT: vmovapd 72(%ebp), %zmm1 +; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3 ; KNL_32-NEXT: movl 8(%ebp), %eax -; KNL_32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1 -; KNL_32-NEXT: vptestnmd %zmm3, %zmm3, %k2 +; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2} ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm1 {%k1} +; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1} ; KNL_32-NEXT: vmovapd %zmm2, %zmm0 +; KNL_32-NEXT: vmovapd %zmm3, %zmm1 ; KNL_32-NEXT: movl %ebp, %esp ; KNL_32-NEXT: popl %ebp ; KNL_32-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 185eb50435e8db..cfc6f7fc3ec1c1 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -5891,9 +5891,9 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge ; SSE4-NEXT: .cfi_offset %r14, -32 ; SSE4-NEXT: .cfi_offset %r15, -24 ; SSE4-NEXT: .cfi_offset %rbp, -16 -; SSE4-NEXT: movdqa (%rdi), %xmm1 -; SSE4-NEXT: movdqa 32(%rdi), %xmm2 -; SSE4-NEXT: movdqa 64(%rdi), %xmm0 +; SSE4-NEXT: movdqa (%rdi), %xmm0 +; SSE4-NEXT: movdqa 32(%rdi), %xmm1 +; SSE4-NEXT: movdqa 64(%rdi), %xmm2 ; SSE4-NEXT: movl 92(%rsi), %eax ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE4-NEXT: movl 88(%rsi), %eax @@ -5916,14 +5916,14 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE4-NEXT: movl 52(%rsi), %eax ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; SSE4-NEXT: packssdw 48(%rdi), %xmm2 -; SSE4-NEXT: packssdw 16(%rdi), %xmm1 -; SSE4-NEXT: packsswb %xmm2, %xmm1 -; SSE4-NEXT: packssdw 80(%rdi), %xmm0 -; SSE4-NEXT: packsswb %xmm0, %xmm0 -; SSE4-NEXT: pmovmskb %xmm1, %eax +; SSE4-NEXT: packssdw 48(%rdi), %xmm1 +; SSE4-NEXT: packssdw 16(%rdi), %xmm0 +; SSE4-NEXT: packsswb %xmm1, %xmm0 +; SSE4-NEXT: packssdw 80(%rdi), %xmm2 +; SSE4-NEXT: packsswb %xmm2, %xmm2 +; SSE4-NEXT: pmovmskb %xmm0, %eax ; SSE4-NEXT: andl $21845, %eax ## imm = 0x5555 -; SSE4-NEXT: pmovmskb %xmm0, %edi +; SSE4-NEXT: pmovmskb %xmm2, %edi ; SSE4-NEXT: andl $85, %edi ; SSE4-NEXT: shll $16, %edi ; SSE4-NEXT: orl %eax, %edi diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 11803e32ad437e..2587f46c015199 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -1812,22 +1812,22 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movmskpd %xmm1, %ecx +; SSE2-NEXT: xorl $3, %ecx +; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB8_1 ; SSE2-NEXT: # %bb.2: # %else -; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: testb $2, %cl ; SSE2-NEXT: jne .LBB8_3 ; SSE2-NEXT: .LBB8_4: # %else2 ; SSE2-NEXT: retq ; SSE2-NEXT: .LBB8_1: # %cond.store -; SSE2-NEXT: movb %cl, (%rdi) -; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: movb %al, (%rdi) +; SSE2-NEXT: testb $2, %cl ; SSE2-NEXT: je .LBB8_4 ; SSE2-NEXT: .LBB8_3: # %cond.store1 -; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i8: diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index 38abaf8ff11c6c..2db2b9f52af81d 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -183,36 +183,36 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm6 ; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [2147483647,2147483647] -; SSE4-NEXT: movdqa %xmm10, %xmm0 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm10, %xmm8 +; SSE4-NEXT: movdqa %xmm9, %xmm8 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 -; SSE4-NEXT: movdqa %xmm10, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm10, %xmm9 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm9 -; SSE4-NEXT: movdqa %xmm10, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm10 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm10 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: movdqa %xmm10, %xmm3 +; SSE4-NEXT: movdqa %xmm9, %xmm3 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE4-NEXT: movdqa %xmm10, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9 ; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; SSE4-NEXT: movapd %xmm10, %xmm0 +; SSE4-NEXT: movapd %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm6 ; SSE4-NEXT: movapd %xmm3, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm2 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 ; SSE4-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] -; SSE4-NEXT: movapd %xmm9, %xmm0 +; SSE4-NEXT: movapd %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3 +; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm3 ; SSE4-NEXT: movapd %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1 @@ -2736,22 +2736,22 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: xorl $3, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: movmskpd %xmm0, %ecx +; SSE2-NEXT: xorl $3, %ecx +; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: jne .LBB8_1 ; SSE2-NEXT: # %bb.2: # %else -; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: testb $2, %cl ; SSE2-NEXT: jne .LBB8_3 ; SSE2-NEXT: .LBB8_4: # %else2 ; SSE2-NEXT: retq ; SSE2-NEXT: .LBB8_1: # %cond.store -; SSE2-NEXT: movb %cl, (%rdi) -; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: movb %al, (%rdi) +; SSE2-NEXT: testb $2, %cl ; SSE2-NEXT: je .LBB8_4 ; SSE2-NEXT: .LBB8_3: # %cond.store1 -; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i8: diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 715df982e1a069..dee433a17dc88d 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -2315,22 +2315,22 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: xorl $3, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: movmskpd %xmm0, %ecx +; SSE2-NEXT: xorl $3, %ecx +; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: jne .LBB8_1 ; SSE2-NEXT: # %bb.2: # %else -; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: testb $2, %cl ; SSE2-NEXT: jne .LBB8_3 ; SSE2-NEXT: .LBB8_4: # %else2 ; SSE2-NEXT: retq ; SSE2-NEXT: .LBB8_1: # %cond.store -; SSE2-NEXT: movb %cl, (%rdi) -; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: movb %al, (%rdi) +; SSE2-NEXT: testb $2, %cl ; SSE2-NEXT: je .LBB8_4 ; SSE2-NEXT: .LBB8_3: # %cond.store1 -; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: movb %ah, 1(%rdi) ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i8: @@ -3147,38 +3147,38 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-LABEL: truncstore_v16i32_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm11, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm9, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm13 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm10, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm12, %xmm13 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm13 ; SSE2-NEXT: pand %xmm13, %xmm1 -; SSE2-NEXT: pandn %xmm10, %xmm13 +; SSE2-NEXT: pandn %xmm9, %xmm13 ; SSE2-NEXT: por %xmm1, %xmm13 -; SSE2-NEXT: movdqa %xmm0, %xmm12 -; SSE2-NEXT: pxor %xmm11, %xmm12 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pxor %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: packuswb %xmm13, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm12 -; SSE2-NEXT: por %xmm3, %xmm12 -; SSE2-NEXT: pxor %xmm2, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm2 -; SSE2-NEXT: pandn %xmm10, %xmm9 -; SSE2-NEXT: por %xmm2, %xmm9 -; SSE2-NEXT: packuswb %xmm12, %xmm9 -; SSE2-NEXT: packuswb %xmm9, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm12, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm3 +; SSE2-NEXT: pandn %xmm9, %xmm11 +; SSE2-NEXT: por %xmm3, %xmm11 +; SSE2-NEXT: pxor %xmm2, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm12 +; SSE2-NEXT: por %xmm2, %xmm12 +; SSE2-NEXT: packuswb %xmm11, %xmm12 +; SSE2-NEXT: packuswb %xmm12, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 ; SSE2-NEXT: packssdw %xmm7, %xmm6 diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index c8c5afbd579df4..6843aa2e027107 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -556,12 +556,12 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin ; SSE-NEXT: movsd {{.*#+}} xmm11 = mem[0],zero ; SSE-NEXT: addsd %xmm13, %xmm1 ; SSE-NEXT: addsd %xmm7, %xmm1 -; SSE-NEXT: movapd %xmm2, %xmm12 -; SSE-NEXT: mulsd %xmm11, %xmm12 +; SSE-NEXT: movapd %xmm2, %xmm7 +; SSE-NEXT: mulsd %xmm11, %xmm7 ; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0,0] ; SSE-NEXT: mulpd %xmm0, %xmm11 -; SSE-NEXT: movapd %xmm5, %xmm7 -; SSE-NEXT: mulsd %xmm10, %xmm7 +; SSE-NEXT: movapd %xmm5, %xmm12 +; SSE-NEXT: mulsd %xmm10, %xmm12 ; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0,0] ; SSE-NEXT: mulpd %xmm3, %xmm10 ; SSE-NEXT: addpd %xmm11, %xmm10 @@ -570,17 +570,17 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin ; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0,0] ; SSE-NEXT: mulpd %xmm6, %xmm8 ; SSE-NEXT: addpd %xmm10, %xmm8 -; SSE-NEXT: addsd %xmm12, %xmm7 -; SSE-NEXT: addsd %xmm11, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm10 = mem[0],zero -; SSE-NEXT: mulsd %xmm10, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0,0] -; SSE-NEXT: mulpd %xmm0, %xmm10 +; SSE-NEXT: addsd %xmm7, %xmm12 +; SSE-NEXT: addsd %xmm11, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero +; SSE-NEXT: mulsd %xmm7, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0,0] +; SSE-NEXT: mulpd %xmm0, %xmm7 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: mulsd %xmm0, %xmm5 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] ; SSE-NEXT: mulpd %xmm3, %xmm0 -; SSE-NEXT: addpd %xmm10, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: mulsd %xmm3, %xmm9 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0,0] @@ -589,7 +589,7 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin ; SSE-NEXT: addsd %xmm2, %xmm5 ; SSE-NEXT: addsd %xmm9, %xmm5 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm7[0] +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm12[0] ; SSE-NEXT: movsd %xmm5, 64(%rdi) ; SSE-NEXT: movapd %xmm3, 48(%rdi) ; SSE-NEXT: movapd %xmm4, (%rdi) @@ -713,117 +713,61 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test_mul3x3_f64: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm0, %xmm9, %xmm10 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0] -; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm3, %xmm1, %xmm4 -; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm7, %xmm6, %xmm10 -; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 -; AVX512F-NEXT: vmulsd %xmm2, %xmm9, %xmm9 -; AVX512F-NEXT: vmulsd %xmm3, %xmm5, %xmm3 -; AVX512F-NEXT: vaddsd %xmm3, %xmm9, %xmm3 -; AVX512F-NEXT: vmulsd %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vaddsd %xmm7, %xmm3, %xmm3 -; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm4, %xmm0, %xmm7 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm10 -; AVX512F-NEXT: vaddpd %xmm7, %xmm10, %xmm7 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm6, %xmm10, %xmm11 -; AVX512F-NEXT: vaddpd %xmm7, %xmm11, %xmm7 -; AVX512F-NEXT: vmulsd %xmm4, %xmm2, %xmm4 -; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm9 -; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512F-NEXT: vmulsd %xmm10, %xmm8, %xmm9 -; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm7, %xmm0, %xmm0 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm1 -; AVX512F-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm6, %xmm6 -; AVX512F-NEXT: vaddpd %xmm6, %xmm0, %xmm0 -; AVX512F-NEXT: vmulsd %xmm7, %xmm2, %xmm2 -; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm5 -; AVX512F-NEXT: vaddsd %xmm5, %xmm2, %xmm2 -; AVX512F-NEXT: vmulsd %xmm1, %xmm8, %xmm1 -; AVX512F-NEXT: vaddsd %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm3, %zmm2 -; AVX512F-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] -; AVX512F-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovsd %xmm1, 64(%rdi) -; AVX512F-NEXT: vmovapd %zmm3, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: test_mul3x3_f64: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: movq %rdi, %rax -; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero -; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm1, %xmm0, %xmm9 -; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm4, %xmm3, %xmm10 -; AVX512VL-NEXT: vaddpd %xmm10, %xmm9, %xmm9 -; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm7, %xmm6, %xmm10 -; AVX512VL-NEXT: vaddpd %xmm10, %xmm9, %xmm9 -; AVX512VL-NEXT: vmulsd %xmm1, %xmm2, %xmm1 -; AVX512VL-NEXT: vmulsd %xmm4, %xmm5, %xmm4 -; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmulsd %xmm7, %xmm8, %xmm4 -; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm4, %xmm0, %xmm7 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm10 -; AVX512VL-NEXT: vaddpd %xmm7, %xmm10, %xmm7 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm6, %xmm10, %xmm11 -; AVX512VL-NEXT: vaddpd %xmm7, %xmm11, %xmm7 -; AVX512VL-NEXT: vmulsd %xmm4, %xmm2, %xmm4 -; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm9 -; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512VL-NEXT: vmulsd %xmm10, %xmm8, %xmm9 -; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm7, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm3 -; AVX512VL-NEXT: vaddpd %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm3, %xmm6, %xmm6 -; AVX512VL-NEXT: vaddpd %xmm6, %xmm0, %xmm0 -; AVX512VL-NEXT: vmulsd %xmm7, %xmm2, %xmm2 -; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm5 -; AVX512VL-NEXT: vaddsd %xmm5, %xmm2, %xmm2 -; AVX512VL-NEXT: vmulsd %xmm3, %xmm8, %xmm3 -; AVX512VL-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] -; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm3 -; AVX512VL-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX512VL-NEXT: vmovapd %zmm3, (%rdi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_mul3x3_f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: movq %rdi, %rax +; AVX512-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm9 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX512-NEXT: vmulpd %xmm4, %xmm3, %xmm10 +; AVX512-NEXT: vaddpd %xmm10, %xmm9, %xmm9 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; AVX512-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX512-NEXT: vmulpd %xmm7, %xmm6, %xmm10 +; AVX512-NEXT: vaddpd %xmm10, %xmm9, %xmm9 +; AVX512-NEXT: vmulsd %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vmulsd %xmm4, %xmm5, %xmm4 +; AVX512-NEXT: vaddsd %xmm4, %xmm1, %xmm1 +; AVX512-NEXT: vmulsd %xmm7, %xmm8, %xmm4 +; AVX512-NEXT: vaddsd %xmm4, %xmm1, %xmm1 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 +; AVX512-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX512-NEXT: vmulpd %xmm4, %xmm0, %xmm7 +; AVX512-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX512-NEXT: vmulpd %xmm3, %xmm9, %xmm10 +; AVX512-NEXT: vaddpd %xmm7, %xmm10, %xmm7 +; AVX512-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512-NEXT: vmulpd %xmm6, %xmm10, %xmm11 +; AVX512-NEXT: vaddpd %xmm7, %xmm11, %xmm7 +; AVX512-NEXT: vmulsd %xmm4, %xmm2, %xmm4 +; AVX512-NEXT: vmulsd %xmm5, %xmm9, %xmm9 +; AVX512-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX512-NEXT: vmulsd %xmm10, %xmm8, %xmm9 +; AVX512-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX512-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX512-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX512-NEXT: vmulpd %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX512-NEXT: vmulpd %xmm3, %xmm9, %xmm3 +; AVX512-NEXT: vaddpd %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX512-NEXT: vmulpd %xmm3, %xmm6, %xmm6 +; AVX512-NEXT: vaddpd %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vmulsd %xmm7, %xmm2, %xmm2 +; AVX512-NEXT: vmulsd %xmm5, %xmm9, %xmm5 +; AVX512-NEXT: vaddsd %xmm5, %xmm2, %xmm2 +; AVX512-NEXT: vmulsd %xmm3, %xmm8, %xmm3 +; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] +; AVX512-NEXT: vpermi2pd %zmm0, %zmm1, %zmm3 +; AVX512-NEXT: vmovsd %xmm2, 64(%rdi) +; AVX512-NEXT: vmovapd %zmm3, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %block = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> %splat.splat = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> zeroinitializer @@ -929,24 +873,24 @@ define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm6, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm6[0,0] ; SSE-NEXT: mulps %xmm9, %xmm4 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm6[1,1] -; SSE-NEXT: mulps %xmm1, %xmm10 -; SSE-NEXT: addps %xmm4, %xmm10 ; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm6[2,2] -; SSE-NEXT: mulps %xmm2, %xmm5 -; SSE-NEXT: addps %xmm10, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[1,1] +; SSE-NEXT: mulps %xmm1, %xmm5 +; SSE-NEXT: addps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,2],xmm6[2,2] +; SSE-NEXT: mulps %xmm2, %xmm4 +; SSE-NEXT: addps %xmm5, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3,3,3] ; SSE-NEXT: mulps %xmm3, %xmm6 -; SSE-NEXT: addps %xmm6, %xmm5 -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm7[0,0] -; SSE-NEXT: mulps %xmm9, %xmm4 +; SSE-NEXT: addps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm7[0,0] +; SSE-NEXT: mulps %xmm9, %xmm5 ; SSE-NEXT: movaps %xmm7, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm7[1,1] ; SSE-NEXT: mulps %xmm1, %xmm6 -; SSE-NEXT: addps %xmm4, %xmm6 +; SSE-NEXT: addps %xmm5, %xmm6 ; SSE-NEXT: movaps %xmm7, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[2,2] ; SSE-NEXT: mulps %xmm2, %xmm1 @@ -955,7 +899,7 @@ define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm7, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm2 ; SSE-NEXT: retq ; ; AVX1-LABEL: test_mul4x4_f32: @@ -1256,7 +1200,6 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; SSE-NEXT: mulpd %xmm10, %xmm15 ; SSE-NEXT: addpd %xmm14, %xmm15 ; SSE-NEXT: mulpd %xmm2, %xmm10 -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: addpd %xmm13, %xmm10 ; SSE-NEXT: movapd %xmm8, %xmm13 ; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm8[0] @@ -1264,6 +1207,7 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; SSE-NEXT: mulpd %xmm13, %xmm14 ; SSE-NEXT: addpd %xmm10, %xmm14 ; SSE-NEXT: movapd %xmm6, %xmm4 +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: mulpd %xmm6, %xmm13 ; SSE-NEXT: addpd %xmm15, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] @@ -1290,7 +1234,6 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; SSE-NEXT: mulpd %xmm13, %xmm14 ; SSE-NEXT: addpd %xmm12, %xmm14 ; SSE-NEXT: mulpd %xmm4, %xmm13 -; SSE-NEXT: movapd %xmm4, %xmm2 ; SSE-NEXT: addpd %xmm15, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1,1] ; SSE-NEXT: movapd %xmm7, %xmm12 @@ -1308,15 +1251,15 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; SSE-NEXT: addpd %xmm13, %xmm15 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: mulpd %xmm0, %xmm14 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: mulpd %xmm0, %xmm11 +; SSE-NEXT: mulpd %xmm2, %xmm11 +; SSE-NEXT: movapd %xmm2, %xmm6 ; SSE-NEXT: addpd %xmm14, %xmm11 ; SSE-NEXT: movapd %xmm13, %xmm14 ; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm13[0] ; SSE-NEXT: movapd %xmm5, %xmm4 ; SSE-NEXT: mulpd %xmm14, %xmm4 ; SSE-NEXT: addpd %xmm11, %xmm4 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: mulpd %xmm2, %xmm14 ; SSE-NEXT: addpd %xmm15, %xmm14 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1,1] @@ -1330,11 +1273,11 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; SSE-NEXT: movapd %xmm14, %xmm4 ; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm14[0] ; SSE-NEXT: mulpd %xmm4, %xmm1 -; SSE-NEXT: mulpd %xmm6, %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1,1] ; SSE-NEXT: mulpd %xmm14, %xmm3 ; SSE-NEXT: addpd %xmm1, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm14 +; SSE-NEXT: mulpd %xmm6, %xmm14 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addpd %xmm4, %xmm14 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -1637,115 +1580,117 @@ entry: define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwind { ; SSE-LABEL: test_mul8x8_f32: ; SSE: # %bb.0: # %entry -; SSE-NEXT: subq $120, %rsp -; SSE-NEXT: movaps %xmm5, %xmm11 +; SSE-NEXT: subq $88, %rsp +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm12 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movq %rdi, %rax -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,0],xmm14[0,0] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: mulps %xmm15, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: mulps %xmm15, %xmm11 ; SSE-NEXT: mulps %xmm0, %xmm15 -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: mulps %xmm0, %xmm10 -; SSE-NEXT: addps %xmm5, %xmm10 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: addps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm14[1,1] +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: mulps %xmm3, %xmm13 +; SSE-NEXT: addps %xmm11, %xmm13 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: addps %xmm15, %xmm3 ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[2,2] ; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm4, %xmm15 ; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulps %xmm11, %xmm1 -; SSE-NEXT: addps %xmm10, %xmm1 +; SSE-NEXT: addps %xmm3, %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: addps %xmm13, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3,3,3] ; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm13 ; SSE-NEXT: mulps %xmm14, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: mulps %xmm6, %xmm14 +; SSE-NEXT: movaps %xmm6, %xmm15 ; SSE-NEXT: addps %xmm2, %xmm14 -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm5[0,0] -; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm11[0,0] +; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm14, %xmm2 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulps %xmm9, %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm10 ; SSE-NEXT: addps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] -; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm11[1,1] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[2,2] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[2,2] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3,3,3] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: addps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulps %xmm11, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulps %xmm9, %xmm11 +; SSE-NEXT: addps %xmm2, %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: mulps %xmm13, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: mulps %xmm7, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm11, %xmm8 -; SSE-NEXT: mulps %xmm11, %xmm1 +; SSE-NEXT: mulps %xmm5, %xmm1 ; SSE-NEXT: addps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm13, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm14 +; SSE-NEXT: mulps %xmm15, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulps %xmm10, %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 @@ -1756,58 +1701,57 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulps %xmm4, %xmm0 ; SSE-NEXT: addps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulps %xmm9, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm10 -; SSE-NEXT: mulps %xmm13, %xmm3 +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: mulps %xmm7, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: movaps %xmm11, %xmm15 +; SSE-NEXT: movaps %xmm11, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm8, %xmm1 +; SSE-NEXT: mulps %xmm5, %xmm1 ; SSE-NEXT: addps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm13, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: mulps %xmm14, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] -; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulps %xmm14, %xmm1 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: mulps %xmm0, %xmm3 -; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm0, %xmm10 +; SSE-NEXT: addps %xmm1, %xmm10 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm4, %xmm1 @@ -1815,323 +1759,322 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: mulps %xmm11, %xmm1 -; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm10, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulps %xmm4, %xmm0 ; SSE-NEXT: addps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulps %xmm0, %xmm4 +; SSE-NEXT: mulps %xmm9, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm8 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE-NEXT: movaps %xmm12, %xmm4 -; SSE-NEXT: mulps %xmm3, %xmm4 -; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: movaps %xmm12, %xmm6 +; SSE-NEXT: mulps %xmm3, %xmm6 +; SSE-NEXT: addps %xmm8, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: mulps %xmm10, %xmm1 -; SSE-NEXT: mulps %xmm13, %xmm3 +; SSE-NEXT: mulps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm11 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm4 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm8, %xmm1 -; SSE-NEXT: addps %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: mulps %xmm0, %xmm3 -; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: mulps %xmm6, %xmm0 -; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm10[0,0] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm14, %xmm1 -; SSE-NEXT: addps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulps %xmm0, %xmm3 -; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[2,2] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm11, %xmm1 -; SSE-NEXT: addps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3,3,3] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: mulps %xmm10, %xmm0 -; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: addps %xmm2, %xmm10 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movaps %xmm9, %xmm14 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE-NEXT: movaps %xmm12, %xmm7 -; SSE-NEXT: mulps %xmm3, %xmm7 -; SSE-NEXT: addps %xmm2, %xmm7 -; SSE-NEXT: mulps %xmm15, %xmm1 -; SSE-NEXT: mulps %xmm13, %xmm3 -; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: mulps %xmm8, %xmm1 -; SSE-NEXT: addps %xmm7, %xmm1 +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: addps %xmm6, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: mulps %xmm0, %xmm7 -; SSE-NEXT: addps %xmm1, %xmm7 -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm6 +; SSE-NEXT: mulps %xmm0, %xmm6 +; SSE-NEXT: addps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm14, %xmm8 +; SSE-NEXT: mulps %xmm14, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm9[0,0] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm1 +; SSE-NEXT: addps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulps %xmm0, %xmm7 -; SSE-NEXT: addps %xmm1, %xmm7 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: mulps %xmm0, %xmm6 +; SSE-NEXT: addps %xmm1, %xmm6 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm9[2,2] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addps %xmm7, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: addps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm15 +; SSE-NEXT: mulps %xmm9, %xmm0 ; SSE-NEXT: addps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: addps %xmm2, %xmm4 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm9 +; SSE-NEXT: addps %xmm2, %xmm9 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm14, %xmm6 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[1,1] -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulps %xmm14, %xmm15 -; SSE-NEXT: addps %xmm2, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: mulps %xmm8, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: mulps %xmm7, %xmm14 -; SSE-NEXT: addps %xmm1, %xmm14 +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: mulps %xmm6, %xmm14 +; SSE-NEXT: addps %xmm2, %xmm14 +; SSE-NEXT: mulps %xmm10, %xmm1 +; SSE-NEXT: mulps %xmm11, %xmm6 +; SSE-NEXT: addps %xmm1, %xmm6 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm14, %xmm2 -; SSE-NEXT: mulps %xmm9, %xmm1 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: addps %xmm15, %xmm1 +; SSE-NEXT: addps %xmm6, %xmm2 +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: addps %xmm14, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: mulps %xmm0, %xmm14 -; SSE-NEXT: addps %xmm1, %xmm14 -; SSE-NEXT: mulps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm12 +; SSE-NEXT: movaps %xmm13, %xmm6 +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm6 +; SSE-NEXT: addps %xmm1, %xmm6 +; SSE-NEXT: mulps %xmm8, %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm13 ; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: mulps %xmm1, %xmm15 -; SSE-NEXT: addps %xmm0, %xmm15 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulps %xmm0, %xmm1 -; SSE-NEXT: addps %xmm14, %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[0,0] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulps %xmm0, %xmm14 -; SSE-NEXT: addps %xmm1, %xmm14 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: addps %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[2,2] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: mulps %xmm1, %xmm15 -; SSE-NEXT: addps %xmm0, %xmm15 +; SSE-NEXT: mulps %xmm1, %xmm14 +; SSE-NEXT: addps %xmm0, %xmm14 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addps %xmm14, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE-NEXT: addps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: mulps %xmm0, %xmm6 +; SSE-NEXT: addps %xmm1, %xmm6 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: addps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm2[2,2] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulps %xmm3, %xmm14 -; SSE-NEXT: addps %xmm1, %xmm14 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: addps %xmm15, %xmm3 +; SSE-NEXT: mulps %xmm1, %xmm14 +; SSE-NEXT: addps %xmm0, %xmm14 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: movaps %xmm15, %xmm6 +; SSE-NEXT: mulps %xmm2, %xmm6 +; SSE-NEXT: addps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addps %xmm14, %xmm2 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: mulps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: mulps %xmm1, %xmm14 ; SSE-NEXT: movaps %xmm0, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[1,1] -; SSE-NEXT: mulps %xmm15, %xmm13 -; SSE-NEXT: addps %xmm6, %xmm13 -; SSE-NEXT: mulps %xmm8, %xmm1 -; SSE-NEXT: mulps %xmm7, %xmm15 +; SSE-NEXT: movaps %xmm12, %xmm8 +; SSE-NEXT: mulps %xmm15, %xmm8 +; SSE-NEXT: addps %xmm14, %xmm8 +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: mulps %xmm10, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: mulps %xmm3, %xmm15 ; SSE-NEXT: addps %xmm1, %xmm15 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm15, %xmm2 -; SSE-NEXT: mulps %xmm9, %xmm1 -; SSE-NEXT: addps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: mulps %xmm1, %xmm14 +; SSE-NEXT: addps %xmm15, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: mulps %xmm4, %xmm1 +; SSE-NEXT: addps %xmm8, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: mulps %xmm0, %xmm9 -; SSE-NEXT: addps %xmm1, %xmm9 -; SSE-NEXT: mulps %xmm12, %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: mulps %xmm0, %xmm8 +; SSE-NEXT: addps %xmm1, %xmm8 +; SSE-NEXT: movaps %xmm13, %xmm6 +; SSE-NEXT: mulps %xmm13, %xmm0 +; SSE-NEXT: addps %xmm14, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,0],xmm1[0,0] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: mulps %xmm2, %xmm15 +; SSE-NEXT: mulps %xmm14, %xmm15 ; SSE-NEXT: addps %xmm0, %xmm15 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulps %xmm0, %xmm2 -; SSE-NEXT: addps %xmm9, %xmm2 +; SSE-NEXT: mulps %xmm0, %xmm14 +; SSE-NEXT: addps %xmm8, %xmm14 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulps %xmm0, %xmm9 -; SSE-NEXT: addps %xmm2, %xmm9 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: mulps %xmm0, %xmm8 +; SSE-NEXT: addps %xmm14, %xmm8 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addps %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2] +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,2],xmm1[2,2] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: mulps %xmm2, %xmm13 +; SSE-NEXT: mulps %xmm14, %xmm13 ; SSE-NEXT: addps %xmm0, %xmm13 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: addps %xmm9, %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm14 +; SSE-NEXT: addps %xmm8, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 ; SSE-NEXT: mulps %xmm1, %xmm15 -; SSE-NEXT: addps %xmm2, %xmm15 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addps %xmm13, %xmm1 +; SSE-NEXT: addps %xmm14, %xmm15 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: mulps %xmm2, %xmm13 -; SSE-NEXT: mulps %xmm8, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: mulps %xmm9, %xmm8 +; SSE-NEXT: mulps %xmm0, %xmm1 +; SSE-NEXT: addps %xmm13, %xmm1 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm8[0,0] +; SSE-NEXT: mulps %xmm0, %xmm11 +; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm8[1,1] +; SSE-NEXT: movaps %xmm12, %xmm10 +; SSE-NEXT: mulps %xmm14, %xmm10 +; SSE-NEXT: addps %xmm11, %xmm10 +; SSE-NEXT: mulps %xmm7, %xmm0 +; SSE-NEXT: mulps %xmm3, %xmm14 +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: addps %xmm0, %xmm14 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm8[2,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm13 +; SSE-NEXT: mulps %xmm0, %xmm13 +; SSE-NEXT: addps %xmm14, %xmm13 +; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: addps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3,3,3] +; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: mulps %xmm8, %xmm10 +; SSE-NEXT: addps %xmm0, %xmm10 +; SSE-NEXT: mulps %xmm6, %xmm8 ; SSE-NEXT: addps %xmm13, %xmm8 -; SSE-NEXT: mulps %xmm7, %xmm9 -; SSE-NEXT: addps %xmm2, %xmm9 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm0[2,2] -; SSE-NEXT: mulps %xmm2, %xmm6 -; SSE-NEXT: addps %xmm9, %xmm6 -; SSE-NEXT: mulps %xmm11, %xmm2 -; SSE-NEXT: addps %xmm8, %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0],xmm0[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulps %xmm13, %xmm14 +; SSE-NEXT: addps %xmm8, %xmm14 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulps %xmm7, %xmm13 +; SSE-NEXT: addps %xmm10, %xmm13 +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm0[1,1] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulps %xmm8, %xmm10 +; SSE-NEXT: addps %xmm13, %xmm10 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: addps %xmm14, %xmm8 +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,2],xmm0[2,2] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulps %xmm13, %xmm7 +; SSE-NEXT: addps %xmm8, %xmm7 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: mulps %xmm8, %xmm13 +; SSE-NEXT: addps %xmm10, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: mulps %xmm0, %xmm9 -; SSE-NEXT: addps %xmm2, %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: addps %xmm6, %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm9[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulps %xmm0, %xmm14 +; SSE-NEXT: addps %xmm13, %xmm14 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: mulps %xmm8, %xmm0 +; SSE-NEXT: addps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: mulps %xmm8, %xmm13 +; SSE-NEXT: mulps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm7[1,1] +; SSE-NEXT: mulps %xmm10, %xmm12 +; SSE-NEXT: addps %xmm13, %xmm12 +; SSE-NEXT: mulps %xmm11, %xmm10 +; SSE-NEXT: addps %xmm8, %xmm10 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,2],xmm7[2,2] +; SSE-NEXT: mulps %xmm8, %xmm3 +; SSE-NEXT: addps %xmm10, %xmm3 +; SSE-NEXT: mulps %xmm4, %xmm8 +; SSE-NEXT: addps %xmm12, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3,3,3] +; SSE-NEXT: mulps %xmm7, %xmm5 +; SSE-NEXT: addps %xmm8, %xmm5 +; SSE-NEXT: mulps %xmm6, %xmm7 +; SSE-NEXT: addps %xmm3, %xmm7 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,0],xmm8[0,0] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: mulps %xmm2, %xmm13 -; SSE-NEXT: addps %xmm0, %xmm13 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: addps %xmm12, %xmm2 -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: mulps %xmm10, %xmm13 +; SSE-NEXT: addps %xmm7, %xmm13 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: addps %xmm5, %xmm10 +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[1,1] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: mulps %xmm0, %xmm12 -; SSE-NEXT: addps %xmm2, %xmm12 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: addps %xmm13, %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm9[2,2] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulps %xmm2, %xmm5 -; SSE-NEXT: addps %xmm0, %xmm5 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: addps %xmm12, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3,3,3] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulps %xmm9, %xmm0 -; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: addps %xmm5, %xmm9 -; SSE-NEXT: movaps %xmm0, 240(%rdi) -; SSE-NEXT: movaps %xmm9, 224(%rdi) -; SSE-NEXT: movaps %xmm15, 208(%rdi) -; SSE-NEXT: movaps %xmm1, 192(%rdi) -; SSE-NEXT: movaps %xmm14, 176(%rdi) -; SSE-NEXT: movaps %xmm3, 160(%rdi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rdi) -; SSE-NEXT: movaps %xmm4, 128(%rdi) +; SSE-NEXT: mulps %xmm7, %xmm12 +; SSE-NEXT: addps %xmm10, %xmm12 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: addps %xmm13, %xmm7 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,2],xmm8[2,2] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulps %xmm10, %xmm3 +; SSE-NEXT: addps %xmm7, %xmm3 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: addps %xmm12, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulps %xmm8, %xmm4 +; SSE-NEXT: addps %xmm10, %xmm4 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: addps %xmm3, %xmm8 +; SSE-NEXT: movaps %xmm4, 240(%rdi) +; SSE-NEXT: movaps %xmm8, 224(%rdi) +; SSE-NEXT: movaps %xmm14, 208(%rdi) +; SSE-NEXT: movaps %xmm0, 192(%rdi) +; SSE-NEXT: movaps %xmm15, 176(%rdi) +; SSE-NEXT: movaps %xmm1, 160(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rdi) +; SSE-NEXT: movaps %xmm2, 128(%rdi) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdi) -; SSE-NEXT: movaps %xmm10, 96(%rdi) +; SSE-NEXT: movaps %xmm9, 96(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2144,7 +2087,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm0, 16(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdi) -; SSE-NEXT: addq $120, %rsp +; SSE-NEXT: addq $88, %rsp ; SSE-NEXT: retq ; ; AVX1-LABEL: test_mul8x8_f32: @@ -2557,6 +2500,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; ; AVX512F-LABEL: test_mul8x8_f32: ; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: pushq %rax ; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm11 ; AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm10 ; AVX512F-NEXT: vextractf64x4 $1, %zmm2, %ymm9 @@ -2624,7 +2568,6 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] ; AVX512F-NEXT: vmulps %ymm13, %ymm8, %ymm13 ; AVX512F-NEXT: vaddps %ymm4, %ymm13, %ymm4 -; AVX512F-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vbroadcastss %xmm5, %ymm13 ; AVX512F-NEXT: vmulps %ymm0, %ymm13, %ymm13 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3] @@ -2654,7 +2597,8 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vshufps {{.*#+}} ymm14 = ymm5[3,3,3,3,7,7,7,7] ; AVX512F-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] ; AVX512F-NEXT: vmulps %ymm14, %ymm8, %ymm14 -; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm12 +; AVX512F-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vextractf64x4 $1, %zmm5, %ymm14 ; AVX512F-NEXT: vextractf32x4 $2, %zmm5, %xmm15 ; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 @@ -2721,66 +2665,67 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12 ; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12 ; AVX512F-NEXT: vextractf64x4 $1, %zmm6, %ymm15 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm15[1,1,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm4, %ymm4 -; AVX512F-NEXT: vmulps %ymm4, %ymm11, %ymm4 -; AVX512F-NEXT: vaddps %ymm4, %ymm12, %ymm4 -; AVX512F-NEXT: vshufps {{.*#+}} xmm12 = xmm15[2,2,2,2] -; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX512F-NEXT: vmulps %ymm1, %ymm12, %ymm12 -; AVX512F-NEXT: vaddps %ymm4, %ymm12, %ymm4 -; AVX512F-NEXT: vshufps {{.*#+}} xmm12 = xmm15[3,3,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX512F-NEXT: vmulps %ymm12, %ymm10, %ymm12 -; AVX512F-NEXT: vaddps %ymm4, %ymm12, %ymm4 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm13 = xmm15[1,1,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512F-NEXT: vmulps %ymm13, %ymm11, %ymm13 +; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,2,2,2] +; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512F-NEXT: vmulps %ymm1, %ymm13, %ymm13 +; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm15[3,3,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512F-NEXT: vmulps %ymm13, %ymm10, %ymm13 +; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 ; AVX512F-NEXT: vextractf32x4 $3, %zmm6, %xmm6 ; AVX512F-NEXT: vbroadcastss %xmm6, %ymm6 ; AVX512F-NEXT: vmulps %ymm6, %ymm2, %ymm6 -; AVX512F-NEXT: vaddps %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vmovshdup {{.*#+}} ymm6 = ymm15[1,1,3,3,5,5,7,7] -; AVX512F-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512F-NEXT: vmulps %ymm6, %ymm9, %ymm6 -; AVX512F-NEXT: vaddps %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vshufps {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6] -; AVX512F-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512F-NEXT: vmulps %ymm6, %ymm3, %ymm6 -; AVX512F-NEXT: vaddps %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vshufps {{.*#+}} ymm6 = ymm15[3,3,3,3,7,7,7,7] -; AVX512F-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512F-NEXT: vmulps %ymm6, %ymm8, %ymm6 -; AVX512F-NEXT: vaddps %ymm6, %ymm4, %ymm6 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512F-NEXT: vaddps %ymm6, %ymm12, %ymm6 +; AVX512F-NEXT: vmovshdup {{.*#+}} ymm12 = ymm15[1,1,3,3,5,5,7,7] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX512F-NEXT: vmulps %ymm12, %ymm9, %ymm12 +; AVX512F-NEXT: vaddps %ymm6, %ymm12, %ymm6 +; AVX512F-NEXT: vshufps {{.*#+}} ymm12 = ymm15[2,2,2,2,6,6,6,6] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX512F-NEXT: vmulps %ymm3, %ymm12, %ymm12 +; AVX512F-NEXT: vaddps %ymm6, %ymm12, %ymm6 +; AVX512F-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,3,3,3,7,7,7,7] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX512F-NEXT: vmulps %ymm12, %ymm8, %ymm12 +; AVX512F-NEXT: vaddps %ymm6, %ymm12, %ymm6 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm12, %zmm4 ; AVX512F-NEXT: vbroadcastss %xmm7, %ymm12 ; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm15 = xmm7[1,1,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 -; AVX512F-NEXT: vmulps %ymm15, %ymm11, %ymm15 -; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 -; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm7[2,2,2,2] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 -; AVX512F-NEXT: vmulps %ymm1, %ymm15, %ymm15 -; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 -; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm7[3,3,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 -; AVX512F-NEXT: vmulps %ymm15, %ymm10, %ymm15 -; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 -; AVX512F-NEXT: vextractf128 $1, %ymm7, %xmm15 -; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 -; AVX512F-NEXT: vmulps %ymm2, %ymm15, %ymm15 -; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 -; AVX512F-NEXT: vmovshdup {{.*#+}} ymm15 = ymm7[1,1,3,3,5,5,7,7] -; AVX512F-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX512F-NEXT: vmulps %ymm15, %ymm9, %ymm15 -; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 -; AVX512F-NEXT: vshufps {{.*#+}} ymm15 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX512F-NEXT: vmulps %ymm3, %ymm15, %ymm15 -; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 -; AVX512F-NEXT: vshufps {{.*#+}} ymm15 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX512F-NEXT: vmulps %ymm15, %ymm8, %ymm15 -; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm13 = xmm7[1,1,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512F-NEXT: vmulps %ymm13, %ymm11, %ymm13 +; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm7[2,2,2,2] +; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512F-NEXT: vmulps %ymm1, %ymm13, %ymm13 +; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm7[3,3,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512F-NEXT: vmulps %ymm13, %ymm10, %ymm13 +; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX512F-NEXT: vextractf128 $1, %ymm7, %xmm13 +; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13 +; AVX512F-NEXT: vmulps %ymm2, %ymm13, %ymm13 +; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX512F-NEXT: vmovshdup {{.*#+}} ymm13 = ymm7[1,1,3,3,5,5,7,7] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512F-NEXT: vmulps %ymm13, %ymm9, %ymm13 +; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX512F-NEXT: vshufps {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512F-NEXT: vmulps %ymm3, %ymm13, %ymm13 +; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX512F-NEXT: vshufps {{.*#+}} ymm13 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512F-NEXT: vmulps %ymm13, %ymm8, %ymm13 +; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vinsertf64x4 $1, %ymm5, %zmm13, %zmm5 ; AVX512F-NEXT: vextractf64x4 $1, %zmm7, %ymm13 ; AVX512F-NEXT: vextractf32x4 $2, %zmm7, %xmm15 @@ -2818,6 +2763,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm12, %zmm3 ; AVX512F-NEXT: vmovaps %zmm4, %zmm0 ; AVX512F-NEXT: vmovaps %zmm5, %zmm1 +; AVX512F-NEXT: popq %rax ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_mul8x8_f32: @@ -3286,7 +3232,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-LABEL: test_mul8x8_f64: ; SSE: # %bb.0: # %entry ; SSE-NEXT: subq $328, %rsp # imm = 0x148 -; SSE-NEXT: movapd %xmm7, %xmm15 +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3294,912 +3240,904 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: movapd %xmm13, %xmm12 -; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm13[0] -; SSE-NEXT: movapd %xmm3, %xmm10 -; SSE-NEXT: mulpd %xmm12, %xmm10 -; SSE-NEXT: movapd %xmm2, %xmm8 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: movapd %xmm15, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm15[0] +; SSE-NEXT: movapd %xmm3, %xmm13 +; SSE-NEXT: mulpd %xmm12, %xmm13 +; SSE-NEXT: movapd %xmm2, %xmm11 +; SSE-NEXT: mulpd %xmm12, %xmm11 +; SSE-NEXT: movapd %xmm1, %xmm8 ; SSE-NEXT: mulpd %xmm12, %xmm8 -; SSE-NEXT: movapd %xmm1, %xmm9 -; SSE-NEXT: mulpd %xmm12, %xmm9 ; SSE-NEXT: mulpd %xmm0, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1,1] -; SSE-NEXT: movapd %xmm7, %xmm2 -; SSE-NEXT: mulpd %xmm13, %xmm2 -; SSE-NEXT: addpd %xmm10, %xmm2 -; SSE-NEXT: movapd %xmm6, %xmm7 -; SSE-NEXT: movapd %xmm6, %xmm10 -; SSE-NEXT: mulpd %xmm13, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1,1] +; SSE-NEXT: movapd %xmm7, %xmm10 +; SSE-NEXT: mulpd %xmm15, %xmm10 +; SSE-NEXT: addpd %xmm13, %xmm10 +; SSE-NEXT: movapd %xmm6, %xmm3 +; SSE-NEXT: movapd %xmm6, %xmm13 +; SSE-NEXT: mulpd %xmm15, %xmm3 +; SSE-NEXT: addpd %xmm11, %xmm3 +; SSE-NEXT: movapd %xmm5, %xmm7 +; SSE-NEXT: mulpd %xmm15, %xmm7 ; SSE-NEXT: addpd %xmm8, %xmm7 -; SSE-NEXT: movapd %xmm5, %xmm8 -; SSE-NEXT: mulpd %xmm13, %xmm8 -; SSE-NEXT: addpd %xmm9, %xmm8 -; SSE-NEXT: mulpd %xmm4, %xmm13 -; SSE-NEXT: addpd %xmm12, %xmm13 -; SSE-NEXT: movapd %xmm11, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0] -; SSE-NEXT: movapd %xmm14, %xmm1 -; SSE-NEXT: mulpd %xmm6, %xmm1 -; SSE-NEXT: addpd %xmm13, %xmm1 +; SSE-NEXT: mulpd %xmm4, %xmm15 +; SSE-NEXT: addpd %xmm12, %xmm15 +; SSE-NEXT: movapd %xmm14, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm14[0] +; SSE-NEXT: movapd %xmm9, %xmm5 +; SSE-NEXT: mulpd %xmm2, %xmm5 +; SSE-NEXT: addpd %xmm15, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm2, %xmm1 +; SSE-NEXT: addpd %xmm7, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm2, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm4 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addpd %xmm10, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 -; SSE-NEXT: addpd %xmm8, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm6, %xmm5 -; SSE-NEXT: addpd %xmm7, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1,1] +; SSE-NEXT: mulpd %xmm14, %xmm3 +; SSE-NEXT: addpd %xmm2, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm11, %xmm2 -; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: mulpd %xmm14, %xmm2 +; SSE-NEXT: addpd %xmm4, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm11, %xmm4 -; SSE-NEXT: addpd %xmm5, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm11, %xmm5 -; SSE-NEXT: addpd %xmm3, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: addpd %xmm1, %xmm11 +; SSE-NEXT: mulpd %xmm14, %xmm4 +; SSE-NEXT: addpd %xmm1, %xmm4 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: addpd %xmm5, %xmm14 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 -; SSE-NEXT: addpd %xmm11, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm6, %xmm7 -; SSE-NEXT: addpd %xmm5, %xmm7 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm6, %xmm5 -; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd %xmm1, %xmm5 +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm5, %xmm6 +; SSE-NEXT: addpd %xmm14, %xmm6 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm5, %xmm0 +; SSE-NEXT: addpd %xmm4, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm5, %xmm4 +; SSE-NEXT: addpd %xmm2, %xmm4 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: addpd %xmm3, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm5, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm1, %xmm3 +; SSE-NEXT: addpd %xmm4, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm5, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm1, %xmm5 -; SSE-NEXT: addpd %xmm7, %xmm5 +; SSE-NEXT: addpd %xmm0, %xmm4 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm6, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: movapd %xmm6, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm3, %xmm2 -; SSE-NEXT: addpd %xmm1, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: movapd %xmm6, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm3, %xmm5 -; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: addpd %xmm4, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm4 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm2, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm6, %xmm2 +; SSE-NEXT: addpd %xmm0, %xmm2 +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm6, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 -; SSE-NEXT: addpd %xmm5, %xmm0 +; SSE-NEXT: addpd %xmm4, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm6, %xmm0 ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: addpd %xmm5, %xmm6 ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movapd %xmm7, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movapd %xmm11, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm15, %xmm8 -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm15, %xmm2 +; SSE-NEXT: addpd %xmm2, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movapd %xmm8, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 -; SSE-NEXT: addpd %xmm3, %xmm2 +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm13, %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: addpd %xmm2, %xmm4 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movapd %xmm9, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm10, %xmm15 -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm10, %xmm4 -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movapd %xmm13, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movapd %xmm10, %xmm5 -; SSE-NEXT: mulpd %xmm1, %xmm5 -; SSE-NEXT: addpd %xmm3, %xmm5 +; SSE-NEXT: movapd %xmm9, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: mulpd %xmm12, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: mulpd %xmm14, %xmm1 -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 -; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: movapd %xmm12, %xmm5 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: addpd %xmm2, %xmm5 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: mulpd %xmm10, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: mulpd %xmm15, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm6, %xmm1 -; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movapd %xmm14, %xmm6 +; SSE-NEXT: mulpd %xmm2, %xmm6 +; SSE-NEXT: addpd %xmm0, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm5, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: mulpd %xmm2, %xmm5 ; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addpd %xmm3, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm1, %xmm3 +; SSE-NEXT: addpd %xmm2, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm0, %xmm2 -; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm5, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm0, %xmm4 -; SSE-NEXT: addpd %xmm5, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm0, %xmm5 -; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm0, %xmm4 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm6, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm5, %xmm6 +; SSE-NEXT: addpd %xmm1, %xmm6 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm5, %xmm1 +; SSE-NEXT: addpd %xmm4, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm5, %xmm4 +; SSE-NEXT: addpd %xmm2, %xmm4 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: addpd %xmm3, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm0, %xmm2 +; SSE-NEXT: addpd %xmm5, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: addpd %xmm4, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: addpd %xmm1, %xmm4 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: movapd %xmm6, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm0, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: addpd %xmm4, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm4 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm6, %xmm2 +; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 -; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm6, %xmm7 -; SSE-NEXT: addpd %xmm5, %xmm7 +; SSE-NEXT: mulpd %xmm6, %xmm1 +; SSE-NEXT: addpd %xmm4, %xmm1 +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm6, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: addpd %xmm5, %xmm6 +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movapd %xmm7, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd %xmm11, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: addpd %xmm2, %xmm3 +; SSE-NEXT: movapd %xmm8, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: movapd %xmm13, %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: addpd %xmm2, %xmm4 +; SSE-NEXT: movapd %xmm9, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: movapd %xmm12, %xmm5 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: addpd %xmm2, %xmm5 +; SSE-NEXT: mulpd %xmm10, %xmm1 +; SSE-NEXT: mulpd %xmm15, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movapd %xmm14, %xmm6 +; SSE-NEXT: mulpd %xmm2, %xmm6 +; SSE-NEXT: addpd %xmm0, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm5, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: mulpd %xmm2, %xmm5 ; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm4, %xmm2 +; SSE-NEXT: addpd %xmm3, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm1, %xmm3 +; SSE-NEXT: addpd %xmm2, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm5, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm0, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm14, %xmm1 +; SSE-NEXT: addpd %xmm6, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm5, %xmm6 +; SSE-NEXT: addpd %xmm1, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm5, %xmm1 +; SSE-NEXT: addpd %xmm4, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm5, %xmm4 +; SSE-NEXT: addpd %xmm2, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm2, %xmm5 +; SSE-NEXT: addpd %xmm3, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm0, %xmm2 +; SSE-NEXT: addpd %xmm5, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: addpd %xmm4, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: addpd %xmm6, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm1, %xmm6 +; SSE-NEXT: addpd %xmm0, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm5, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm1, %xmm5 -; SSE-NEXT: addpd %xmm7, %xmm5 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: movapd %xmm6, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: addpd %xmm3, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: mulpd %xmm4, %xmm2 ; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: mulpd %xmm4, %xmm1 ; SSE-NEXT: addpd %xmm5, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm3, %xmm5 -; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm4, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 -; SSE-NEXT: addpd %xmm5, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: addpd %xmm6, %xmm4 +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 -; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movapd %xmm7, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd %xmm11, %xmm4 ; SSE-NEXT: movapd %xmm11, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: addpd %xmm2, %xmm3 ; SSE-NEXT: movapd %xmm8, %xmm2 +; SSE-NEXT: movapd %xmm8, %xmm14 ; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movapd %xmm12, %xmm5 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: addpd %xmm2, %xmm5 +; SSE-NEXT: movapd %xmm9, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movapd %xmm13, %xmm6 +; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: mulpd %xmm10, %xmm1 +; SSE-NEXT: mulpd %xmm15, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: mulpd %xmm2, %xmm8 +; SSE-NEXT: addpd %xmm0, %xmm8 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm2, %xmm6 +; SSE-NEXT: addpd %xmm5, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd %xmm9, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm15, %xmm4 -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm13, %xmm8 -; SSE-NEXT: movapd %xmm13, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm10, %xmm5 -; SSE-NEXT: movapd %xmm10, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm1, %xmm3 +; SSE-NEXT: addpd %xmm2, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm1, %xmm5 -; SSE-NEXT: addpd %xmm3, %xmm5 -; SSE-NEXT: movapd %xmm12, %xmm10 -; SSE-NEXT: mulpd %xmm12, %xmm0 -; SSE-NEXT: movapd %xmm14, %xmm9 -; SSE-NEXT: mulpd %xmm14, %xmm1 -; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm8, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm6 ; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 -; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: mulpd %xmm6, %xmm8 +; SSE-NEXT: addpd %xmm1, %xmm8 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm6, %xmm1 ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm6, %xmm5 -; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm4, %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: addpd %xmm3, %xmm6 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm0, %xmm2 ; SSE-NEXT: addpd %xmm6, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm0, %xmm4 -; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: addpd %xmm5, %xmm6 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm0, %xmm5 ; SSE-NEXT: addpd %xmm1, %xmm5 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: addpd %xmm8, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 -; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm6, %xmm7 -; SSE-NEXT: addpd %xmm5, %xmm7 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm6, %xmm5 -; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd %xmm3, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: mulpd %xmm1, %xmm8 +; SSE-NEXT: addpd %xmm0, %xmm8 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm1, %xmm5 -; SSE-NEXT: addpd %xmm7, %xmm5 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: movapd %xmm7, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] +; SSE-NEXT: addpd %xmm6, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm3, %xmm2 ; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm5, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm3, %xmm5 -; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm4, %xmm3 -; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm7, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm7, %xmm0 -; SSE-NEXT: addpd %xmm5, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm7, %xmm0 -; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm0, %xmm7 -; SSE-NEXT: addpd %xmm2, %xmm7 -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd %xmm11, %xmm3 -; SSE-NEXT: movapd %xmm11, %xmm12 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movapd %xmm6, %xmm2 -; SSE-NEXT: mulpd %xmm1, %xmm2 -; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movapd %xmm11, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movapd %xmm13, %xmm4 -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm8, %xmm3 -; SSE-NEXT: movapd %xmm8, %xmm14 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm15, %xmm8 -; SSE-NEXT: movapd %xmm15, %xmm5 -; SSE-NEXT: mulpd %xmm1, %xmm5 -; SSE-NEXT: addpd %xmm3, %xmm5 -; SSE-NEXT: mulpd %xmm10, %xmm0 -; SSE-NEXT: mulpd %xmm9, %xmm1 -; SSE-NEXT: movapd %xmm9, %xmm10 +; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: addpd %xmm8, %xmm3 +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm7, %xmm3 -; SSE-NEXT: addpd %xmm1, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm7, %xmm1 -; SSE-NEXT: addpd %xmm5, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm7, %xmm5 -; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: addpd %xmm2, %xmm7 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: mulpd %xmm1, %xmm7 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm0, %xmm2 -; SSE-NEXT: addpd %xmm7, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm0, %xmm4 -; SSE-NEXT: addpd %xmm5, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: movapd %xmm4, %xmm5 +; SSE-NEXT: movapd %xmm4, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm5 -; SSE-NEXT: addpd %xmm1, %xmm5 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm5 +; SSE-NEXT: movapd %xmm14, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: movapd %xmm12, %xmm4 +; SSE-NEXT: movapd %xmm12, %xmm6 +; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: movapd %xmm9, %xmm2 +; SSE-NEXT: movapd %xmm9, %xmm12 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: movapd %xmm13, %xmm7 +; SSE-NEXT: movapd %xmm13, %xmm8 +; SSE-NEXT: mulpd %xmm0, %xmm8 +; SSE-NEXT: addpd %xmm2, %xmm8 +; SSE-NEXT: mulpd %xmm10, %xmm1 +; SSE-NEXT: movapd %xmm10, %xmm11 +; SSE-NEXT: movapd %xmm15, %xmm13 +; SSE-NEXT: mulpd %xmm15, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm7, %xmm3 -; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm7, %xmm9 -; SSE-NEXT: addpd %xmm5, %xmm9 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm7, %xmm5 -; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: mulpd %xmm2, %xmm9 +; SSE-NEXT: addpd %xmm0, %xmm9 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm0, %xmm7 -; SSE-NEXT: addpd %xmm2, %xmm7 +; SSE-NEXT: mulpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm8, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: mulpd %xmm2, %xmm8 +; SSE-NEXT: addpd %xmm6, %xmm8 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addpd %xmm5, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm7, %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm5, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm1, %xmm7 -; SSE-NEXT: addpd %xmm9, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: movapd %xmm15, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm2, %xmm5 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm3, %xmm2 -; SSE-NEXT: addpd %xmm1, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm7, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm3, %xmm7 -; SSE-NEXT: addpd %xmm4, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm15, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm15, %xmm0 -; SSE-NEXT: addpd %xmm7, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm15, %xmm0 -; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: addpd %xmm2, %xmm15 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd %xmm12, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm6, %xmm2 -; SSE-NEXT: movapd %xmm6, %xmm12 ; SSE-NEXT: mulpd %xmm1, %xmm2 -; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: mulpd %xmm0, %xmm11 -; SSE-NEXT: movapd %xmm13, %xmm6 -; SSE-NEXT: movapd %xmm13, %xmm4 -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm11, %xmm4 -; SSE-NEXT: mulpd %xmm0, %xmm14 -; SSE-NEXT: movapd %xmm8, %xmm7 -; SSE-NEXT: mulpd %xmm1, %xmm7 -; SSE-NEXT: addpd %xmm14, %xmm7 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: movapd %xmm10, %xmm5 -; SSE-NEXT: mulpd %xmm10, %xmm1 -; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: addpd %xmm8, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm1, %xmm6 +; SSE-NEXT: addpd %xmm0, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm9, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm9 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm9, %xmm3 -; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: movapd %xmm0, %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm8, %xmm9 +; SSE-NEXT: addpd %xmm1, %xmm9 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm9, %xmm1 -; SSE-NEXT: addpd %xmm7, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm9, %xmm7 -; SSE-NEXT: addpd %xmm4, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: addpd %xmm2, %xmm9 +; SSE-NEXT: mulpd %xmm8, %xmm1 +; SSE-NEXT: addpd %xmm6, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm8, %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: addpd %xmm5, %xmm8 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm0, %xmm2 -; SSE-NEXT: addpd %xmm9, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm0, %xmm4 -; SSE-NEXT: addpd %xmm7, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm0, %xmm7 -; SSE-NEXT: addpd %xmm1, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: addpd %xmm8, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: mulpd %xmm0, %xmm8 +; SSE-NEXT: addpd %xmm6, %xmm8 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: addpd %xmm1, %xmm6 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm9 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm1[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm9, %xmm3 -; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm9, %xmm10 -; SSE-NEXT: addpd %xmm7, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm9, %xmm7 -; SSE-NEXT: addpd %xmm4, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: addpd %xmm2, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: addpd %xmm9, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: movapd %xmm15, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm15[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: mulpd %xmm1, %xmm9 -; SSE-NEXT: addpd %xmm7, %xmm9 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm1, %xmm7 -; SSE-NEXT: addpd %xmm10, %xmm7 +; SSE-NEXT: addpd %xmm0, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm1, %xmm10 +; SSE-NEXT: addpd %xmm8, %xmm10 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movapd %xmm11, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm11[0] +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: mulpd %xmm15, %xmm2 ; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm7, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm3, %xmm7 -; SSE-NEXT: addpd %xmm9, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm11, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm11, %xmm0 -; SSE-NEXT: addpd %xmm7, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: addpd %xmm10, %xmm1 +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: addpd %xmm9, %xmm15 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm11, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movapd %xmm5, %xmm8 +; SSE-NEXT: mulpd %xmm1, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd %xmm3, %xmm9 +; SSE-NEXT: mulpd %xmm0, %xmm9 +; SSE-NEXT: addpd %xmm8, %xmm9 +; SSE-NEXT: movapd %xmm14, %xmm2 +; SSE-NEXT: movapd %xmm14, %xmm8 +; SSE-NEXT: mulpd %xmm1, %xmm8 +; SSE-NEXT: movapd %xmm4, %xmm10 +; SSE-NEXT: mulpd %xmm0, %xmm10 +; SSE-NEXT: addpd %xmm8, %xmm10 +; SSE-NEXT: movapd %xmm12, %xmm6 +; SSE-NEXT: movapd %xmm12, %xmm8 +; SSE-NEXT: mulpd %xmm1, %xmm8 +; SSE-NEXT: movapd %xmm7, %xmm12 +; SSE-NEXT: mulpd %xmm0, %xmm12 +; SSE-NEXT: addpd %xmm8, %xmm12 +; SSE-NEXT: mulpd %xmm11, %xmm1 +; SSE-NEXT: movapd %xmm13, %xmm7 +; SSE-NEXT: mulpd %xmm13, %xmm0 ; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: addpd %xmm2, %xmm11 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movapd %xmm13, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm12, %xmm2 -; SSE-NEXT: mulpd %xmm1, %xmm2 -; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movapd %xmm14, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm6, %xmm7 -; SSE-NEXT: mulpd %xmm1, %xmm7 -; SSE-NEXT: addpd %xmm3, %xmm7 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movapd %xmm4, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movapd %xmm6, %xmm9 -; SSE-NEXT: mulpd %xmm1, %xmm9 -; SSE-NEXT: addpd %xmm3, %xmm9 -; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: mulpd %xmm5, %xmm1 -; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulpd %xmm8, %xmm13 +; SSE-NEXT: addpd %xmm0, %xmm13 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm10 -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm10, %xmm3 -; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: mulpd %xmm8, %xmm0 +; SSE-NEXT: addpd %xmm12, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: mulpd %xmm10, %xmm12 -; SSE-NEXT: addpd %xmm9, %xmm12 +; SSE-NEXT: mulpd %xmm8, %xmm12 +; SSE-NEXT: addpd %xmm10, %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm4, %xmm8 +; SSE-NEXT: addpd %xmm9, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm10, %xmm9 -; SSE-NEXT: addpd %xmm7, %xmm9 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: addpd %xmm2, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm1, %xmm9 +; SSE-NEXT: addpd %xmm8, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: mulpd %xmm1, %xmm8 +; SSE-NEXT: addpd %xmm12, %xmm8 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm1, %xmm10 +; SSE-NEXT: addpd %xmm0, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: addpd %xmm13, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulpd %xmm12, %xmm13 +; SSE-NEXT: addpd %xmm1, %xmm13 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm12, %xmm1 ; SSE-NEXT: addpd %xmm10, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm0, %xmm10 -; SSE-NEXT: addpd %xmm9, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: addpd %xmm12, %xmm9 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm2, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: movapd %xmm7, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm3, %xmm2 -; SSE-NEXT: addpd %xmm0, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: mulpd %xmm3, %xmm12 +; SSE-NEXT: mulpd %xmm12, %xmm10 +; SSE-NEXT: addpd %xmm8, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm4, %xmm12 ; SSE-NEXT: addpd %xmm9, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: mulpd %xmm0, %xmm8 +; SSE-NEXT: addpd %xmm12, %xmm8 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm3, %xmm9 +; SSE-NEXT: mulpd %xmm0, %xmm9 ; SSE-NEXT: addpd %xmm10, %xmm9 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: addpd %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm7, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm7, %xmm10 -; SSE-NEXT: addpd %xmm9, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm7, %xmm9 -; SSE-NEXT: addpd %xmm12, %xmm9 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: addpd %xmm2, %xmm7 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movapd %xmm8, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm8[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: addpd %xmm7, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: mulpd %xmm2, %xmm12 -; SSE-NEXT: addpd %xmm9, %xmm12 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm2, %xmm7 -; SSE-NEXT: addpd %xmm10, %xmm7 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm3, %xmm2 -; SSE-NEXT: addpd %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: addpd %xmm2, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: addpd %xmm7, %xmm0 -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm8, %xmm9 -; SSE-NEXT: addpd %xmm12, %xmm9 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm0, %xmm8 -; SSE-NEXT: addpd %xmm1, %xmm8 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd %xmm13, %xmm12 -; SSE-NEXT: mulpd %xmm0, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: mulpd %xmm1, %xmm3 -; SSE-NEXT: addpd %xmm12, %xmm3 -; SSE-NEXT: movapd %xmm14, %xmm12 -; SSE-NEXT: movapd %xmm14, %xmm5 -; SSE-NEXT: mulpd %xmm0, %xmm12 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: mulpd %xmm1, %xmm13 -; SSE-NEXT: addpd %xmm12, %xmm13 -; SSE-NEXT: mulpd %xmm0, %xmm4 -; SSE-NEXT: movapd %xmm6, %xmm14 -; SSE-NEXT: mulpd %xmm1, %xmm14 -; SSE-NEXT: addpd %xmm4, %xmm14 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: mulpd %xmm6, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: mulpd %xmm10, %xmm1 -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: mulpd %xmm0, %xmm12 -; SSE-NEXT: addpd %xmm1, %xmm12 +; SSE-NEXT: mulpd %xmm0, %xmm10 +; SSE-NEXT: addpd %xmm1, %xmm10 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: addpd %xmm14, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulpd %xmm0, %xmm14 -; SSE-NEXT: addpd %xmm13, %xmm14 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm13, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm11[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: mulpd %xmm2, %xmm13 +; SSE-NEXT: mulpd %xmm12, %xmm13 ; SSE-NEXT: addpd %xmm0, %xmm13 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm2, %xmm0 -; SSE-NEXT: addpd %xmm14, %xmm0 +; SSE-NEXT: mulpd %xmm12, %xmm0 +; SSE-NEXT: addpd %xmm10, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulpd %xmm2, %xmm14 -; SSE-NEXT: addpd %xmm1, %xmm14 +; SSE-NEXT: mulpd %xmm12, %xmm14 +; SSE-NEXT: addpd %xmm9, %xmm14 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: addpd %xmm8, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm1, %xmm2 -; SSE-NEXT: addpd %xmm12, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: movapd %xmm12, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm12[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm1, %xmm3 -; SSE-NEXT: addpd %xmm2, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm1, %xmm2 -; SSE-NEXT: addpd %xmm14, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulpd %xmm1, %xmm14 -; SSE-NEXT: addpd %xmm0, %xmm14 +; SSE-NEXT: mulpd %xmm11, %xmm1 +; SSE-NEXT: addpd %xmm12, %xmm1 +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm11, %xmm10 +; SSE-NEXT: addpd %xmm14, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm11, %xmm9 +; SSE-NEXT: addpd %xmm0, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm11 +; SSE-NEXT: addpd %xmm13, %xmm11 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: mulpd %xmm12, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd %xmm3, %xmm14 +; SSE-NEXT: mulpd %xmm0, %xmm14 +; SSE-NEXT: addpd %xmm5, %xmm14 +; SSE-NEXT: movapd %xmm2, %xmm13 +; SSE-NEXT: movapd %xmm2, %xmm4 +; SSE-NEXT: mulpd %xmm12, %xmm13 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: mulpd %xmm0, %xmm2 +; SSE-NEXT: addpd %xmm13, %xmm2 +; SSE-NEXT: movapd %xmm6, %xmm13 +; SSE-NEXT: movapd %xmm6, %xmm8 +; SSE-NEXT: mulpd %xmm12, %xmm13 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: mulpd %xmm0, %xmm1 ; SSE-NEXT: addpd %xmm13, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: mulpd %xmm5, %xmm12 +; SSE-NEXT: mulpd %xmm7, %xmm0 +; SSE-NEXT: addpd %xmm12, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: movapd %xmm12, %xmm13 +; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm12[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm13, %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm13, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm13, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm2, %xmm13 +; SSE-NEXT: addpd %xmm14, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm12, %xmm4 -; SSE-NEXT: addpd %xmm1, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm12, %xmm2 +; SSE-NEXT: addpd %xmm13, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: mulpd %xmm12, %xmm13 -; SSE-NEXT: addpd %xmm14, %xmm13 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulpd %xmm12, %xmm14 -; SSE-NEXT: addpd %xmm2, %xmm14 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: addpd %xmm1, %xmm13 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm12, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm12 ; SSE-NEXT: addpd %xmm3, %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm0, %xmm14 +; SSE-NEXT: addpd %xmm12, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulpd %xmm0, %xmm12 +; SSE-NEXT: addpd %xmm1, %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: addpd %xmm13, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm2, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: addpd %xmm0, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulpd %xmm3, %xmm13 +; SSE-NEXT: addpd %xmm1, %xmm13 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm12, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: mulpd %xmm3, %xmm12 -; SSE-NEXT: addpd %xmm14, %xmm12 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm3, %xmm0 -; SSE-NEXT: addpd %xmm13, %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm7, %xmm3 -; SSE-NEXT: addpd %xmm4, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: addpd %xmm14, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm0[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulpd %xmm2, %xmm14 +; SSE-NEXT: mulpd %xmm12, %xmm14 ; SSE-NEXT: addpd %xmm3, %xmm14 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: mulpd %xmm2, %xmm13 -; SSE-NEXT: addpd %xmm0, %xmm13 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm12, %xmm3 +; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm12, %xmm1 +; SSE-NEXT: addpd %xmm13, %xmm1 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: addpd %xmm2, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm2, %xmm7 +; SSE-NEXT: mulpd %xmm0, %xmm7 ; SSE-NEXT: addpd %xmm12, %xmm7 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm0, %xmm2 -; SSE-NEXT: addpd %xmm1, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulpd %xmm0, %xmm13 +; SSE-NEXT: addpd %xmm1, %xmm13 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 ; SSE-NEXT: mulpd %xmm0, %xmm12 -; SSE-NEXT: mulpd %xmm0, %xmm5 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm12, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm12 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm5, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm5 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm3 -; SSE-NEXT: mulpd %xmm10, %xmm1 -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm4 -; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm4, %xmm10 -; SSE-NEXT: addpd %xmm1, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm4, %xmm1 -; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm4, %xmm1 -; SSE-NEXT: addpd %xmm5, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: addpd %xmm12, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: addpd %xmm3, %xmm12 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: addpd %xmm4, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm12 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm0, %xmm6 -; SSE-NEXT: addpd %xmm5, %xmm6 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm14, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm3 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: addpd %xmm10, %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm4 -; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm4, %xmm5 -; SSE-NEXT: addpd %xmm0, %xmm5 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm4, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm4, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: addpd %xmm12, %xmm4 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulpd %xmm3, %xmm6 +; SSE-NEXT: mulpd %xmm3, %xmm4 +; SSE-NEXT: movapd %xmm8, %xmm2 +; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: mulpd %xmm5, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm4, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm10, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm10 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addpd %xmm5, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm4 -; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm4, %xmm5 -; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: mulpd %xmm1, %xmm8 +; SSE-NEXT: addpd %xmm6, %xmm8 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: mulpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulpd %xmm1, %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: movapd %xmm6, %xmm2 +; SSE-NEXT: mulpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm3[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm14, %xmm6 +; SSE-NEXT: addpd %xmm1, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm14, %xmm4 +; SSE-NEXT: addpd %xmm2, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm14, %xmm2 +; SSE-NEXT: addpd %xmm5, %xmm2 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: addpd %xmm8, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm4, %xmm1 -; SSE-NEXT: addpd %xmm10, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm4, %xmm10 -; SSE-NEXT: addpd %xmm6, %xmm10 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm14, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm8 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm4, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm4 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: addpd %xmm6, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm14, %xmm2 +; SSE-NEXT: addpd %xmm3, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: mulpd %xmm14, %xmm3 ; SSE-NEXT: addpd %xmm4, %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm14, %xmm3 +; SSE-NEXT: addpd %xmm8, %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm8 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: addpd %xmm5, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm1, %xmm3 +; SSE-NEXT: addpd %xmm14, %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm1, %xmm3 +; SSE-NEXT: addpd %xmm8, %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm1, %xmm3 +; SSE-NEXT: addpd %xmm6, %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm3[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm14, %xmm2 +; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm14, %xmm1 +; SSE-NEXT: addpd %xmm6, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm14, %xmm6 +; SSE-NEXT: addpd %xmm5, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: addpd %xmm4, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm0, %xmm4 -; SSE-NEXT: addpd %xmm10, %xmm4 +; SSE-NEXT: mulpd %xmm3, %xmm4 +; SSE-NEXT: addpd %xmm14, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm3, %xmm5 +; SSE-NEXT: addpd %xmm6, %xmm5 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: mulpd %xmm3, %xmm6 ; SSE-NEXT: addpd %xmm1, %xmm6 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: addpd %xmm5, %xmm0 -; SSE-NEXT: movapd %xmm3, 496(%rdi) -; SSE-NEXT: movapd %xmm4, 480(%rdi) +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: addpd %xmm2, %xmm3 +; SSE-NEXT: movapd %xmm4, 496(%rdi) +; SSE-NEXT: movapd %xmm5, 480(%rdi) ; SSE-NEXT: movapd %xmm6, 464(%rdi) -; SSE-NEXT: movapd %xmm0, 448(%rdi) -; SSE-NEXT: movapd %xmm14, 432(%rdi) +; SSE-NEXT: movapd %xmm3, 448(%rdi) +; SSE-NEXT: movapd %xmm7, 432(%rdi) ; SSE-NEXT: movapd %xmm13, 416(%rdi) -; SSE-NEXT: movapd %xmm7, 400(%rdi) -; SSE-NEXT: movapd %xmm2, 384(%rdi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 368(%rdi) +; SSE-NEXT: movapd %xmm12, 400(%rdi) +; SSE-NEXT: movapd %xmm0, 384(%rdi) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rdi) +; SSE-NEXT: movaps %xmm0, 368(%rdi) +; SSE-NEXT: movapd %xmm10, 352(%rdi) ; SSE-NEXT: movapd %xmm9, 336(%rdi) -; SSE-NEXT: movapd %xmm8, 320(%rdi) +; SSE-NEXT: movapd %xmm11, 320(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rdi) -; SSE-NEXT: movapd %xmm11, 256(%rdi) +; SSE-NEXT: movapd %xmm15, 256(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdi) -; SSE-NEXT: movapd %xmm15, 192(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4232,376 +4170,386 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX1-NEXT: pushq %rbp ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp -; AVX1-NEXT: subq $448, %rsp # imm = 0x1C0 +; AVX1-NEXT: subq $416, %rsp # imm = 0x1A0 +; AVX1-NEXT: vmovapd %ymm5, %ymm14 ; AVX1-NEXT: vmovapd %ymm2, %ymm12 -; AVX1-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill ; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: vmovapd 144(%rbp), %ymm2 -; AVX1-NEXT: vmovapd 112(%rbp), %ymm13 -; AVX1-NEXT: vbroadcastsd 272(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm8 -; AVX1-NEXT: vmovapd %ymm1, %ymm9 -; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vmovapd 144(%rbp), %ymm15 +; AVX1-NEXT: vbroadcastsd 272(%rbp), %ymm8 +; AVX1-NEXT: vmulpd %ymm1, %ymm8, %ymm5 +; AVX1-NEXT: vmovapd %ymm1, %ymm13 +; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm9 +; AVX1-NEXT: vmovapd %ymm0, %ymm8 ; AVX1-NEXT: vbroadcastsd 280(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm11, %ymm8, %ymm1 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vmovapd %ymm3, %ymm1 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm10, %ymm9, %ymm2 ; AVX1-NEXT: vbroadcastsd 288(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmovapd %ymm14, %ymm3 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 296(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 ; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 304(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 ; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 312(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 -; AVX1-NEXT: vmovapd %ymm13, %ymm14 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 ; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 320(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX1-NEXT: vmovapd %ymm2, %ymm13 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 ; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 328(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm0 +; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vbroadcastsd 336(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 +; AVX1-NEXT: vbroadcastsd 336(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm13, %ymm5 +; AVX1-NEXT: vmovapd %ymm13, %ymm0 ; AVX1-NEXT: vbroadcastsd 344(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX1-NEXT: vmovapd %ymm3, %ymm8 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd (%rsp), %ymm15 # 32-byte Reload -; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX1-NEXT: vmovapd %ymm1, %ymm9 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmulpd %ymm2, %ymm8, %ymm2 +; AVX1-NEXT: vmovapd %ymm12, %ymm13 ; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 352(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX1-NEXT: vmovapd %ymm5, %ymm3 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 360(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 ; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vmovapd %ymm6, %ymm12 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 368(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 16(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vmovapd 16(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmovapd 48(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 376(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd 80(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 384(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd 176(%rbp), %ymm14 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 392(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 240(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vbroadcastsd 400(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 +; AVX1-NEXT: vmovapd 240(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmovapd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vmovapd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vbroadcastsd 400(%rbp), %ymm2 +; AVX1-NEXT: vmovapd %ymm0, %ymm1 +; AVX1-NEXT: vmulpd %ymm2, %ymm0, %ymm5 ; AVX1-NEXT: vbroadcastsd 408(%rbp), %ymm10 -; AVX1-NEXT: vmovapd %ymm8, %ymm5 -; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmulpd %ymm2, %ymm8, %ymm2 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX1-NEXT: vmovapd %ymm13, %ymm6 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 416(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 ; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 -; AVX1-NEXT: vmovapd %ymm3, %ymm2 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 424(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 432(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 ; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 440(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 ; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 448(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 456(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm0 +; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovapd 208(%rbp), %ymm15 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vbroadcastsd 464(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 -; AVX1-NEXT: vmovapd %ymm9, %ymm13 +; AVX1-NEXT: vbroadcastsd 464(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm1, %ymm5 +; AVX1-NEXT: vmovapd %ymm1, %ymm13 ; AVX1-NEXT: vbroadcastsd 472(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 -; AVX1-NEXT: vmovapd %ymm15, %ymm9 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX1-NEXT: vmovapd %ymm9, %ymm14 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmulpd %ymm2, %ymm8, %ymm2 +; AVX1-NEXT: vmovapd %ymm8, %ymm9 +; AVX1-NEXT: vmovapd %ymm6, %ymm8 +; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 480(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1-NEXT: vmovapd %ymm4, %ymm3 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd %ymm2, %ymm15 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vmovapd %ymm4, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1-NEXT: vmovapd %ymm3, %ymm6 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 488(%rbp), %ymm10 -; AVX1-NEXT: vmovapd %ymm7, %ymm8 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd %ymm6, %ymm7 -; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 496(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd 48(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vmovapd 16(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmovapd 48(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 504(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 112(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd 80(%rbp), %ymm14 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 512(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd 176(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vmovapd 144(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 520(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd 240(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm4 +; AVX1-NEXT: vmovapd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm1 ; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vbroadcastsd 528(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm13, %ymm1 +; AVX1-NEXT: vbroadcastsd 528(%rbp), %ymm2 +; AVX1-NEXT: vmovapd %ymm13, %ymm1 +; AVX1-NEXT: vmulpd %ymm2, %ymm13, %ymm5 ; AVX1-NEXT: vbroadcastsd 536(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm11 -; AVX1-NEXT: vmovapd %ymm5, %ymm6 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX1-NEXT: vmovapd %ymm12, %ymm5 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vmovapd %ymm14, %ymm4 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmovapd %ymm9, %ymm3 +; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm2 +; AVX1-NEXT: vmovapd %ymm8, %ymm9 +; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 544(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX1-NEXT: vmovapd %ymm3, %ymm12 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vmovapd %ymm0, %ymm8 +; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 552(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX1-NEXT: vmovapd %ymm12, %ymm13 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 560(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX1-NEXT: vmovapd %ymm4, %ymm3 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 568(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 576(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 144(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 584(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 240(%rbp), %ymm14 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovapd 208(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vbroadcastsd 592(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm13, %ymm1 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm0 +; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vbroadcastsd 592(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm1, %ymm5 ; AVX1-NEXT: vbroadcastsd 600(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm0 -; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmulpd %ymm2, %ymm3, %ymm2 +; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 608(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 -; AVX1-NEXT: vbroadcastsd 616(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 +; AVX1-NEXT: vbroadcastsd 616(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd %ymm7, %ymm12 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmovapd %ymm13, %ymm7 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 624(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vmovapd 16(%rbp), %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmovapd 48(%rbp), %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 632(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 112(%rbp), %ymm3 -; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd 80(%rbp), %ymm3 -; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX1-NEXT: vbroadcastsd 640(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd 176(%rbp), %ymm3 -; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm13 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX1-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX1-NEXT: vbroadcastsd 648(%rbp), %ymm10 -; AVX1-NEXT: vmovapd %ymm14, %ymm4 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd 240(%rbp), %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm5, %ymm11, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vbroadcastsd 656(%rbp), %ymm1 -; AVX1-NEXT: vmovapd %ymm13, %ymm3 -; AVX1-NEXT: vmulpd %ymm1, %ymm13, %ymm2 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm0 +; AVX1-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vbroadcastsd 656(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm1, %ymm5 +; AVX1-NEXT: vmovapd %ymm1, %ymm15 ; AVX1-NEXT: vbroadcastsd 664(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm6, %ymm14 -; AVX1-NEXT: vmovapd %ymm6, %ymm10 -; AVX1-NEXT: vaddpd %ymm2, %ymm14, %ymm2 -; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm1 -; AVX1-NEXT: vmulpd %ymm0, %ymm5, %ymm0 -; AVX1-NEXT: vmovapd %ymm5, %ymm6 -; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vbroadcastsd 672(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm12, %ymm14 -; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 -; AVX1-NEXT: vmulpd %ymm1, %ymm15, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vbroadcastsd 680(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm14 +; AVX1-NEXT: vaddpd %ymm5, %ymm14, %ymm5 +; AVX1-NEXT: vmulpd %ymm2, %ymm3, %ymm2 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm0 +; AVX1-NEXT: vmovapd %ymm9, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vbroadcastsd 672(%rbp), %ymm2 ; AVX1-NEXT: vmulpd %ymm2, %ymm8, %ymm14 -; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 -; AVX1-NEXT: vmulpd %ymm2, %ymm7, %ymm2 -; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd 688(%rbp), %ymm2 -; AVX1-NEXT: vmovapd 16(%rbp), %ymm11 -; AVX1-NEXT: vmulpd %ymm2, %ymm11, %ymm14 ; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 -; AVX1-NEXT: vmulpd 48(%rbp), %ymm2, %ymm2 -; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastsd 696(%rbp), %ymm2 -; AVX1-NEXT: vmovapd 112(%rbp), %ymm5 -; AVX1-NEXT: vmulpd %ymm2, %ymm5, %ymm14 -; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 -; AVX1-NEXT: vmovapd 80(%rbp), %ymm5 -; AVX1-NEXT: vmulpd %ymm2, %ymm5, %ymm2 -; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd 704(%rbp), %ymm2 -; AVX1-NEXT: vmulpd 144(%rbp), %ymm2, %ymm14 +; AVX1-NEXT: vmovapd %ymm6, %ymm9 +; AVX1-NEXT: vmulpd %ymm2, %ymm6, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vbroadcastsd 680(%rbp), %ymm5 +; AVX1-NEXT: vmovapd %ymm12, %ymm10 +; AVX1-NEXT: vmulpd %ymm5, %ymm12, %ymm14 +; AVX1-NEXT: vaddpd %ymm2, %ymm14, %ymm2 +; AVX1-NEXT: vmulpd %ymm5, %ymm7, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 688(%rbp), %ymm5 +; AVX1-NEXT: vmulpd 16(%rbp), %ymm5, %ymm14 ; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 -; AVX1-NEXT: vmovapd 176(%rbp), %ymm13 -; AVX1-NEXT: vmulpd %ymm2, %ymm13, %ymm2 -; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastsd 712(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm4, %ymm14 -; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 -; AVX1-NEXT: vmovapd 208(%rbp), %ymm14 -; AVX1-NEXT: vmulpd %ymm2, %ymm14, %ymm2 -; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd 720(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm3, %ymm3 -; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm2 -; AVX1-NEXT: vbroadcastsd 728(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 -; AVX1-NEXT: vmulpd %ymm4, %ymm6, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastsd 736(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm12, %ymm5 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm5, %ymm5 ; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 -; AVX1-NEXT: vmulpd %ymm4, %ymm15, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vbroadcastsd 744(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm8, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 -; AVX1-NEXT: vmulpd %ymm4, %ymm7, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastsd 752(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm11, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 -; AVX1-NEXT: vmulpd 48(%rbp), %ymm4, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vbroadcastsd 760(%rbp), %ymm4 -; AVX1-NEXT: vmulpd 112(%rbp), %ymm4, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 -; AVX1-NEXT: vmulpd 80(%rbp), %ymm4, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastsd 768(%rbp), %ymm4 -; AVX1-NEXT: vmulpd 144(%rbp), %ymm4, %ymm5 +; AVX1-NEXT: vbroadcastsd 696(%rbp), %ymm5 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm6 +; AVX1-NEXT: vmulpd %ymm5, %ymm6, %ymm14 +; AVX1-NEXT: vaddpd %ymm2, %ymm14, %ymm2 +; AVX1-NEXT: vmulpd 80(%rbp), %ymm5, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 704(%rbp), %ymm5 +; AVX1-NEXT: vmovapd 144(%rbp), %ymm12 +; AVX1-NEXT: vmulpd %ymm5, %ymm12, %ymm14 +; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX1-NEXT: vmulpd %ymm5, %ymm13, %ymm5 ; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 -; AVX1-NEXT: vmulpd %ymm4, %ymm13, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vbroadcastsd 776(%rbp), %ymm4 -; AVX1-NEXT: vmulpd 240(%rbp), %ymm4, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 -; AVX1-NEXT: vmulpd %ymm4, %ymm14, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vmovapd %ymm3, 480(%rdi) -; AVX1-NEXT: vmovapd %ymm2, 448(%rdi) -; AVX1-NEXT: vmovapd %ymm1, 416(%rdi) -; AVX1-NEXT: vmovapd %ymm0, 384(%rdi) +; AVX1-NEXT: vbroadcastsd 712(%rbp), %ymm14 +; AVX1-NEXT: vmovapd 240(%rbp), %ymm13 +; AVX1-NEXT: vmulpd %ymm14, %ymm13, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm5 +; AVX1-NEXT: vmovapd 208(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm14, %ymm2 +; AVX1-NEXT: vmovapd %ymm1, %ymm14 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm2 +; AVX1-NEXT: vbroadcastsd 720(%rbp), %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vbroadcastsd 728(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm4, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 736(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm8, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd %ymm3, %ymm9, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 744(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd %ymm3, %ymm7, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 752(%rbp), %ymm3 +; AVX1-NEXT: vmulpd 16(%rbp), %ymm3, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm3, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 760(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm6, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd 80(%rbp), %ymm3, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 768(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm12, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd 176(%rbp), %ymm3, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 776(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm13, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd %ymm3, %ymm14, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd %ymm1, 480(%rdi) +; AVX1-NEXT: vmovapd %ymm0, 448(%rdi) +; AVX1-NEXT: vmovapd %ymm5, 416(%rdi) +; AVX1-NEXT: vmovapd %ymm2, 384(%rdi) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 352(%rdi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 320(%rdi) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 288(%rdi) @@ -4633,376 +4581,386 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $448, %rsp # imm = 0x1C0 +; AVX2-NEXT: subq $416, %rsp # imm = 0x1A0 +; AVX2-NEXT: vmovapd %ymm5, %ymm14 ; AVX2-NEXT: vmovapd %ymm2, %ymm12 -; AVX2-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: movq %rdi, %rax -; AVX2-NEXT: vmovapd 144(%rbp), %ymm2 -; AVX2-NEXT: vmovapd 112(%rbp), %ymm13 -; AVX2-NEXT: vbroadcastsd 272(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm8 -; AVX2-NEXT: vmovapd %ymm1, %ymm9 -; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vmovapd 144(%rbp), %ymm15 +; AVX2-NEXT: vbroadcastsd 272(%rbp), %ymm8 +; AVX2-NEXT: vmulpd %ymm1, %ymm8, %ymm5 +; AVX2-NEXT: vmovapd %ymm1, %ymm13 +; AVX2-NEXT: vmulpd %ymm0, %ymm8, %ymm9 +; AVX2-NEXT: vmovapd %ymm0, %ymm8 ; AVX2-NEXT: vbroadcastsd 280(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm11, %ymm8, %ymm1 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vmovapd %ymm3, %ymm1 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm10, %ymm9, %ymm2 ; AVX2-NEXT: vbroadcastsd 288(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmovapd %ymm14, %ymm3 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 296(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 ; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 304(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 ; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 312(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 -; AVX2-NEXT: vmovapd %ymm13, %ymm14 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 ; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 320(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX2-NEXT: vmovapd %ymm2, %ymm13 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 ; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 328(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 -; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 336(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 336(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm13, %ymm5 +; AVX2-NEXT: vmovapd %ymm13, %ymm0 ; AVX2-NEXT: vbroadcastsd 344(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX2-NEXT: vmovapd %ymm3, %ymm8 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmovapd (%rsp), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX2-NEXT: vmovapd %ymm1, %ymm9 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmulpd %ymm2, %ymm8, %ymm2 +; AVX2-NEXT: vmovapd %ymm12, %ymm13 ; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 352(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX2-NEXT: vmovapd %ymm5, %ymm3 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 360(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 ; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vmovapd %ymm6, %ymm12 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 368(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 16(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vmovapd 16(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmovapd 48(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 376(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmovapd 80(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 384(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd 176(%rbp), %ymm14 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 392(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 240(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 -; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 400(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 +; AVX2-NEXT: vmovapd 240(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmovapd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vmovapd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 400(%rbp), %ymm2 +; AVX2-NEXT: vmovapd %ymm0, %ymm1 +; AVX2-NEXT: vmulpd %ymm2, %ymm0, %ymm5 ; AVX2-NEXT: vbroadcastsd 408(%rbp), %ymm10 -; AVX2-NEXT: vmovapd %ymm8, %ymm5 -; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmulpd %ymm2, %ymm8, %ymm2 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX2-NEXT: vmovapd %ymm13, %ymm6 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 416(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 ; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 -; AVX2-NEXT: vmovapd %ymm3, %ymm2 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 424(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 432(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 ; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 440(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 ; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 448(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 456(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 -; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 464(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 -; AVX2-NEXT: vmovapd %ymm9, %ymm13 +; AVX2-NEXT: vmovapd 208(%rbp), %ymm15 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 464(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm5 +; AVX2-NEXT: vmovapd %ymm1, %ymm13 ; AVX2-NEXT: vbroadcastsd 472(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 -; AVX2-NEXT: vmovapd %ymm15, %ymm9 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX2-NEXT: vmovapd %ymm9, %ymm14 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmulpd %ymm2, %ymm8, %ymm2 +; AVX2-NEXT: vmovapd %ymm8, %ymm9 +; AVX2-NEXT: vmovapd %ymm6, %ymm8 +; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 480(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX2-NEXT: vmovapd %ymm4, %ymm3 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd %ymm2, %ymm15 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vmovapd %ymm4, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX2-NEXT: vmovapd %ymm3, %ymm6 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 488(%rbp), %ymm10 -; AVX2-NEXT: vmovapd %ymm7, %ymm8 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmovapd %ymm6, %ymm7 -; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 496(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd 48(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vmovapd 16(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmovapd 48(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 504(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 112(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmovapd 80(%rbp), %ymm14 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 512(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd 176(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vmovapd 144(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 520(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd 240(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm4 +; AVX2-NEXT: vmovapd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm1 ; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 -; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 528(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm13, %ymm1 +; AVX2-NEXT: vbroadcastsd 528(%rbp), %ymm2 +; AVX2-NEXT: vmovapd %ymm13, %ymm1 +; AVX2-NEXT: vmulpd %ymm2, %ymm13, %ymm5 ; AVX2-NEXT: vbroadcastsd 536(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 -; AVX2-NEXT: vmovapd %ymm5, %ymm6 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX2-NEXT: vmovapd %ymm12, %ymm5 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vmovapd %ymm14, %ymm4 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmovapd %ymm9, %ymm3 +; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm2 +; AVX2-NEXT: vmovapd %ymm8, %ymm9 +; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 544(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX2-NEXT: vmovapd %ymm3, %ymm12 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vmovapd %ymm0, %ymm8 +; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 552(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX2-NEXT: vmovapd %ymm12, %ymm13 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 560(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX2-NEXT: vmovapd %ymm4, %ymm3 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 568(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 576(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 144(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 584(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 240(%rbp), %ymm14 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovapd 208(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm1 -; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 592(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm13, %ymm1 +; AVX2-NEXT: vbroadcastsd 592(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm5 ; AVX2-NEXT: vbroadcastsd 600(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 -; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmulpd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 608(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 -; AVX2-NEXT: vbroadcastsd 616(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 +; AVX2-NEXT: vbroadcastsd 616(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd %ymm7, %ymm12 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmovapd %ymm13, %ymm7 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 624(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vmovapd 16(%rbp), %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmovapd 48(%rbp), %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 632(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 112(%rbp), %ymm3 -; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmovapd 80(%rbp), %ymm3 -; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm5 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vbroadcastsd 640(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd 176(%rbp), %ymm3 -; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm13 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX2-NEXT: vaddpd %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vbroadcastsd 648(%rbp), %ymm10 -; AVX2-NEXT: vmovapd %ymm14, %ymm4 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm1 -; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd 240(%rbp), %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm5, %ymm11, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 656(%rbp), %ymm1 -; AVX2-NEXT: vmovapd %ymm13, %ymm3 -; AVX2-NEXT: vmulpd %ymm1, %ymm13, %ymm2 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 656(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm5 +; AVX2-NEXT: vmovapd %ymm1, %ymm15 ; AVX2-NEXT: vbroadcastsd 664(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm6, %ymm14 -; AVX2-NEXT: vmovapd %ymm6, %ymm10 -; AVX2-NEXT: vaddpd %ymm2, %ymm14, %ymm2 -; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm1 -; AVX2-NEXT: vmulpd %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovapd %ymm5, %ymm6 -; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vbroadcastsd 672(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm12, %ymm14 -; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 -; AVX2-NEXT: vmulpd %ymm1, %ymm15, %ymm1 -; AVX2-NEXT: vaddpd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vbroadcastsd 680(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm0, %ymm4, %ymm14 +; AVX2-NEXT: vaddpd %ymm5, %ymm14, %ymm5 +; AVX2-NEXT: vmulpd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vmovapd %ymm9, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vbroadcastsd 672(%rbp), %ymm2 ; AVX2-NEXT: vmulpd %ymm2, %ymm8, %ymm14 -; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 -; AVX2-NEXT: vmulpd %ymm2, %ymm7, %ymm2 -; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd 688(%rbp), %ymm2 -; AVX2-NEXT: vmovapd 16(%rbp), %ymm11 -; AVX2-NEXT: vmulpd %ymm2, %ymm11, %ymm14 ; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 -; AVX2-NEXT: vmulpd 48(%rbp), %ymm2, %ymm2 -; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastsd 696(%rbp), %ymm2 -; AVX2-NEXT: vmovapd 112(%rbp), %ymm5 -; AVX2-NEXT: vmulpd %ymm2, %ymm5, %ymm14 -; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 -; AVX2-NEXT: vmovapd 80(%rbp), %ymm5 -; AVX2-NEXT: vmulpd %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd 704(%rbp), %ymm2 -; AVX2-NEXT: vmulpd 144(%rbp), %ymm2, %ymm14 +; AVX2-NEXT: vmovapd %ymm6, %ymm9 +; AVX2-NEXT: vmulpd %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vbroadcastsd 680(%rbp), %ymm5 +; AVX2-NEXT: vmovapd %ymm12, %ymm10 +; AVX2-NEXT: vmulpd %ymm5, %ymm12, %ymm14 +; AVX2-NEXT: vaddpd %ymm2, %ymm14, %ymm2 +; AVX2-NEXT: vmulpd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 688(%rbp), %ymm5 +; AVX2-NEXT: vmulpd 16(%rbp), %ymm5, %ymm14 ; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 -; AVX2-NEXT: vmovapd 176(%rbp), %ymm13 -; AVX2-NEXT: vmulpd %ymm2, %ymm13, %ymm2 -; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastsd 712(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm4, %ymm14 -; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 -; AVX2-NEXT: vmovapd 208(%rbp), %ymm14 -; AVX2-NEXT: vmulpd %ymm2, %ymm14, %ymm2 -; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd 720(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm2 -; AVX2-NEXT: vbroadcastsd 728(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vmulpd %ymm4, %ymm6, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vbroadcastsd 736(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm12, %ymm5 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm5, %ymm5 ; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vmulpd %ymm4, %ymm15, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vbroadcastsd 744(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm8, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vmulpd %ymm4, %ymm7, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vbroadcastsd 752(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm11, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vmulpd 48(%rbp), %ymm4, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vbroadcastsd 760(%rbp), %ymm4 -; AVX2-NEXT: vmulpd 112(%rbp), %ymm4, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vmulpd 80(%rbp), %ymm4, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vbroadcastsd 768(%rbp), %ymm4 -; AVX2-NEXT: vmulpd 144(%rbp), %ymm4, %ymm5 +; AVX2-NEXT: vbroadcastsd 696(%rbp), %ymm5 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm6 +; AVX2-NEXT: vmulpd %ymm5, %ymm6, %ymm14 +; AVX2-NEXT: vaddpd %ymm2, %ymm14, %ymm2 +; AVX2-NEXT: vmulpd 80(%rbp), %ymm5, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 704(%rbp), %ymm5 +; AVX2-NEXT: vmovapd 144(%rbp), %ymm12 +; AVX2-NEXT: vmulpd %ymm5, %ymm12, %ymm14 +; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX2-NEXT: vmulpd %ymm5, %ymm13, %ymm5 ; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vmulpd %ymm4, %ymm13, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vbroadcastsd 776(%rbp), %ymm4 -; AVX2-NEXT: vmulpd 240(%rbp), %ymm4, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vmulpd %ymm4, %ymm14, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vmovapd %ymm3, 480(%rdi) -; AVX2-NEXT: vmovapd %ymm2, 448(%rdi) -; AVX2-NEXT: vmovapd %ymm1, 416(%rdi) -; AVX2-NEXT: vmovapd %ymm0, 384(%rdi) +; AVX2-NEXT: vbroadcastsd 712(%rbp), %ymm14 +; AVX2-NEXT: vmovapd 240(%rbp), %ymm13 +; AVX2-NEXT: vmulpd %ymm14, %ymm13, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm5 +; AVX2-NEXT: vmovapd 208(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm14, %ymm2 +; AVX2-NEXT: vmovapd %ymm1, %ymm14 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vbroadcastsd 720(%rbp), %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vbroadcastsd 728(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm4, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vmulpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 736(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm8, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd %ymm3, %ymm9, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 744(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vmulpd %ymm3, %ymm7, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 752(%rbp), %ymm3 +; AVX2-NEXT: vmulpd 16(%rbp), %ymm3, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm3, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 760(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm6, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vmulpd 80(%rbp), %ymm3, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 768(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm12, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd 176(%rbp), %ymm3, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 776(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm13, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vmulpd %ymm3, %ymm14, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd %ymm1, 480(%rdi) +; AVX2-NEXT: vmovapd %ymm0, 448(%rdi) +; AVX2-NEXT: vmovapd %ymm5, 416(%rdi) +; AVX2-NEXT: vmovapd %ymm2, 384(%rdi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 352(%rdi) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 320(%rdi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 288(%rdi) diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll index c0f8f86e6e8b10..682d0a800bac32 100644 --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll @@ -635,20 +635,20 @@ define i1 @length13_eq(ptr %X, ptr %Y) nounwind { ; X86-LABEL: length13_eq: ; X86: # %bb.0: ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl 4(%edx), %eax -; X86-NEXT: xorl (%ecx), %esi -; X86-NEXT: xorl 4(%ecx), %eax +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: xorl 8(%eax), %edx +; X86-NEXT: movzbl 12(%ecx), %ecx +; X86-NEXT: xorb 12(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: xorl 8(%ecx), %esi -; X86-NEXT: movzbl 12(%edx), %edx -; X86-NEXT: xorb 12(%ecx), %dl -; X86-NEXT: movzbl %dl, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -661,20 +661,20 @@ define i1 @length14_eq(ptr %X, ptr %Y) nounwind { ; X86-LABEL: length14_eq: ; X86: # %bb.0: ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl 4(%edx), %eax -; X86-NEXT: xorl (%ecx), %esi -; X86-NEXT: xorl 4(%ecx), %eax +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: xorl 8(%eax), %edx +; X86-NEXT: movzwl 12(%ecx), %ecx +; X86-NEXT: xorw 12(%eax), %cx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: xorl 8(%ecx), %esi -; X86-NEXT: movzwl 12(%edx), %edx -; X86-NEXT: xorw 12(%ecx), %dx -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -687,19 +687,19 @@ define i1 @length15_eq(ptr %X, ptr %Y) nounwind { ; X86-LABEL: length15_eq: ; X86: # %bb.0: ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl 4(%edx), %eax -; X86-NEXT: xorl (%ecx), %esi -; X86-NEXT: xorl 4(%ecx), %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: xorl 8(%ecx), %esi -; X86-NEXT: movl 11(%edx), %edx -; X86-NEXT: xorl 11(%ecx), %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: xorl 8(%eax), %edx +; X86-NEXT: movl 11(%ecx), %ecx +; X86-NEXT: xorl 11(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: sete %al ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -715,38 +715,38 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%edx), %ecx +; X86-NEXT: movl (%eax), %esi ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: jne .LBB33_4 ; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: movl 4(%edx), %ecx +; X86-NEXT: movl 4(%eax), %esi ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: jne .LBB33_4 ; X86-NEXT: # %bb.2: # %loadbb2 -; X86-NEXT: movl 8(%esi), %ecx -; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: movl 8(%edx), %ecx +; X86-NEXT: movl 8(%eax), %esi ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: jne .LBB33_4 ; X86-NEXT: # %bb.3: # %loadbb3 -; X86-NEXT: movl 12(%esi), %ecx -; X86-NEXT: movl 12(%eax), %edx +; X86-NEXT: movl 12(%edx), %ecx +; X86-NEXT: movl 12(%eax), %esi ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: je .LBB33_5 ; X86-NEXT: .LBB33_4: # %res_block ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: orl $1, %eax ; X86-NEXT: .LBB33_5: # %endblock @@ -760,19 +760,19 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind { ; X86-NOSSE-LABEL: length16_eq: ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOSSE-NEXT: movl (%edx), %esi -; X86-NOSSE-NEXT: movl 4(%edx), %eax -; X86-NOSSE-NEXT: xorl (%ecx), %esi -; X86-NOSSE-NEXT: xorl 4(%ecx), %eax -; X86-NOSSE-NEXT: orl %esi, %eax -; X86-NOSSE-NEXT: movl 8(%edx), %esi -; X86-NOSSE-NEXT: xorl 8(%ecx), %esi -; X86-NOSSE-NEXT: movl 12(%edx), %edx -; X86-NOSSE-NEXT: xorl 12(%ecx), %edx -; X86-NOSSE-NEXT: orl %esi, %edx -; X86-NOSSE-NEXT: orl %eax, %edx +; X86-NOSSE-NEXT: movl (%ecx), %edx +; X86-NOSSE-NEXT: movl 4(%ecx), %esi +; X86-NOSSE-NEXT: xorl (%eax), %edx +; X86-NOSSE-NEXT: xorl 4(%eax), %esi +; X86-NOSSE-NEXT: orl %edx, %esi +; X86-NOSSE-NEXT: movl 8(%ecx), %edx +; X86-NOSSE-NEXT: xorl 8(%eax), %edx +; X86-NOSSE-NEXT: movl 12(%ecx), %ecx +; X86-NOSSE-NEXT: xorl 12(%eax), %ecx +; X86-NOSSE-NEXT: orl %edx, %ecx +; X86-NOSSE-NEXT: orl %esi, %ecx ; X86-NOSSE-NEXT: setne %al ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: retl @@ -780,19 +780,19 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind { ; X86-SSE1-LABEL: length16_eq: ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE1-NEXT: movl (%edx), %esi -; X86-SSE1-NEXT: movl 4(%edx), %eax -; X86-SSE1-NEXT: xorl (%ecx), %esi -; X86-SSE1-NEXT: xorl 4(%ecx), %eax -; X86-SSE1-NEXT: orl %esi, %eax -; X86-SSE1-NEXT: movl 8(%edx), %esi -; X86-SSE1-NEXT: xorl 8(%ecx), %esi -; X86-SSE1-NEXT: movl 12(%edx), %edx -; X86-SSE1-NEXT: xorl 12(%ecx), %edx -; X86-SSE1-NEXT: orl %esi, %edx -; X86-SSE1-NEXT: orl %eax, %edx +; X86-SSE1-NEXT: movl (%ecx), %edx +; X86-SSE1-NEXT: movl 4(%ecx), %esi +; X86-SSE1-NEXT: xorl (%eax), %edx +; X86-SSE1-NEXT: xorl 4(%eax), %esi +; X86-SSE1-NEXT: orl %edx, %esi +; X86-SSE1-NEXT: movl 8(%ecx), %edx +; X86-SSE1-NEXT: xorl 8(%eax), %edx +; X86-SSE1-NEXT: movl 12(%ecx), %ecx +; X86-SSE1-NEXT: xorl 12(%eax), %ecx +; X86-SSE1-NEXT: orl %edx, %ecx +; X86-SSE1-NEXT: orl %esi, %ecx ; X86-SSE1-NEXT: setne %al ; X86-SSE1-NEXT: popl %esi ; X86-SSE1-NEXT: retl @@ -829,38 +829,38 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%edx), %ecx +; X86-NEXT: movl (%eax), %esi ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: jne .LBB35_4 ; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: movl 4(%edx), %ecx +; X86-NEXT: movl 4(%eax), %esi ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: jne .LBB35_4 ; X86-NEXT: # %bb.2: # %loadbb2 -; X86-NEXT: movl 8(%esi), %ecx -; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: movl 8(%edx), %ecx +; X86-NEXT: movl 8(%eax), %esi ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: jne .LBB35_4 ; X86-NEXT: # %bb.3: # %loadbb3 -; X86-NEXT: movl 12(%esi), %ecx -; X86-NEXT: movl 12(%eax), %edx +; X86-NEXT: movl 12(%edx), %ecx +; X86-NEXT: movl 12(%eax), %esi ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: je .LBB35_5 ; X86-NEXT: .LBB35_4: # %res_block ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: orl $1, %eax ; X86-NEXT: .LBB35_5: # %endblock @@ -877,43 +877,43 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X86-LABEL: length16_gt: ; X86: # %bb.0: ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: movl (%edx), %ecx +; X86-NEXT: movl (%edx), %eax +; X86-NEXT: movl (%ecx), %esi ; X86-NEXT: bswapl %eax -; X86-NEXT: bswapl %ecx -; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: jne .LBB36_4 ; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %eax -; X86-NEXT: movl 4(%edx), %ecx +; X86-NEXT: movl 4(%edx), %eax +; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: bswapl %eax -; X86-NEXT: bswapl %ecx -; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: jne .LBB36_4 ; X86-NEXT: # %bb.2: # %loadbb2 -; X86-NEXT: movl 8(%esi), %eax -; X86-NEXT: movl 8(%edx), %ecx +; X86-NEXT: movl 8(%edx), %eax +; X86-NEXT: movl 8(%ecx), %esi ; X86-NEXT: bswapl %eax -; X86-NEXT: bswapl %ecx -; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: jne .LBB36_4 ; X86-NEXT: # %bb.3: # %loadbb3 -; X86-NEXT: movl 12(%esi), %eax -; X86-NEXT: movl 12(%edx), %ecx +; X86-NEXT: movl 12(%edx), %eax +; X86-NEXT: movl 12(%ecx), %esi ; X86-NEXT: bswapl %eax -; X86-NEXT: bswapl %ecx -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: bswapl %esi +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: je .LBB36_5 ; X86-NEXT: .LBB36_4: # %res_block -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: orl $1, %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: orl $1, %ecx ; X86-NEXT: .LBB36_5: # %endblock -; X86-NEXT: testl %edx, %edx +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setg %al ; X86-NEXT: popl %esi ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index acf4d900745d3d..be5ffe54579f2f 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2385,31 +2385,31 @@ define <8 x i16> @vec128_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwind { ; SSE2-LABEL: vec128_i8_signed_reg_reg: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psubb %xmm1, %xmm4 ; SSE2-NEXT: psubb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: psrlw $1, %xmm3 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: pmullw %xmm1, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: packuswb %xmm4, %xmm2 -; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: packuswb %xmm4, %xmm3 +; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i8_signed_reg_reg: @@ -2611,56 +2611,56 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwind { ; SSE2-LABEL: vec128_i8_unsigned_reg_reg: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pminub %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pminub %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm3, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: psubb %xmm3, %xmm1 +; SSE2-NEXT: psubb %xmm2, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: packuswb %xmm4, %xmm2 -; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: packuswb %xmm3, %xmm4 +; SSE2-NEXT: paddb %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i8_unsigned_reg_reg: ; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pminub %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pminub %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpeqb %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE41-NEXT: pmaxub %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm3, %xmm1 +; SSE41-NEXT: psubb %xmm2, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm1, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm1, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm1, %xmm4 +; SSE41-NEXT: pmullw %xmm3, %xmm2 ; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pmullw %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: packuswb %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm3, %xmm0 +; SSE41-NEXT: packuswb %xmm4, %xmm2 +; SSE41-NEXT: paddb %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vec128_i8_unsigned_reg_reg: @@ -2846,33 +2846,33 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind { ; SSE2-LABEL: vec128_i8_signed_mem_reg: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movdqa (%rdi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psubb %xmm1, %xmm4 -; SSE2-NEXT: psubb %xmm2, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm4 +; SSE2-NEXT: psubb %xmm2, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: psrlw $1, %xmm3 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm1, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pmullw %xmm0, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i8_signed_mem_reg: @@ -3085,56 +3085,56 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind { ; SSE2-LABEL: vec128_i8_signed_reg_mem: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psubb %xmm3, %xmm4 -; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: psubb %xmm1, %xmm4 +; SSE2-NEXT: psubb %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: psrlw $1, %xmm2 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm1, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm1, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: packuswb %xmm4, %xmm3 +; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i8_signed_reg_mem: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pminsb %xmm2, %xmm3 -; SSE41-NEXT: pmaxsb %xmm0, %xmm2 -; SSE41-NEXT: psubb %xmm3, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pminsb %xmm1, %xmm3 +; SSE41-NEXT: pmaxsb %xmm0, %xmm1 +; SSE41-NEXT: psubb %xmm3, %xmm1 +; SSE41-NEXT: psrlw $1, %xmm1 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pmullw %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: packuswb %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: packuswb %xmm2, %xmm3 ; SSE41-NEXT: paddb %xmm3, %xmm0 ; SSE41-NEXT: retq ; @@ -3323,30 +3323,30 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE2-LABEL: vec128_i8_signed_mem_mem: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psubb %xmm3, %xmm4 -; SSE2-NEXT: psubb %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psubb %xmm2, %xmm4 +; SSE2-NEXT: psubb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrlw $1, %xmm3 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pmullw %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: packuswb %xmm4, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -3354,26 +3354,26 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE41-LABEL: vec128_i8_signed_mem_mem: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: movdqa (%rsi), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtb %xmm3, %xmm2 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: movdqa (%rsi), %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pminsb %xmm3, %xmm0 -; SSE41-NEXT: pmaxsb %xmm1, %xmm3 -; SSE41-NEXT: psubb %xmm0, %xmm3 -; SSE41-NEXT: psrlw $1, %xmm3 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: pminsb %xmm2, %xmm0 +; SSE41-NEXT: pmaxsb %xmm1, %xmm2 +; SSE41-NEXT: psubb %xmm0, %xmm2 +; SSE41-NEXT: psrlw $1, %xmm2 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: pmullw %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 4c605b10f66b6b..2663c949b15be3 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -553,45 +553,45 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwind { ; AVX1-LABEL: vec256_i64_unsigned_reg_reg: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm6 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm6 ; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm6 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6 +; AVX1-NEXT: vblendvpd %xmm3, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm6 +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] ; AVX1-NEXT: # xmm8 = mem[0,0] -; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm9 ; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 ; AVX1-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpsrlq $33, %xmm3, %xmm3 -; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm7 +; AVX1-NEXT: vpmuludq %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpsrlq $33, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7 ; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -2347,19 +2347,19 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; ; XOP-LABEL: vec256_i8_signed_mem_reg: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOP-NEXT: vmovdqa (%rdi), %xmm1 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOP-NEXT: vmovdqa (%rdi), %xmm2 ; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtb %xmm0, %xmm1, %xmm5 -; XOP-NEXT: vpminsb %xmm0, %xmm1, %xmm6 -; XOP-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vpcomgtb %xmm1, %xmm3, %xmm4 +; XOP-NEXT: vpcomgtb %xmm0, %xmm2, %xmm5 +; XOP-NEXT: vpminsb %xmm0, %xmm2, %xmm6 +; XOP-NEXT: vpmaxsb %xmm0, %xmm2, %xmm0 ; XOP-NEXT: vpsubb %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpminsb %xmm2, %xmm3, %xmm6 -; XOP-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vpsubb %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpminsb %xmm1, %xmm3, %xmm6 +; XOP-NEXT: vpmaxsb %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpshlb %xmm6, %xmm0, %xmm0 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] @@ -2371,17 +2371,17 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; XOP-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-NEXT: vpor %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-NEXT: vpmullw %xmm7, %xmm6, %xmm6 -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; XOP-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; XOP-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2 -; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOP-NEXT: vpmullw %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 +; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i8_signed_mem_reg: diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll index 601166d67f6f27..e3c8f62a02018a 100644 --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -303,41 +303,42 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: movl %edi, %edx -; X86-NEXT: sbbl %ebp, %edx +; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: setl %dl ; X86-NEXT: movzbl %dl, %ebx ; X86-NEXT: jl .LBB5_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: jmp .LBB5_3 ; X86-NEXT: .LBB5_1: ; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB5_3: ; X86-NEXT: negl %ebx -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: orl $1, %ebp -; X86-NEXT: subl %esi, %eax +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: orl $1, %ecx +; X86-NEXT: subl %ebp, %eax ; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: shrdl $1, %edi, %eax ; X86-NEXT: imull %eax, %ebx -; X86-NEXT: mull %ebp +; X86-NEXT: mull %ecx ; X86-NEXT: addl %ebx, %edx ; X86-NEXT: shrl %edi -; X86-NEXT: imull %ebp, %edi +; X86-NEXT: imull %ecx, %edi ; X86-NEXT: addl %edi, %edx -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -376,42 +377,43 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl %ebx, %eax ; X86-NEXT: movl %edi, %edx -; X86-NEXT: sbbl %ebp, %edx +; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: setb %dl -; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: testb %dl, %dl ; X86-NEXT: jne .LBB6_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: jmp .LBB6_3 ; X86-NEXT: .LBB6_1: ; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: .LBB6_3: -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: orl $1, %ebp -; X86-NEXT: subl %esi, %eax +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: orl $1, %ebx +; X86-NEXT: subl %ebp, %eax ; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: shrdl $1, %edi, %eax -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: mull %ebp -; X86-NEXT: addl %ebx, %edx +; X86-NEXT: imull %eax, %ecx +; X86-NEXT: mull %ebx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: shrl %edi -; X86-NEXT: imull %ebp, %edi +; X86-NEXT: imull %ebx, %edi ; X86-NEXT: addl %edi, %edx -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -456,24 +458,24 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind { ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %esi -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%edx), %ecx +; X86-NEXT: movl 4(%edx), %esi +; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: movl %edi, %edx -; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: setl %dl ; X86-NEXT: movzbl %dl, %ebx ; X86-NEXT: jl .LBB7_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: jmp .LBB7_3 ; X86-NEXT: .LBB7_1: ; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB7_3: ; X86-NEXT: negl %ebx ; X86-NEXT: movl %ebx, %ebp @@ -487,8 +489,8 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind { ; X86-NEXT: shrl %edi ; X86-NEXT: imull %ebp, %edi ; X86-NEXT: addl %edi, %edx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -530,42 +532,43 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl (%edx), %eax ; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: movl %edi, %edx -; X86-NEXT: sbbl %ebp, %edx +; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: setl %dl ; X86-NEXT: movzbl %dl, %ebx ; X86-NEXT: jl .LBB8_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: jmp .LBB8_3 ; X86-NEXT: .LBB8_1: ; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB8_3: ; X86-NEXT: negl %ebx -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: orl $1, %ebp -; X86-NEXT: subl %esi, %eax +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: orl $1, %ecx +; X86-NEXT: subl %ebp, %eax ; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: shrdl $1, %edi, %eax ; X86-NEXT: imull %eax, %ebx -; X86-NEXT: mull %ebp +; X86-NEXT: mull %ecx ; X86-NEXT: addl %ebx, %edx ; X86-NEXT: shrl %edi -; X86-NEXT: imull %ebp, %edi +; X86-NEXT: imull %ecx, %edi ; X86-NEXT: addl %edi, %edx -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -610,25 +613,25 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %esi -; X86-NEXT: movl 4(%eax), %ecx +; X86-NEXT: movl (%eax), %ecx +; X86-NEXT: movl 4(%eax), %esi ; X86-NEXT: movl (%edx), %eax ; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: movl %edi, %edx -; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: setl %dl ; X86-NEXT: movzbl %dl, %ebx ; X86-NEXT: jl .LBB9_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: jmp .LBB9_3 ; X86-NEXT: .LBB9_1: ; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB9_3: ; X86-NEXT: negl %ebx ; X86-NEXT: movl %ebx, %ebp @@ -642,8 +645,8 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-NEXT: shrl %edi ; X86-NEXT: imull %ebp, %edi ; X86-NEXT: addl %edi, %edx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 7dd4af76897929..dc00cafee3e223 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1413,444 +1413,444 @@ define dso_local void @v64i1_shuffle(<64 x i8>* %x, <64 x i8>* %y) "min-legal-ve ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 -; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k0 -; CHECK-NEXT: kshiftrd $1, %k0, %k1 -; CHECK-NEXT: kshiftlq $63, %k0, %k2 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 +; CHECK-NEXT: kshiftrd $1, %k1, %k0 +; CHECK-NEXT: kshiftlq $63, %k1, %k2 ; CHECK-NEXT: kshiftrq $62, %k2, %k2 -; CHECK-NEXT: kshiftlq $63, %k1, %k1 -; CHECK-NEXT: kshiftrq $63, %k1, %k1 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $63, %k0, %k0 +; CHECK-NEXT: kshiftrq $63, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-5, %rax ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $3, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $3, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $61, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-9, %rax ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $2, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $2, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $60, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-17, %rax ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $5, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $5, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $59, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-33, %rax ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $4, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $4, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $58, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-65, %rax ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $7, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $7, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $57, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-129, %rax ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $6, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $6, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $56, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-257, %rax # imm = 0xFEFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $9, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $9, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $55, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-513, %rax # imm = 0xFDFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $8, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $8, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $54, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-1025, %rax # imm = 0xFBFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $11, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $11, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $53, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-2049, %rax # imm = 0xF7FF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $10, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $10, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $52, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-4097, %rax # imm = 0xEFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $13, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $13, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $51, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-8193, %rax # imm = 0xDFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $12, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $12, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $50, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-16385, %rax # imm = 0xBFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $15, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $15, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $49, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-32769, %rax # imm = 0xFFFF7FFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $14, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $14, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $48, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-65537, %rax # imm = 0xFFFEFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $17, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $17, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $47, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-131073, %rax # imm = 0xFFFDFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $16, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $16, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $46, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-262145, %rax # imm = 0xFFFBFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $19, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $19, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $45, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-524289, %rax # imm = 0xFFF7FFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $18, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $18, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $44, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-1048577, %rax # imm = 0xFFEFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $21, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $21, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $43, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-2097153, %rax # imm = 0xFFDFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $20, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $20, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $42, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-4194305, %rax # imm = 0xFFBFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $23, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $23, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $41, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-8388609, %rax # imm = 0xFF7FFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $22, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $22, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $40, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-16777217, %rax # imm = 0xFEFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $25, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $25, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $39, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-33554433, %rax # imm = 0xFDFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $24, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $24, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $38, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-67108865, %rax # imm = 0xFBFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $27, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $27, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $37, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-134217729, %rax # imm = 0xF7FFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $26, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $26, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $36, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-268435457, %rax # imm = 0xEFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $29, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $29, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $35, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-536870913, %rax # imm = 0xDFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $28, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $28, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $34, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movq $-1073741825, %rax # imm = 0xBFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $31, %k0, %k2 +; CHECK-NEXT: kandq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $31, %k1, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $33, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k2, %k0, %k0 ; CHECK-NEXT: movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k2 -; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 -; CHECK-NEXT: kshiftrd $30, %k0, %k0 -; CHECK-NEXT: kshiftlq $63, %k0, %k0 -; CHECK-NEXT: kshiftrq $32, %k0, %k0 -; CHECK-NEXT: korq %k0, %k2, %k0 +; CHECK-NEXT: kandq %k2, %k0, %k2 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k0 +; CHECK-NEXT: kshiftrd $30, %k1, %k1 +; CHECK-NEXT: kshiftlq $63, %k1, %k1 +; CHECK-NEXT: kshiftrq $32, %k1, %k1 +; CHECK-NEXT: korq %k1, %k2, %k1 ; CHECK-NEXT: movabsq $-4294967297, %rax # imm = 0xFFFFFFFEFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $1, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $1, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $31, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-8589934593, %rax # imm = 0xFFFFFFFDFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftlq $63, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $63, %k0, %k2 ; CHECK-NEXT: kshiftrq $30, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-17179869185, %rax # imm = 0xFFFFFFFBFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $3, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $3, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $29, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-34359738369, %rax # imm = 0xFFFFFFF7FFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $2, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $2, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $28, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-68719476737, %rax # imm = 0xFFFFFFEFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $5, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $5, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $27, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-137438953473, %rax # imm = 0xFFFFFFDFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $4, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $4, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $26, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-274877906945, %rax # imm = 0xFFFFFFBFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $7, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $7, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $25, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-549755813889, %rax # imm = 0xFFFFFF7FFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $6, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $6, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $24, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-1099511627777, %rax # imm = 0xFFFFFEFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $9, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $9, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $23, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-2199023255553, %rax # imm = 0xFFFFFDFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $8, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $8, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $22, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-4398046511105, %rax # imm = 0xFFFFFBFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $11, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $11, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $21, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-8796093022209, %rax # imm = 0xFFFFF7FFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $10, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $10, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $20, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-17592186044417, %rax # imm = 0xFFFFEFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $13, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $13, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $19, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-35184372088833, %rax # imm = 0xFFFFDFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $12, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $12, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $18, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-70368744177665, %rax # imm = 0xFFFFBFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $15, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $15, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $17, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-140737488355329, %rax # imm = 0xFFFF7FFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $14, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $14, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $16, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-281474976710657, %rax # imm = 0xFFFEFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $17, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $17, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $15, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-562949953421313, %rax # imm = 0xFFFDFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $16, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $16, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $14, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-1125899906842625, %rax # imm = 0xFFFBFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $19, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $19, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $13, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-2251799813685249, %rax # imm = 0xFFF7FFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $18, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $18, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $12, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-4503599627370497, %rax # imm = 0xFFEFFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $21, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $21, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $11, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-9007199254740993, %rax # imm = 0xFFDFFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $20, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $20, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $10, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-18014398509481985, %rax # imm = 0xFFBFFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $23, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $23, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $9, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-36028797018963969, %rax # imm = 0xFF7FFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $22, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $22, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $8, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-72057594037927937, %rax # imm = 0xFEFFFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $25, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $25, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $7, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-144115188075855873, %rax # imm = 0xFDFFFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $24, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $24, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $6, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-288230376151711745, %rax # imm = 0xFBFFFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $27, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $27, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $5, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-576460752303423489, %rax # imm = 0xF7FFFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $26, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $26, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $4, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-1152921504606846977, %rax # imm = 0xEFFFFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $29, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $29, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $3, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-2305843009213693953, %rax # imm = 0xDFFFFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $28, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $28, %k0, %k2 ; CHECK-NEXT: kshiftlq $63, %k2, %k2 ; CHECK-NEXT: kshiftrq $2, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movabsq $-4611686018427387905, %rax # imm = 0xBFFFFFFFFFFFFFFF ; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $31, %k1, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $31, %k0, %k2 ; CHECK-NEXT: kshiftlq $62, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $30, %k1, %k1 -; CHECK-NEXT: kshiftlq $1, %k0, %k0 -; CHECK-NEXT: kshiftrq $1, %k0, %k0 -; CHECK-NEXT: kshiftlq $63, %k1, %k1 -; CHECK-NEXT: korq %k1, %k0, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftrd $30, %k0, %k0 +; CHECK-NEXT: kshiftlq $1, %k1, %k1 +; CHECK-NEXT: kshiftrq $1, %k1, %k1 +; CHECK-NEXT: kshiftlq $63, %k0, %k0 +; CHECK-NEXT: korq %k0, %k1, %k1 ; CHECK-NEXT: vmovdqu8 %ymm1, (%rsi) {%k1} ; CHECK-NEXT: kshiftrq $32, %k1, %k1 ; CHECK-NEXT: vmovdqu8 %ymm0, 32(%rsi) {%k1} diff --git a/llvm/test/CodeGen/X86/misched-matmul.ll b/llvm/test/CodeGen/X86/misched-matmul.ll index a6c489dcb3dad8..0b8fe55c5f8fce 100644 --- a/llvm/test/CodeGen/X86/misched-matmul.ll +++ b/llvm/test/CodeGen/X86/misched-matmul.ll @@ -10,7 +10,7 @@ ; more complex cases. ; ; CHECK: @wrap_mul4 -; CHECK: 24 regalloc - Number of spills inserted +; CHECK: 20 regalloc - Number of spills inserted define void @wrap_mul4(ptr nocapture %Out, ptr nocapture %A, ptr nocapture %B) #0 { entry: diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index 27a9acf181ea2a..a6116175226ea1 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -144,28 +144,28 @@ define void @test1(ptr %A, ptr %B) { ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: paddd %xmm1, %xmm0 -; X32-NEXT: movq %xmm0, (%eax) -; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X32-NEXT: pmuludq %xmm1, %xmm0 -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X32-NEXT: pmuludq %xmm1, %xmm2 -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: pand %xmm0, %xmm1 +; X32-NEXT: paddd %xmm0, %xmm1 +; X32-NEXT: movq %xmm1, (%eax) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X32-NEXT: pmuludq %xmm0, %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X32-NEXT: pmuludq %xmm0, %xmm2 +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: por %xmm1, %xmm0 +; X32-NEXT: pand %xmm1, %xmm0 ; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: pxor %xmm0, %xmm1 +; X32-NEXT: por %xmm0, %xmm1 ; X32-NEXT: movq %xmm1, (%eax) +; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pxor %xmm1, %xmm0 +; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: emms ; X32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/mmx-fold-zero.ll b/llvm/test/CodeGen/X86/mmx-fold-zero.ll index d40146453cff06..7e2e1a3d8850f8 100644 --- a/llvm/test/CodeGen/X86/mmx-fold-zero.ll +++ b/llvm/test/CodeGen/X86/mmx-fold-zero.ll @@ -10,34 +10,33 @@ define double @mmx_zero(double, double, double, double) nounwind { ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $16, %esp ; X86-NEXT: movq 8(%ebp), %mm0 -; X86-NEXT: movq 16(%ebp), %mm5 -; X86-NEXT: movq %mm5, (%esp) # 8-byte Spill -; X86-NEXT: movq %mm0, %mm3 -; X86-NEXT: paddd %mm5, %mm3 -; X86-NEXT: pxor %mm1, %mm1 -; X86-NEXT: movq %mm3, %mm6 -; X86-NEXT: pmuludq %mm1, %mm6 -; X86-NEXT: movq 24(%ebp), %mm4 -; X86-NEXT: movq %mm6, %mm2 +; X86-NEXT: movq 16(%ebp), %mm4 +; X86-NEXT: movq %mm4, (%esp) # 8-byte Spill +; X86-NEXT: movq %mm0, %mm2 ; X86-NEXT: paddd %mm4, %mm2 -; X86-NEXT: paddw %mm2, %mm0 -; X86-NEXT: movq %mm5, %mm1 -; X86-NEXT: paddw %mm0, %mm1 -; X86-NEXT: movq 32(%ebp), %mm5 -; X86-NEXT: movq %mm1, %mm7 -; X86-NEXT: pmuludq %mm5, %mm7 -; X86-NEXT: paddw %mm4, %mm7 -; X86-NEXT: paddw %mm7, %mm5 -; X86-NEXT: paddw %mm5, %mm2 -; X86-NEXT: paddw %mm2, %mm0 -; X86-NEXT: paddw %mm6, %mm0 -; X86-NEXT: pmuludq %mm3, %mm0 +; X86-NEXT: pxor %mm3, %mm3 +; X86-NEXT: movq %mm2, %mm5 +; X86-NEXT: pmuludq %mm3, %mm5 +; X86-NEXT: movq 24(%ebp), %mm6 +; X86-NEXT: movq %mm5, %mm3 +; X86-NEXT: paddd %mm6, %mm3 +; X86-NEXT: paddw %mm3, %mm0 +; X86-NEXT: paddw %mm0, %mm4 +; X86-NEXT: movq 32(%ebp), %mm1 +; X86-NEXT: movq %mm4, %mm7 +; X86-NEXT: pmuludq %mm1, %mm7 +; X86-NEXT: paddw %mm6, %mm7 +; X86-NEXT: paddw %mm7, %mm1 +; X86-NEXT: paddw %mm1, %mm3 +; X86-NEXT: paddw %mm3, %mm0 +; X86-NEXT: paddw %mm5, %mm0 +; X86-NEXT: pmuludq %mm2, %mm0 ; X86-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 -; X86-NEXT: paddw %mm1, %mm0 +; X86-NEXT: paddw %mm4, %mm0 ; X86-NEXT: pmuludq %mm7, %mm0 ; X86-NEXT: pmuludq (%esp), %mm0 # 8-byte Folded Reload -; X86-NEXT: paddw %mm5, %mm0 -; X86-NEXT: paddw %mm2, %mm0 +; X86-NEXT: paddw %mm1, %mm0 +; X86-NEXT: paddw %mm3, %mm0 ; X86-NEXT: movq2dq %mm0, %xmm0 ; X86-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: fldl {{[0-9]+}}(%esp) @@ -48,34 +47,33 @@ define double @mmx_zero(double, double, double, double) nounwind { ; X64-LABEL: mmx_zero: ; X64: # %bb.0: ; X64-NEXT: movdq2q %xmm0, %mm0 -; X64-NEXT: movdq2q %xmm1, %mm5 -; X64-NEXT: movq %mm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %mm0, %mm3 -; X64-NEXT: paddd %mm5, %mm3 -; X64-NEXT: pxor %mm1, %mm1 -; X64-NEXT: movq %mm3, %mm6 -; X64-NEXT: pmuludq %mm1, %mm6 -; X64-NEXT: movdq2q %xmm2, %mm4 -; X64-NEXT: movq %mm6, %mm2 +; X64-NEXT: movdq2q %xmm1, %mm4 +; X64-NEXT: movq %mm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %mm0, %mm2 ; X64-NEXT: paddd %mm4, %mm2 -; X64-NEXT: paddw %mm2, %mm0 -; X64-NEXT: movq %mm5, %mm1 -; X64-NEXT: paddw %mm0, %mm1 -; X64-NEXT: movdq2q %xmm3, %mm5 -; X64-NEXT: movq %mm1, %mm7 -; X64-NEXT: pmuludq %mm5, %mm7 -; X64-NEXT: paddw %mm4, %mm7 -; X64-NEXT: paddw %mm7, %mm5 -; X64-NEXT: paddw %mm5, %mm2 -; X64-NEXT: paddw %mm2, %mm0 -; X64-NEXT: paddw %mm6, %mm0 -; X64-NEXT: pmuludq %mm3, %mm0 +; X64-NEXT: pxor %mm3, %mm3 +; X64-NEXT: movq %mm2, %mm5 +; X64-NEXT: pmuludq %mm3, %mm5 +; X64-NEXT: movdq2q %xmm2, %mm6 +; X64-NEXT: movq %mm5, %mm3 +; X64-NEXT: paddd %mm6, %mm3 +; X64-NEXT: paddw %mm3, %mm0 +; X64-NEXT: paddw %mm0, %mm4 +; X64-NEXT: movdq2q %xmm3, %mm1 +; X64-NEXT: movq %mm4, %mm7 +; X64-NEXT: pmuludq %mm1, %mm7 +; X64-NEXT: paddw %mm6, %mm7 +; X64-NEXT: paddw %mm7, %mm1 +; X64-NEXT: paddw %mm1, %mm3 +; X64-NEXT: paddw %mm3, %mm0 +; X64-NEXT: paddw %mm5, %mm0 +; X64-NEXT: pmuludq %mm2, %mm0 ; X64-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %mm0 -; X64-NEXT: paddw %mm1, %mm0 +; X64-NEXT: paddw %mm4, %mm0 ; X64-NEXT: pmuludq %mm7, %mm0 ; X64-NEXT: pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; X64-NEXT: paddw %mm5, %mm0 -; X64-NEXT: paddw %mm2, %mm0 +; X64-NEXT: paddw %mm1, %mm0 +; X64-NEXT: paddw %mm3, %mm0 ; X64-NEXT: movq2dq %mm0, %xmm0 ; X64-NEXT: retq %5 = bitcast double %0 to x86_mmx diff --git a/llvm/test/CodeGen/X86/mul-constant-result.ll b/llvm/test/CodeGen/X86/mul-constant-result.ll index beb2dba05e85ac..14eced1f144a3c 100644 --- a/llvm/test/CodeGen/X86/mul-constant-result.ll +++ b/llvm/test/CodeGen/X86/mul-constant-result.ll @@ -612,8 +612,8 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $8, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl $8, %esi ; X86-NEXT: pushl $4 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $9 @@ -621,9 +621,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl $9, %esi -; X86-NEXT: orl %ebx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $9, %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: pushl $5 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $10 @@ -631,9 +631,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $10, %ebx -; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: xorl $10, %ebp +; X86-NEXT: orl %ebx, %ebp ; X86-NEXT: pushl $5 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $11 @@ -643,7 +643,7 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %esi ; X86-NEXT: xorl $11, %esi -; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %ebp, %esi ; X86-NEXT: orl %edi, %esi ; X86-NEXT: pushl $6 ; X86-NEXT: .cfi_adjust_cfa_offset 4 @@ -652,8 +652,8 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $12, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl $12, %edi ; X86-NEXT: pushl $6 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $13 @@ -661,9 +661,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $13, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $13, %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: pushl $7 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $14 @@ -671,9 +671,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $14, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl $14, %edi +; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $7 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $15 @@ -681,9 +681,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: xorl $15, %ebp -; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $15, %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: pushl $8 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $16 @@ -693,7 +693,7 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi ; X86-NEXT: xorl $16, %edi -; X86-NEXT: orl %ebp, %edi +; X86-NEXT: orl %ebx, %edi ; X86-NEXT: orl %esi, %edi ; X86-NEXT: pushl $8 ; X86-NEXT: .cfi_adjust_cfa_offset 4 @@ -702,8 +702,8 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $17, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl $17, %esi ; X86-NEXT: pushl $9 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $18 @@ -711,9 +711,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl $18, %esi -; X86-NEXT: orl %ebx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $18, %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: pushl $9 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $19 @@ -721,9 +721,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $19, %ebx -; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl $19, %esi +; X86-NEXT: orl %ebx, %esi ; X86-NEXT: pushl $10 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $20 @@ -731,9 +731,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl $20, %esi -; X86-NEXT: orl %ebx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $20, %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: pushl $10 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $21 @@ -741,9 +741,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $21, %ebx -; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: xorl $21, %ebp +; X86-NEXT: orl %ebx, %ebp ; X86-NEXT: pushl $11 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $22 @@ -753,7 +753,7 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %esi ; X86-NEXT: xorl $22, %esi -; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %ebp, %esi ; X86-NEXT: orl %edi, %esi ; X86-NEXT: pushl $11 ; X86-NEXT: .cfi_adjust_cfa_offset 4 @@ -762,8 +762,8 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $23, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl $23, %edi ; X86-NEXT: pushl $12 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $24 @@ -771,9 +771,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $24, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $24, %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: pushl $12 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $25 @@ -781,9 +781,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $25, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl $25, %edi +; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $13 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $26 @@ -791,9 +791,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $26, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $26, %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: pushl $13 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $27 @@ -801,9 +801,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $27, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl $27, %edi +; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $14 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $28 @@ -811,9 +811,9 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: xorl $28, %ebp -; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $28, %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: pushl $14 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $29 @@ -823,7 +823,7 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi ; X86-NEXT: xorl $29, %edi -; X86-NEXT: orl %ebp, %edi +; X86-NEXT: orl %ebx, %edi ; X86-NEXT: orl %esi, %edi ; X86-NEXT: pushl $15 ; X86-NEXT: .cfi_adjust_cfa_offset 4 @@ -832,8 +832,8 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $30, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl $30, %esi ; X86-NEXT: pushl $15 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $31 @@ -841,10 +841,10 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl $31, %esi -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: orl %edi, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $31, %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: pushl $16 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $32 @@ -854,7 +854,7 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: xorl $32, %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: orl %esi, %eax +; X86-NEXT: orl %ebx, %eax ; X86-NEXT: setne %cl ; X86-NEXT: negl %ecx ; X86-NEXT: movl %ecx, %eax @@ -898,33 +898,33 @@ define i32 @foo() local_unnamed_addr #0 { ; X64-HSW-NEXT: movl $3, %edi ; X64-HSW-NEXT: movl $1, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %r14d -; X64-HSW-NEXT: xorl $3, %r14d +; X64-HSW-NEXT: movl %eax, %ebx +; X64-HSW-NEXT: xorl $3, %ebx ; X64-HSW-NEXT: movl $4, %edi ; X64-HSW-NEXT: movl $2, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebx -; X64-HSW-NEXT: xorl $4, %ebx -; X64-HSW-NEXT: orl %r14d, %ebx -; X64-HSW-NEXT: orl %ebp, %ebx +; X64-HSW-NEXT: movl %eax, %r14d +; X64-HSW-NEXT: xorl $4, %r14d +; X64-HSW-NEXT: orl %ebx, %r14d +; X64-HSW-NEXT: orl %ebp, %r14d ; X64-HSW-NEXT: movl $5, %edi ; X64-HSW-NEXT: movl $2, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebp -; X64-HSW-NEXT: xorl $5, %ebp +; X64-HSW-NEXT: movl %eax, %ebx +; X64-HSW-NEXT: xorl $5, %ebx ; X64-HSW-NEXT: movl $6, %edi ; X64-HSW-NEXT: movl $3, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %r14d -; X64-HSW-NEXT: xorl $6, %r14d -; X64-HSW-NEXT: orl %ebp, %r14d +; X64-HSW-NEXT: movl %eax, %r15d +; X64-HSW-NEXT: xorl $6, %r15d +; X64-HSW-NEXT: orl %ebx, %r15d ; X64-HSW-NEXT: movl $7, %edi ; X64-HSW-NEXT: movl $3, %esi ; X64-HSW-NEXT: callq mult@PLT ; X64-HSW-NEXT: movl %eax, %ebp ; X64-HSW-NEXT: xorl $7, %ebp +; X64-HSW-NEXT: orl %r15d, %ebp ; X64-HSW-NEXT: orl %r14d, %ebp -; X64-HSW-NEXT: orl %ebx, %ebp ; X64-HSW-NEXT: movl $8, %edi ; X64-HSW-NEXT: movl $4, %esi ; X64-HSW-NEXT: callq mult@PLT diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll index 6829356bf107eb..2f26a56212f3b7 100644 --- a/llvm/test/CodeGen/X86/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/mul-i1024.ll @@ -10,410 +10,482 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: subl $400, %esp # imm = 0x190 -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 60(%eax), %ebp +; X32-NEXT: movl 60(%eax), %ebx ; X32-NEXT: movl 56(%eax), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%edx), %esi +; X32-NEXT: movl (%ecx), %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl 4(%ebx), %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 4(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 48(%ecx), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 52(%ecx), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 48(%esi), %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl 52(%esi), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 8(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl 8(%eax), %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %ecx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: movl 12(%eax), %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 40(%ecx), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 40(%edi), %ebx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl 44(%edi), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 44(%ecx), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 32(%ecx), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 32(%edi), %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 36(%ecx), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 36(%edi), %eax +; X32-NEXT: movl %eax, %edi ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: setb %bl +; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; X32-NEXT: adcl %edi, %eax -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload +; X32-NEXT: adcl %ebx, %eax +; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 16(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 16(%ecx), %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 20(%eax), %ecx -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl 20(%ecx), %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 24(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl 24(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 28(%eax), %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ecx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: adcl %edi, %edx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %bl, %edi +; X32-NEXT: adcl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edi, %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl $0, %eax +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -423,378 +495,428 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, %ebx ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: movl %edx, %esi -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: addl %ebp, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edi +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %eax +; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %eax -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 24(%edi), %ebx ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl 28(%edi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 16(%esi), %edi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl 20(%esi), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ecx +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 24(%ecx), %ebx +; X32-NEXT: movl 8(%ecx), %ebx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl 12(%ecx), %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 28(%ecx), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 16(%ecx), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl (%ebx), %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 20(%ecx), %eax +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 4(%ebx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 8(%ecx), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 12(%ecx), %ebp +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: adcl %ebx, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X32-NEXT: adcl %edi, %eax +; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl (%ecx), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 4(%ecx), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebx +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload @@ -804,367 +926,218 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %ebp, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %esi -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: adcl %edi, %edx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movl %ebx, %edx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movzbl %bl, %edi +; X32-NEXT: adcl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X32-NEXT: adcl %esi, %eax -; X32-NEXT: adcl $0, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edi, %esi ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: movl %edx, %esi -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: addl %ebp, %edx +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edx, %eax -; X32-NEXT: adcl $0, %eax -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp ; X32-NEXT: movl %ebp, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: adcl %ebx, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: adcl %eax, %edi +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb (%esp) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl %esi, %edx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -1175,14 +1148,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl $0, %eax ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -1191,15 +1163,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1208,573 +1180,568 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 32(%edi), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 32(%ecx), %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 36(%eax), %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %edi +; X32-NEXT: movl %esi, %ecx ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb %cl ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 40(%eax), %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 44(%eax), %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edx ; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 44(%eax), %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp ; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx +; X32-NEXT: addl %ecx, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl %esi, %ebx -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl %ecx, %ebp +; X32-NEXT: mull %ecx ; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, %edi -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movzbl %bl, %edi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movl %ebx, %edx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: adcl %ebp, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X32-NEXT: adcl %edi, %eax ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 48(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl 48(%eax), %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 52(%eax), %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, %ebx -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl 52(%eax), %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebx ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 56(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl 56(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl 60(%esi), %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl 60(%ebp), %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: adcl $0, %edx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: adcl %edi, %edx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X32-NEXT: adcl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: adcl %esi, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %esi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edi, %esi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl %ebx, %ebp +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %ebx, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %eax, %edi +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %ebp, %edi +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: movl %ecx, %ebp -; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1785,13 +1752,12 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1803,7 +1769,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1814,11 +1780,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %edx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1828,279 +1793,274 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb %cl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %esi -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, %ebp +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %edi, %ebp -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb (%esp) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb %cl +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movl %ecx, %ebp +; X32-NEXT: mull %ecx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebp, %esi ; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: adcl %ebp, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movzbl %bl, %edi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: adcl %edi, %esi -; X32-NEXT: movl (%esp), %edx # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl %ebp, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X32-NEXT: adcl %edi, %eax ; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -2111,289 +2071,280 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl %edi, %ebp +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: adcl %ebp, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: adcl %edi, %edx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %bl, %edi +; X32-NEXT: adcl %edi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: addl %ebp, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ecx +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: adcl $0, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edi, %esi ; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, %esi -; X32-NEXT: mull %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %bl, %esi +; X32-NEXT: adcl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: adcl %eax, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload @@ -2407,9 +2358,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -2421,7 +2370,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -2443,283 +2392,276 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 64(%eax), %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl 64(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 68(%eax), %edi +; X32-NEXT: movl 68(%eax), %ecx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebp -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %ecx, %ebp +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 72(%eax), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl 72(%eax), %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 76(%eax), %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull (%esp) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull (%esp) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull (%esp) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb %cl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebx, %edi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl (%esp), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull (%esp) # 4-byte Folded Reload ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: mull (%esp) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %ebp, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X32-NEXT: adcl %edi, %eax -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -2727,8 +2669,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 80(%eax), %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -2736,281 +2678,276 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 84(%eax), %ecx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ecx, %ebx ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 88(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 92(%eax), %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebp +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 88(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 92(%eax), %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: movl %edx, %esi -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: addl %ebp, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: adcl %edi, %edx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edx, %eax -; X32-NEXT: adcl $0, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X32-NEXT: adcl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edi, %ecx ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %eax +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebx, %ebp ; X32-NEXT: mull %ebx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %ecx ; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb %bl +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %ebp, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi +; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -3023,100 +2960,101 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb %bl ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: adcl %ebp, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl %esi, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ebp -; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %edi @@ -3124,243 +3062,243 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %eax, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: imull %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %eax -; X32-NEXT: addl %edx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: imull %ecx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %edi, %esi -; X32-NEXT: addl %edx, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: imull %ebx, %eax +; X32-NEXT: addl %edx, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %ebp, %esi ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: imull %eax, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %ecx +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ebx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: imull %esi, %ecx +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %eax, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: imull %ebx, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %edx +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: addl %edx, %ebp -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %edi, %ecx -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb %cl +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: mull %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %bl, %edi +; X32-NEXT: adcl %edi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 104(%ecx), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 108(%ecx), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 104(%edi), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl 108(%edi), %ebx ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %ebx +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 96(%esi), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl 96(%esi), %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 100(%esi), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl 100(%esi), %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ecx @@ -3375,53 +3313,54 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 112(%esi), %edi -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: imull %edi, %ecx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 112(%ecx), %ebx +; X32-NEXT: imull %ebx, %edi +; X32-NEXT: movl %edi, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: movl 116(%esi), %eax +; X32-NEXT: addl %esi, %edx +; X32-NEXT: movl 116(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull %eax, %ebx -; X32-NEXT: addl %edx, %ebx -; X32-NEXT: movl 120(%esi), %eax +; X32-NEXT: imull %eax, %edi +; X32-NEXT: addl %edx, %edi +; X32-NEXT: movl 120(%ecx), %eax ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: imull %esi, %ecx @@ -3434,99 +3373,98 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ecx +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %ebp, %edi +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: mull %edi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %esi ; X32-NEXT: adcl %esi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: addl %edx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %edi +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %esi +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %edx +; X32-NEXT: imull (%esp), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %edx -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: imull %ebx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %eax, %ecx +; X32-NEXT: imull %edi, %ecx ; X32-NEXT: addl %edx, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ecx ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: adcl %ebp, %edi -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %bl, %edi -; X32-NEXT: adcl %edi, %edx +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, %ebx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -3536,11 +3474,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -3549,73 +3487,74 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 88(%ecx), %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 92(%ecx), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 88(%esi), %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 92(%esi), %ebp ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 80(%ecx), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 80(%esi), %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 84(%ecx), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl 84(%esi), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: adcl %ebx, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx +; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -3623,346 +3562,341 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %ebp, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebp, (%esp) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 72(%ecx), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 72(%esi), %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 76(%esi), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl 64(%ebx), %esi +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl 76(%ecx), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl 68(%ebx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %edi ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 64(%esi), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl 68(%esi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: addl %esi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl (%esp), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %ebp, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %eax, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: addl %edi, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: movl %edx, %esi -; X32-NEXT: adcl %eax, %esi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: adcl %esi, %edx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb %cl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, %ebp +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %edi, %esi +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl +; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edi ; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %edi -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: adcl %ebx, %edx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %bl, %ebx +; X32-NEXT: adcl %ebx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: adcl $0, %edx @@ -3974,15 +3908,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %esi, %ecx +; X32-NEXT: movl %ebx, %ecx ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -3990,503 +3924,495 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx +; X32-NEXT: addl %ebx, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb (%esp) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edi +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 96(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 96(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 100(%eax), %esi +; X32-NEXT: movl 100(%eax), %ecx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 104(%eax), %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 108(%eax), %ecx +; X32-NEXT: movl 108(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: adcl %eax, %ebp ; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: addl %edx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: imull %eax, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: imull %edi, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %edi, %esi +; X32-NEXT: imull %ecx, %esi ; X32-NEXT: addl %edx, %esi -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl 120(%ebx), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 120(%edi), %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl 124(%ebx), %eax +; X32-NEXT: movl 124(%edi), %eax ; X32-NEXT: imull %ecx, %eax ; X32-NEXT: addl %eax, %esi -; X32-NEXT: movl 112(%ebx), %edi -; X32-NEXT: movl 116(%ebx), %ebp +; X32-NEXT: movl 112(%edi), %ecx +; X32-NEXT: movl 116(%edi), %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: imull %ebp, %ebx ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: addl %ebx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %edi, %ecx -; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: imull %ecx, %edi +; X32-NEXT: addl %edx, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi +; X32-NEXT: addl %ebx, %edi ; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, (%esp) # 4-byte Spill +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp -; X32-NEXT: setb %cl +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ecx +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4497,100 +4423,98 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %eax, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %edi, %esi -; X32-NEXT: imull %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: addl %edx, %eax +; X32-NEXT: imull %eax, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl %eax, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %edx +; X32-NEXT: imull %ebx, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %edi, %esi -; X32-NEXT: addl %edx, %esi -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebp +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: imull %esi, %edi +; X32-NEXT: addl %edx, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %esi, %ecx -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: imull %eax, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: addl %edx, %ebx +; X32-NEXT: addl %ebx, %edx +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl %edx, %esi +; X32-NEXT: movl %esi, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: imull %edi, %ecx +; X32-NEXT: imull %edi, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %edx +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: imull %ebp, %ecx ; X32-NEXT: addl %edx, %ecx @@ -4599,62 +4523,62 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %ebx, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl %edi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %edx -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: addl %ebx, %edx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %eax, %ebp ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload @@ -4667,27 +4591,60 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4706,109 +4663,75 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, (%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 4(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 8(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 12(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 16(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 20(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 24(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 28(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 32(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 36(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 40(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 44(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 48(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 52(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 56(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 60(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 64(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 68(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 72(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 76(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 80(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 84(%ecx) -; X32-NEXT: movl %ebp, 88(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 92(%ecx) -; X32-NEXT: movl %ebx, 96(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 100(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 104(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 108(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 112(%ecx) -; X32-NEXT: movl %edi, 116(%ecx) -; X32-NEXT: movl %edx, 120(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 124(%ecx) +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, (%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 4(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 8(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 12(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 16(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 20(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 24(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 28(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 32(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 36(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 40(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 44(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 48(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 52(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 56(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, 60(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 64(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 68(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 72(%esi) +; X32-NEXT: movl %ebx, 76(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 80(%esi) +; X32-NEXT: movl %ebp, 84(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 88(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 92(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 96(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 100(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 104(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 108(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 112(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, 116(%esi) +; X32-NEXT: movl %edx, 120(%esi) +; X32-NEXT: movl %eax, 124(%esi) ; X32-NEXT: addl $400, %esp # imm = 0x190 ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi @@ -4824,44 +4747,44 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: subq $240, %rsp +; X64-NEXT: subq $232, %rsp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq 40(%rdi), %rbx -; X64-NEXT: movq 32(%rdi), %r12 -; X64-NEXT: movq 56(%rdi), %r15 +; X64-NEXT: movq 32(%rdi), %rbp +; X64-NEXT: movq 56(%rdi), %r14 ; X64-NEXT: movq 48(%rdi), %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq (%rsi), %r11 -; X64-NEXT: movq 8(%rsi), %r14 -; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq 8(%rsi), %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %r9, %r8 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r9, %rcx ; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %r9d -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r10, %rsi -; X64-NEXT: adcq %r9, %rcx -; X64-NEXT: movq %r12, %rax +; X64-NEXT: adcq %r9, %r13 +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r9 @@ -4872,539 +4795,542 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %r9, %r11 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r10, %r9 +; X64-NEXT: adcq %r10, %r14 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %rbx, %r11 ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r9, %r15 +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r14, %rbx ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %rbx -; X64-NEXT: addq %rdi, %r15 -; X64-NEXT: adcq %r8, %rbx +; X64-NEXT: adcq %rax, %r9 +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq %rcx, %r9 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%r13), %r8 -; X64-NEXT: movq %r12, %r10 +; X64-NEXT: adcq $0, %r13 ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq 16(%r12), %r8 +; X64-NEXT: movq %rbp, %r10 +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %rbp ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rdi, %r12 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq 24(%r13), %rbp +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rdi, %r14 +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq 24(%r12), %r15 ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: addq %r12, %rax -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: adcq %r9, %r13 +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: adcq %r11, %r12 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r13, %r9 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r12, %r11 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: addq %r15, %r14 +; X64-NEXT: addq %rbx, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r9, %r14 ; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %r12 -; X64-NEXT: movq %r12, (%rsp) # 8-byte Spill -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %r11 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: addq %rsi, %r9 -; X64-NEXT: adcq %rcx, %rdi +; X64-NEXT: addq %rsi, %r11 +; X64-NEXT: adcq %r13, %rdi ; X64-NEXT: setb %r10b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, %rbp ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r9, %rax +; X64-NEXT: movq %rax, %r9 ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: adcq %rax, %rdx -; X64-NEXT: addq %r9, %r11 -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rdi, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %sil, %ecx +; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: addq %r11, %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rdi, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %r10b, %ecx +; X64-NEXT: adcq %rcx, %rax +; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq 16(%r14), %r11 -; X64-NEXT: movq %r11, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 24(%r14), %r8 +; X64-NEXT: movq 16(%r10), %r8 ; X64-NEXT: movq %r8, %rax ; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq 24(%r10), %r9 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: addq %rcx, %rdi ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rdi, %rbx -; X64-NEXT: adcq %rsi, %r15 -; X64-NEXT: setb %sil ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rbp, %r13 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rdi, %r11 +; X64-NEXT: adcq %rsi, %rbx +; X64-NEXT: setb %sil +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r15, %rdi +; X64-NEXT: addq %rbx, %rdi ; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %rcx -; X64-NEXT: movq (%r14), %rbp -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: movq (%r10), %r8 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq 8(%r14), %r14 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rsi, %r12 -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq 8(%r10), %r12 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rsi, %r14 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %r12, %rax +; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %rsi +; X64-NEXT: adcq %rbx, %rsi ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r14, %r15 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rsi, %r13 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rsi, %r14 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %r9, %r13 -; X64-NEXT: adcq %rbx, %r12 +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: addq %r15, %r14 +; X64-NEXT: adcq %r11, %rbx ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, %r15 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rbp, %r8 +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r12, %rbp +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rsi, %rbx +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rsi, %r11 ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: adcq %r9, %rbp -; X64-NEXT: setb %r9b ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: addq %r11, %rax +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: adcq %r9, %r12 +; X64-NEXT: setb %r9b +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rbp, %rsi +; X64-NEXT: addq %r12, %rsi ; X64-NEXT: movzbl %r9b, %eax -; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %r13, %r10 +; X64-NEXT: adcq %rax, %r11 +; X64-NEXT: addq %r14, %r10 ; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r12, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rbx, %r13 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %r15 +; X64-NEXT: adcq $0, %r11 ; X64-NEXT: addq %rdi, %rsi -; X64-NEXT: adcq %rcx, %r15 +; X64-NEXT: adcq %rcx, %r11 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r8, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rcx, %r12 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r11, %rbp -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r12, %r11 -; X64-NEXT: adcq %rdi, %r13 -; X64-NEXT: setb %dil -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, %r9 -; X64-NEXT: mulq %r14 -; X64-NEXT: addq %r13, %rax -; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r14, %rbx +; X64-NEXT: adcq %rdi, %rcx +; X64-NEXT: setb %bpl +; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r12, %r8 +; X64-NEXT: mulq %r15 +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movzbl %bpl, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %rsi, %rbx -; X64-NEXT: adcq %r15, %r11 +; X64-NEXT: addq %rsi, %r9 +; X64-NEXT: adcq %r11, %rbx ; X64-NEXT: movzbl %r10b, %ecx ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload ; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq (%rsp), %rdx # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; X64-NEXT: adcq $0, (%rsp) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq 32(%r8), %rcx -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq 32(%rsi), %rdi +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rsi, %r11 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 40(%r8), %rsi -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %r8, %r10 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq 40(%rsi), %r9 +; X64-NEXT: movq %rsi, %r15 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r11, %rsi -; X64-NEXT: adcq %rdi, %r15 -; X64-NEXT: setb %r10b -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: addq %r14, %rsi +; X64-NEXT: adcq %r11, %r13 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r15, %r11 -; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: addq %r13, %r11 +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r15, %rbp +; X64-NEXT: addq %r14, %rbp ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %rbx, %rcx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, %rdi +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill -; X64-NEXT: adcq %r13, %r10 -; X64-NEXT: setb %bl -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r13, %r8 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rbx, %r13 +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r10, %rbp -; X64-NEXT: movzbl %bl, %eax -; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: addq %r8, %rbp +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %r14 ; X64-NEXT: addq %r12, %rbp -; X64-NEXT: adcq %rsi, %r15 +; X64-NEXT: adcq %rsi, %r14 ; X64-NEXT: adcq $0, %r11 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 48(%r8), %rcx -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq 48(%r15), %r12 +; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, %r12 -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rsi, %r13 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq 56(%r8), %rsi -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r13, %r9 -; X64-NEXT: adcq %r10, %r14 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r13, %r9 +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rsi, %r10 +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: movq 56(%r15), %rdi +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: addq %r10, %rax +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: adcq %r13, %r15 ; X64-NEXT: setb %r8b -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r14, %r13 +; X64-NEXT: addq %r15, %r13 ; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %rsi ; X64-NEXT: addq %rbp, %rbx -; X64-NEXT: adcq %r15, %r9 +; X64-NEXT: movq %rbx, %r8 +; X64-NEXT: adcq %r14, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: addq %r11, %r13 -; X64-NEXT: adcq %rdi, %rsi -; X64-NEXT: setb %r11b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: adcq %rcx, %rsi +; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rdi, %r14 +; X64-NEXT: addq %rcx, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r14, %rbp -; X64-NEXT: adcq %r10, %r8 -; X64-NEXT: setb %r10b -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r8, %rdi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r14, %rcx +; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r15, %r14 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %r13, %r12 -; X64-NEXT: adcq %rsi, %rbp -; X64-NEXT: movzbl %r11b, %eax -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: adcq $0, %r15 +; X64-NEXT: adcq %rax, %rbp +; X64-NEXT: addq %r13, %r9 +; X64-NEXT: adcq %rsi, %rcx +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: adcq $0, %rbp ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: adcq %rax, (%rsp) # 8-byte Folded Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %r12 +; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: adcq $0, %r14 ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq (%rsp), %r14 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r14 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rcx, %r8 -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rsi, %r10 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r8, %rbx -; X64-NEXT: adcq %rsi, %r10 +; X64-NEXT: addq %r10, %rbx +; X64-NEXT: adcq %rdi, %r15 ; X64-NEXT: setb %r8b -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rcx, %r11 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r10, %rsi +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r15, %rdi ; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: adcq %rax, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r8, %r14 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r11, %r13 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %r10, %r12 +; X64-NEXT: adcq $0, %r15 +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: addq %r12, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r10, %r8 -; X64-NEXT: setb %r10b -; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r12, %r11 -; X64-NEXT: mulq %r13 +; X64-NEXT: adcq %r15, %r10 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r13, %r15 +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r8, %r13 -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: addq %r10, %r13 +; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbp, %r13 +; X64-NEXT: addq %r9, %r13 ; X64-NEXT: adcq %rbx, %r12 +; X64-NEXT: adcq $0, %rdi ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r11, %rbx +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r8, %r10 -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r9, %rbx +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %r8, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r10, %r11 -; X64-NEXT: adcq %rbp, %r8 -; X64-NEXT: setb %r10b -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %r9, %rbp ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r8, %rbx -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: addq %r15, %rbx +; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: addq %r13, %r14 -; X64-NEXT: movq %r14, %r13 -; X64-NEXT: adcq %r12, %r11 +; X64-NEXT: addq %r13, %r11 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: adcq %r12, %rcx ; X64-NEXT: adcq $0, %rbx ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: addq %rsi, %rbx -; X64-NEXT: adcq %rcx, %r9 -; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq %rsi, %r9 +; X64-NEXT: setb %r13b ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rcx, %r8 -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r8, %rax -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: setb %sil +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rsi, %r10 +; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: adcq %rax, %rdx -; X64-NEXT: addq %rbx, %r10 -; X64-NEXT: adcq %r9, %r8 -; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: addq %r10, %rax +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: adcq %rdi, %rsi +; X64-NEXT: setb %dil +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: addq %rsi, %rax +; X64-NEXT: movzbl %dil, %esi +; X64-NEXT: adcq %rsi, %rdx +; X64-NEXT: addq %rbx, %r8 +; X64-NEXT: adcq %r9, %r10 +; X64-NEXT: movzbl %r13b, %esi +; X64-NEXT: adcq %rsi, %rax ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: adcq %rdi, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %r11 -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; X64-NEXT: adcq %rax, %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: addq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: adcq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; X64-NEXT: adcq %r14, %r11 +; X64-NEXT: movq %r11, (%rsp) # 8-byte Spill +; X64-NEXT: adcq %rbp, %rcx ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; X64-NEXT: adcq %rcx, %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq 64(%r9), %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq 64(%r8), %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rdi @@ -5415,539 +5341,534 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdi, %r15 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rsi, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rsi, %r9 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 72(%r9), %rsi -; X64-NEXT: movq %r9, %r13 +; X64-NEXT: movq 72(%r8), %rdx +; X64-NEXT: movq %r8, %r13 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: mulq %rdx ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r8, %rbx +; X64-NEXT: addq %r9, %rbx ; X64-NEXT: adcq %rdi, %r10 ; X64-NEXT: setb %r8b ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rcx, %rsi +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %r10, %r9 ; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r8, %r14 -; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r12, %rcx +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r10, %r15 +; X64-NEXT: adcq $0, %r14 +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: addq %r15, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r10, %r8 -; X64-NEXT: setb %r10b -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, %r12 +; X64-NEXT: adcq %r14, %r10 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r8, %rbp -; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: addq %r10, %rbp +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %r14 ; X64-NEXT: addq %r11, %rbp -; X64-NEXT: adcq %rbx, %r15 +; X64-NEXT: adcq %rbx, %r14 ; X64-NEXT: adcq $0, %r9 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 80(%r13), %r14 +; X64-NEXT: movq 80(%r13), %r15 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq %rcx, %rsi +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r10, %rbx +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq 88(%r13), %r8 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r8, %r11 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq 88(%r13), %rbx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: addq %r11, %rax -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: adcq %r10, %r8 -; X64-NEXT: setb %r10b +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: adcq %r11, %r10 +; X64-NEXT: setb %r11b ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r8, %r13 -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: addq %r10, %r13 +; X64-NEXT: movzbl %r11b, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbp, %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %r11 -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %rbp, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r14, %rbx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: addq %r9, %r13 ; X64-NEXT: adcq %rdi, %r12 ; X64-NEXT: setb %bpl -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rdi, %r10 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, %r15 -; X64-NEXT: mulq %rbx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: addq %r10, %rax -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: adcq %r8, %rdi -; X64-NEXT: setb %r8b -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %rcx, %r9 -; X64-NEXT: mulq %rbx -; X64-NEXT: addq %rdi, %rax ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %rdx -; X64-NEXT: addq %r13, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: adcq %r10, %rdi +; X64-NEXT: setb %r10b +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rdi, %r14 +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: addq %r13, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r12, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r12, %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movzbl %bpl, %eax -; X64-NEXT: adcq %rax, %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %rdx -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: adcq $0, %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: imulq %rax, %rbx -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rbx, %rdx +; X64-NEXT: imulq %rax, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r8, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: imulq %rcx, %r14 -; X64-NEXT: addq %rdx, %r14 +; X64-NEXT: imulq %rcx, %r15 +; X64-NEXT: addq %rdx, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: imulq %rsi, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: mulq %r11 +; X64-NEXT: imulq %rsi, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r10, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: imulq %r11, %rbx -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: addq %r8, %rdi -; X64-NEXT: adcq %r14, %rbx -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: addq %r8, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: imulq %r13, %rax +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: addq %r10, %rdi +; X64-NEXT: adcq %r15, %rax +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r8, %r14 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r8, %r15 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r13, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: addq %r14, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r15, %r11 ; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r8, %r14 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r8, %rsi ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %rsi -; X64-NEXT: addq %rdi, %r14 -; X64-NEXT: adcq %rbx, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 112(%rcx), %r10 +; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: addq %rdi, %rsi +; X64-NEXT: adcq %r12, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: movq 112(%rdi), %r12 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: imulq %rcx, %r12 +; X64-NEXT: addq %rdx, %r12 +; X64-NEXT: movq 120(%rdi), %rax +; X64-NEXT: imulq %r9, %rax +; X64-NEXT: addq %rax, %r12 +; X64-NEXT: movq 96(%rdi), %r10 +; X64-NEXT: movq 104(%rdi), %r13 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: imulq %r11, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: movq 120(%rcx), %rax -; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: movq %rdi, %r12 -; X64-NEXT: addq %rax, %r10 -; X64-NEXT: movq 96(%rcx), %r13 -; X64-NEXT: movq 104(%rcx), %r8 -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, %rbx -; X64-NEXT: imulq %r8, %rbx -; X64-NEXT: mulq %r13 +; X64-NEXT: imulq %r13, %rbp +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rbx, %rdx -; X64-NEXT: imulq %r13, %r9 -; X64-NEXT: addq %rdx, %r9 -; X64-NEXT: addq %rbp, %rdi -; X64-NEXT: adcq %r10, %r9 -; X64-NEXT: movq %r9, %r15 +; X64-NEXT: addq %rbp, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: imulq %r10, %rax +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: addq %r8, %rdi +; X64-NEXT: adcq %r12, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r10, %r12 +; X64-NEXT: addq %r8, %r12 ; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %r12, %r9 +; X64-NEXT: adcq %rbp, %r8 +; X64-NEXT: setb %r10b ; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r12, %r13 -; X64-NEXT: adcq %rbp, %r10 -; X64-NEXT: setb %bl -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: addq %r10, %rax -; X64-NEXT: movzbl %bl, %r8d +; X64-NEXT: mulq %rcx +; X64-NEXT: addq %r8, %rax +; X64-NEXT: movzbl %r10b, %r8d ; X64-NEXT: adcq %r8, %rdx ; X64-NEXT: addq %rdi, %rax +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; X64-NEXT: adcq %r11, %r9 +; X64-NEXT: adcq %rsi, %rax ; X64-NEXT: adcq %r15, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; X64-NEXT: adcq %r14, %rax -; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: adcq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; X64-NEXT: adcq %rbx, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: movq 80(%r10), %r11 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq 80(%rdi), %r10 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 88(%rdi), %r15 -; X64-NEXT: movq %rdi, %r14 -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %r8, %rbx +; X64-NEXT: movq 88(%r10), %r14 +; X64-NEXT: movq %r10, %rbx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdi, %r10 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: mulq %r11 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: addq %r9, %rdi ; X64-NEXT: adcq %r8, %rcx ; X64-NEXT: setb %r8b -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %r11, %r10 -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rcx, %r12 ; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: movq 64(%r14), %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq 72(%r14), %r8 +; X64-NEXT: adcq %rax, %r9 +; X64-NEXT: movq 64(%rbx), %r8 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: movq %r10, %r11 +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq 72(%rbx), %rax +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r11, %r14 +; X64-NEXT: addq %rcx, %r14 ; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %rcx, %r9 -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: addq %r14, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %r11 -; X64-NEXT: setb %cl ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: movq %r8, %r11 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rbx, %rcx +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r10, %r14 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r11, %rbp -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %rbx ; X64-NEXT: addq %rsi, %rbp ; X64-NEXT: adcq %rdi, %rbx ; X64-NEXT: adcq $0, %r12 -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %r9, %rcx -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, %r10 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rsi, %r11 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: addq %r11, %rax -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: adcq %rdi, %r13 -; X64-NEXT: setb %cl +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r13, %rdi -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rdi, %r10 +; X64-NEXT: adcq %rsi, %rcx +; X64-NEXT: setb %r11b +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: movzbl %r11b, %eax ; X64-NEXT: adcq %rax, %rsi -; X64-NEXT: addq %rbp, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %r11 -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %rbp, %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rbx, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: addq %r12, %rdi -; X64-NEXT: adcq %r15, %rsi -; X64-NEXT: setb %cl +; X64-NEXT: adcq %r9, %rsi +; X64-NEXT: setb %r11b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r9, %rbx -; X64-NEXT: adcq $0, %r11 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq %r8, %r9 -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: adcq %r11, %r13 -; X64-NEXT: setb %r8b -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: addq %r13, %rax -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: adcq %r9, %rcx +; X64-NEXT: setb %r9b +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: movzbl %r9b, %eax ; X64-NEXT: adcq %rax, %rdx -; X64-NEXT: addq %rdi, %r15 -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rsi, %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %rdi, %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rsi, %rbx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %r11b, %eax +; X64-NEXT: adcq %rax, %rbp ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 96(%rcx), %rsi -; X64-NEXT: imulq %rsi, %r9 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq 96(%rdi), %rcx +; X64-NEXT: imulq %rcx, %r13 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %r12, %r9 +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r13, %rdx +; X64-NEXT: movq 104(%rdi), %r8 +; X64-NEXT: imulq %r8, %r9 +; X64-NEXT: addq %rdx, %r9 +; X64-NEXT: movq %r9, %r10 +; X64-NEXT: movq 112(%rdi), %rax +; X64-NEXT: movq %rdi, %r9 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r9, %rdx -; X64-NEXT: movq 104(%rcx), %r9 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: imulq %r9, %rax -; X64-NEXT: addq %rdx, %rax -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq 112(%rcx), %rax -; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: imulq %r10, %rcx +; X64-NEXT: imulq %r15, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: movq 120(%r14), %r13 -; X64-NEXT: imulq %rbx, %r13 -; X64-NEXT: addq %rdx, %r13 -; X64-NEXT: addq %rdi, %r8 -; X64-NEXT: adcq %r11, %r13 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: movq 120(%r9), %r9 +; X64-NEXT: imulq %rbx, %r9 +; X64-NEXT: addq %rdx, %r9 +; X64-NEXT: addq %rsi, %r13 +; X64-NEXT: adcq %r10, %r9 ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rbx, %rcx -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rdi, %rbx -; X64-NEXT: adcq $0, %r11 -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rsi, %rdi +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rbx, %r12 -; X64-NEXT: adcq %r11, %rcx -; X64-NEXT: setb %sil -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: addq %rdi, %r12 +; X64-NEXT: adcq %rcx, %rsi +; X64-NEXT: setb %cl +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rcx, %r9 -; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rsi, %rdi +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %rbx -; X64-NEXT: addq %r8, %r9 -; X64-NEXT: adcq %r13, %rbx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: imulq %r10, %rdi -; X64-NEXT: movq %r10, %rax +; X64-NEXT: addq %r13, %rdi +; X64-NEXT: adcq %r9, %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: imulq %r11, %rsi +; X64-NEXT: movq %r11, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %r14, %rax -; X64-NEXT: addq %rdx, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: imulq %r8, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: imulq %rdi, %rbp -; X64-NEXT: addq %rdx, %rbp -; X64-NEXT: addq %rcx, %r11 -; X64-NEXT: adcq %r13, %rbp -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: imulq %r10, %r14 +; X64-NEXT: addq %rdx, %r14 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: imulq %r15, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r8, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: imulq %r9, %rax +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: addq %r13, %rcx +; X64-NEXT: adcq %r14, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, %r14 +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, %r15 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r9, %r8 +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rsi, %rcx -; X64-NEXT: adcq %r8, %rdi +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r8, %r11 +; X64-NEXT: adcq %rsi, %r9 ; X64-NEXT: setb %sil ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: addq %rdi, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: addq %r9, %rax ; X64-NEXT: movzbl %sil, %esi ; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: addq %r11, %rax -; X64-NEXT: adcq %rbp, %rdx +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; X64-NEXT: adcq %r12, %rcx -; X64-NEXT: adcq %r9, %rax +; X64-NEXT: adcq %r12, %r11 +; X64-NEXT: adcq %rdi, %rax ; X64-NEXT: adcq %rbx, %rdx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: adcq %rbp, %rax ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload ; X64-NEXT: movq %rsi, %r8 -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload -; X64-NEXT: movq %rdi, %r9 +; X64-NEXT: adcq (%rsp), %r9 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, (%rsi) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 8(%rsi) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 16(%rsi) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 24(%rsi) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 32(%rsi) -; X64-NEXT: movq (%rsp), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 40(%rsi) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 48(%rsi) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 56(%rsi) -; X64-NEXT: movq %r8, 64(%rsi) -; X64-NEXT: movq %r9, 72(%rsi) -; X64-NEXT: movq %r10, 80(%rsi) -; X64-NEXT: movq %r11, 88(%rsi) -; X64-NEXT: movq %r13, 96(%rsi) -; X64-NEXT: movq %rcx, 104(%rsi) -; X64-NEXT: movq %rax, 112(%rsi) -; X64-NEXT: movq %rdx, 120(%rsi) -; X64-NEXT: addq $240, %rsp +; X64-NEXT: movq %rsi, (%rcx) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rsi, 8(%rcx) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rsi, 16(%rcx) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rsi, 24(%rcx) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rsi, 32(%rcx) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rsi, 40(%rcx) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rsi, 48(%rcx) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rsi, 56(%rcx) +; X64-NEXT: movq %rdi, 64(%rcx) +; X64-NEXT: movq %r8, 72(%rcx) +; X64-NEXT: movq %r9, 80(%rcx) +; X64-NEXT: movq %r10, 88(%rcx) +; X64-NEXT: movq %r13, 96(%rcx) +; X64-NEXT: movq %r11, 104(%rcx) +; X64-NEXT: movq %rax, 112(%rcx) +; X64-NEXT: movq %rdx, 120(%rcx) +; X64-NEXT: addq $232, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll index 6f6dde3aa3cf4d..00740f61ba6f8a 100644 --- a/llvm/test/CodeGen/X86/mul-i256.ll +++ b/llvm/test/CodeGen/X86/mul-i256.ll @@ -21,73 +21,74 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X32-NEXT: .cfi_offset %edi, -16 ; X32-NEXT: .cfi_offset %ebx, -12 ; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %ebx -; X32-NEXT: movl 8(%eax), %ebp -; X32-NEXT: movl (%edx), %edi +; X32-NEXT: movl 12(%eax), %ebp +; X32-NEXT: movl 8(%eax), %ebx +; X32-NEXT: movl (%ecx), %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 4(%eax), %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 4(%eax), %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %ebx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl (%esi), %ebx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 4(%esi), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl 4(%esi), %eax ; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, %esi -; X32-NEXT: mull %edi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %esi, %edi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -95,96 +96,92 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 8(%eax), %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %ebp -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl +; X32-NEXT: movl 12(%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull (%esp) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebx, %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 16(%ecx), %esi -; X32-NEXT: movl (%esp), %edi # 4-byte Reload -; X32-NEXT: imull %esi, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl 16(%ecx), %edi +; X32-NEXT: imull %edi, %esi +; X32-NEXT: movl %esi, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: addl %edi, %edx -; X32-NEXT: movl 20(%ecx), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %edi -; X32-NEXT: imull %eax, %edi -; X32-NEXT: addl %edx, %edi +; X32-NEXT: addl %ebx, %edx +; X32-NEXT: movl 20(%ecx), %ebp +; X32-NEXT: imull %ebp, %esi +; X32-NEXT: addl %edx, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl 24(%ecx), %eax ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: imull %esi, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %edx @@ -194,19 +191,18 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax @@ -222,20 +218,19 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 24(%edi), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl 24(%ebx), %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl %edi, %edx -; X32-NEXT: movl 28(%edi), %eax +; X32-NEXT: movl 28(%ebx), %eax ; X32-NEXT: imull %ecx, %eax ; X32-NEXT: addl %eax, %esi -; X32-NEXT: movl 16(%edi), %edi -; X32-NEXT: movl 20(%edx), %ebp +; X32-NEXT: movl 16(%ebx), %edi +; X32-NEXT: movl 20(%ebx), %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: imull %ebp, %ebx @@ -250,32 +245,30 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %ecx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %bl, %esi ; X32-NEXT: adcl %esi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload @@ -321,57 +314,57 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .cfi_offset %r15, -16 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq (%rdi), %rbx -; X64-NEXT: movq 8(%rdi), %r11 -; X64-NEXT: movq 16(%rdi), %r10 +; X64-NEXT: movq (%rdi), %r10 +; X64-NEXT: movq 8(%rdi), %r9 +; X64-NEXT: movq 16(%rdi), %rbx ; X64-NEXT: movq 16(%rsi), %r8 -; X64-NEXT: movq (%rsi), %r9 +; X64-NEXT: movq (%rsi), %r11 ; X64-NEXT: movq 8(%rsi), %r14 ; X64-NEXT: movq 24(%rdi), %r15 -; X64-NEXT: imulq %r9, %r15 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: imulq %r11, %r15 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: imulq %r14, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: addq %r15, %r10 +; X64-NEXT: imulq %r14, %rbx +; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: addq %r15, %rbx ; X64-NEXT: movq %r8, %r15 -; X64-NEXT: imulq %r11, %r15 +; X64-NEXT: imulq %r9, %r15 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %r15, %rdx ; X64-NEXT: movq 24(%rsi), %r15 -; X64-NEXT: imulq %rbx, %r15 +; X64-NEXT: imulq %r10, %r15 ; X64-NEXT: addq %rdx, %r15 ; X64-NEXT: addq %rdi, %r8 -; X64-NEXT: adcq %r10, %r15 -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: adcq %rbx, %r15 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rsi, %r9 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rsi, %rbx +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r9, %rbx -; X64-NEXT: adcq %rdi, %rsi +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rbx, %r10 +; X64-NEXT: adcq %r11, %rsi ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %edi -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movzbl %al, %r11d +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: addq %rsi, %rax -; X64-NEXT: adcq %rdi, %rdx +; X64-NEXT: adcq %r11, %rdx ; X64-NEXT: addq %r8, %rax ; X64-NEXT: adcq %r15, %rdx -; X64-NEXT: movq %r10, (%rcx) -; X64-NEXT: movq %rbx, 8(%rcx) +; X64-NEXT: movq %rdi, (%rcx) +; X64-NEXT: movq %r10, 8(%rcx) ; X64-NEXT: movq %rax, 16(%rcx) ; X64-NEXT: movq %rdx, 24(%rcx) ; X64-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll index 4a0f0ad94cef06..709434112f01c5 100644 --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -9,753 +9,739 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $180, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: subl $184, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 28(%eax), %ebx -; X32-NEXT: movl 24(%eax), %ebp -; X32-NEXT: movl (%edx), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl 24(%eax), %eax +; X32-NEXT: movl (%ecx), %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 4(%eax), %edi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebp +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 4(%eax), %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %ecx +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 16(%ecx), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl 16(%ebx), %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 20(%ecx), %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl 20(%ebx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebp, %edi -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 8(%eax), %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %ecx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl 12(%eax), %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ecx +; X32-NEXT: addl %esi, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebp, (%esp) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx +; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 8(%ecx), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 12(%ecx), %ebx +; X32-NEXT: movl 8(%ecx), %ebx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl (%esi), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 4(%esi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebp, %edi -; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl 12(%ecx), %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %ebx, %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl (%ebx), %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 4(%ebx), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl +; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X32-NEXT: adcl %ebx, %eax -; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 16(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebx ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 20(%eax), %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ecx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl 16(%eax), %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl 20(%eax), %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ecx, %esi ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp +; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebx, %edi +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ecx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 24(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl (%esp), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 28(%eax), %edx +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull (%esp) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull (%esp) # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: adcl %ebx, %edx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: adcl $0, %eax -; X32-NEXT: adcl $0, %edi +; X32-NEXT: mull (%esp) # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %bl, %ebx +; X32-NEXT: adcl %ebx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %eax +; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, %esi +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebp ; X32-NEXT: mull %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull (%esp) # 4-byte Folded Reload ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull (%esp) # 4-byte Folded Reload ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 32(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl 32(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 36(%eax), %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 36(%eax), %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %ebx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, %esi -; X32-NEXT: mull %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 40(%eax), %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: movl %eax, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl 40(%eax), %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl 44(%edi), %edi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 44(%eax), %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: addl %edx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: imull %eax, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload @@ -764,377 +750,377 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl (%esp), %esi # 4-byte Reload ; X32-NEXT: imull %ecx, %esi ; X32-NEXT: addl %edx, %esi -; X32-NEXT: addl %ebx, %ebp -; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, (%esp) # 4-byte Spill ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ecx, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 56(%edi), %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl 60(%edi), %eax -; X32-NEXT: imull %ebp, %eax -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: movl 48(%edi), %esi -; X32-NEXT: movl 52(%edi), %edi +; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 56(%ecx), %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl %edx, %ebx +; X32-NEXT: movl 60(%ecx), %eax +; X32-NEXT: imull %esi, %eax +; X32-NEXT: addl %eax, %ebx +; X32-NEXT: movl 48(%ecx), %esi +; X32-NEXT: movl 52(%ecx), %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: imull %edi, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: imull %edi, %ebp ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %esi -; X32-NEXT: addl %ebx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %esi, %ebx -; X32-NEXT: addl %edx, %ebx +; X32-NEXT: addl %ebp, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: imull %esi, %ecx +; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movzbl %bl, %esi +; X32-NEXT: adcl %esi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 40(%ecx), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 40(%esi), %ebx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 44(%ecx), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl 44(%esi), %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 32(%esi), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl 32(%esi), %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl 36(%esi), %ebp ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebx -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: adcl %ebx, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: addl %edi, (%esp) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %eax, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 48(%esi), %edi -; X32-NEXT: imull %edi, %ecx -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 48(%ecx), %ebp +; X32-NEXT: imull %ebp, %edi +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: movl 52(%esi), %eax +; X32-NEXT: addl %edi, %edx +; X32-NEXT: movl 52(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: imull %eax, %ebx ; X32-NEXT: addl %edx, %ebx -; X32-NEXT: movl 56(%esi), %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 60(%esi), %esi -; X32-NEXT: imull %ecx, %esi -; X32-NEXT: addl %edx, %esi +; X32-NEXT: movl 56(%ecx), %eax +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: imull %edi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 60(%ecx), %ecx +; X32-NEXT: imull %esi, %ecx +; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: adcl %ebx, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebp ; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movzbl %bl, %esi +; X32-NEXT: adcl %esi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edx +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: imull %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: imull %esi, %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: addl %edx, %edi +; X32-NEXT: addl %esi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: imull %edi, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %edx +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: imull %ebx, %ecx ; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: movl %ebp, %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl (%esp), %esi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %esi, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -1158,14 +1144,14 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %esi, 32(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, 36(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl (%esp), %esi # 4-byte Reload ; X32-NEXT: movl %esi, 40(%ecx) -; X32-NEXT: movl %ebx, 44(%ecx) -; X32-NEXT: movl %ebp, 48(%ecx) -; X32-NEXT: movl %edi, 52(%ecx) +; X32-NEXT: movl %ebp, 44(%ecx) +; X32-NEXT: movl %edi, 48(%ecx) +; X32-NEXT: movl %ebx, 52(%ecx) ; X32-NEXT: movl %eax, 56(%ecx) ; X32-NEXT: movl %edx, 60(%ecx) -; X32-NEXT: addl $180, %esp +; X32-NEXT: addl $184, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -1180,260 +1166,256 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: pushq %rax -; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq (%rdi), %rbx -; X64-NEXT: movq 8(%rdi), %rdi -; X64-NEXT: movq 24(%rax), %r14 -; X64-NEXT: movq 16(%rax), %rax -; X64-NEXT: movq (%rsi), %r8 +; X64-NEXT: movq (%rdi), %r8 +; X64-NEXT: movq 8(%rdi), %r13 +; X64-NEXT: movq 24(%rdi), %rcx +; X64-NEXT: movq 16(%rdi), %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq (%rsi), %r12 ; X64-NEXT: movq 8(%rsi), %r11 -; X64-NEXT: movq %rsi, %r13 -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp ; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rcx, %r10 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r9, %rbx +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r10, %r15 -; X64-NEXT: adcq %r9, %rcx +; X64-NEXT: addq %rbx, %r15 +; X64-NEXT: adcq %r10, %r14 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %al, %ecx +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rcx, %r9 -; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rbx, %rsi -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %r11, %rdi +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r14, %r10 +; X64-NEXT: adcq %rcx, %r11 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rbx, %rbp +; X64-NEXT: adcq $0, %r14 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, %r12 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, %r8 -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r14, %rax +; X64-NEXT: addq %rbp, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %rcx -; X64-NEXT: setb %sil -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r11 +; X64-NEXT: adcq %r14, %rbx +; X64-NEXT: setb %cl +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rbx, %rbp +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %r14 -; X64-NEXT: addq %rbp, %rbx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload ; X64-NEXT: adcq %r15, %r14 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: adcq $0, %r12 -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: movq 16(%rsi), %rsi +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r13, %rbx ; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%r13), %r10 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r12 -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r10 +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %rdi, %rcx ; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq 24(%rsi), %rsi -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rbp, %r11 -; X64-NEXT: adcq %r15, %rcx -; X64-NEXT: setb %dil +; X64-NEXT: movq 24(%r8), %rdi +; X64-NEXT: movq %r8, %r13 ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: adcq %r15, %r8 +; X64-NEXT: setb %r12b +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r8, %rbx +; X64-NEXT: movzbl %r12b, %eax ; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %r11 -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %rbp +; X64-NEXT: addq %rbp, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r14, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %rbx ; X64-NEXT: adcq $0, %r15 -; X64-NEXT: addq %r9, %rbp -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; X64-NEXT: setb %dil -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r10, %rbx +; X64-NEXT: adcq %r11, %r15 +; X64-NEXT: setb %bpl ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: adcq %r9, %rcx -; X64-NEXT: setb %r8b -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq $0, %r8 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %r9, %rax +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: adcq %r8, %rcx +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 ; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %rdx -; X64-NEXT: addq %rbp, %r11 +; X64-NEXT: addq %rbx, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r15, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %bpl, %eax +; X64-NEXT: adcq %rax, %r14 ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq 32(%rdi), %r15 -; X64-NEXT: imulq %r15, %rsi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq 40(%rdi), %rsi -; X64-NEXT: imulq %rsi, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: movq 48(%rdi), %rax -; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq 32(%r9), %rcx +; X64-NEXT: imulq %rcx, %rdi +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: movq 40(%r9), %rbp +; X64-NEXT: imulq %rbp, %rsi +; X64-NEXT: addq %rdx, %rsi +; X64-NEXT: movq 48(%r9), %rax +; X64-NEXT: movq %r9, %r10 ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: imulq %r9, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: mulq %r11 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq 56(%r8), %r8 -; X64-NEXT: imulq %r11, %r8 -; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq %r10, %r8 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq 56(%r10), %r11 +; X64-NEXT: imulq %r15, %r11 +; X64-NEXT: addq %rdx, %r11 +; X64-NEXT: addq %r8, %rbx +; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rcx, %r15 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r15, %r13 -; X64-NEXT: adcq %rdi, %rcx -; X64-NEXT: setb %dil +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rdi, %r8 +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r8, %r10 +; X64-NEXT: adcq %rcx, %r15 +; X64-NEXT: setb %cl ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rcx, %r10 -; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r15, %rdi +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbx, %r10 -; X64-NEXT: adcq %r8, %r12 +; X64-NEXT: addq %rbx, %rdi +; X64-NEXT: adcq %r11, %r12 +; X64-NEXT: movq %r13, %r11 +; X64-NEXT: movq 48(%r13), %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq 48(%r8), %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %r14, %rsi +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: imulq %r9, %rsi ; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: movq %r8, %rdx -; X64-NEXT: movq 56(%r8), %rax -; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: movq 56(%r13), %rax +; X64-NEXT: imulq %r8, %rax +; X64-NEXT: movq %r8, %r13 ; X64-NEXT: addq %rax, %rsi -; X64-NEXT: movq 32(%rdx), %rbp -; X64-NEXT: movq 40(%rdx), %r9 +; X64-NEXT: movq 32(%r11), %r8 +; X64-NEXT: movq 40(%r11), %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: imulq %r9, %rdi -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: imulq %rbp, %r11 -; X64-NEXT: addq %rdx, %r11 -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq %rsi, %r11 -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: imulq %r15, %r11 ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r11, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: imulq %r8, %rax +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: addq %rbp, %rbx +; X64-NEXT: adcq %rsi, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r11, %r13 +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rdi, %r8 -; X64-NEXT: adcq %r15, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: addq %r13, %r8 +; X64-NEXT: adcq %rbp, %rcx +; X64-NEXT: setb %r11b +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: movzbl %r11b, %ecx ; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: addq %rbx, %rax -; X64-NEXT: adcq %r11, %rdx +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: adcq %r13, %r8 -; X64-NEXT: adcq %r10, %rax +; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: adcq %rdi, %rax ; X64-NEXT: adcq %r12, %rdx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: adcq %r14, %rax ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movq %rdi, (%rcx) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload @@ -1446,7 +1428,6 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %r8, 40(%rcx) ; X64-NEXT: movq %rax, 48(%rcx) ; X64-NEXT: movq %rdx, 56(%rcx) -; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll index fc1cc1f65627a8..fd91720436e399 100644 --- a/llvm/test/CodeGen/X86/mul128.ll +++ b/llvm/test/CodeGen/X86/mul128.ll @@ -30,58 +30,57 @@ define i128 @foo(i128 %t, i128 %u) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: imull %ecx, %ebp -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: imull %ecx, %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull %esi, %eax -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ecx, %esi +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: imull %ebp, %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %edi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: imull %ebp, %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: adcl %ecx, %edi ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb %cl +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl %bl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: adcl %edi, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl %ebp, 4(%ecx) ; X86-NEXT: movl (%esp), %esi # 4-byte Reload ; X86-NEXT: movl %esi, (%ecx) ; X86-NEXT: movl %eax, 8(%ecx) diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll index 8b75c6fb68c78c..394dd75ce0fc5b 100644 --- a/llvm/test/CodeGen/X86/muloti.ll +++ b/llvm/test/CodeGen/X86/muloti.ll @@ -13,62 +13,61 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movq %rdx, %r11 -; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: movq %rdx, %r9 ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: movq %rcx, %rdi -; CHECK-NEXT: imulq %rdx, %rdi -; CHECK-NEXT: movq %r11, %rax +; CHECK-NEXT: movq %rcx, %r11 +; CHECK-NEXT: imulq %rdx, %r11 +; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: mulq %rdx -; CHECK-NEXT: movq %rdx, %r9 -; CHECK-NEXT: movq %rax, %rbx -; CHECK-NEXT: addq %rax, %r9 -; CHECK-NEXT: addq %rdi, %r9 +; CHECK-NEXT: movq %rdx, %r8 +; CHECK-NEXT: movq %rax, %r10 +; CHECK-NEXT: addq %rax, %r8 +; CHECK-NEXT: addq %r11, %r8 ; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: sarq $63, %rax ; CHECK-NEXT: movq %rax, %r14 ; CHECK-NEXT: imulq %rsi, %r14 -; CHECK-NEXT: mulq %r10 -; CHECK-NEXT: movq %rax, %r8 -; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: addq %r14, %rdi -; CHECK-NEXT: addq %rax, %rdi -; CHECK-NEXT: addq %rbx, %r8 -; CHECK-NEXT: adcq %r9, %rdi -; CHECK-NEXT: movq %r10, %rax -; CHECK-NEXT: mulq %r11 +; CHECK-NEXT: mulq %rdi +; CHECK-NEXT: movq %rax, %r11 ; CHECK-NEXT: movq %rdx, %rbx -; CHECK-NEXT: movq %rax, %r9 +; CHECK-NEXT: addq %r14, %rbx +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %r10, %r11 +; CHECK-NEXT: adcq %r8, %rbx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: mulq %r9 +; CHECK-NEXT: movq %rdx, %r10 +; CHECK-NEXT: movq %rax, %r8 ; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: mulq %r11 -; CHECK-NEXT: movq %rdx, %r11 +; CHECK-NEXT: mulq %r9 +; CHECK-NEXT: movq %rdx, %r9 ; CHECK-NEXT: movq %rax, %r14 -; CHECK-NEXT: addq %rbx, %r14 -; CHECK-NEXT: adcq $0, %r11 -; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: addq %r10, %r14 +; CHECK-NEXT: adcq $0, %r9 +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: mulq %rcx -; CHECK-NEXT: movq %rdx, %rbx -; CHECK-NEXT: movq %rax, %r10 -; CHECK-NEXT: addq %r14, %r10 -; CHECK-NEXT: adcq %r11, %rbx +; CHECK-NEXT: movq %rdx, %r10 +; CHECK-NEXT: movq %rax, %rdi +; CHECK-NEXT: addq %r14, %rdi +; CHECK-NEXT: adcq %r9, %r10 ; CHECK-NEXT: setb %al -; CHECK-NEXT: movzbl %al, %r11d +; CHECK-NEXT: movzbl %al, %r9d ; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: mulq %rcx -; CHECK-NEXT: addq %rbx, %rax -; CHECK-NEXT: adcq %r11, %rdx -; CHECK-NEXT: addq %r8, %rax -; CHECK-NEXT: adcq %rdi, %rdx -; CHECK-NEXT: movq %r10, %rcx +; CHECK-NEXT: addq %r10, %rax +; CHECK-NEXT: adcq %r9, %rdx +; CHECK-NEXT: addq %r11, %rax +; CHECK-NEXT: adcq %rbx, %rdx +; CHECK-NEXT: movq %rdi, %rcx ; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: xorq %rcx, %rdx ; CHECK-NEXT: xorq %rax, %rcx ; CHECK-NEXT: orq %rdx, %rcx ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.2: ## %nooverflow -; CHECK-NEXT: movq %r9, %rax -; CHECK-NEXT: movq %r10, %rdx +; CHECK-NEXT: movq %r8, %rax +; CHECK-NEXT: movq %rdi, %rdx ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll index ce672a70b1f912..1756154272018e 100644 --- a/llvm/test/CodeGen/X86/musttail-varargs.ll +++ b/llvm/test/CodeGen/X86/musttail-varargs.ll @@ -45,12 +45,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUX-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUX-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movl %eax, %ebp -; LINUX-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; LINUX-NEXT: movq %r8, %r14 -; LINUX-NEXT: movq %rcx, %r15 -; LINUX-NEXT: movq %rdx, %r12 -; LINUX-NEXT: movq %rsi, %r13 +; LINUX-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-NEXT: movq %r9, %r14 +; LINUX-NEXT: movq %r8, %r15 +; LINUX-NEXT: movq %rcx, %r12 +; LINUX-NEXT: movq %rdx, %r13 +; LINUX-NEXT: movq %rsi, %rbp ; LINUX-NEXT: movq %rdi, %rbx ; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) @@ -78,12 +78,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-NEXT: callq get_f@PLT ; LINUX-NEXT: movq %rax, %r11 ; LINUX-NEXT: movq %rbx, %rdi -; LINUX-NEXT: movq %r13, %rsi -; LINUX-NEXT: movq %r12, %rdx -; LINUX-NEXT: movq %r15, %rcx -; LINUX-NEXT: movq %r14, %r8 -; LINUX-NEXT: movl %ebp, %eax -; LINUX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-NEXT: movq %rbp, %rsi +; LINUX-NEXT: movq %r13, %rdx +; LINUX-NEXT: movq %r12, %rcx +; LINUX-NEXT: movq %r15, %r8 +; LINUX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; LINUX-NEXT: movq %r14, %r9 ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -138,12 +138,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-X32-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; LINUX-X32-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; LINUX-X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movl %eax, %ebp -; LINUX-X32-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill -; LINUX-X32-NEXT: movq %r8, %r14 -; LINUX-X32-NEXT: movq %rcx, %r15 -; LINUX-X32-NEXT: movq %rdx, %r12 -; LINUX-X32-NEXT: movq %rsi, %r13 +; LINUX-X32-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-NEXT: movq %r9, %r14 +; LINUX-X32-NEXT: movq %r8, %r15 +; LINUX-X32-NEXT: movq %rcx, %r12 +; LINUX-X32-NEXT: movq %rdx, %r13 +; LINUX-X32-NEXT: movq %rsi, %rbp ; LINUX-X32-NEXT: movq %rdi, %rbx ; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp) @@ -171,12 +171,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-X32-NEXT: callq get_f@PLT ; LINUX-X32-NEXT: movl %eax, %r11d ; LINUX-X32-NEXT: movq %rbx, %rdi -; LINUX-X32-NEXT: movq %r13, %rsi -; LINUX-X32-NEXT: movq %r12, %rdx -; LINUX-X32-NEXT: movq %r15, %rcx -; LINUX-X32-NEXT: movq %r14, %r8 -; LINUX-X32-NEXT: movl %ebp, %eax -; LINUX-X32-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-NEXT: movq %rbp, %rsi +; LINUX-X32-NEXT: movq %r13, %rdx +; LINUX-X32-NEXT: movq %r12, %rcx +; LINUX-X32-NEXT: movq %r15, %r8 +; LINUX-X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; LINUX-X32-NEXT: movq %r14, %r9 ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/nontemporal.ll b/llvm/test/CodeGen/X86/nontemporal.ll index 8d3d0d0799c269..843303da8f8a42 100644 --- a/llvm/test/CodeGen/X86/nontemporal.ll +++ b/llvm/test/CodeGen/X86/nontemporal.ll @@ -17,31 +17,31 @@ define i32 @f(<4 x float> %A, ptr %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 ; X86-SSE-NEXT: movdqa 56(%ebp), %xmm4 ; X86-SSE-NEXT: movdqa 40(%ebp), %xmm5 ; X86-SSE-NEXT: movdqa 24(%ebp), %xmm6 -; X86-SSE-NEXT: movl 8(%ebp), %esi -; X86-SSE-NEXT: movl 80(%ebp), %edx -; X86-SSE-NEXT: movl (%edx), %eax +; X86-SSE-NEXT: movl 8(%ebp), %edx +; X86-SSE-NEXT: movl 80(%ebp), %esi +; X86-SSE-NEXT: movl (%esi), %eax ; X86-SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movntps %xmm0, (%esi) +; X86-SSE-NEXT: movntps %xmm0, (%edx) ; X86-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntdq %xmm2, (%esi) +; X86-SSE-NEXT: addl (%esi), %eax +; X86-SSE-NEXT: movntdq %xmm2, (%edx) ; X86-SSE-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntpd %xmm1, (%esi) +; X86-SSE-NEXT: addl (%esi), %eax +; X86-SSE-NEXT: movntpd %xmm1, (%edx) ; X86-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm6 -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntdq %xmm6, (%esi) +; X86-SSE-NEXT: addl (%esi), %eax +; X86-SSE-NEXT: movntdq %xmm6, (%edx) ; X86-SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5 -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntdq %xmm5, (%esi) +; X86-SSE-NEXT: addl (%esi), %eax +; X86-SSE-NEXT: movntdq %xmm5, (%edx) ; X86-SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntdq %xmm4, (%esi) -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntil %ecx, (%esi) -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movsd %xmm3, (%esi) -; X86-SSE-NEXT: addl (%edx), %eax +; X86-SSE-NEXT: addl (%esi), %eax +; X86-SSE-NEXT: movntdq %xmm4, (%edx) +; X86-SSE-NEXT: addl (%esi), %eax +; X86-SSE-NEXT: movntil %ecx, (%edx) +; X86-SSE-NEXT: addl (%esi), %eax +; X86-SSE-NEXT: movsd %xmm3, (%edx) +; X86-SSE-NEXT: addl (%esi), %eax ; X86-SSE-NEXT: leal -4(%ebp), %esp ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: popl %ebp @@ -59,31 +59,31 @@ define i32 @f(<4 x float> %A, ptr %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 ; X86-AVX-NEXT: vmovdqa 56(%ebp), %xmm4 ; X86-AVX-NEXT: vmovdqa 40(%ebp), %xmm5 ; X86-AVX-NEXT: vmovdqa 24(%ebp), %xmm6 -; X86-AVX-NEXT: movl 8(%ebp), %esi -; X86-AVX-NEXT: movl 80(%ebp), %edx -; X86-AVX-NEXT: movl (%edx), %eax +; X86-AVX-NEXT: movl 8(%ebp), %edx +; X86-AVX-NEXT: movl 80(%ebp), %esi +; X86-AVX-NEXT: movl (%esi), %eax ; X86-AVX-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovntps %xmm0, (%esi) +; X86-AVX-NEXT: vmovntps %xmm0, (%edx) ; X86-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0 -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) +; X86-AVX-NEXT: addl (%esi), %eax +; X86-AVX-NEXT: vmovntdq %xmm0, (%edx) ; X86-AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovntpd %xmm0, (%esi) +; X86-AVX-NEXT: addl (%esi), %eax +; X86-AVX-NEXT: vmovntpd %xmm0, (%edx) ; X86-AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm6, %xmm0 -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) +; X86-AVX-NEXT: addl (%esi), %eax +; X86-AVX-NEXT: vmovntdq %xmm0, (%edx) ; X86-AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5, %xmm0 -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) +; X86-AVX-NEXT: addl (%esi), %eax +; X86-AVX-NEXT: vmovntdq %xmm0, (%edx) ; X86-AVX-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4, %xmm0 -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: movntil %ecx, (%esi) -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovsd %xmm3, (%esi) -; X86-AVX-NEXT: addl (%edx), %eax +; X86-AVX-NEXT: addl (%esi), %eax +; X86-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X86-AVX-NEXT: addl (%esi), %eax +; X86-AVX-NEXT: movntil %ecx, (%edx) +; X86-AVX-NEXT: addl (%esi), %eax +; X86-AVX-NEXT: vmovsd %xmm3, (%edx) +; X86-AVX-NEXT: addl (%esi), %eax ; X86-AVX-NEXT: leal -4(%ebp), %esp ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index f0fb89496aa1ed..d6d971920a751f 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -911,27 +911,27 @@ define void @interleave_24i8_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-LABEL: interleave_24i16_out: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqu (%rdi), %xmm3 -; SSE2-NEXT: movdqu 16(%rdi), %xmm2 +; SSE2-NEXT: movdqu (%rdi), %xmm1 +; SSE2-NEXT: movdqu 16(%rdi), %xmm3 ; SSE2-NEXT: movdqu 32(%rdi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,7,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1] ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,0] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: pand %xmm4, %xmm6 ; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,2,3,4,5,6,7] @@ -947,19 +947,19 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-NEXT: movdqa %xmm6, %xmm8 ; SSE2-NEXT: pandn %xmm7, %xmm8 ; SSE2-NEXT: por %xmm5, %xmm8 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] ; SSE2-NEXT: pandn %xmm0, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: movups %xmm1, (%rsi) +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: movups %xmm2, (%rsi) ; SSE2-NEXT: movdqu %xmm8, (%rdx) ; SSE2-NEXT: movdqu %xmm6, (%rcx) ; SSE2-NEXT: retq @@ -1062,25 +1062,25 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nou ; SSE2: # %bb.0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu 16(%rdi), %xmm1 -; SSE2-NEXT: movdqu 32(%rdi), %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0] +; SSE2-NEXT: movdqu 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0] ; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,5,6,6,7] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,5,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1] ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,0,1,2,4,5,6,7] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,0] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm6 ; SSE2-NEXT: pand %xmm4, %xmm6 ; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,3,2,3,4,5,6,7] @@ -1096,7 +1096,7 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nou ; SSE2-NEXT: pandn %xmm7, %xmm8 ; SSE2-NEXT: por %xmm5, %xmm8 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,2,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] @@ -1109,7 +1109,7 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nou ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; SSE2-NEXT: pandn %xmm0, %xmm6 ; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: movups %xmm2, (%rsi) +; SSE2-NEXT: movups %xmm3, (%rsi) ; SSE2-NEXT: movdqu %xmm8, (%rdx) ; SSE2-NEXT: movdqu %xmm6, (%rcx) ; SSE2-NEXT: retq @@ -1212,41 +1212,41 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-LABEL: interleave_24i16_in: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqu (%rsi), %xmm0 -; SSE2-NEXT: movdqu (%rdx), %xmm2 -; SSE2-NEXT: movdqu (%rcx), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqu (%rdx), %xmm1 +; SSE2-NEXT: movdqu (%rcx), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,2] ; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[3,3,3,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; SSE2-NEXT: pandn %xmm6, %xmm4 ; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535] ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,1,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,3,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7] ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: movdqu %xmm4, 32(%rdi) ; SSE2-NEXT: movdqu %xmm5, 16(%rdi) -; SSE2-NEXT: movdqu %xmm1, (%rdi) +; SSE2-NEXT: movdqu %xmm3, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i16_in: @@ -1396,72 +1396,72 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqu 64(%rdi), %xmm2 -; SSE2-NEXT: movups 80(%rdi), %xmm4 -; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu 64(%rdi), %xmm0 +; SSE2-NEXT: movups 80(%rdi), %xmm1 +; SSE2-NEXT: movdqu (%rdi), %xmm2 ; SSE2-NEXT: movdqu 16(%rdi), %xmm3 -; SSE2-NEXT: movups 32(%rdi), %xmm6 -; SSE2-NEXT: movdqu 48(%rdi), %xmm1 -; SSE2-NEXT: movaps %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm3[2,0] -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm6[2,0] +; SSE2-NEXT: movups 32(%rdi), %xmm4 +; SSE2-NEXT: movdqu 48(%rdi), %xmm5 ; SSE2-NEXT: movaps %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[2,0] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm4[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,1,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[2,0] -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm2[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm3[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[2,0] +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm0[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm3[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[2,0] ; SSE2-NEXT: movups %xmm10, 16(%rsi) ; SSE2-NEXT: movups %xmm8, (%rsi) -; SSE2-NEXT: movups %xmm1, 16(%rdx) -; SSE2-NEXT: movups %xmm0, (%rdx) +; SSE2-NEXT: movups %xmm5, 16(%rdx) +; SSE2-NEXT: movups %xmm2, (%rdx) ; SSE2-NEXT: movups %xmm9, 16(%rcx) -; SSE2-NEXT: movups %xmm5, (%rcx) +; SSE2-NEXT: movups %xmm7, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: ; SSE42: # %bb.0: ; SSE42-NEXT: movups 80(%rdi), %xmm0 ; SSE42-NEXT: movdqu 64(%rdi), %xmm1 -; SSE42-NEXT: movdqu (%rdi), %xmm4 -; SSE42-NEXT: movdqu 16(%rdi), %xmm2 -; SSE42-NEXT: movups 32(%rdi), %xmm3 +; SSE42-NEXT: movdqu (%rdi), %xmm2 +; SSE42-NEXT: movdqu 16(%rdi), %xmm3 +; SSE42-NEXT: movups 32(%rdi), %xmm4 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5 -; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] -; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1] +; SSE42-NEXT: movdqa %xmm3, %xmm6 +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm2[2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[1] ; SSE42-NEXT: movdqa %xmm1, %xmm8 ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[1] -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm10[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5],xmm10[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3],xmm7[4,5,6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3],xmm9[4,5,6,7] ; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3] ; SSE42-NEXT: movups %xmm5, 16(%rsi) -; SSE42-NEXT: movups %xmm4, (%rsi) +; SSE42-NEXT: movups %xmm2, (%rsi) ; SSE42-NEXT: movdqu %xmm10, 16(%rdx) ; SSE42-NEXT: movdqu %xmm6, (%rdx) ; SSE42-NEXT: movups %xmm9, 16(%rcx) @@ -1635,38 +1635,38 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movups (%rsi), %xmm1 ; SSE2-NEXT: movups 16(%rsi), %xmm0 -; SSE2-NEXT: movups (%rdx), %xmm3 -; SSE2-NEXT: movups 16(%rdx), %xmm5 +; SSE2-NEXT: movups (%rdx), %xmm2 +; SSE2-NEXT: movups 16(%rdx), %xmm3 ; SSE2-NEXT: movups (%rcx), %xmm4 -; SSE2-NEXT: movups 16(%rcx), %xmm7 +; SSE2-NEXT: movups 16(%rcx), %xmm5 ; SSE2-NEXT: movaps %xmm4, %xmm6 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[1,3] -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0,2] -; SSE2-NEXT: movaps %xmm0, %xmm8 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1] -; SSE2-NEXT: movaps %xmm7, %xmm9 -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[1,3] +; SSE2-NEXT: movaps %xmm1, %xmm7 +; SSE2-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2] ; SSE2-NEXT: movaps %xmm0, %xmm6 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm7[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm5[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm8[0,2] -; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[0,2] -; SSE2-NEXT: movaps %xmm1, %xmm5 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE2-NEXT: movaps %xmm5, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] +; SSE2-NEXT: movaps %xmm0, %xmm9 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm5[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[0,2] +; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[0,2] +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[0,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE2-NEXT: movups %xmm4, 16(%rdi) -; SSE2-NEXT: movups %xmm6, 48(%rdi) -; SSE2-NEXT: movups %xmm7, 64(%rdi) -; SSE2-NEXT: movups %xmm2, (%rdi) +; SSE2-NEXT: movups %xmm9, 48(%rdi) +; SSE2-NEXT: movups %xmm5, 64(%rdi) +; SSE2-NEXT: movups %xmm7, (%rdi) ; SSE2-NEXT: movups %xmm1, 32(%rdi) ; SSE2-NEXT: movups %xmm0, 80(%rdi) ; SSE2-NEXT: retq @@ -1674,39 +1674,39 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE42-LABEL: interleave_24i32_in: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqu (%rsi), %xmm0 -; SSE42-NEXT: movdqu 16(%rsi), %xmm2 -; SSE42-NEXT: movdqu (%rdx), %xmm3 -; SSE42-NEXT: movdqu 16(%rdx), %xmm4 -; SSE42-NEXT: movdqu (%rcx), %xmm5 -; SSE42-NEXT: movdqu 16(%rcx), %xmm6 -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,2] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5],xmm7[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm8 -; SSE42-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE42-NEXT: movdqu 16(%rsi), %xmm1 +; SSE42-NEXT: movdqu (%rdx), %xmm2 +; SSE42-NEXT: movdqu 16(%rdx), %xmm3 +; SSE42-NEXT: movdqu (%rcx), %xmm4 +; SSE42-NEXT: movdqu 16(%rcx), %xmm5 +; SSE42-NEXT: movdqa %xmm0, %xmm6 +; SSE42-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5],xmm6[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm8 +; SSE42-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] -; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3,4,5],xmm4[6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; SSE42-NEXT: movdqu %xmm2, 32(%rdi) -; SSE42-NEXT: movdqu %xmm4, 80(%rdi) +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,3],xmm8[4,5,6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3,4,5],xmm3[6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; SSE42-NEXT: movdqu %xmm1, 32(%rdi) +; SSE42-NEXT: movdqu %xmm3, 80(%rdi) ; SSE42-NEXT: movdqu %xmm8, 16(%rdi) ; SSE42-NEXT: movdqu %xmm9, 48(%rdi) -; SSE42-NEXT: movdqu %xmm7, 64(%rdi) -; SSE42-NEXT: movdqu %xmm1, (%rdi) +; SSE42-NEXT: movdqu %xmm6, 64(%rdi) +; SSE42-NEXT: movdqu %xmm7, (%rdi) ; SSE42-NEXT: retq ; ; AVX1-LABEL: interleave_24i32_in: diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index 2f557679a15588..f57defc368d5ef 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -157,35 +157,35 @@ define void @PR42833() { ; SSE2: # %bb.0: ; SSE2-NEXT: movl b(%rip), %eax ; SSE2-NEXT: movdqa c+128(%rip), %xmm0 -; SSE2-NEXT: movdqa c+144(%rip), %xmm2 +; SSE2-NEXT: movdqa c+144(%rip), %xmm1 ; SSE2-NEXT: addl c+128(%rip), %eax -; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: movdqa d+144(%rip), %xmm4 -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: paddd %xmm2, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] -; SSE2-NEXT: movdqa %xmm2, c+144(%rip) +; SSE2-NEXT: movdqa %xmm1, c+144(%rip) ; SSE2-NEXT: movaps %xmm5, c+128(%rip) -; SSE2-NEXT: movdqa c+160(%rip), %xmm2 +; SSE2-NEXT: movdqa c+160(%rip), %xmm1 ; SSE2-NEXT: movdqa c+176(%rip), %xmm3 ; SSE2-NEXT: movdqa d+160(%rip), %xmm5 ; SSE2-NEXT: movdqa d+176(%rip), %xmm6 ; SSE2-NEXT: movdqa d+128(%rip), %xmm7 -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] ; SSE2-NEXT: psubd %xmm0, %xmm7 ; SSE2-NEXT: psubd %xmm3, %xmm6 -; SSE2-NEXT: psubd %xmm2, %xmm5 +; SSE2-NEXT: psubd %xmm1, %xmm5 ; SSE2-NEXT: movdqa %xmm5, d+160(%rip) ; SSE2-NEXT: movdqa %xmm6, d+176(%rip) ; SSE2-NEXT: movdqa %xmm4, d+144(%rip) ; SSE2-NEXT: movdqa %xmm7, d+128(%rip) ; SSE2-NEXT: paddd %xmm3, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, c+160(%rip) +; SSE2-NEXT: paddd %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, c+160(%rip) ; SSE2-NEXT: movdqa %xmm3, c+176(%rip) ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll index 4b398095b549d7..b0739b6f47458c 100644 --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $28, %esp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: imull %ebp, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: imull %esi, %eax ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %ecx, (%esp) ## 4-byte Spill +; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.1: ## %bb10.preheader -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: shrl $30, %eax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: sarl $2, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl %eax, %ebp +; CHECK-NEXT: sarl $31, %ebp +; CHECK-NEXT: shrl $30, %ebp +; CHECK-NEXT: addl %eax, %ebp +; CHECK-NEXT: sarl $2, %ebp +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.2: ## %bb.nph9 -; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: testl %esi, %esi ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_4: ## %bb6 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx -; CHECK-NEXT: movb %bl, (%edx,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %ebp, %esi +; CHECK-NEXT: movzbl (%eax,%edi,2), %ebx +; CHECK-NEXT: movb %bl, (%edx,%edi) +; CHECK-NEXT: incl %edi +; CHECK-NEXT: cmpl %esi, %edi ; CHECK-NEXT: jl LBB0_4 ; CHECK-NEXT: ## %bb.5: ## %bb9 ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: cmpl %edi, %ecx +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: je LBB0_12 ; CHECK-NEXT: ## %bb.6: ## %bb7.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: jmp LBB0_4 ; CHECK-NEXT: LBB0_12: ## %bb18.loopexit +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl (%esp), %eax ## 4-byte Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %ebp, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: cmpl $1, %edi +; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) ; CHECK-NEXT: jle LBB0_13 ; CHECK-NEXT: ## %bb.7: ## %bb.nph5 -; CHECK-NEXT: cmpl $2, %ebp +; CHECK-NEXT: cmpl $2, %esi ; CHECK-NEXT: jl LBB0_13 ; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split -; CHECK-NEXT: movl %ebp, %edx -; CHECK-NEXT: shrl $31, %edx -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: sarl %edx +; CHECK-NEXT: movl %esi, %ebp +; CHECK-NEXT: shrl $31, %ebp +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: sarl %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $31, %ecx @@ -84,12 +84,12 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_9: ## %bb13 @@ -97,89 +97,90 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: ## Child Loop BB0_10 Depth 2 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %edx, %edi ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB0_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%ecx,%esi) -; CHECK-NEXT: movzbl (%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%eax,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %edx, %esi +; CHECK-NEXT: movzbl -2(%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%ecx,%ebx) +; CHECK-NEXT: movzbl (%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%eax,%ebx) +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: cmpl %ebp, %ebx ; CHECK-NEXT: jl LBB0_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB0_9 Depth=1 ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; CHECK-NEXT: incl %edi -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: addl %ebp, %eax +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: addl %ebp, %ecx ; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; CHECK-NEXT: jl LBB0_9 ; CHECK-NEXT: LBB0_13: ## %bb20 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: cmpl $1, %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.14: ## %bb20 -; CHECK-NEXT: cmpl $3, %eax +; CHECK-NEXT: cmpl $3, %ecx ; CHECK-NEXT: jne LBB0_24 ; CHECK-NEXT: ## %bb.15: ## %bb22 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; CHECK-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; CHECK-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_18 ; CHECK-NEXT: ## %bb.16: ## %bb.nph -; CHECK-NEXT: leal 15(%edi), %eax +; CHECK-NEXT: leal 15(%edx), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebx, %ebx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: addl %ecx, %ebx -; CHECK-NEXT: addl %eax, %edx -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: addl %ebp, %ebp +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: addl %edi, %ecx +; CHECK-NEXT: addl %ecx, %ebp +; CHECK-NEXT: addl %eax, %ebx +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_17: ## %bb23 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi ; CHECK-NEXT: pushl %ebx -; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ebp, %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %ebp, %ebx +; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %ebx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %edi +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_17 ; CHECK-NEXT: LBB0_18: ## %bb26 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl %ecx, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %esi, %edx ; CHECK-NEXT: jmp LBB0_23 ; CHECK-NEXT: LBB0_19: ## %bb29 -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_22 ; CHECK-NEXT: ## %bb.20: ## %bb.nph11 -; CHECK-NEXT: movl %edi, %esi -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -187,30 +188,32 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: LBB0_21: ## %bb30 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx +; CHECK-NEXT: movl %ebp, %ebx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %edi -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %esi +; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_21 ; CHECK-NEXT: LBB0_22: ## %bb33 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %ecx, %edx ; CHECK-NEXT: LBB0_23: ## %bb33 -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: sarl %eax ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: pushl $128 -; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: pushl %edx ; CHECK-NEXT: calll _memset ; CHECK-NEXT: addl $44, %esp ; CHECK-NEXT: LBB0_25: ## %return diff --git a/llvm/test/CodeGen/X86/overflow.ll b/llvm/test/CodeGen/X86/overflow.ll index 5900e7674cd0e6..b98ebdeb6b8904 100644 --- a/llvm/test/CodeGen/X86/overflow.ll +++ b/llvm/test/CodeGen/X86/overflow.ll @@ -10,23 +10,23 @@ define i128 @mulhioverflow(i64 %a, i64 %b, i64 %c) nounwind { ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: mull %ecx diff --git a/llvm/test/CodeGen/X86/peep-test-3.ll b/llvm/test/CodeGen/X86/peep-test-3.ll index 1e962169d9b08b..c5ee5c71f8df66 100644 --- a/llvm/test/CodeGen/X86/peep-test-3.ll +++ b/llvm/test/CodeGen/X86/peep-test-3.ll @@ -66,12 +66,12 @@ return: ; preds = %entry define void @and(ptr %A, i32 %IA, i32 %N, ptr %p) nounwind { ; CHECK-LABEL: and: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: xorl $1, %eax -; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: andl $3, %eax -; CHECK-NEXT: movb %al, (%ecx) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: xorl $1, %ecx +; CHECK-NEXT: andl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: andl $3, %ecx +; CHECK-NEXT: movb %cl, (%eax) ; CHECK-NEXT: je .LBB2_2 ; CHECK-NEXT: # %bb.1: # %bb ; CHECK-NEXT: movl $0, 0 diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 8e6ae4b5526572..e8ee7cb9c7690e 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -722,78 +722,78 @@ entry: define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; SSE2-LABEL: mul_v64i8c: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] -; SSE2-NEXT: pmullw %xmm4, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [117,117,117,117,117,117,117,117] +; SSE2-NEXT: pmullw %xmm5, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pmullw %xmm5, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pmullw %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pmullw %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: packuswb %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v64i8c: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [117,117,117,117,117,117,117,117] +; SSE41-NEXT: pmullw %xmm8, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: pmullw %xmm8, %xmm4 +; SSE41-NEXT: pand %xmm9, %xmm4 +; SSE41-NEXT: packuswb %xmm0, %xmm4 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117] -; SSE41-NEXT: pmullw %xmm6, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm7, %xmm1 -; SSE41-NEXT: pmullw %xmm6, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm6, %xmm4 -; SSE41-NEXT: pand %xmm7, %xmm4 -; SSE41-NEXT: pmullw %xmm6, %xmm1 -; SSE41-NEXT: pand %xmm7, %xmm1 -; SSE41-NEXT: packuswb %xmm4, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: pmullw %xmm8, %xmm1 +; SSE41-NEXT: pand %xmm9, %xmm1 +; SSE41-NEXT: pmullw %xmm8, %xmm5 +; SSE41-NEXT: pand %xmm9, %xmm5 +; SSE41-NEXT: packuswb %xmm1, %xmm5 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm6, %xmm2 -; SSE41-NEXT: pand %xmm7, %xmm2 -; SSE41-NEXT: pmullw %xmm6, %xmm4 -; SSE41-NEXT: pand %xmm7, %xmm4 -; SSE41-NEXT: packuswb %xmm2, %xmm4 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pmullw %xmm8, %xmm2 +; SSE41-NEXT: pand %xmm9, %xmm2 +; SSE41-NEXT: pmullw %xmm8, %xmm6 +; SSE41-NEXT: pand %xmm9, %xmm6 +; SSE41-NEXT: packuswb %xmm2, %xmm6 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm6, %xmm3 -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pmullw %xmm6, %xmm5 -; SSE41-NEXT: pand %xmm7, %xmm5 -; SSE41-NEXT: packuswb %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pmullw %xmm8, %xmm3 +; SSE41-NEXT: pand %xmm9, %xmm3 +; SSE41-NEXT: pmullw %xmm8, %xmm7 +; SSE41-NEXT: pand %xmm9, %xmm7 +; SSE41-NEXT: packuswb %xmm3, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v64i8c: @@ -907,47 +907,47 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v64i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa %xmm1, %xmm8 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm4, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm9, %xmm1 -; SSE41-NEXT: pmullw %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm5, %xmm8 -; SSE41-NEXT: pand %xmm9, %xmm8 -; SSE41-NEXT: pmullw %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm9, %xmm1 -; SSE41-NEXT: packuswb %xmm8, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; SSE41-NEXT: pmullw %xmm4, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm10, %xmm8 +; SSE41-NEXT: pmullw %xmm9, %xmm0 +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: packuswb %xmm8, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm10, %xmm1 +; SSE41-NEXT: pmullw %xmm8, %xmm4 +; SSE41-NEXT: pand %xmm10, %xmm4 +; SSE41-NEXT: packuswb %xmm1, %xmm4 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm6, %xmm2 -; SSE41-NEXT: pand %xmm9, %xmm2 -; SSE41-NEXT: pmullw %xmm5, %xmm4 -; SSE41-NEXT: pand %xmm9, %xmm4 -; SSE41-NEXT: packuswb %xmm2, %xmm4 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; SSE41-NEXT: pand %xmm10, %xmm2 +; SSE41-NEXT: pmullw %xmm1, %xmm5 +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: packuswb %xmm2, %xmm5 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm9, %xmm3 -; SSE41-NEXT: pmullw %xmm2, %xmm5 -; SSE41-NEXT: pand %xmm9, %xmm5 -; SSE41-NEXT: packuswb %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pand %xmm10, %xmm3 +; SSE41-NEXT: pmullw %xmm1, %xmm6 +; SSE41-NEXT: pand %xmm10, %xmm6 +; SSE41-NEXT: packuswb %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm3 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v64i8: @@ -1240,7 +1240,7 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -1248,43 +1248,43 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pxor %xmm13, %xmm13 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm13 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm11[0,1,1,3] +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm10[0,1,1,3] ; SSE2-NEXT: pmuludq %xmm4, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm12 -; SSE2-NEXT: paddq %xmm14, %xmm12 -; SSE2-NEXT: psllq $32, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm9 +; SSE2-NEXT: paddq %xmm14, %xmm9 +; SSE2-NEXT: psllq $32, %xmm9 ; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: paddq %xmm12, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,3,3] +; SSE2-NEXT: paddq %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm9, %xmm11 -; SSE2-NEXT: paddq %xmm4, %xmm11 -; SSE2-NEXT: psllq $32, %xmm11 -; SSE2-NEXT: pmuludq %xmm9, %xmm1 -; SSE2-NEXT: paddq %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm13[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm11, %xmm9 +; SSE2-NEXT: paddq %xmm4, %xmm9 +; SSE2-NEXT: psllq $32, %xmm9 +; SSE2-NEXT: pmuludq %xmm11, %xmm1 +; SSE2-NEXT: paddq %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,1,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[0,1,1,3] ; SSE2-NEXT: pmuludq %xmm6, %xmm9 ; SSE2-NEXT: paddq %xmm4, %xmm9 ; SSE2-NEXT: psllq $32, %xmm9 diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index 1110146d3cda8c..d14558e88d37dd 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -1335,39 +1335,39 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] ; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm11 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: movdqa %xmm5, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm15 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8 ; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm8, 240(%rdi) ; SSE2-NEXT: movdqa %xmm7, 224(%rdi) ; SSE2-NEXT: movdqa %xmm6, 208(%rdi) @@ -1379,7 +1379,7 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE2-NEXT: movdqa %xmm3, 112(%rdi) ; SSE2-NEXT: movdqa %xmm12, 96(%rdi) ; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm11, 64(%rdi) +; SSE2-NEXT: movdqa %xmm10, 64(%rdi) ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) ; SSE2-NEXT: movdqa %xmm9, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) @@ -1392,31 +1392,31 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE41-NEXT: movdqa %xmm0, %xmm8 ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8 -; SSE41-NEXT: pxor %xmm11, %xmm11 +; SSE41-NEXT: pxor %xmm10, %xmm10 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE41-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm10 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; SSE41-NEXT: movdqa %xmm2, %xmm11 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm12 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm13 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm14 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm15 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; SSE41-NEXT: movdqa %xmm7, 240(%rdi) ; SSE41-NEXT: movdqa %xmm0, 224(%rdi) ; SSE41-NEXT: movdqa %xmm15, 208(%rdi) @@ -1431,7 +1431,7 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE41-NEXT: movdqa %xmm12, 112(%rdi) ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: movdqa %xmm0, 96(%rdi) -; SSE41-NEXT: movdqa %xmm10, 80(%rdi) +; SSE41-NEXT: movdqa %xmm11, 80(%rdi) ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SSE41-NEXT: movdqa %xmm0, 64(%rdi) ; SSE41-NEXT: movdqa %xmm9, 48(%rdi) @@ -1516,39 +1516,39 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] ; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm11 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: movdqa %xmm5, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm15 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8 ; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm8, 240(%rdi) ; SSE2-NEXT: movdqa %xmm7, 224(%rdi) ; SSE2-NEXT: movdqa %xmm6, 208(%rdi) @@ -1560,7 +1560,7 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE2-NEXT: movdqa %xmm3, 112(%rdi) ; SSE2-NEXT: movdqa %xmm12, 96(%rdi) ; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm11, 64(%rdi) +; SSE2-NEXT: movdqa %xmm10, 64(%rdi) ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) ; SSE2-NEXT: movdqa %xmm9, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) @@ -1573,31 +1573,31 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE41-NEXT: movdqa %xmm0, %xmm8 ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8 -; SSE41-NEXT: pxor %xmm11, %xmm11 +; SSE41-NEXT: pxor %xmm10, %xmm10 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE41-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm10 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; SSE41-NEXT: movdqa %xmm2, %xmm11 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm12 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm13 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm14 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm15 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; SSE41-NEXT: movdqa %xmm7, 240(%rdi) ; SSE41-NEXT: movdqa %xmm0, 224(%rdi) ; SSE41-NEXT: movdqa %xmm15, 208(%rdi) @@ -1612,7 +1612,7 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE41-NEXT: movdqa %xmm12, 112(%rdi) ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: movdqa %xmm0, 96(%rdi) -; SSE41-NEXT: movdqa %xmm10, 80(%rdi) +; SSE41-NEXT: movdqa %xmm11, 80(%rdi) ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SSE41-NEXT: movdqa %xmm0, 64(%rdi) ; SSE41-NEXT: movdqa %xmm9, 48(%rdi) diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll index 5ed14ab6e0b976..307ce91682a377 100644 --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -382,39 +382,39 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; X64: # %bb.0: ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 -; X64-NEXT: andq %r8, %rax +; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X64-NEXT: andq %rcx, %rax ; X64-NEXT: subq %rax, %rsi -; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 +; X64-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: andq %rdx, %rax ; X64-NEXT: shrq $2, %rsi -; X64-NEXT: andq %rcx, %rsi +; X64-NEXT: andq %rdx, %rsi ; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: shrq $4, %rdx -; X64-NEXT: addq %rax, %rdx -; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rsi, %rdx +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: shrq $4, %rsi +; X64-NEXT: addq %rax, %rsi +; X64-NEXT: movabsq $1085102592571150095, %r8 # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %r8, %rsi ; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 -; X64-NEXT: imulq %r9, %rdx -; X64-NEXT: shrq $56, %rdx +; X64-NEXT: imulq %r9, %rsi +; X64-NEXT: shrq $56, %rsi ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shrq %rax -; X64-NEXT: andq %r8, %rax +; X64-NEXT: andq %rcx, %rax ; X64-NEXT: subq %rax, %rdi ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: andq %rdx, %rax ; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rdi, %rcx -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: andq %rdi, %rdx +; X64-NEXT: addq %rax, %rdx +; X64-NEXT: movq %rdx, %rax ; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: andq %rsi, %rax +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: andq %r8, %rax ; X64-NEXT: imulq %r9, %rax ; X64-NEXT: shrq $56, %rax -; X64-NEXT: addq %rdx, %rax +; X64-NEXT: addq %rsi, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq ; @@ -493,16 +493,16 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; X86-SSSE3-LABEL: cnt128: ; X86-SSSE3: # %bb.0: ; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; X86-SSSE3-NEXT: movdqa %xmm2, %xmm3 -; X86-SSSE3-NEXT: pand %xmm1, %xmm3 -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-SSSE3-NEXT: movdqa %xmm0, %xmm4 +; X86-SSSE3-NEXT: pand %xmm0, %xmm3 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; X86-SSSE3-NEXT: pshufb %xmm3, %xmm4 ; X86-SSSE3-NEXT: psrlw $4, %xmm2 -; X86-SSSE3-NEXT: pand %xmm1, %xmm2 -; X86-SSSE3-NEXT: movdqa %xmm0, %xmm3 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm3 ; X86-SSSE3-NEXT: pshufb %xmm2, %xmm3 ; X86-SSSE3-NEXT: paddb %xmm4, %xmm3 ; X86-SSSE3-NEXT: pxor %xmm2, %xmm2 @@ -510,15 +510,15 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; X86-SSSE3-NEXT: movd %xmm3, %ecx ; X86-SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X86-SSSE3-NEXT: pand %xmm1, %xmm4 -; X86-SSSE3-NEXT: movdqa %xmm0, %xmm5 +; X86-SSSE3-NEXT: pand %xmm0, %xmm4 +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm5 ; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 ; X86-SSSE3-NEXT: psrlw $4, %xmm3 -; X86-SSSE3-NEXT: pand %xmm1, %xmm3 -; X86-SSSE3-NEXT: pshufb %xmm3, %xmm0 -; X86-SSSE3-NEXT: paddb %xmm5, %xmm0 -; X86-SSSE3-NEXT: psadbw %xmm2, %xmm0 -; X86-SSSE3-NEXT: movd %xmm0, %edx +; X86-SSSE3-NEXT: pand %xmm0, %xmm3 +; X86-SSSE3-NEXT: pshufb %xmm3, %xmm1 +; X86-SSSE3-NEXT: paddb %xmm5, %xmm1 +; X86-SSSE3-NEXT: psadbw %xmm2, %xmm1 +; X86-SSSE3-NEXT: movd %xmm1, %edx ; X86-SSSE3-NEXT: addl %ecx, %edx ; X86-SSSE3-NEXT: movl %edx, (%eax) ; X86-SSSE3-NEXT: movl $0, 12(%eax) @@ -798,14 +798,14 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X86-NOSSE-NEXT: pushl %ebx ; X86-NOSSE-NEXT: pushl %edi ; X86-NOSSE-NEXT: pushl %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NOSSE-NEXT: movl %ebx, %ecx ; X86-NOSSE-NEXT: shrl %ecx -; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %edi, %ecx +; X86-NOSSE-NEXT: movl $1431655765, %eax # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %eax, %ecx ; X86-NOSSE-NEXT: subl %ecx, %ebx ; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 ; X86-NOSSE-NEXT: movl %ebx, %ebp @@ -816,52 +816,52 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X86-NOSSE-NEXT: movl %ebx, %ebp ; X86-NOSSE-NEXT: shrl $4, %ebp ; X86-NOSSE-NEXT: addl %ebx, %ebp -; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: movl %edi, %ebx ; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl %edi, %ebx -; X86-NOSSE-NEXT: subl %ebx, %eax -; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: andl %eax, %ebx +; X86-NOSSE-NEXT: subl %ebx, %edi +; X86-NOSSE-NEXT: movl %edi, %ebx ; X86-NOSSE-NEXT: andl %ecx, %ebx -; X86-NOSSE-NEXT: shrl $2, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax -; X86-NOSSE-NEXT: addl %ebx, %eax -; X86-NOSSE-NEXT: movl %eax, %edi -; X86-NOSSE-NEXT: shrl $4, %edi -; X86-NOSSE-NEXT: addl %eax, %edi +; X86-NOSSE-NEXT: shrl $2, %edi +; X86-NOSSE-NEXT: andl %ecx, %edi +; X86-NOSSE-NEXT: addl %ebx, %edi +; X86-NOSSE-NEXT: movl %edi, %eax +; X86-NOSSE-NEXT: shrl $4, %eax +; X86-NOSSE-NEXT: addl %edi, %eax ; X86-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F ; X86-NOSSE-NEXT: andl %ebx, %ebp -; X86-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %eax -; X86-NOSSE-NEXT: andl %ebx, %edi -; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: imull $16843009, %ebp, %ebp # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ebp +; X86-NOSSE-NEXT: andl %ebx, %eax +; X86-NOSSE-NEXT: imull $16843009, %eax, %edi # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %edi -; X86-NOSSE-NEXT: addl %eax, %edi -; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: addl %ebp, %edi +; X86-NOSSE-NEXT: movl %edx, %eax ; X86-NOSSE-NEXT: shrl %eax ; X86-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 ; X86-NOSSE-NEXT: andl %ebp, %eax -; X86-NOSSE-NEXT: subl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax -; X86-NOSSE-NEXT: shrl $2, %esi -; X86-NOSSE-NEXT: andl %ecx, %esi -; X86-NOSSE-NEXT: addl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %ebp -; X86-NOSSE-NEXT: shrl $4, %ebp -; X86-NOSSE-NEXT: addl %esi, %ebp -; X86-NOSSE-NEXT: movl %edx, %eax -; X86-NOSSE-NEXT: shrl %eax -; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %esi, %eax ; X86-NOSSE-NEXT: subl %eax, %edx ; X86-NOSSE-NEXT: movl %edx, %eax ; X86-NOSSE-NEXT: andl %ecx, %eax ; X86-NOSSE-NEXT: shrl $2, %edx ; X86-NOSSE-NEXT: andl %ecx, %edx ; X86-NOSSE-NEXT: addl %eax, %edx -; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: movl %edx, %ebp +; X86-NOSSE-NEXT: shrl $4, %ebp +; X86-NOSSE-NEXT: addl %edx, %ebp +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: shrl %eax +; X86-NOSSE-NEXT: movl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %edx, %eax +; X86-NOSSE-NEXT: subl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: addl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %eax ; X86-NOSSE-NEXT: shrl $4, %eax -; X86-NOSSE-NEXT: addl %edx, %eax +; X86-NOSSE-NEXT: addl %esi, %eax ; X86-NOSSE-NEXT: andl %ebx, %ebp ; X86-NOSSE-NEXT: andl %ebx, %eax ; X86-NOSSE-NEXT: imull $16843009, %ebp, %ecx # imm = 0x1010101 @@ -886,39 +886,39 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X64: # %bb.0: ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 -; X64-NEXT: andq %r8, %rax +; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X64-NEXT: andq %rcx, %rax ; X64-NEXT: subq %rax, %rsi -; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 +; X64-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: andq %rdx, %rax ; X64-NEXT: shrq $2, %rsi -; X64-NEXT: andq %rcx, %rsi +; X64-NEXT: andq %rdx, %rsi ; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: shrq $4, %rdx -; X64-NEXT: addq %rax, %rdx -; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rsi, %rdx +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: shrq $4, %rsi +; X64-NEXT: addq %rax, %rsi +; X64-NEXT: movabsq $1085102592571150095, %r8 # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %r8, %rsi ; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 -; X64-NEXT: imulq %r9, %rdx -; X64-NEXT: shrq $56, %rdx +; X64-NEXT: imulq %r9, %rsi +; X64-NEXT: shrq $56, %rsi ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shrq %rax -; X64-NEXT: andq %r8, %rax +; X64-NEXT: andq %rcx, %rax ; X64-NEXT: subq %rax, %rdi ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: andq %rdx, %rax ; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rdi, %rcx -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: andq %rdi, %rdx +; X64-NEXT: addq %rax, %rdx +; X64-NEXT: movq %rdx, %rax ; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: andq %rsi, %rax +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: andq %r8, %rax ; X64-NEXT: imulq %r9, %rax ; X64-NEXT: shrq $56, %rax -; X64-NEXT: addq %rdx, %rax +; X64-NEXT: addq %rsi, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq ; @@ -999,16 +999,16 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X86-SSSE3-LABEL: cnt128_optsize: ; X86-SSSE3: # %bb.0: ; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; X86-SSSE3-NEXT: movdqa %xmm2, %xmm3 -; X86-SSSE3-NEXT: pand %xmm1, %xmm3 -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-SSSE3-NEXT: movdqa %xmm0, %xmm4 +; X86-SSSE3-NEXT: pand %xmm0, %xmm3 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; X86-SSSE3-NEXT: pshufb %xmm3, %xmm4 ; X86-SSSE3-NEXT: psrlw $4, %xmm2 -; X86-SSSE3-NEXT: pand %xmm1, %xmm2 -; X86-SSSE3-NEXT: movdqa %xmm0, %xmm3 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm3 ; X86-SSSE3-NEXT: pshufb %xmm2, %xmm3 ; X86-SSSE3-NEXT: paddb %xmm4, %xmm3 ; X86-SSSE3-NEXT: pxor %xmm2, %xmm2 @@ -1016,15 +1016,15 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X86-SSSE3-NEXT: movd %xmm3, %ecx ; X86-SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X86-SSSE3-NEXT: pand %xmm1, %xmm4 -; X86-SSSE3-NEXT: movdqa %xmm0, %xmm5 +; X86-SSSE3-NEXT: pand %xmm0, %xmm4 +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm5 ; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 ; X86-SSSE3-NEXT: psrlw $4, %xmm3 -; X86-SSSE3-NEXT: pand %xmm1, %xmm3 -; X86-SSSE3-NEXT: pshufb %xmm3, %xmm0 -; X86-SSSE3-NEXT: paddb %xmm5, %xmm0 -; X86-SSSE3-NEXT: psadbw %xmm2, %xmm0 -; X86-SSSE3-NEXT: movd %xmm0, %edx +; X86-SSSE3-NEXT: pand %xmm0, %xmm3 +; X86-SSSE3-NEXT: pshufb %xmm3, %xmm1 +; X86-SSSE3-NEXT: paddb %xmm5, %xmm1 +; X86-SSSE3-NEXT: psadbw %xmm2, %xmm1 +; X86-SSSE3-NEXT: movd %xmm1, %edx ; X86-SSSE3-NEXT: addl %ecx, %edx ; X86-SSSE3-NEXT: xorl %ecx, %ecx ; X86-SSSE3-NEXT: movl %ecx, 12(%eax) @@ -1227,14 +1227,14 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X86-NOSSE-NEXT: pushl %ebx ; X86-NOSSE-NEXT: pushl %edi ; X86-NOSSE-NEXT: pushl %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NOSSE-NEXT: movl %ebx, %ecx ; X86-NOSSE-NEXT: shrl %ecx -; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %edi, %ecx +; X86-NOSSE-NEXT: movl $1431655765, %eax # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %eax, %ecx ; X86-NOSSE-NEXT: subl %ecx, %ebx ; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 ; X86-NOSSE-NEXT: movl %ebx, %ebp @@ -1245,52 +1245,52 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X86-NOSSE-NEXT: movl %ebx, %ebp ; X86-NOSSE-NEXT: shrl $4, %ebp ; X86-NOSSE-NEXT: addl %ebx, %ebp -; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: movl %edi, %ebx ; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl %edi, %ebx -; X86-NOSSE-NEXT: subl %ebx, %eax -; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: andl %eax, %ebx +; X86-NOSSE-NEXT: subl %ebx, %edi +; X86-NOSSE-NEXT: movl %edi, %ebx ; X86-NOSSE-NEXT: andl %ecx, %ebx -; X86-NOSSE-NEXT: shrl $2, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax -; X86-NOSSE-NEXT: addl %ebx, %eax -; X86-NOSSE-NEXT: movl %eax, %edi -; X86-NOSSE-NEXT: shrl $4, %edi -; X86-NOSSE-NEXT: addl %eax, %edi +; X86-NOSSE-NEXT: shrl $2, %edi +; X86-NOSSE-NEXT: andl %ecx, %edi +; X86-NOSSE-NEXT: addl %ebx, %edi +; X86-NOSSE-NEXT: movl %edi, %eax +; X86-NOSSE-NEXT: shrl $4, %eax +; X86-NOSSE-NEXT: addl %edi, %eax ; X86-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F ; X86-NOSSE-NEXT: andl %ebx, %ebp -; X86-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %eax -; X86-NOSSE-NEXT: andl %ebx, %edi -; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: imull $16843009, %ebp, %ebp # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ebp +; X86-NOSSE-NEXT: andl %ebx, %eax +; X86-NOSSE-NEXT: imull $16843009, %eax, %edi # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %edi -; X86-NOSSE-NEXT: addl %eax, %edi -; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: addl %ebp, %edi +; X86-NOSSE-NEXT: movl %edx, %eax ; X86-NOSSE-NEXT: shrl %eax ; X86-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 ; X86-NOSSE-NEXT: andl %ebp, %eax -; X86-NOSSE-NEXT: subl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax -; X86-NOSSE-NEXT: shrl $2, %esi -; X86-NOSSE-NEXT: andl %ecx, %esi -; X86-NOSSE-NEXT: addl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %ebp -; X86-NOSSE-NEXT: shrl $4, %ebp -; X86-NOSSE-NEXT: addl %esi, %ebp -; X86-NOSSE-NEXT: movl %edx, %eax -; X86-NOSSE-NEXT: shrl %eax -; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %esi, %eax ; X86-NOSSE-NEXT: subl %eax, %edx ; X86-NOSSE-NEXT: movl %edx, %eax ; X86-NOSSE-NEXT: andl %ecx, %eax ; X86-NOSSE-NEXT: shrl $2, %edx ; X86-NOSSE-NEXT: andl %ecx, %edx ; X86-NOSSE-NEXT: addl %eax, %edx -; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: movl %edx, %ebp +; X86-NOSSE-NEXT: shrl $4, %ebp +; X86-NOSSE-NEXT: addl %edx, %ebp +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: shrl %eax +; X86-NOSSE-NEXT: movl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %edx, %eax +; X86-NOSSE-NEXT: subl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: addl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %eax ; X86-NOSSE-NEXT: shrl $4, %eax -; X86-NOSSE-NEXT: addl %edx, %eax +; X86-NOSSE-NEXT: addl %esi, %eax ; X86-NOSSE-NEXT: andl %ebx, %ebp ; X86-NOSSE-NEXT: andl %ebx, %eax ; X86-NOSSE-NEXT: imull $16843009, %ebp, %ecx # imm = 0x1010101 @@ -1315,39 +1315,39 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X64: # %bb.0: ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 -; X64-NEXT: andq %r8, %rax +; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X64-NEXT: andq %rcx, %rax ; X64-NEXT: subq %rax, %rsi -; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 +; X64-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: andq %rdx, %rax ; X64-NEXT: shrq $2, %rsi -; X64-NEXT: andq %rcx, %rsi +; X64-NEXT: andq %rdx, %rsi ; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: shrq $4, %rdx -; X64-NEXT: addq %rax, %rdx -; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rsi, %rdx +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: shrq $4, %rsi +; X64-NEXT: addq %rax, %rsi +; X64-NEXT: movabsq $1085102592571150095, %r8 # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %r8, %rsi ; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 -; X64-NEXT: imulq %r9, %rdx -; X64-NEXT: shrq $56, %rdx +; X64-NEXT: imulq %r9, %rsi +; X64-NEXT: shrq $56, %rsi ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shrq %rax -; X64-NEXT: andq %r8, %rax +; X64-NEXT: andq %rcx, %rax ; X64-NEXT: subq %rax, %rdi ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: andq %rdx, %rax ; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rdi, %rcx -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: andq %rdi, %rdx +; X64-NEXT: addq %rax, %rdx +; X64-NEXT: movq %rdx, %rax ; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: andq %rsi, %rax +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: andq %r8, %rax ; X64-NEXT: imulq %r9, %rax ; X64-NEXT: shrq $56, %rax -; X64-NEXT: addq %rdx, %rax +; X64-NEXT: addq %rsi, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq ; @@ -1428,16 +1428,16 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X86-SSSE3-LABEL: cnt128_pgso: ; X86-SSSE3: # %bb.0: ; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; X86-SSSE3-NEXT: movdqa %xmm2, %xmm3 -; X86-SSSE3-NEXT: pand %xmm1, %xmm3 -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-SSSE3-NEXT: movdqa %xmm0, %xmm4 +; X86-SSSE3-NEXT: pand %xmm0, %xmm3 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; X86-SSSE3-NEXT: pshufb %xmm3, %xmm4 ; X86-SSSE3-NEXT: psrlw $4, %xmm2 -; X86-SSSE3-NEXT: pand %xmm1, %xmm2 -; X86-SSSE3-NEXT: movdqa %xmm0, %xmm3 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm3 ; X86-SSSE3-NEXT: pshufb %xmm2, %xmm3 ; X86-SSSE3-NEXT: paddb %xmm4, %xmm3 ; X86-SSSE3-NEXT: pxor %xmm2, %xmm2 @@ -1445,15 +1445,15 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X86-SSSE3-NEXT: movd %xmm3, %ecx ; X86-SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X86-SSSE3-NEXT: pand %xmm1, %xmm4 -; X86-SSSE3-NEXT: movdqa %xmm0, %xmm5 +; X86-SSSE3-NEXT: pand %xmm0, %xmm4 +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm5 ; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 ; X86-SSSE3-NEXT: psrlw $4, %xmm3 -; X86-SSSE3-NEXT: pand %xmm1, %xmm3 -; X86-SSSE3-NEXT: pshufb %xmm3, %xmm0 -; X86-SSSE3-NEXT: paddb %xmm5, %xmm0 -; X86-SSSE3-NEXT: psadbw %xmm2, %xmm0 -; X86-SSSE3-NEXT: movd %xmm0, %edx +; X86-SSSE3-NEXT: pand %xmm0, %xmm3 +; X86-SSSE3-NEXT: pshufb %xmm3, %xmm1 +; X86-SSSE3-NEXT: paddb %xmm5, %xmm1 +; X86-SSSE3-NEXT: psadbw %xmm2, %xmm1 +; X86-SSSE3-NEXT: movd %xmm1, %edx ; X86-SSSE3-NEXT: addl %ecx, %edx ; X86-SSSE3-NEXT: xorl %ecx, %ecx ; X86-SSSE3-NEXT: movl %ecx, 12(%eax) diff --git a/llvm/test/CodeGen/X86/pr32284.ll b/llvm/test/CodeGen/X86/pr32284.ll index 90fb76a23e9a5d..e18afb7dfa7cac 100644 --- a/llvm/test/CodeGen/X86/pr32284.ll +++ b/llvm/test/CodeGen/X86/pr32284.ll @@ -366,15 +366,15 @@ define void @f2() { ; X86: # %bb.0: # %entry ; X86-NEXT: subl $2, %esp ; X86-NEXT: .cfi_def_cfa_offset 6 -; X86-NEXT: movzbl var_7, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: sete %al -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: xorl %eax, %edx +; X86-NEXT: movzbl var_7, %eax +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %cl +; X86-NEXT: movl %eax, %edx +; X86-NEXT: xorl %ecx, %edx ; X86-NEXT: movw %dx, (%esp) ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: sete %dl ; X86-NEXT: movw %dx, (%eax) ; X86-NEXT: addl $2, %esp diff --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll index d9671aa04f4603..9d6174e9d92583 100644 --- a/llvm/test/CodeGen/X86/pr32329.ll +++ b/llvm/test/CodeGen/X86/pr32329.ll @@ -30,7 +30,7 @@ define void @foo() local_unnamed_addr { ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movsbl var_27, %eax -; X86-NEXT: movzwl var_2, %ebx +; X86-NEXT: movzwl var_2, %edi ; X86-NEXT: movl var_310, %ecx ; X86-NEXT: imull %eax, %ecx ; X86-NEXT: addl var_24, %ecx @@ -38,21 +38,21 @@ define void @foo() local_unnamed_addr { ; X86-NEXT: andl obj, %esi ; X86-NEXT: leal (%esi,%esi), %edx ; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %edi, %ebx +; X86-NEXT: imull %ebx, %ecx ; X86-NEXT: addb $113, %cl -; X86-NEXT: movl $9, %ebx +; X86-NEXT: movl $9, %edi ; X86-NEXT: xorl %ebp, %ebp -; X86-NEXT: shldl %cl, %ebx, %ebp -; X86-NEXT: shll %cl, %ebx +; X86-NEXT: shldl %cl, %edi, %ebp +; X86-NEXT: shll %cl, %edi ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %ebx, %ebp +; X86-NEXT: cmovnel %edi, %ebp ; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovnel %ecx, %ebx -; X86-NEXT: cmpl %esi, %edi +; X86-NEXT: cmovnel %ecx, %edi +; X86-NEXT: cmpl %esi, %ebx ; X86-NEXT: movl %ebp, var_50+4 -; X86-NEXT: movl %ebx, var_50 +; X86-NEXT: movl %edi, var_50 ; X86-NEXT: setge var_205 ; X86-NEXT: imull %eax, %edx ; X86-NEXT: movb %dl, var_218 diff --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll index 8e0532e6065280..f7278c3397f3db 100644 --- a/llvm/test/CodeGen/X86/pr38795.ll +++ b/llvm/test/CodeGen/X86/pr38795.ll @@ -23,16 +23,17 @@ define dso_local void @fn() { ; CHECK-NEXT: .cfi_offset %ebx, -12 ; CHECK-NEXT: .cfi_offset %ebp, -8 ; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: # implicit-def: $esi +; CHECK-NEXT: # implicit-def: $ecx ; CHECK-NEXT: # implicit-def: $edi -; CHECK-NEXT: # implicit-def: $ch +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: # kill: killed $al ; CHECK-NEXT: # implicit-def: $dl ; CHECK-NEXT: # implicit-def: $ebp ; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_14: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: movb %dl, %ch -; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; CHECK-NEXT: movb %dh, %dl ; CHECK-NEXT: .LBB0_1: # %for.cond ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_22 Depth 2 @@ -45,12 +46,11 @@ define dso_local void @fn() { ; CHECK-NEXT: je .LBB0_3 ; CHECK-NEXT: # %bb.4: # %if.end ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: movl %esi, %ecx -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl a ; CHECK-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: movb %cl, %dh ; CHECK-NEXT: movl $0, h ; CHECK-NEXT: cmpb $8, %dl ; CHECK-NEXT: jg .LBB0_8 @@ -58,22 +58,22 @@ define dso_local void @fn() { ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: movl $.str, (%esp) -; CHECK-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; CHECK-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; CHECK-NEXT: calll printf -; CHECK-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload ; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: # implicit-def: $eax -; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload +; CHECK-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; CHECK-NEXT: movb %dh, %dl ; CHECK-NEXT: je .LBB0_6 ; CHECK-NEXT: jmp .LBB0_18 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_3: # %if.then ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: movl $.str, (%esp) -; CHECK-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; CHECK-NEXT: calll printf -; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; CHECK-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload ; CHECK-NEXT: # implicit-def: $eax ; CHECK-NEXT: .LBB0_6: # %for.cond35 @@ -96,31 +96,20 @@ define dso_local void @fn() { ; CHECK-NEXT: calll printf ; CHECK-NEXT: .LBB0_21: # %for.end46 ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: # implicit-def: $ch -; CHECK-NEXT: # implicit-def: $cl +; CHECK-NEXT: # implicit-def: $dl +; CHECK-NEXT: # implicit-def: $dh ; CHECK-NEXT: # implicit-def: $ebp ; CHECK-NEXT: jmp .LBB0_22 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_8: # %if.end21 ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: # implicit-def: $ebp -; CHECK-NEXT: testb %bl, %bl -; CHECK-NEXT: je .LBB0_13 +; CHECK-NEXT: jmp .LBB0_9 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB0_10: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: # implicit-def: $eax -; CHECK-NEXT: testb %bl, %bl -; CHECK-NEXT: je .LBB0_19 -; CHECK-NEXT: .LBB0_12: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: # implicit-def: $edi -; CHECK-NEXT: # implicit-def: $ch -; CHECK-NEXT: # implicit-def: $dl -; CHECK-NEXT: # implicit-def: $ebp -; CHECK-NEXT: testl %edi, %edi -; CHECK-NEXT: jne .LBB0_11 ; CHECK-NEXT: .LBB0_7: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: movb %dl, %cl +; CHECK-NEXT: movb %dl, %dh +; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_22: # %for.cond47 ; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 @@ -131,13 +120,13 @@ define dso_local void @fn() { ; CHECK-NEXT: # in Loop: Header=BB0_22 Depth=2 ; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne .LBB0_22 -; CHECK-NEXT: # %bb.24: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: movb %ch, %dl +; CHECK-NEXT: .LBB0_9: # %ae +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne .LBB0_10 -; CHECK-NEXT: .LBB0_13: # %if.end26 +; CHECK-NEXT: # %bb.13: # %if.end26 ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testb %dl, %dl ; CHECK-NEXT: je .LBB0_14 ; CHECK-NEXT: # %bb.15: # %if.end26 @@ -146,17 +135,31 @@ define dso_local void @fn() { ; CHECK-NEXT: jne .LBB0_16 ; CHECK-NEXT: # %bb.17: # %if.then31 ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: movb %dl, %ch +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; CHECK-NEXT: xorl %ebp, %ebp ; CHECK-NEXT: .LBB0_18: # %for.inc ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: movb %dh, %dl ; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_10: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: # implicit-def: $eax +; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: je .LBB0_19 +; CHECK-NEXT: .LBB0_12: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: # implicit-def: $edi +; CHECK-NEXT: # implicit-def: $cl +; CHECK-NEXT: # kill: killed $cl +; CHECK-NEXT: # implicit-def: $dl +; CHECK-NEXT: # implicit-def: $ebp +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jne .LBB0_11 +; CHECK-NEXT: jmp .LBB0_7 +; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_16: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: movb %dl, %ch -; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; CHECK-NEXT: movb %dh, %dl ; CHECK-NEXT: jmp .LBB0_1 entry: br label %for.cond diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll index 2fb4410567be69..f3d784a86a5c23 100644 --- a/llvm/test/CodeGen/X86/pr43820.ll +++ b/llvm/test/CodeGen/X86/pr43820.ll @@ -10,26 +10,25 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12 ; CHECK-NEXT: bswapq %r12 -; CHECK-NEXT: movq %r12, %r10 -; CHECK-NEXT: shrq $4, %r10 -; CHECK-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: andq %rsi, %r12 +; CHECK-NEXT: movq %r12, %r11 +; CHECK-NEXT: shrq $4, %r11 +; CHECK-NEXT: movabsq $1085102592571150095, %rbx # imm = 0xF0F0F0F0F0F0F0F +; CHECK-NEXT: andq %rbx, %r11 +; CHECK-NEXT: andq %rbx, %r12 ; CHECK-NEXT: shlq $4, %r12 -; CHECK-NEXT: orq %r10, %r12 -; CHECK-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333 +; CHECK-NEXT: orq %r11, %r12 +; CHECK-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 ; CHECK-NEXT: movq %r12, %r13 -; CHECK-NEXT: andq %r10, %r13 +; CHECK-NEXT: andq %r11, %r13 ; CHECK-NEXT: shrq $2, %r12 -; CHECK-NEXT: andq %r10, %r12 +; CHECK-NEXT: andq %r11, %r12 ; CHECK-NEXT: leaq (%r12,%r13,4), %r12 ; CHECK-NEXT: movabsq $6148914691230924800, %r13 # imm = 0x5555555555000000 ; CHECK-NEXT: movq %r12, %rbp @@ -38,283 +37,281 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r13, %r12 ; CHECK-NEXT: leaq (%r12,%rbp,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: bswapq %r14 -; CHECK-NEXT: movq %r14, %r12 -; CHECK-NEXT: shrq $4, %r12 -; CHECK-NEXT: andq %rsi, %r12 -; CHECK-NEXT: andq %rsi, %r14 -; CHECK-NEXT: shlq $4, %r14 -; CHECK-NEXT: orq %r12, %r14 -; CHECK-NEXT: movq %r14, %r12 -; CHECK-NEXT: andq %r10, %r12 -; CHECK-NEXT: shrq $2, %r14 -; CHECK-NEXT: andq %r10, %r14 -; CHECK-NEXT: leaq (%r14,%r12,4), %r12 -; CHECK-NEXT: movabsq $6148914691236517205, %r14 # imm = 0x5555555555555555 -; CHECK-NEXT: movq %r12, %r13 -; CHECK-NEXT: andq %r14, %r13 -; CHECK-NEXT: shrq %r12 -; CHECK-NEXT: andq %r14, %r12 -; CHECK-NEXT: leaq (%r12,%r13,2), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r15 ; CHECK-NEXT: movq %r15, %r12 ; CHECK-NEXT: shrq $4, %r12 -; CHECK-NEXT: andq %rsi, %r12 -; CHECK-NEXT: andq %rsi, %r15 +; CHECK-NEXT: andq %rbx, %r12 +; CHECK-NEXT: andq %rbx, %r15 ; CHECK-NEXT: shlq $4, %r15 ; CHECK-NEXT: orq %r12, %r15 ; CHECK-NEXT: movq %r15, %r12 -; CHECK-NEXT: andq %r10, %r12 +; CHECK-NEXT: andq %r11, %r12 ; CHECK-NEXT: shrq $2, %r15 -; CHECK-NEXT: andq %r10, %r15 -; CHECK-NEXT: leaq (%r15,%r12,4), %r15 -; CHECK-NEXT: movq %r15, %r12 -; CHECK-NEXT: andq %r14, %r12 -; CHECK-NEXT: shrq %r15 -; CHECK-NEXT: andq %r14, %r15 -; CHECK-NEXT: leaq (%r15,%r12,2), %rax +; CHECK-NEXT: andq %r11, %r15 +; CHECK-NEXT: leaq (%r15,%r12,4), %r12 +; CHECK-NEXT: movabsq $6148914691236517205, %r15 # imm = 0x5555555555555555 +; CHECK-NEXT: movq %r12, %r13 +; CHECK-NEXT: andq %r15, %r13 +; CHECK-NEXT: shrq %r12 +; CHECK-NEXT: andq %r15, %r12 +; CHECK-NEXT: leaq (%r12,%r13,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: bswapq %r14 +; CHECK-NEXT: movq %r14, %r12 +; CHECK-NEXT: shrq $4, %r12 +; CHECK-NEXT: andq %rbx, %r12 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: orq %r12, %r14 +; CHECK-NEXT: movq %r14, %r12 +; CHECK-NEXT: andq %r11, %r12 +; CHECK-NEXT: shrq $2, %r14 +; CHECK-NEXT: andq %r11, %r14 +; CHECK-NEXT: leaq (%r14,%r12,4), %r14 +; CHECK-NEXT: movq %r14, %r12 +; CHECK-NEXT: andq %r15, %r12 +; CHECK-NEXT: shrq %r14 +; CHECK-NEXT: andq %r15, %r14 +; CHECK-NEXT: leaq (%r14,%r12,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: bswapq %rbx -; CHECK-NEXT: movq %rbx, %r15 -; CHECK-NEXT: shrq $4, %r15 -; CHECK-NEXT: andq %rsi, %r15 -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: shlq $4, %rbx -; CHECK-NEXT: orq %r15, %rbx -; CHECK-NEXT: movq %rbx, %r15 -; CHECK-NEXT: andq %r10, %r15 -; CHECK-NEXT: shrq $2, %rbx -; CHECK-NEXT: andq %r10, %rbx -; CHECK-NEXT: leaq (%rbx,%r15,4), %rbx -; CHECK-NEXT: movq %rbx, %r15 -; CHECK-NEXT: andq %r14, %r15 -; CHECK-NEXT: shrq %rbx -; CHECK-NEXT: andq %r14, %rbx -; CHECK-NEXT: leaq (%rbx,%r15,2), %rax +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: shrq $4, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: andq %rbx, %r10 +; CHECK-NEXT: shlq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %r11, %r14 +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: andq %r11, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %r15, %r14 +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: andq %r15, %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: andq %rbx, %r10 +; CHECK-NEXT: andq %rbx, %rdi ; CHECK-NEXT: shlq $4, %rdi -; CHECK-NEXT: orq %rbx, %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: orq %r10, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r11, %r10 ; CHECK-NEXT: shrq $2, %rdi -; CHECK-NEXT: andq %r10, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: andq %r11, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,4), %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r15, %r10 ; CHECK-NEXT: shrq %rdi -; CHECK-NEXT: andq %r14, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax +; CHECK-NEXT: andq %r15, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: bswapq %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: andq %rbx, %r10 +; CHECK-NEXT: andq %rbx, %rdi ; CHECK-NEXT: shlq $4, %rdi -; CHECK-NEXT: orq %rbx, %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: orq %r10, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r11, %r10 ; CHECK-NEXT: shrq $2, %rdi -; CHECK-NEXT: andq %r10, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: andq %r11, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,4), %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r15, %r10 ; CHECK-NEXT: shrq %rdi -; CHECK-NEXT: andq %r14, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax +; CHECK-NEXT: andq %r15, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: bswapq %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: andq %rbx, %r10 +; CHECK-NEXT: andq %rbx, %rdi ; CHECK-NEXT: shlq $4, %rdi -; CHECK-NEXT: orq %rbx, %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: orq %r10, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r11, %r10 ; CHECK-NEXT: shrq $2, %rdi -; CHECK-NEXT: andq %r10, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: andq %r11, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,4), %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r15, %r10 ; CHECK-NEXT: shrq %rdi -; CHECK-NEXT: andq %r14, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax +; CHECK-NEXT: andq %r15, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: bswapq %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: andq %rbx, %r10 +; CHECK-NEXT: andq %rbx, %rdi ; CHECK-NEXT: shlq $4, %rdi -; CHECK-NEXT: orq %rbx, %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: orq %r10, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r11, %r10 ; CHECK-NEXT: shrq $2, %rdi -; CHECK-NEXT: andq %r10, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: andq %r11, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,4), %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r15, %r10 ; CHECK-NEXT: shrq %rdi -; CHECK-NEXT: andq %r14, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax +; CHECK-NEXT: andq %r15, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: bswapq %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: andq %rbx, %r10 +; CHECK-NEXT: andq %rbx, %rdi ; CHECK-NEXT: shlq $4, %rdi -; CHECK-NEXT: orq %rbx, %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: orq %r10, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r11, %r10 ; CHECK-NEXT: shrq $2, %rdi -; CHECK-NEXT: andq %r10, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: andq %r11, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,4), %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r15, %r10 ; CHECK-NEXT: shrq %rdi -; CHECK-NEXT: andq %r14, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax +; CHECK-NEXT: andq %r15, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: bswapq %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: andq %rbx, %r10 +; CHECK-NEXT: andq %rbx, %rdi ; CHECK-NEXT: shlq $4, %rdi -; CHECK-NEXT: orq %rbx, %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: orq %r10, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r11, %r10 ; CHECK-NEXT: shrq $2, %rdi -; CHECK-NEXT: andq %r10, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: andq %r11, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,4), %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %r15, %r10 ; CHECK-NEXT: shrq %rdi -; CHECK-NEXT: andq %r14, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax +; CHECK-NEXT: andq %r15, %rdi +; CHECK-NEXT: leaq (%rdi,%r10,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: andq %rbx, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rax, %rdi ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: andq %r10, %rax +; CHECK-NEXT: andq %r11, %rax ; CHECK-NEXT: shrq $2, %rdi -; CHECK-NEXT: andq %r10, %rdi +; CHECK-NEXT: andq %r11, %rdi ; CHECK-NEXT: leaq (%rdi,%rax,4), %rax ; CHECK-NEXT: movq %rax, %rdi -; CHECK-NEXT: andq %r14, %rdi +; CHECK-NEXT: andq %r15, %rdi ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%rdi,2), %rdi +; CHECK-NEXT: andq %r15, %rax +; CHECK-NEXT: leaq (%rax,%rdi,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r9 ; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %r9 +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: andq %rbx, %r9 ; CHECK-NEXT: shlq $4, %r9 ; CHECK-NEXT: orq %rax, %r9 ; CHECK-NEXT: movq %r9, %rax -; CHECK-NEXT: andq %r10, %rax +; CHECK-NEXT: andq %r11, %rax ; CHECK-NEXT: shrq $2, %r9 -; CHECK-NEXT: andq %r10, %r9 +; CHECK-NEXT: andq %r11, %r9 ; CHECK-NEXT: leaq (%r9,%rax,4), %rax ; CHECK-NEXT: movq %rax, %r9 -; CHECK-NEXT: andq %r14, %r9 +; CHECK-NEXT: andq %r15, %r9 ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%r9,2), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: andq %r15, %rax +; CHECK-NEXT: leaq (%rax,%r9,2), %r12 ; CHECK-NEXT: bswapq %r8 ; CHECK-NEXT: movq %r8, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %r8 +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: andq %rbx, %r8 ; CHECK-NEXT: shlq $4, %r8 ; CHECK-NEXT: orq %rax, %r8 ; CHECK-NEXT: movq %r8, %rax -; CHECK-NEXT: andq %r10, %rax +; CHECK-NEXT: andq %r11, %rax ; CHECK-NEXT: shrq $2, %r8 -; CHECK-NEXT: andq %r10, %r8 +; CHECK-NEXT: andq %r11, %r8 ; CHECK-NEXT: leaq (%r8,%rax,4), %rax ; CHECK-NEXT: movq %rax, %r8 -; CHECK-NEXT: andq %r14, %r8 +; CHECK-NEXT: andq %r15, %r8 ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%r8,2), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: andq %r15, %rax +; CHECK-NEXT: leaq (%rax,%r8,2), %r10 ; CHECK-NEXT: bswapq %rcx ; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: andq %rbx, %rcx ; CHECK-NEXT: shlq $4, %rcx ; CHECK-NEXT: orq %rax, %rcx ; CHECK-NEXT: movq %rcx, %rax -; CHECK-NEXT: andq %r10, %rax +; CHECK-NEXT: andq %r11, %rax ; CHECK-NEXT: shrq $2, %rcx -; CHECK-NEXT: andq %r10, %rcx +; CHECK-NEXT: andq %r11, %rcx ; CHECK-NEXT: leaq (%rcx,%rax,4), %rax ; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: andq %r14, %rcx +; CHECK-NEXT: andq %r15, %rcx ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%rcx,2), %rbx +; CHECK-NEXT: andq %r15, %rax +; CHECK-NEXT: leaq (%rax,%rcx,2), %rdi ; CHECK-NEXT: bswapq %rdx ; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %rdx +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: andq %rbx, %rdx ; CHECK-NEXT: shlq $4, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %rax -; CHECK-NEXT: andq %r10, %rax +; CHECK-NEXT: andq %r11, %rax ; CHECK-NEXT: shrq $2, %rdx -; CHECK-NEXT: andq %r10, %rdx +; CHECK-NEXT: andq %r11, %rdx ; CHECK-NEXT: leaq (%rdx,%rax,4), %rax ; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: andq %r14, %rdx +; CHECK-NEXT: andq %r15, %rdx ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %r14, %rax +; CHECK-NEXT: andq %r15, %rax ; CHECK-NEXT: leaq (%rax,%rdx,2), %rdx -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: bswapq %rsi +; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: shlq $4, %rcx -; CHECK-NEXT: orq %rax, %rcx -; CHECK-NEXT: movq %rcx, %rax -; CHECK-NEXT: andq %r10, %rax -; CHECK-NEXT: shrq $2, %rcx -; CHECK-NEXT: andq %r10, %rcx -; CHECK-NEXT: leaq (%rcx,%rax,4), %rax +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: andq %rbx, %rsi +; CHECK-NEXT: shlq $4, %rsi +; CHECK-NEXT: orq %rax, %rsi +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: shrq $2, %rsi +; CHECK-NEXT: andq %r11, %rsi +; CHECK-NEXT: leaq (%rsi,%rax,4), %rax ; CHECK-NEXT: movq %rax, %rsi -; CHECK-NEXT: andq %r14, %rsi +; CHECK-NEXT: andq %r15, %rsi ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %r14, %rax +; CHECK-NEXT: andq %r15, %rax ; CHECK-NEXT: leaq (%rax,%rsi,2), %rsi -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rax, %r10 +; CHECK-NEXT: shrdq $24, %rax, %r11 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: shrdq $24, %rcx, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -323,43 +320,41 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload ; CHECK-NEXT: shrdq $24, %r13, %rbp -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r12, %r13 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r15, %r12 +; CHECK-NEXT: shrdq $24, %r15, %r13 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; CHECK-NEXT: shrdq $24, %r14, %r15 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r11, %r14 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; CHECK-NEXT: shrdq $24, %rbx, %r14 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r9, %r11 -; CHECK-NEXT: movq %rdi, %r8 -; CHECK-NEXT: shrdq $24, %rdi, %r9 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rdi, %r8 +; CHECK-NEXT: shrdq $24, %r9, %rbx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; CHECK-NEXT: shrdq $24, %r8, %r9 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rcx, %rdi -; CHECK-NEXT: shrdq $24, %rbx, %rcx -; CHECK-NEXT: shrdq $24, %rdx, %rbx +; CHECK-NEXT: shrdq $24, %rcx, %r8 +; CHECK-NEXT: shrdq $24, %r12, %rcx +; CHECK-NEXT: shrdq $24, %r10, %r12 +; CHECK-NEXT: shrdq $24, %rdi, %r10 +; CHECK-NEXT: shrdq $24, %rdx, %rdi ; CHECK-NEXT: shrdq $24, %rsi, %rdx ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: movq %rdx, 112(%rax) -; CHECK-NEXT: movq %rbx, 104(%rax) -; CHECK-NEXT: movq %rcx, 96(%rax) -; CHECK-NEXT: movq %rdi, 88(%rax) -; CHECK-NEXT: movq %r8, 80(%rax) -; CHECK-NEXT: movq %r9, 72(%rax) -; CHECK-NEXT: movq %r11, 64(%rax) -; CHECK-NEXT: movq %r14, 56(%rax) -; CHECK-NEXT: movq %r15, 48(%rax) -; CHECK-NEXT: movq %r12, 40(%rax) +; CHECK-NEXT: movq %rdi, 104(%rax) +; CHECK-NEXT: movq %r10, 96(%rax) +; CHECK-NEXT: movq %r12, 88(%rax) +; CHECK-NEXT: movq %rcx, 80(%rax) +; CHECK-NEXT: movq %r8, 72(%rax) +; CHECK-NEXT: movq %r9, 64(%rax) +; CHECK-NEXT: movq %rbx, 56(%rax) +; CHECK-NEXT: movq %r14, 48(%rax) +; CHECK-NEXT: movq %r15, 40(%rax) ; CHECK-NEXT: movq %r13, 32(%rax) ; CHECK-NEXT: movq %rbp, 24(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 16(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 8(%rax) -; CHECK-NEXT: movq %r10, (%rax) +; CHECK-NEXT: movq %r11, (%rax) ; CHECK-NEXT: movq %rsi, %rcx ; CHECK-NEXT: shrq $56, %rsi ; CHECK-NEXT: movb %sil, 124(%rax) diff --git a/llvm/test/CodeGen/X86/pr44915.ll b/llvm/test/CodeGen/X86/pr44915.ll index 1ebdd9ccb3190f..0e2908d1dd20b1 100644 --- a/llvm/test/CodeGen/X86/pr44915.ll +++ b/llvm/test/CodeGen/X86/pr44915.ll @@ -10,25 +10,25 @@ define i32 @extract3(ptr, i32) nounwind { ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl 24(%esp), %esi ; X86-NEXT: andl $7, %esi -; X86-NEXT: movl 20(%esp), %eax -; X86-NEXT: movzwl (%eax), %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $7, %cl -; X86-NEXT: movb %bl, %ch -; X86-NEXT: andb $7, %ch -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: shrl $6, %eax -; X86-NEXT: andb $7, %al -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: shrl $9, %edx +; X86-NEXT: movl 20(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: shrb $3, %bl +; X86-NEXT: andb $7, %bl +; X86-NEXT: movb %cl, %bh +; X86-NEXT: andb $7, %bh +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $6, %edx ; X86-NEXT: andb $7, %dl -; X86-NEXT: shrl $12, %ebx -; X86-NEXT: movb %bl, 4(%esp) -; X86-NEXT: movb %dl, 3(%esp) -; X86-NEXT: movb %al, 2(%esp) -; X86-NEXT: movb %ch, (%esp) -; X86-NEXT: movb %cl, 1(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl $9, %eax +; X86-NEXT: andb $7, %al +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: movb %cl, 4(%esp) +; X86-NEXT: movb %al, 3(%esp) +; X86-NEXT: movb %dl, 2(%esp) +; X86-NEXT: movb %bh, (%esp) +; X86-NEXT: movb %bl, 1(%esp) ; X86-NEXT: movzbl (%esp,%esi), %eax ; X86-NEXT: andl $7, %eax ; X86-NEXT: addl $8, %esp diff --git a/llvm/test/CodeGen/X86/pr44976.ll b/llvm/test/CodeGen/X86/pr44976.ll index 7c8d5e099ca670..aeca49290aa0f4 100644 --- a/llvm/test/CodeGen/X86/pr44976.ll +++ b/llvm/test/CodeGen/X86/pr44976.ll @@ -14,26 +14,26 @@ define <3 x i32> @f_29(<12 x i16> %a, <12 x i16> %b) { ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-NEXT: movd %r9d, %xmm1 -; CHECK-NEXT: movd %r8d, %xmm3 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-NEXT: movd %esi, %xmm4 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: movd %r9d, %xmm0 +; CHECK-NEXT: movd %r8d, %xmm2 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: movd %edx, %xmm4 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; CHECK-NEXT: movd %esi, %xmm0 ; CHECK-NEXT: movd %edi, %xmm1 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; CHECK-NEXT: pinsrw $1, {{[0-9]+}}(%rsp), %xmm4 ; CHECK-NEXT: pinsrw $2, {{[0-9]+}}(%rsp), %xmm4 @@ -42,44 +42,44 @@ define <3 x i32> @f_29(<12 x i16> %a, <12 x i16> %b) { ; CHECK-NEXT: pinsrw $1, {{[0-9]+}}(%rsp), %xmm2 ; CHECK-NEXT: pinsrw $2, {{[0-9]+}}(%rsp), %xmm2 ; CHECK-NEXT: pinsrw $3, {{[0-9]+}}(%rsp), %xmm2 -; CHECK-NEXT: movdqa %xmm1, %xmm3 -; CHECK-NEXT: pmulhuw %xmm0, %xmm3 -; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm5 +; CHECK-NEXT: pmulhuw %xmm3, %xmm5 +; CHECK-NEXT: pmullw %xmm3, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,2,3,3] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; CHECK-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; CHECK-NEXT: movdqa %xmm2, %xmm7 -; CHECK-NEXT: pmulhuw %xmm4, %xmm7 -; CHECK-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,0,2,1,4,5,6,7] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,2,3,3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; CHECK-NEXT: movdqa %xmm2, %xmm6 +; CHECK-NEXT: pmulhuw %xmm4, %xmm6 +; CHECK-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,0,65535,0,65535,0] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; CHECK-NEXT: pmullw %xmm4, %xmm2 ; CHECK-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; CHECK-NEXT: pand %xmm3, %xmm5 -; CHECK-NEXT: pandn %xmm6, %xmm3 -; CHECK-NEXT: por %xmm5, %xmm3 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; CHECK-NEXT: movdqa %xmm3, %xmm4 -; CHECK-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; CHECK-NEXT: movdqa %xmm0, %xmm5 -; CHECK-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm4[2,0] -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,3,3,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; CHECK-NEXT: movdqa %xmm3, %xmm6 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] -; CHECK-NEXT: paddd %xmm5, %xmm0 -; CHECK-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; CHECK-NEXT: paddd %xmm4, %xmm3 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; CHECK-NEXT: pand %xmm7, %xmm3 +; CHECK-NEXT: pandn %xmm5, %xmm7 +; CHECK-NEXT: por %xmm3, %xmm7 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; CHECK-NEXT: movdqa %xmm7, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; CHECK-NEXT: movdqa %xmm0, %xmm4 +; CHECK-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[2,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-NEXT: movdqa %xmm7, %xmm5 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[2,0] +; CHECK-NEXT: paddd %xmm4, %xmm0 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; CHECK-NEXT: paddd %xmm3, %xmm7 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,3] +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm7[1,3] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,3] ; CHECK-NEXT: paddd %xmm1, %xmm0 ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr46877.ll b/llvm/test/CodeGen/X86/pr46877.ll index 56618205ec7c1c..d4e7f4ff9d4aa9 100644 --- a/llvm/test/CodeGen/X86/pr46877.ll +++ b/llvm/test/CodeGen/X86/pr46877.ll @@ -5,118 +5,117 @@ define void @tester(float %0, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13, float %14, float %15, float %16, float %17, float %18, float %19, float %20, float %21, float %22, float %23, float %24, float %25, float %26, float %27, float %28, float %29, float %30, float %31, float %32, float %33, float %34, float %35, float %36, float %37, float %38, float %39, float %40, float %41, float %42, float %43, float %44, float %45, float %46, float %47, float %48, float %49, float %50, float %51, float %52, float %53, float %54, float %55, float %56, float %57, float %58, float %59, float %60, float %61, float %62, float %63, float %64, float %65, float %66, float %67, float %68, float %69, float %70, float %71, float %72, float %73, float %74, float %75, float %76, float %77, float %78, float %79, ptr %80) { ; CHECK-LABEL: tester: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps %xmm3, %xmm13 -; CHECK-NEXT: vmovaps %xmm1, %xmm14 -; CHECK-NEXT: vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovaps %xmm3, %xmm12 +; CHECK-NEXT: vmovaps %xmm1, %xmm15 +; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero +; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vmulss %xmm2, %xmm15, %xmm2 +; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm2 = (xmm3 * xmm2) - xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm1 * xmm5) + xmm0 +; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm3 +; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vmulss %xmm6, %xmm1, %xmm3 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm7 * xmm3) + xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vmulss %xmm0, %xmm10, %xmm3 +; CHECK-NEXT: vmovss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: vmulss %xmm3, %xmm14, %xmm3 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm4 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm4 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm5 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm13 * xmm5) + xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm5, %xmm5 +; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm3 ; CHECK-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero -; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm12 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm2 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vmulss %xmm6, %xmm12, %xmm2 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm7 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vmulss %xmm0, %xmm9, %xmm2 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm6 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm6 = -(xmm14 * xmm6) + xmm0 +; CHECK-NEXT: vmulss %xmm5, %xmm6, %xmm5 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm2 ; CHECK-NEXT: vmovss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vmulss %xmm2, %xmm8, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm3 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm3 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm11 * xmm4) + xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm4, %xmm4 -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm13, %xmm6 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm6 = -(xmm1 * xmm6) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm6, %xmm3 ; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm5 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm8 * xmm5) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm1 -; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vmovaps %xmm5, %xmm15 -; CHECK-NEXT: vmulss %xmm1, %xmm11, %xmm5 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm5, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm9 * xmm5) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm9 -; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm9 * xmm3) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm13 * xmm3) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm3 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm14, %xmm5 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm10 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm6 = -(xmm10 * xmm6) + xmm0 +; CHECK-NEXT: vmulss %xmm5, %xmm6, %xmm5 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm11 +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm11 * xmm4) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmulss %xmm5, %xmm3, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm12 * xmm4) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm7 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm10 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm10, %xmm5 -; CHECK-NEXT: vmulss %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmulss %xmm7, %xmm10, %xmm7 +; CHECK-NEXT: vmulss %xmm3, %xmm7, %xmm3 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm5 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm14, %xmm7 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm7, %xmm3 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm14 * xmm5) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm7 = -(xmm15 * xmm7) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm10, %xmm5 -; CHECK-NEXT: vmulss %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmulss %xmm7, %xmm10, %xmm7 +; CHECK-NEXT: vmulss %xmm3, %xmm7, %xmm3 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm1 -; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm1 * xmm10) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm10, %xmm3 -; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm15 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm10 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm12 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm12 = -(xmm6 * xmm12) + xmm0 -; CHECK-NEXT: vmulss %xmm10, %xmm12, %xmm10 +; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmovss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm7 = -(xmm2 * xmm7) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm7, %xmm3 +; CHECK-NEXT: vmulss %xmm0, %xmm1, %xmm4 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm7 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm14 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm14 = -(xmm8 * xmm14) + xmm0 +; CHECK-NEXT: vmulss %xmm7, %xmm14, %xmm7 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vmulss %xmm3, %xmm10, %xmm12 -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm6 * xmm2) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm7, %xmm14 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss %xmm3, %xmm15, %xmm1 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm8 * xmm3) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss %xmm4, %xmm7, %xmm2 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm5 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm1 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm8 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm5 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm3 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm5, %xmm1 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm2 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm4 * xmm2) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm3, %xmm1 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm2 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm6 * xmm2) + xmm0 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm14, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm3 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm12 * xmm3) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm1 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm6 +; CHECK-NEXT: vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm3 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm14 * xmm3) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm1 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 ; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload ; CHECK-NEXT: # xmm2 = mem[0],zero,zero,zero @@ -125,81 +124,81 @@ define void @tester(float %0, float %1, float %2, float %3, float %4, float %5, ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm3 * xmm2) + xmm0 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm2 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Reload ; CHECK-NEXT: # xmm10 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm12 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm6, %xmm2 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm8 = -(xmm8 * mem) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm14 * xmm2) + xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm8, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm7, %xmm2 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm9 = -(xmm9 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm9, %xmm2 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm2 -; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm1 -; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm12 * xmm2) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vmulss %xmm0, %xmm7, %xmm2 +; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm5 * xmm3) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm3 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm1 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm4 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm9 = -(xmm5 * xmm9) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm3 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm13 * xmm3) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm9, %xmm3 -; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vfnmadd132ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 4-byte Folded Reload -; CHECK-NEXT: # xmm11 = -(xmm11 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm11, %xmm3 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm4 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm7 * xmm3) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm2 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm3, %xmm3 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm7 * xmm11) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm1 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm12 * xmm1) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm6 -; CHECK-NEXT: vmulss %xmm5, %xmm14, %xmm5 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm7 -; CHECK-NEXT: vmulss %xmm15, %xmm13, %xmm8 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm11, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm5, %xmm1 +; CHECK-NEXT: vfnmadd132ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 4-byte Folded Reload +; CHECK-NEXT: # xmm13 = -(xmm13 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm13, %xmm1 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm5 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm7 +; CHECK-NEXT: vmulss %xmm6, %xmm15, %xmm6 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm8 +; CHECK-NEXT: vmulss %xmm4, %xmm12, %xmm4 ; CHECK-NEXT: vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm8 = -(xmm11 * xmm8) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm11 * xmm4) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm9 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm9 = -(xmm11 * xmm9) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm7 = -(xmm11 * xmm7) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm6 = -(xmm11 * xmm6) + xmm0 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm11 * xmm5) + xmm0 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm10 -; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm11 +; CHECK-NEXT: vmulss %xmm0, %xmm14, %xmm11 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm11, %xmm11 -; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm12 * xmm11) + xmm0 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm12 * xmm10) + xmm0 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm7 = (xmm13 * xmm7) - xmm0 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm0 -; CHECK-NEXT: vmulss %xmm0, %xmm8, %xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm13 * xmm11) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm13 * xmm10) + xmm0 +; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm8 = (xmm12 * xmm8) - xmm0 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vmulss %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vmulss %xmm0, %xmm9, %xmm0 -; CHECK-NEXT: vmulss %xmm0, %xmm6, %xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm1 +; CHECK-NEXT: vmulss %xmm0, %xmm7, %xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm1 +; CHECK-NEXT: vmulss %xmm5, %xmm6, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm11, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm10, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm7, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm8, %xmm1 ; CHECK-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovss %xmm0, (%rdi) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr47857.ll b/llvm/test/CodeGen/X86/pr47857.ll index 419e839a5d974a..d6bd530402919c 100644 --- a/llvm/test/CodeGen/X86/pr47857.ll +++ b/llvm/test/CodeGen/X86/pr47857.ll @@ -7,30 +7,30 @@ define void @PR47857(ptr noalias nocapture writeonly sret(%"struct.std::array") ; CHECK-LABEL: PR47857: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq (%rdx), %r9 -; CHECK-NEXT: movq 8(%rdx), %rcx -; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: addq (%rsi), %r9 -; CHECK-NEXT: adcq 8(%rsi), %rcx -; CHECK-NEXT: movq 16(%rdx), %r8 -; CHECK-NEXT: adcq 16(%rsi), %r8 +; CHECK-NEXT: movq (%rdx), %rcx +; CHECK-NEXT: movq 8(%rdx), %rdi +; CHECK-NEXT: xorl %r8d, %r8d +; CHECK-NEXT: addq (%rsi), %rcx +; CHECK-NEXT: adcq 8(%rsi), %rdi +; CHECK-NEXT: movq 16(%rdx), %r9 +; CHECK-NEXT: adcq 16(%rsi), %r9 ; CHECK-NEXT: movq 24(%rdx), %rdx ; CHECK-NEXT: adcq 24(%rsi), %rdx -; CHECK-NEXT: sbbq %rdi, %rdi -; CHECK-NEXT: andl $38, %edi -; CHECK-NEXT: addq %rdi, %r9 -; CHECK-NEXT: adcq $0, %rcx -; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: sbbq %r8, %r8 +; CHECK-NEXT: andl $38, %r8d +; CHECK-NEXT: addq %r8, %rcx +; CHECK-NEXT: adcq $0, %rdi +; CHECK-NEXT: adcq $0, %r9 ; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: sbbq %rdi, %rdi -; CHECK-NEXT: andl $38, %edi -; CHECK-NEXT: addq %r9, %rdi -; CHECK-NEXT: adcq $0, %rcx -; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: sbbq %r8, %r8 +; CHECK-NEXT: andl $38, %r8d +; CHECK-NEXT: addq %rcx, %r8 +; CHECK-NEXT: adcq $0, %rdi +; CHECK-NEXT: adcq $0, %r9 ; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: movq %rdi, (%rax) -; CHECK-NEXT: movq %rcx, 8(%rax) -; CHECK-NEXT: movq %r8, 16(%rax) +; CHECK-NEXT: movq %r8, (%rax) +; CHECK-NEXT: movq %rdi, 8(%rax) +; CHECK-NEXT: movq %r9, 16(%rax) ; CHECK-NEXT: movq %rdx, 24(%rax) ; CHECK-NEXT: retq %4 = load i64, ptr %1, align 8 diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll index a6ae7ce5ccd15d..d28406ee682212 100644 --- a/llvm/test/CodeGen/X86/pr57340.ll +++ b/llvm/test/CodeGen/X86/pr57340.ll @@ -13,29 +13,29 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-NEXT: vmovdqu (%rax), %xmm5 -; CHECK-NEXT: vpextrw $0, %xmm5, %eax +; CHECK-NEXT: vmovdqu (%rax), %xmm6 +; CHECK-NEXT: vpextrw $0, %xmm6, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm2 -; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2 -; CHECK-NEXT: vucomiss %xmm0, %xmm2 +; CHECK-NEXT: vcvtph2ps %xmm2, %xmm3 +; CHECK-NEXT: vucomiss %xmm0, %xmm3 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl -; CHECK-NEXT: vpsrld $16, %xmm1, %xmm3 -; CHECK-NEXT: vpextrw $0, %xmm3, %eax +; CHECK-NEXT: vpsrld $16, %xmm1, %xmm2 +; CHECK-NEXT: vpextrw $0, %xmm2, %eax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm3 -; CHECK-NEXT: vpsrld $16, %xmm5, %xmm4 +; CHECK-NEXT: vmovd %eax, %xmm2 +; CHECK-NEXT: vpsrld $16, %xmm6, %xmm4 ; CHECK-NEXT: vpextrw $0, %xmm4, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm4 ; CHECK-NEXT: setne %al ; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: vcvtph2ps %xmm3, %xmm6 -; CHECK-NEXT: vcvtph2ps %xmm4, %xmm3 +; CHECK-NEXT: vcvtph2ps %xmm2, %xmm5 +; CHECK-NEXT: vcvtph2ps %xmm4, %xmm2 ; CHECK-NEXT: kmovw %eax, %k0 -; CHECK-NEXT: vucomiss %xmm6, %xmm3 +; CHECK-NEXT: vucomiss %xmm5, %xmm2 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -68,13 +68,13 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm4 -; CHECK-NEXT: vcvtph2ps %xmm4, %xmm6 -; CHECK-NEXT: vpsrlq $48, %xmm5, %xmm4 +; CHECK-NEXT: vcvtph2ps %xmm4, %xmm5 +; CHECK-NEXT: vpsrlq $48, %xmm6, %xmm4 ; CHECK-NEXT: vpextrw $0, %xmm4, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm4 ; CHECK-NEXT: vcvtph2ps %xmm4, %xmm4 -; CHECK-NEXT: vucomiss %xmm6, %xmm4 +; CHECK-NEXT: vucomiss %xmm5, %xmm4 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -85,13 +85,13 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: movw $-17, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] -; CHECK-NEXT: vpextrw $0, %xmm6, %eax +; CHECK-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; CHECK-NEXT: vpextrw $0, %xmm5, %eax ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm6 -; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6 -; CHECK-NEXT: vucomiss %xmm6, %xmm0 +; CHECK-NEXT: vmovd %eax, %xmm5 +; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5 +; CHECK-NEXT: vucomiss %xmm5, %xmm0 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -102,18 +102,18 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: movw $-33, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpextrw $0, %xmm6, %eax +; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrw $0, %xmm5, %eax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm6 -; CHECK-NEXT: vcvtph2ps %xmm6, %xmm7 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpextrw $0, %xmm6, %eax +; CHECK-NEXT: vmovd %eax, %xmm5 +; CHECK-NEXT: vcvtph2ps %xmm5, %xmm7 +; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrw $0, %xmm5, %eax ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm6 -; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6 -; CHECK-NEXT: vucomiss %xmm7, %xmm6 +; CHECK-NEXT: vmovd %eax, %xmm5 +; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5 +; CHECK-NEXT: vucomiss %xmm7, %xmm5 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -147,12 +147,12 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm7 ; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpextrw $0, %xmm5, %eax +; CHECK-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrw $0, %xmm6, %eax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm5 -; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5 -; CHECK-NEXT: vucomiss %xmm7, %xmm5 +; CHECK-NEXT: vmovd %eax, %xmm6 +; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6 +; CHECK-NEXT: vucomiss %xmm7, %xmm6 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -169,7 +169,7 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm7 ; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7 -; CHECK-NEXT: vucomiss %xmm7, %xmm2 +; CHECK-NEXT: vucomiss %xmm7, %xmm3 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -181,12 +181,12 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: movw $-513, %ax # imm = 0xFDFF ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: kandw %k1, %k0, %k0 -; CHECK-NEXT: vpsrld $16, %xmm1, %xmm2 -; CHECK-NEXT: vpextrw $0, %xmm2, %eax +; CHECK-NEXT: vpsrld $16, %xmm1, %xmm3 +; CHECK-NEXT: vpextrw $0, %xmm3, %eax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm2 -; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2 -; CHECK-NEXT: vucomiss %xmm2, %xmm3 +; CHECK-NEXT: vmovd %eax, %xmm3 +; CHECK-NEXT: vcvtph2ps %xmm3, %xmm3 +; CHECK-NEXT: vucomiss %xmm3, %xmm2 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -254,7 +254,7 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm2 ; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2 -; CHECK-NEXT: vucomiss %xmm2, %xmm6 +; CHECK-NEXT: vucomiss %xmm2, %xmm5 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -286,7 +286,7 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: vmovd %eax, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-NEXT: kshiftrw $1, %k0, %k0 -; CHECK-NEXT: vucomiss %xmm0, %xmm5 +; CHECK-NEXT: vucomiss %xmm0, %xmm6 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl diff --git a/llvm/test/CodeGen/X86/pr59258.ll b/llvm/test/CodeGen/X86/pr59258.ll index 61ddb24eaaf87d..fb2d219556632f 100644 --- a/llvm/test/CodeGen/X86/pr59258.ll +++ b/llvm/test/CodeGen/X86/pr59258.ll @@ -4,7 +4,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-LABEL: cvt_and_clamp2: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $104, %rsp +; CHECK-NEXT: subq $120, %rsp ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm1, %xmm0 @@ -21,7 +21,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __truncsfhf2@PLT @@ -62,13 +62,13 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: callq fmaxf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss (%rsp), %xmm0 # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: callq fmaxf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -110,7 +110,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: callq fminf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movd (%rsp), %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -157,7 +157,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: addq $104, %rsp +; CHECK-NEXT: addq $120, %rsp ; CHECK-NEXT: retq %2 = fptrunc <8 x float> %0 to <8 x half> %3 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> zeroinitializer, <8 x half> %2) diff --git a/llvm/test/CodeGen/X86/pr62653.ll b/llvm/test/CodeGen/X86/pr62653.ll index 0a03c1831f6579..a05abf07a5cea9 100644 --- a/llvm/test/CodeGen/X86/pr62653.ll +++ b/llvm/test/CodeGen/X86/pr62653.ll @@ -24,38 +24,38 @@ define <64 x i4> @pr62653(<64 x i4> %a0) nounwind { ; CHECK-NEXT: andl $15, %r10d ; CHECK-NEXT: shlq $12, %r10 ; CHECK-NEXT: orq %rdi, %r10 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-NEXT: andl $15, %r11d -; CHECK-NEXT: shlq $16, %r11 -; CHECK-NEXT: orq %r10, %r11 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: andl $15, %edi -; CHECK-NEXT: shlq $20, %rdi -; CHECK-NEXT: orq %r11, %rdi +; CHECK-NEXT: shlq $16, %rdi +; CHECK-NEXT: orq %r10, %rdi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $24, %r10 +; CHECK-NEXT: shlq $20, %r10 +; CHECK-NEXT: orq %rdi, %r10 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: shlq $24, %rdi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-NEXT: andl $15, %r11d ; CHECK-NEXT: shlq $28, %r11 -; CHECK-NEXT: orq %r10, %r11 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $32, %r10 -; CHECK-NEXT: orq %r11, %r10 +; CHECK-NEXT: orq %rdi, %r11 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: shlq $32, %rdi +; CHECK-NEXT: orq %r11, %rdi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-NEXT: andl $15, %r11d ; CHECK-NEXT: shlq $36, %r11 -; CHECK-NEXT: orq %r10, %r11 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $40, %r10 -; CHECK-NEXT: orq %r11, %r10 +; CHECK-NEXT: orq %rdi, %r11 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: shlq $40, %rdi +; CHECK-NEXT: orq %r11, %rdi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-NEXT: andl $15, %r11d ; CHECK-NEXT: shlq $44, %r11 -; CHECK-NEXT: orq %r10, %r11 ; CHECK-NEXT: orq %rdi, %r11 +; CHECK-NEXT: orq %r10, %r11 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: shlq $48, %rdi @@ -82,26 +82,26 @@ define <64 x i4> @pr62653(<64 x i4> %a0) nounwind { ; CHECK-NEXT: andl $15, %ecx ; CHECK-NEXT: shlq $20, %rcx ; CHECK-NEXT: orq %r9, %rcx -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: andl $15, %esi -; CHECK-NEXT: shlq $24, %rsi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: andl $15, %edx -; CHECK-NEXT: shlq $28, %rdx -; CHECK-NEXT: orq %rsi, %rdx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: shlq $32, %rcx +; CHECK-NEXT: shlq $24, %rdx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; CHECK-NEXT: andl $15, %esi -; CHECK-NEXT: shlq $36, %rsi +; CHECK-NEXT: shlq $28, %rsi +; CHECK-NEXT: orq %rdx, %rsi ; CHECK-NEXT: orq %rcx, %rsi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: shlq $32, %rcx +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: andl $15, %edx +; CHECK-NEXT: shlq $36, %rdx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: andl $15, %ecx ; CHECK-NEXT: shlq $40, %rcx -; CHECK-NEXT: orq %rsi, %rcx ; CHECK-NEXT: orq %rdx, %rcx +; CHECK-NEXT: orq %rsi, %rcx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: andl $15, %edx ; CHECK-NEXT: shlq $44, %rdx diff --git a/llvm/test/CodeGen/X86/pr63475.ll b/llvm/test/CodeGen/X86/pr63475.ll index 0052688b5aa130..4584495e1fdac6 100644 --- a/llvm/test/CodeGen/X86/pr63475.ll +++ b/llvm/test/CodeGen/X86/pr63475.ll @@ -44,14 +44,14 @@ define void @callee(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, <7 x i ; CHECK-NEXT: movl 112(%rsp), %ebp ; CHECK-NEXT: movd %ebp, %xmm1 ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: movl 104(%rsp), %r15d -; CHECK-NEXT: movd %r15d, %xmm0 +; CHECK-NEXT: movl 104(%rsp), %r14d +; CHECK-NEXT: movd %r14d, %xmm0 ; CHECK-NEXT: movl 96(%rsp), %edi ; CHECK-NEXT: movd %edi, %xmm2 ; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; CHECK-NEXT: movl 136(%rsp), %r14d -; CHECK-NEXT: movd %r14d, %xmm0 +; CHECK-NEXT: movl 136(%rsp), %r15d +; CHECK-NEXT: movd %r15d, %xmm0 ; CHECK-NEXT: movl 128(%rsp), %r12d ; CHECK-NEXT: movd %r12d, %xmm1 ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -60,7 +60,7 @@ define void @callee(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, <7 x i ; CHECK-NEXT: movq %xmm1, 28(%rsp) ; CHECK-NEXT: movdqu %xmm2, 12(%rsp) ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: movl %r15d, %edi +; CHECK-NEXT: movl %r14d, %edi ; CHECK-NEXT: callq use@PLT ; CHECK-NEXT: movl %ebp, %edi ; CHECK-NEXT: callq use@PLT @@ -68,7 +68,7 @@ define void @callee(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, <7 x i ; CHECK-NEXT: callq use@PLT ; CHECK-NEXT: movl %r12d, %edi ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: movl %r14d, %edi +; CHECK-NEXT: movl %r15d, %edi ; CHECK-NEXT: callq use@PLT ; CHECK-NEXT: movl %r13d, %edi ; CHECK-NEXT: callq use@PLT diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index d2da4c0f3e86ad..d197f4dd0d7d4d 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -840,8 +840,8 @@ vector.ph: define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; SSE2OR3-LABEL: test14: ; SSE2OR3: # %bb.0: # %vector.ph -; SSE2OR3-NEXT: pxor %xmm6, %xmm6 -; SSE2OR3-NEXT: movdqa %xmm0, %xmm5 +; SSE2OR3-NEXT: pxor %xmm5, %xmm5 +; SSE2OR3-NEXT: movdqa %xmm0, %xmm6 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm7 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm9 @@ -856,27 +856,27 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; SSE2OR3-NEXT: packuswb %xmm3, %xmm1 ; SSE2OR3-NEXT: psubb %xmm0, %xmm1 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 -; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2OR3-NEXT: movdqa %xmm2, %xmm0 -; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; SSE2OR3-NEXT: movdqa %xmm5, %xmm3 -; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE2OR3-NEXT: pxor %xmm6, %xmm7 -; SSE2OR3-NEXT: por %xmm6, %xmm5 -; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2OR3-NEXT: pxor %xmm6, %xmm8 -; SSE2OR3-NEXT: por %xmm6, %xmm3 +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE2OR3-NEXT: movdqa %xmm6, %xmm3 +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE2OR3-NEXT: pxor %xmm5, %xmm7 +; SSE2OR3-NEXT: por %xmm5, %xmm6 +; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2OR3-NEXT: pxor %xmm5, %xmm8 +; SSE2OR3-NEXT: por %xmm5, %xmm3 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE2OR3-NEXT: packssdw %xmm5, %xmm3 -; SSE2OR3-NEXT: pxor %xmm6, %xmm9 -; SSE2OR3-NEXT: por %xmm6, %xmm2 +; SSE2OR3-NEXT: packssdw %xmm6, %xmm3 +; SSE2OR3-NEXT: pxor %xmm5, %xmm9 +; SSE2OR3-NEXT: por %xmm5, %xmm2 ; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE2OR3-NEXT: pxor %xmm6, %xmm4 -; SSE2OR3-NEXT: por %xmm6, %xmm0 +; SSE2OR3-NEXT: pxor %xmm5, %xmm4 +; SSE2OR3-NEXT: por %xmm5, %xmm0 ; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE2OR3-NEXT: packssdw %xmm2, %xmm0 ; SSE2OR3-NEXT: packsswb %xmm3, %xmm0 @@ -1669,53 +1669,53 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; ; SSE41-LABEL: psubus_8i64_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535] -; SSE41-NEXT: movapd %xmm8, %xmm10 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm9 = [65535,65535] +; SSE41-NEXT: movapd %xmm9, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm8, %xmm4 +; SSE41-NEXT: movapd %xmm9, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 ; SSE41-NEXT: packusdw %xmm10, %xmm4 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm8, %xmm3 +; SSE41-NEXT: movapd %xmm9, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pand %xmm7, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 -; SSE41-NEXT: packusdw %xmm3, %xmm8 -; SSE41-NEXT: packusdw %xmm4, %xmm8 -; SSE41-NEXT: psubusw %xmm8, %xmm5 +; SSE41-NEXT: pxor %xmm1, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: packusdw %xmm3, %xmm9 +; SSE41-NEXT: packusdw %xmm4, %xmm9 +; SSE41-NEXT: psubusw %xmm9, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: psubus_8i64_max: @@ -2770,53 +2770,53 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-LABEL: test33: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE41-NEXT: movapd %xmm9, %xmm11 +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm10 = [4294967295,4294967295] +; SSE41-NEXT: movapd %xmm10, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm10, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] ; SSE41-NEXT: pmaxud %xmm3, %xmm7 ; SSE41-NEXT: psubd %xmm3, %xmm7 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm2 +; SSE41-NEXT: movapd %xmm10, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pxor %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pand %xmm9, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] -; SSE41-NEXT: pmaxud %xmm9, %xmm1 -; SSE41-NEXT: psubd %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE41-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[0,2] +; SSE41-NEXT: pmaxud %xmm10, %xmm1 +; SSE41-NEXT: psubd %xmm10, %xmm1 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: retq ; @@ -2992,58 +2992,58 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; ; SSE41-LABEL: test34: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm0, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pand %xmm0, %xmm7 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE41-NEXT: movapd %xmm9, %xmm11 +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm10 = [4294967295,4294967295] +; SSE41-NEXT: movapd %xmm10, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm10, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] -; SSE41-NEXT: pmaxud %xmm3, %xmm6 -; SSE41-NEXT: psubd %xmm3, %xmm6 +; SSE41-NEXT: pmaxud %xmm3, %xmm7 +; SSE41-NEXT: psubd %xmm3, %xmm7 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm2 +; SSE41-NEXT: movapd %xmm10, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pand %xmm8, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] -; SSE41-NEXT: pmaxud %xmm9, %xmm1 -; SSE41-NEXT: psubd %xmm9, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pand %xmm9, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE41-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[0,2] +; SSE41-NEXT: pmaxud %xmm10, %xmm1 +; SSE41-NEXT: psubd %xmm10, %xmm1 +; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test34: diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll index 7e9bbc55564248..152bf01bd701c2 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -763,32 +763,32 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; SSE-LABEL: v8f32_two_step: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: rcpps %xmm0, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: subps %xmm4, %xmm5 ; SSE-NEXT: mulps %xmm3, %xmm5 ; SSE-NEXT: addps %xmm3, %xmm5 ; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: subps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm5, %xmm3 ; SSE-NEXT: addps %xmm5, %xmm3 -; SSE-NEXT: rcpps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: rcpps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: mulps %xmm0, %xmm4 -; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: subps %xmm4, %xmm5 ; SSE-NEXT: mulps %xmm0, %xmm5 ; SSE-NEXT: addps %xmm0, %xmm5 -; SSE-NEXT: mulps %xmm5, %xmm2 -; SSE-NEXT: subps %xmm2, %xmm1 ; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: addps %xmm5, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm2 +; SSE-NEXT: mulps %xmm5, %xmm2 +; SSE-NEXT: addps %xmm5, %xmm2 ; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_two_step: @@ -971,34 +971,34 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { define <16 x float> @v16f32_one_step(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_one_step: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: rcpps %xmm0, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm5 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: subps %xmm5, %xmm0 ; SSE-NEXT: mulps %xmm6, %xmm0 -; SSE-NEXT: addps %xmm6, %xmm0 -; SSE-NEXT: rcpps %xmm1, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: subps %xmm1, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: subps %xmm0, %xmm5 ; SSE-NEXT: mulps %xmm6, %xmm5 ; SSE-NEXT: addps %xmm6, %xmm5 -; SSE-NEXT: rcpps %xmm2, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: subps %xmm2, %xmm6 -; SSE-NEXT: mulps %xmm1, %xmm6 -; SSE-NEXT: addps %xmm1, %xmm6 -; SSE-NEXT: rcpps %xmm4, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm4 -; SSE-NEXT: subps %xmm4, %xmm3 -; SSE-NEXT: mulps %xmm1, %xmm3 -; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: rcpps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: subps %xmm1, %xmm6 +; SSE-NEXT: mulps %xmm0, %xmm6 +; SSE-NEXT: addps %xmm0, %xmm6 +; SSE-NEXT: rcpps %xmm2, %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: subps %xmm2, %xmm7 +; SSE-NEXT: mulps %xmm0, %xmm7 +; SSE-NEXT: addps %xmm0, %xmm7 +; SSE-NEXT: rcpps %xmm3, %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm3 +; SSE-NEXT: subps %xmm3, %xmm4 +; SSE-NEXT: mulps %xmm0, %xmm4 +; SSE-NEXT: addps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v16f32_one_step: @@ -1107,58 +1107,58 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 { define <16 x float> @v16f32_two_step(<16 x float> %x) #2 { ; SSE-LABEL: v16f32_two_step: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: rcpps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: mulps %xmm0, %xmm6 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: subps %xmm6, %xmm7 ; SSE-NEXT: mulps %xmm0, %xmm7 ; SSE-NEXT: addps %xmm0, %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: subps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: subps %xmm5, %xmm0 ; SSE-NEXT: mulps %xmm7, %xmm0 ; SSE-NEXT: addps %xmm7, %xmm0 -; SSE-NEXT: rcpps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: mulps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm3, %xmm7 -; SSE-NEXT: subps %xmm6, %xmm7 -; SSE-NEXT: mulps %xmm1, %xmm7 -; SSE-NEXT: addps %xmm1, %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: subps %xmm5, %xmm1 -; SSE-NEXT: mulps %xmm7, %xmm1 -; SSE-NEXT: addps %xmm7, %xmm1 -; SSE-NEXT: rcpps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: rcpps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm6 ; SSE-NEXT: mulps %xmm5, %xmm6 -; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: subps %xmm6, %xmm7 ; SSE-NEXT: mulps %xmm5, %xmm7 ; SSE-NEXT: addps %xmm5, %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: subps %xmm2, %xmm5 +; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: subps %xmm1, %xmm5 ; SSE-NEXT: mulps %xmm7, %xmm5 ; SSE-NEXT: addps %xmm7, %xmm5 -; SSE-NEXT: rcpps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: mulps %xmm2, %xmm6 -; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: rcpps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: mulps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: subps %xmm6, %xmm7 -; SSE-NEXT: mulps %xmm2, %xmm7 -; SSE-NEXT: addps %xmm2, %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm4 -; SSE-NEXT: subps %xmm4, %xmm3 +; SSE-NEXT: mulps %xmm1, %xmm7 +; SSE-NEXT: addps %xmm1, %xmm7 +; SSE-NEXT: mulps %xmm7, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: subps %xmm2, %xmm6 +; SSE-NEXT: mulps %xmm7, %xmm6 +; SSE-NEXT: addps %xmm7, %xmm6 +; SSE-NEXT: rcpps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: subps %xmm2, %xmm7 +; SSE-NEXT: mulps %xmm1, %xmm7 +; SSE-NEXT: addps %xmm1, %xmm7 ; SSE-NEXT: mulps %xmm7, %xmm3 -; SSE-NEXT: addps %xmm7, %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: subps %xmm3, %xmm4 +; SSE-NEXT: mulps %xmm7, %xmm4 +; SSE-NEXT: addps %xmm7, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v16f32_two_step: diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll index 2a5e46bba2c009..66f7824ec36ec8 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -1118,42 +1118,42 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_one_step2: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: rcpps %xmm0, %xmm5 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: mulps %xmm0, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: subps %xmm1, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: addps %xmm6, %xmm0 -; SSE-NEXT: rcpps %xmm4, %xmm5 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: mulps %xmm1, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm4 -; SSE-NEXT: subps %xmm4, %xmm1 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: addps %xmm6, %xmm1 -; SSE-NEXT: rcpps %xmm2, %xmm5 -; SSE-NEXT: movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1] +; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: mulps %xmm4, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm2 -; SSE-NEXT: subps %xmm2, %xmm4 +; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: subps %xmm0, %xmm4 ; SSE-NEXT: mulps %xmm5, %xmm4 ; SSE-NEXT: addps %xmm6, %xmm4 -; SSE-NEXT: rcpps %xmm3, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1] -; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: rcpps %xmm1, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SSE-NEXT: movaps %xmm0, %xmm6 ; SSE-NEXT: mulps %xmm5, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm3 -; SSE-NEXT: subps %xmm3, %xmm5 -; SSE-NEXT: mulps %xmm2, %xmm5 +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm5 ; SSE-NEXT: addps %xmm6, %xmm5 -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: rcpps %xmm2, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm6 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: subps %xmm2, %xmm6 +; SSE-NEXT: mulps %xmm0, %xmm6 +; SSE-NEXT: addps %xmm1, %xmm6 +; SSE-NEXT: rcpps %xmm3, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm7 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: mulps %xmm1, %xmm3 +; SSE-NEXT: subps %xmm3, %xmm7 +; SSE-NEXT: mulps %xmm0, %xmm7 +; SSE-NEXT: addps %xmm1, %xmm7 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm7, %xmm3 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v16f32_one_step2: @@ -1285,19 +1285,19 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 { define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_one_step_2_divs: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm0, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: rcpps %xmm0, %xmm5 +; SSE-NEXT: mulps %xmm5, %xmm0 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: subps %xmm0, %xmm5 -; SSE-NEXT: mulps %xmm6, %xmm5 -; SSE-NEXT: addps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: subps %xmm0, %xmm6 +; SSE-NEXT: mulps %xmm5, %xmm6 +; SSE-NEXT: addps %xmm5, %xmm6 ; SSE-NEXT: rcpps %xmm1, %xmm0 ; SSE-NEXT: mulps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: subps %xmm1, %xmm6 -; SSE-NEXT: mulps %xmm0, %xmm6 -; SSE-NEXT: addps %xmm0, %xmm6 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: subps %xmm1, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm5 +; SSE-NEXT: addps %xmm0, %xmm5 ; SSE-NEXT: rcpps %xmm2, %xmm0 ; SSE-NEXT: mulps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm4, %xmm7 @@ -1314,11 +1314,11 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { ; SSE-NEXT: movaps {{.*#+}} xmm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1] ; SSE-NEXT: mulps %xmm7, %xmm2 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: mulps %xmm5, %xmm1 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: mulps %xmm5, %xmm1 ; SSE-NEXT: mulps %xmm7, %xmm2 ; SSE-NEXT: mulps %xmm4, %xmm3 ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll index 8f046a4f5aea5b..db479e4731fe9a 100644 --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -223,17 +223,17 @@ define i16 @no_extract_mul(i16 %i) nounwind { define i8 @no_extract_udiv(i8 %i) nounwind { ; X86-LABEL: no_extract_udiv: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull $171, %eax, %ecx -; X86-NEXT: imull $79, %eax, %edx -; X86-NEXT: subb %dh, %al -; X86-NEXT: shrb %al -; X86-NEXT: addb %dh, %al -; X86-NEXT: shrb $5, %al -; X86-NEXT: shlb $3, %ch -; X86-NEXT: orb %al, %ch -; X86-NEXT: andb $-9, %ch -; X86-NEXT: movb %ch, %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull $171, %ecx, %eax +; X86-NEXT: imull $79, %ecx, %edx +; X86-NEXT: subb %dh, %cl +; X86-NEXT: shrb %cl +; X86-NEXT: addb %dh, %cl +; X86-NEXT: shrb $5, %cl +; X86-NEXT: shlb $3, %ah +; X86-NEXT: orb %cl, %ah +; X86-NEXT: andb $-9, %ah +; X86-NEXT: movb %ah, %al ; X86-NEXT: retl ; ; X64-LABEL: no_extract_udiv: diff --git a/llvm/test/CodeGen/X86/rotate.ll b/llvm/test/CodeGen/X86/rotate.ll index ea32edba628228..c099a4246321e9 100644 --- a/llvm/test/CodeGen/X86/rotate.ll +++ b/llvm/test/CodeGen/X86/rotate.ll @@ -655,27 +655,27 @@ define i64 @truncated_rot(i64 %x, i32 %amt) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: shll %cl, %eax ; X86-NEXT: testb $32, %cl -; X86-NEXT: movl $0, %edi +; X86-NEXT: movl $0, %ebx ; X86-NEXT: jne .LBB28_2 ; X86-NEXT: # %bb.1: # %entry -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: .LBB28_2: # %entry ; X86-NEXT: movb $64, %dl ; X86-NEXT: subb %cl, %dl -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: shrdl %cl, %ebx, %esi +; X86-NEXT: shrdl %cl, %edi, %esi ; X86-NEXT: testb $32, %dl ; X86-NEXT: jne .LBB28_4 ; X86-NEXT: # %bb.3: # %entry ; X86-NEXT: movl %esi, %eax ; X86-NEXT: .LBB28_4: # %entry -; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ebx, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index 3d3e935045475e..02660d04ffd2cd 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -309,10 +309,10 @@ middle.block: define dso_local i32 @sad_avx64i8() nounwind { ; SSE2-LABEL: sad_avx64i8: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: .p2align 4, 0x90 @@ -323,7 +323,7 @@ define dso_local i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: movdqa a+1040(%rax), %xmm5 ; SSE2-NEXT: psadbw b+1040(%rax), %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: paddd %xmm5, %xmm4 ; SSE2-NEXT: movdqa a+1056(%rax), %xmm5 ; SSE2-NEXT: psadbw b+1056(%rax), %xmm5 ; SSE2-NEXT: paddd %xmm5, %xmm2 @@ -333,15 +333,15 @@ define dso_local i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: addq $64, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm4, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: paddd %xmm5, %xmm5 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm0 ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: paddd %xmm5, %xmm4 +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: paddd %xmm2, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm5 @@ -564,23 +564,23 @@ define dso_local i32 @sad_2i8() nounwind { ; ; AVX-LABEL: sad_2i8: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB3_1: # %vector.body ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ; AVX-NEXT: addq $2, %rax ; AVX-NEXT: jne .LBB3_1 ; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/sadd_sat.ll b/llvm/test/CodeGen/X86/sadd_sat.ll index 5b9a42d1f0d91f..83e2824c81f0ec 100644 --- a/llvm/test/CodeGen/X86/sadd_sat.ll +++ b/llvm/test/CodeGen/X86/sadd_sat.ll @@ -184,23 +184,23 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovol %edi, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%esi,%eax), %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: addl $-2147483648, %ebx # imm = 0x80000000 +; X86-NEXT: leal (%esi,%eax), %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: addl $-2147483648, %edi # imm = 0x80000000 ; X86-NEXT: addl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovol %ebx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmovol %edi, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%edi,%eax), %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: addl $-2147483648, %ebx # imm = 0x80000000 -; X86-NEXT: addl %eax, %edi +; X86-NEXT: leal (%ebx,%eax), %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: addl $-2147483648, %edi # imm = 0x80000000 +; X86-NEXT: addl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmovol %ebx, %edi +; X86-NEXT: cmovol %edi, %ebx ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index 45a8a6fd5449af..f2dfed32758998 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -1008,34 +1008,34 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; SSE41-LABEL: v16i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm0, %xmm10 -; SSE41-NEXT: paddd %xmm4, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm10 +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: paddd %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm1 +; SSE41-NEXT: movdqa %xmm9, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm10 -; SSE41-NEXT: movdqa %xmm9, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: paddd %xmm5, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm9 -; SSE41-NEXT: pxor %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE41-NEXT: pxor %xmm5, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: movdqa %xmm10, %xmm2 ; SSE41-NEXT: paddd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 -; SSE41-NEXT: pxor %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE41-NEXT: pxor %xmm6, %xmm10 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm11, %xmm3 ; SSE41-NEXT: paddd %xmm7, %xmm3 @@ -1046,7 +1046,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; SSE41-NEXT: pxor %xmm4, %xmm5 ; SSE41-NEXT: movdqa %xmm11, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3 -; SSE41-NEXT: movaps %xmm10, %xmm0 +; SSE41-NEXT: movaps %xmm9, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i32: @@ -1247,106 +1247,108 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE2-LABEL: v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm6, %xmm0 -; SSE2-NEXT: paddq %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm6, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] ; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm7 +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: paddq %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] -; SSSE3-NEXT: pxor %xmm6, %xmm0 -; SSSE3-NEXT: paddq %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pxor %xmm6, %xmm5 -; SSSE3-NEXT: movdqa %xmm0, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: paddq %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn %xmm4, %xmm2 -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] ; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pandn %xmm0, %xmm7 +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSSE3-NEXT: paddq %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm1, %xmm6 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 ; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pxor %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: pand %xmm5, %xmm3 ; SSSE3-NEXT: pandn %xmm1, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i64: @@ -1451,21 +1453,21 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm8 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm9, %xmm0 ; SSE2-NEXT: paddq %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm10, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm9, %xmm10 ; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSE2-NEXT: pand %xmm12, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm11, %xmm4 @@ -1477,10 +1479,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 ; SSE2-NEXT: paddq %xmm5, %xmm8 ; SSE2-NEXT: movdqa %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 ; SSE2-NEXT: movdqa %xmm1, %xmm12 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 ; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] @@ -1500,10 +1502,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: pandn %xmm8, %xmm5 ; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 ; SSE2-NEXT: paddq %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm5 ; SSE2-NEXT: movdqa %xmm4, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2] @@ -1523,26 +1525,26 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: pandn %xmm2, %xmm6 ; SSE2-NEXT: por %xmm6, %xmm4 ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm2 ; SSE2-NEXT: paddq %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm10 +; SSE2-NEXT: pxor %xmm3, %xmm9 ; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pxor %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm5, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: pxor %xmm11, %xmm5 -; SSE2-NEXT: pand %xmm9, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm5 +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm10 +; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: movdqa %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: retq @@ -1551,21 +1553,21 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa %xmm1, %xmm8 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm9, %xmm0 ; SSSE3-NEXT: paddq %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm9 -; SSSE3-NEXT: pxor %xmm10, %xmm9 +; SSSE3-NEXT: movdqa %xmm1, %xmm10 +; SSSE3-NEXT: pxor %xmm9, %xmm10 ; SSSE3-NEXT: movdqa %xmm0, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 ; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSSE3-NEXT: pand %xmm12, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm11 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm11, %xmm4 @@ -1577,10 +1579,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: pandn %xmm1, %xmm4 ; SSSE3-NEXT: por %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm8, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 ; SSSE3-NEXT: paddq %xmm5, %xmm8 ; SSSE3-NEXT: movdqa %xmm8, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm4 ; SSSE3-NEXT: movdqa %xmm1, %xmm12 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12 ; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] @@ -1600,10 +1602,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: pandn %xmm8, %xmm5 ; SSSE3-NEXT: por %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm4 ; SSSE3-NEXT: paddq %xmm6, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm5 ; SSSE3-NEXT: movdqa %xmm4, %xmm8 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2] @@ -1623,26 +1625,26 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: pandn %xmm2, %xmm6 ; SSSE3-NEXT: por %xmm6, %xmm4 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm2 ; SSSE3-NEXT: paddq %xmm7, %xmm3 -; SSSE3-NEXT: pxor %xmm3, %xmm10 +; SSSE3-NEXT: pxor %xmm3, %xmm9 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9 -; SSSE3-NEXT: pxor %xmm5, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 +; SSSE3-NEXT: pxor %xmm5, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm5 ; SSSE3-NEXT: pxor %xmm11, %xmm5 -; SSSE3-NEXT: pand %xmm9, %xmm5 -; SSSE3-NEXT: pandn %xmm3, %xmm9 -; SSSE3-NEXT: por %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm10, %xmm5 +; SSSE3-NEXT: pandn %xmm3, %xmm10 +; SSSE3-NEXT: por %xmm10, %xmm5 ; SSSE3-NEXT: movdqa %xmm4, %xmm2 ; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sbb-false-dep.ll b/llvm/test/CodeGen/X86/sbb-false-dep.ll index 34a92cb58692b5..d84b9856b8d82f 100644 --- a/llvm/test/CodeGen/X86/sbb-false-dep.ll +++ b/llvm/test/CodeGen/X86/sbb-false-dep.ll @@ -58,34 +58,34 @@ define i32 @mallocbench_gs(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 n ; IDIOM-NEXT: pushq %r14 ; IDIOM-NEXT: pushq %r12 ; IDIOM-NEXT: pushq %rbx -; IDIOM-NEXT: movl %r8d, %ebp -; IDIOM-NEXT: movl %ecx, %r14d -; IDIOM-NEXT: movl %edx, %r15d -; IDIOM-NEXT: movq %rsi, %rbx +; IDIOM-NEXT: movl %r8d, %ebx +; IDIOM-NEXT: movl %ecx, %ebp +; IDIOM-NEXT: movl %edx, %r14d +; IDIOM-NEXT: movq %rsi, %r15 ; IDIOM-NEXT: movq %rdi, %r12 ; IDIOM-NEXT: movq (%rsi), %rdi ; IDIOM-NEXT: movq 8(%rsi), %rsi -; IDIOM-NEXT: movq %rbx, %rdx +; IDIOM-NEXT: movq %r15, %rdx ; IDIOM-NEXT: callq foo1@PLT -; IDIOM-NEXT: movq 8(%rbx), %rax +; IDIOM-NEXT: movq 8(%r15), %rax ; IDIOM-NEXT: movq (%rax), %rax -; IDIOM-NEXT: movl %ebp, %ecx +; IDIOM-NEXT: movl %ebx, %ecx ; IDIOM-NEXT: negl %ecx ; IDIOM-NEXT: sbbq %r10, %r10 ; IDIOM-NEXT: orq %rax, %r10 -; IDIOM-NEXT: cmpl $1, %ebp +; IDIOM-NEXT: cmpl $1, %ebx ; IDIOM-NEXT: sbbq %r11, %r11 ; IDIOM-NEXT: orq %rax, %r11 ; IDIOM-NEXT: subq $8, %rsp ; IDIOM-NEXT: movq %r12, %rdi -; IDIOM-NEXT: movl %r15d, %esi -; IDIOM-NEXT: movl %r14d, %edx +; IDIOM-NEXT: movl %r14d, %esi +; IDIOM-NEXT: movl %ebp, %edx ; IDIOM-NEXT: xorl %ecx, %ecx ; IDIOM-NEXT: xorl %r8d, %r8d ; IDIOM-NEXT: xorl %r9d, %r9d ; IDIOM-NEXT: pushq %r11 ; IDIOM-NEXT: pushq %r10 -; IDIOM-NEXT: pushq %rbx +; IDIOM-NEXT: pushq %r15 ; IDIOM-NEXT: callq foo2@PLT ; IDIOM-NEXT: addq $32, %rsp ; IDIOM-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll index 0b3ef70d2beefd..932dc441945393 100644 --- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll +++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll @@ -219,35 +219,35 @@ define i256 @test2(i256 %a) nounwind { ; ILP: # %bb.0: ; ILP-NEXT: movq %rdi, %rax ; ILP-NEXT: xorl %edi, %edi -; ILP-NEXT: movq %rsi, %r11 -; ILP-NEXT: negq %r11 +; ILP-NEXT: movq %rsi, %r9 +; ILP-NEXT: negq %r9 ; ILP-NEXT: movl $0, %r10d ; ILP-NEXT: sbbq %rdx, %r10 -; ILP-NEXT: movl $0, %r9d -; ILP-NEXT: sbbq %rcx, %r9 +; ILP-NEXT: movl $0, %r11d +; ILP-NEXT: sbbq %rcx, %r11 ; ILP-NEXT: sbbq %r8, %rdi ; ILP-NEXT: andq %r8, %rdi ; ILP-NEXT: bsrq %rdi, %r8 ; ILP-NEXT: andq %rdx, %r10 ; ILP-NEXT: bsrq %r10, %rdx ; ILP-NEXT: xorq $63, %r8 -; ILP-NEXT: andq %rcx, %r9 -; ILP-NEXT: bsrq %r9, %rcx +; ILP-NEXT: andq %rcx, %r11 +; ILP-NEXT: bsrq %r11, %rcx ; ILP-NEXT: xorq $63, %rcx ; ILP-NEXT: addq $64, %rcx ; ILP-NEXT: testq %rdi, %rdi ; ILP-NEXT: cmovneq %r8, %rcx ; ILP-NEXT: xorq $63, %rdx -; ILP-NEXT: andq %rsi, %r11 +; ILP-NEXT: andq %rsi, %r9 ; ILP-NEXT: movl $127, %esi -; ILP-NEXT: bsrq %r11, %r8 +; ILP-NEXT: bsrq %r9, %r8 ; ILP-NEXT: cmoveq %rsi, %r8 ; ILP-NEXT: xorq $63, %r8 ; ILP-NEXT: addq $64, %r8 ; ILP-NEXT: testq %r10, %r10 ; ILP-NEXT: cmovneq %rdx, %r8 ; ILP-NEXT: subq $-128, %r8 -; ILP-NEXT: orq %rdi, %r9 +; ILP-NEXT: orq %rdi, %r11 ; ILP-NEXT: cmovneq %rcx, %r8 ; ILP-NEXT: movq %r8, (%rax) ; ILP-NEXT: movq $0, 24(%rax) @@ -259,18 +259,18 @@ define i256 @test2(i256 %a) nounwind { ; HYBRID: # %bb.0: ; HYBRID-NEXT: movq %rdi, %rax ; HYBRID-NEXT: xorl %edi, %edi -; HYBRID-NEXT: movq %rsi, %r11 -; HYBRID-NEXT: negq %r11 +; HYBRID-NEXT: movq %rsi, %r9 +; HYBRID-NEXT: negq %r9 ; HYBRID-NEXT: movl $0, %r10d ; HYBRID-NEXT: sbbq %rdx, %r10 -; HYBRID-NEXT: movl $0, %r9d -; HYBRID-NEXT: sbbq %rcx, %r9 +; HYBRID-NEXT: movl $0, %r11d +; HYBRID-NEXT: sbbq %rcx, %r11 ; HYBRID-NEXT: sbbq %r8, %rdi ; HYBRID-NEXT: andq %r8, %rdi ; HYBRID-NEXT: bsrq %rdi, %r8 ; HYBRID-NEXT: xorq $63, %r8 -; HYBRID-NEXT: andq %rcx, %r9 -; HYBRID-NEXT: bsrq %r9, %rcx +; HYBRID-NEXT: andq %rcx, %r11 +; HYBRID-NEXT: bsrq %r11, %rcx ; HYBRID-NEXT: xorq $63, %rcx ; HYBRID-NEXT: addq $64, %rcx ; HYBRID-NEXT: testq %rdi, %rdi @@ -278,16 +278,16 @@ define i256 @test2(i256 %a) nounwind { ; HYBRID-NEXT: andq %rdx, %r10 ; HYBRID-NEXT: bsrq %r10, %rdx ; HYBRID-NEXT: xorq $63, %rdx -; HYBRID-NEXT: andq %rsi, %r11 +; HYBRID-NEXT: andq %rsi, %r9 ; HYBRID-NEXT: movl $127, %esi -; HYBRID-NEXT: bsrq %r11, %r8 +; HYBRID-NEXT: bsrq %r9, %r8 ; HYBRID-NEXT: cmoveq %rsi, %r8 ; HYBRID-NEXT: xorq $63, %r8 ; HYBRID-NEXT: addq $64, %r8 ; HYBRID-NEXT: testq %r10, %r10 ; HYBRID-NEXT: cmovneq %rdx, %r8 ; HYBRID-NEXT: subq $-128, %r8 -; HYBRID-NEXT: orq %rdi, %r9 +; HYBRID-NEXT: orq %rdi, %r11 ; HYBRID-NEXT: cmovneq %rcx, %r8 ; HYBRID-NEXT: movq %r8, (%rax) ; HYBRID-NEXT: movq $0, 24(%rax) @@ -299,18 +299,18 @@ define i256 @test2(i256 %a) nounwind { ; BURR: # %bb.0: ; BURR-NEXT: movq %rdi, %rax ; BURR-NEXT: xorl %edi, %edi -; BURR-NEXT: movq %rsi, %r11 -; BURR-NEXT: negq %r11 +; BURR-NEXT: movq %rsi, %r9 +; BURR-NEXT: negq %r9 ; BURR-NEXT: movl $0, %r10d ; BURR-NEXT: sbbq %rdx, %r10 -; BURR-NEXT: movl $0, %r9d -; BURR-NEXT: sbbq %rcx, %r9 +; BURR-NEXT: movl $0, %r11d +; BURR-NEXT: sbbq %rcx, %r11 ; BURR-NEXT: sbbq %r8, %rdi ; BURR-NEXT: andq %r8, %rdi ; BURR-NEXT: bsrq %rdi, %r8 ; BURR-NEXT: xorq $63, %r8 -; BURR-NEXT: andq %rcx, %r9 -; BURR-NEXT: bsrq %r9, %rcx +; BURR-NEXT: andq %rcx, %r11 +; BURR-NEXT: bsrq %r11, %rcx ; BURR-NEXT: xorq $63, %rcx ; BURR-NEXT: addq $64, %rcx ; BURR-NEXT: testq %rdi, %rdi @@ -318,16 +318,16 @@ define i256 @test2(i256 %a) nounwind { ; BURR-NEXT: andq %rdx, %r10 ; BURR-NEXT: bsrq %r10, %rdx ; BURR-NEXT: xorq $63, %rdx -; BURR-NEXT: andq %rsi, %r11 +; BURR-NEXT: andq %rsi, %r9 ; BURR-NEXT: movl $127, %esi -; BURR-NEXT: bsrq %r11, %r8 +; BURR-NEXT: bsrq %r9, %r8 ; BURR-NEXT: cmoveq %rsi, %r8 ; BURR-NEXT: xorq $63, %r8 ; BURR-NEXT: addq $64, %r8 ; BURR-NEXT: testq %r10, %r10 ; BURR-NEXT: cmovneq %rdx, %r8 ; BURR-NEXT: subq $-128, %r8 -; BURR-NEXT: orq %rdi, %r9 +; BURR-NEXT: orq %rdi, %r11 ; BURR-NEXT: cmovneq %rcx, %r8 ; BURR-NEXT: movq %r8, (%rax) ; BURR-NEXT: movq $0, 24(%rax) @@ -339,27 +339,27 @@ define i256 @test2(i256 %a) nounwind { ; SRC: # %bb.0: ; SRC-NEXT: movq %rdi, %rax ; SRC-NEXT: xorl %edi, %edi -; SRC-NEXT: movq %rsi, %r11 -; SRC-NEXT: negq %r11 +; SRC-NEXT: movq %rsi, %r9 +; SRC-NEXT: negq %r9 ; SRC-NEXT: movl $0, %r10d ; SRC-NEXT: sbbq %rdx, %r10 -; SRC-NEXT: movl $0, %r9d -; SRC-NEXT: sbbq %rcx, %r9 +; SRC-NEXT: movl $0, %r11d +; SRC-NEXT: sbbq %rcx, %r11 ; SRC-NEXT: sbbq %r8, %rdi ; SRC-NEXT: andq %rdx, %r10 -; SRC-NEXT: andq %rcx, %r9 +; SRC-NEXT: andq %rcx, %r11 ; SRC-NEXT: andq %r8, %rdi -; SRC-NEXT: andq %rsi, %r11 +; SRC-NEXT: andq %rsi, %r9 ; SRC-NEXT: bsrq %rdi, %rcx ; SRC-NEXT: xorq $63, %rcx -; SRC-NEXT: bsrq %r9, %rdx +; SRC-NEXT: bsrq %r11, %rdx ; SRC-NEXT: xorq $63, %rdx ; SRC-NEXT: addq $64, %rdx ; SRC-NEXT: testq %rdi, %rdi ; SRC-NEXT: cmovneq %rcx, %rdx ; SRC-NEXT: bsrq %r10, %rcx ; SRC-NEXT: xorq $63, %rcx -; SRC-NEXT: bsrq %r11, %rsi +; SRC-NEXT: bsrq %r9, %rsi ; SRC-NEXT: movl $127, %r8d ; SRC-NEXT: cmovneq %rsi, %r8 ; SRC-NEXT: xorq $63, %r8 @@ -367,7 +367,7 @@ define i256 @test2(i256 %a) nounwind { ; SRC-NEXT: testq %r10, %r10 ; SRC-NEXT: cmovneq %rcx, %r8 ; SRC-NEXT: subq $-128, %r8 -; SRC-NEXT: orq %r9, %rdi +; SRC-NEXT: orq %r11, %rdi ; SRC-NEXT: cmovneq %rdx, %r8 ; SRC-NEXT: movq %r8, (%rax) ; SRC-NEXT: movq $0, 24(%rax) @@ -423,140 +423,134 @@ define i256 @test2(i256 %a) nounwind { define i256 @test3(i256 %n) nounwind { ; ILP-LABEL: test3: ; ILP: # %bb.0: -; ILP-NEXT: pushq %rbx ; ILP-NEXT: movq %rdi, %rax -; ILP-NEXT: xorl %r9d, %r9d -; ILP-NEXT: movq %rsi, %rdi -; ILP-NEXT: negq %rdi +; ILP-NEXT: xorl %edi, %edi +; ILP-NEXT: movq %rsi, %r9 +; ILP-NEXT: negq %r9 ; ILP-NEXT: movl $0, %r10d ; ILP-NEXT: sbbq %rdx, %r10 ; ILP-NEXT: movl $0, %r11d ; ILP-NEXT: sbbq %rcx, %r11 -; ILP-NEXT: sbbq %r8, %r9 +; ILP-NEXT: sbbq %r8, %rdi ; ILP-NEXT: notq %r8 -; ILP-NEXT: andq %r9, %r8 -; ILP-NEXT: bsrq %r8, %rbx +; ILP-NEXT: andq %rdi, %r8 +; ILP-NEXT: bsrq %r8, %rdi ; ILP-NEXT: notq %rdx ; ILP-NEXT: andq %r10, %rdx -; ILP-NEXT: bsrq %rdx, %r9 +; ILP-NEXT: bsrq %rdx, %r10 ; ILP-NEXT: notq %rsi -; ILP-NEXT: xorq $63, %rbx +; ILP-NEXT: xorq $63, %rdi ; ILP-NEXT: notq %rcx ; ILP-NEXT: andq %r11, %rcx -; ILP-NEXT: bsrq %rcx, %r10 -; ILP-NEXT: xorq $63, %r10 -; ILP-NEXT: addq $64, %r10 +; ILP-NEXT: bsrq %rcx, %r11 +; ILP-NEXT: xorq $63, %r11 +; ILP-NEXT: addq $64, %r11 ; ILP-NEXT: testq %r8, %r8 -; ILP-NEXT: cmovneq %rbx, %r10 -; ILP-NEXT: xorq $63, %r9 -; ILP-NEXT: andq %rdi, %rsi +; ILP-NEXT: cmovneq %rdi, %r11 +; ILP-NEXT: xorq $63, %r10 +; ILP-NEXT: andq %r9, %rsi ; ILP-NEXT: movl $127, %edi ; ILP-NEXT: bsrq %rsi, %rsi ; ILP-NEXT: cmoveq %rdi, %rsi ; ILP-NEXT: xorq $63, %rsi ; ILP-NEXT: addq $64, %rsi ; ILP-NEXT: testq %rdx, %rdx -; ILP-NEXT: cmovneq %r9, %rsi +; ILP-NEXT: cmovneq %r10, %rsi ; ILP-NEXT: subq $-128, %rsi ; ILP-NEXT: orq %r8, %rcx -; ILP-NEXT: cmovneq %r10, %rsi +; ILP-NEXT: cmovneq %r11, %rsi ; ILP-NEXT: movq %rsi, (%rax) ; ILP-NEXT: movq $0, 24(%rax) ; ILP-NEXT: movq $0, 16(%rax) ; ILP-NEXT: movq $0, 8(%rax) -; ILP-NEXT: popq %rbx ; ILP-NEXT: retq ; ; HYBRID-LABEL: test3: ; HYBRID: # %bb.0: -; HYBRID-NEXT: pushq %rbx ; HYBRID-NEXT: movq %rdi, %rax -; HYBRID-NEXT: xorl %r9d, %r9d -; HYBRID-NEXT: movq %rsi, %rdi -; HYBRID-NEXT: negq %rdi +; HYBRID-NEXT: xorl %edi, %edi +; HYBRID-NEXT: movq %rsi, %r9 +; HYBRID-NEXT: negq %r9 ; HYBRID-NEXT: movl $0, %r10d ; HYBRID-NEXT: sbbq %rdx, %r10 ; HYBRID-NEXT: movl $0, %r11d ; HYBRID-NEXT: sbbq %rcx, %r11 -; HYBRID-NEXT: sbbq %r8, %r9 +; HYBRID-NEXT: sbbq %r8, %rdi ; HYBRID-NEXT: notq %r8 -; HYBRID-NEXT: andq %r9, %r8 -; HYBRID-NEXT: bsrq %r8, %rbx -; HYBRID-NEXT: xorq $63, %rbx +; HYBRID-NEXT: andq %rdi, %r8 +; HYBRID-NEXT: bsrq %r8, %rdi +; HYBRID-NEXT: xorq $63, %rdi ; HYBRID-NEXT: notq %rcx ; HYBRID-NEXT: andq %r11, %rcx -; HYBRID-NEXT: bsrq %rcx, %r9 -; HYBRID-NEXT: xorq $63, %r9 -; HYBRID-NEXT: addq $64, %r9 +; HYBRID-NEXT: bsrq %rcx, %r11 +; HYBRID-NEXT: xorq $63, %r11 +; HYBRID-NEXT: addq $64, %r11 ; HYBRID-NEXT: testq %r8, %r8 -; HYBRID-NEXT: cmovneq %rbx, %r9 +; HYBRID-NEXT: cmovneq %rdi, %r11 ; HYBRID-NEXT: notq %rdx ; HYBRID-NEXT: andq %r10, %rdx -; HYBRID-NEXT: bsrq %rdx, %r10 -; HYBRID-NEXT: xorq $63, %r10 +; HYBRID-NEXT: bsrq %rdx, %rdi +; HYBRID-NEXT: xorq $63, %rdi ; HYBRID-NEXT: notq %rsi -; HYBRID-NEXT: andq %rdi, %rsi -; HYBRID-NEXT: movl $127, %edi +; HYBRID-NEXT: andq %r9, %rsi +; HYBRID-NEXT: movl $127, %r9d ; HYBRID-NEXT: bsrq %rsi, %rsi -; HYBRID-NEXT: cmoveq %rdi, %rsi +; HYBRID-NEXT: cmoveq %r9, %rsi ; HYBRID-NEXT: xorq $63, %rsi ; HYBRID-NEXT: addq $64, %rsi ; HYBRID-NEXT: testq %rdx, %rdx -; HYBRID-NEXT: cmovneq %r10, %rsi +; HYBRID-NEXT: cmovneq %rdi, %rsi ; HYBRID-NEXT: subq $-128, %rsi ; HYBRID-NEXT: orq %r8, %rcx -; HYBRID-NEXT: cmovneq %r9, %rsi +; HYBRID-NEXT: cmovneq %r11, %rsi ; HYBRID-NEXT: movq %rsi, (%rax) ; HYBRID-NEXT: movq $0, 24(%rax) ; HYBRID-NEXT: movq $0, 16(%rax) ; HYBRID-NEXT: movq $0, 8(%rax) -; HYBRID-NEXT: popq %rbx ; HYBRID-NEXT: retq ; ; BURR-LABEL: test3: ; BURR: # %bb.0: -; BURR-NEXT: pushq %rbx ; BURR-NEXT: movq %rdi, %rax -; BURR-NEXT: xorl %r9d, %r9d -; BURR-NEXT: movq %rsi, %rdi -; BURR-NEXT: negq %rdi +; BURR-NEXT: xorl %edi, %edi +; BURR-NEXT: movq %rsi, %r9 +; BURR-NEXT: negq %r9 ; BURR-NEXT: movl $0, %r10d ; BURR-NEXT: sbbq %rdx, %r10 ; BURR-NEXT: movl $0, %r11d ; BURR-NEXT: sbbq %rcx, %r11 -; BURR-NEXT: sbbq %r8, %r9 +; BURR-NEXT: sbbq %r8, %rdi ; BURR-NEXT: notq %r8 -; BURR-NEXT: andq %r9, %r8 -; BURR-NEXT: bsrq %r8, %rbx -; BURR-NEXT: xorq $63, %rbx +; BURR-NEXT: andq %rdi, %r8 +; BURR-NEXT: bsrq %r8, %rdi +; BURR-NEXT: xorq $63, %rdi ; BURR-NEXT: notq %rcx ; BURR-NEXT: andq %r11, %rcx -; BURR-NEXT: bsrq %rcx, %r9 -; BURR-NEXT: xorq $63, %r9 -; BURR-NEXT: addq $64, %r9 +; BURR-NEXT: bsrq %rcx, %r11 +; BURR-NEXT: xorq $63, %r11 +; BURR-NEXT: addq $64, %r11 ; BURR-NEXT: testq %r8, %r8 -; BURR-NEXT: cmovneq %rbx, %r9 +; BURR-NEXT: cmovneq %rdi, %r11 ; BURR-NEXT: notq %rdx ; BURR-NEXT: andq %r10, %rdx -; BURR-NEXT: bsrq %rdx, %r10 -; BURR-NEXT: xorq $63, %r10 +; BURR-NEXT: bsrq %rdx, %rdi +; BURR-NEXT: xorq $63, %rdi ; BURR-NEXT: notq %rsi -; BURR-NEXT: andq %rdi, %rsi -; BURR-NEXT: movl $127, %edi +; BURR-NEXT: andq %r9, %rsi +; BURR-NEXT: movl $127, %r9d ; BURR-NEXT: bsrq %rsi, %rsi -; BURR-NEXT: cmoveq %rdi, %rsi +; BURR-NEXT: cmoveq %r9, %rsi ; BURR-NEXT: xorq $63, %rsi ; BURR-NEXT: addq $64, %rsi ; BURR-NEXT: testq %rdx, %rdx -; BURR-NEXT: cmovneq %r10, %rsi +; BURR-NEXT: cmovneq %rdi, %rsi ; BURR-NEXT: subq $-128, %rsi ; BURR-NEXT: orq %r8, %rcx -; BURR-NEXT: cmovneq %r9, %rsi +; BURR-NEXT: cmovneq %r11, %rsi ; BURR-NEXT: movq %rsi, (%rax) ; BURR-NEXT: movq $0, 24(%rax) ; BURR-NEXT: movq $0, 16(%rax) ; BURR-NEXT: movq $0, 8(%rax) -; BURR-NEXT: popq %rbx ; BURR-NEXT: retq ; ; SRC-LABEL: test3: @@ -611,20 +605,20 @@ define i256 @test3(i256 %n) nounwind { ; LIN-NEXT: notq %rsi ; LIN-NEXT: andq %rdi, %rsi ; LIN-NEXT: bsrq %rsi, %rsi -; LIN-NEXT: movl $127, %r9d -; LIN-NEXT: cmovneq %rsi, %r9 -; LIN-NEXT: xorq $63, %r9 -; LIN-NEXT: addq $64, %r9 -; LIN-NEXT: xorl %edi, %edi -; LIN-NEXT: movl $0, %esi -; LIN-NEXT: sbbq %rdx, %rsi +; LIN-NEXT: movl $127, %edi +; LIN-NEXT: cmovneq %rsi, %rdi +; LIN-NEXT: xorq $63, %rdi +; LIN-NEXT: addq $64, %rdi +; LIN-NEXT: xorl %esi, %esi +; LIN-NEXT: movl $0, %r9d +; LIN-NEXT: sbbq %rdx, %r9 ; LIN-NEXT: notq %rdx -; LIN-NEXT: andq %rsi, %rdx -; LIN-NEXT: bsrq %rdx, %rsi -; LIN-NEXT: xorq $63, %rsi +; LIN-NEXT: andq %r9, %rdx +; LIN-NEXT: bsrq %rdx, %r9 +; LIN-NEXT: xorq $63, %r9 ; LIN-NEXT: testq %rdx, %rdx -; LIN-NEXT: cmoveq %r9, %rsi -; LIN-NEXT: subq $-128, %rsi +; LIN-NEXT: cmoveq %rdi, %r9 +; LIN-NEXT: subq $-128, %r9 ; LIN-NEXT: movl $0, %edx ; LIN-NEXT: sbbq %rcx, %rdx ; LIN-NEXT: notq %rcx @@ -632,16 +626,16 @@ define i256 @test3(i256 %n) nounwind { ; LIN-NEXT: bsrq %rcx, %rdx ; LIN-NEXT: xorq $63, %rdx ; LIN-NEXT: addq $64, %rdx -; LIN-NEXT: sbbq %r8, %rdi +; LIN-NEXT: sbbq %r8, %rsi ; LIN-NEXT: notq %r8 -; LIN-NEXT: andq %rdi, %r8 -; LIN-NEXT: bsrq %r8, %rdi -; LIN-NEXT: xorq $63, %rdi +; LIN-NEXT: andq %rsi, %r8 +; LIN-NEXT: bsrq %r8, %rsi +; LIN-NEXT: xorq $63, %rsi ; LIN-NEXT: testq %r8, %r8 -; LIN-NEXT: cmoveq %rdx, %rdi +; LIN-NEXT: cmoveq %rdx, %rsi ; LIN-NEXT: orq %rcx, %r8 -; LIN-NEXT: cmoveq %rsi, %rdi -; LIN-NEXT: movq %rdi, (%rax) +; LIN-NEXT: cmoveq %r9, %rsi +; LIN-NEXT: movq %rsi, (%rax) ; LIN-NEXT: movq $0, 8(%rax) ; LIN-NEXT: movq $0, 16(%rax) ; LIN-NEXT: movq $0, 24(%rax) @@ -745,13 +739,13 @@ define i256 @PR25498(i256 %a) nounwind { ; ILP: # %bb.0: ; ILP-NEXT: pushq %rbx ; ILP-NEXT: movq %rdi, %rax -; ILP-NEXT: xorl %edi, %edi +; ILP-NEXT: xorl %r11d, %r11d ; ILP-NEXT: movq %rsi, %rbx ; ILP-NEXT: negq %rbx -; ILP-NEXT: movl $0, %r11d -; ILP-NEXT: sbbq %rdx, %r11 ; ILP-NEXT: movl $0, %r9d -; ILP-NEXT: sbbq %rcx, %r9 +; ILP-NEXT: sbbq %rdx, %r9 +; ILP-NEXT: movl $0, %edi +; ILP-NEXT: sbbq %rcx, %rdi ; ILP-NEXT: movl $0, %r10d ; ILP-NEXT: sbbq %r8, %r10 ; ILP-NEXT: orq %r8, %rdx @@ -759,10 +753,10 @@ define i256 @PR25498(i256 %a) nounwind { ; ILP-NEXT: orq %rdx, %rsi ; ILP-NEXT: je .LBB4_1 ; ILP-NEXT: # %bb.2: # %cond.false -; ILP-NEXT: bsrq %r11, %rdx +; ILP-NEXT: bsrq %r9, %rdx ; ILP-NEXT: bsrq %r10, %rcx ; ILP-NEXT: xorq $63, %rcx -; ILP-NEXT: bsrq %r9, %rsi +; ILP-NEXT: bsrq %rdi, %rsi ; ILP-NEXT: xorq $63, %rsi ; ILP-NEXT: addq $64, %rsi ; ILP-NEXT: testq %r10, %r10 @@ -771,20 +765,20 @@ define i256 @PR25498(i256 %a) nounwind { ; ILP-NEXT: bsrq %rbx, %rcx ; ILP-NEXT: xorq $63, %rcx ; ILP-NEXT: addq $64, %rcx -; ILP-NEXT: testq %r11, %r11 +; ILP-NEXT: testq %r9, %r9 ; ILP-NEXT: cmovneq %rdx, %rcx ; ILP-NEXT: subq $-128, %rcx -; ILP-NEXT: xorl %edi, %edi -; ILP-NEXT: orq %r10, %r9 +; ILP-NEXT: xorl %r11d, %r11d +; ILP-NEXT: orq %r10, %rdi ; ILP-NEXT: cmovneq %rsi, %rcx ; ILP-NEXT: jmp .LBB4_3 ; ILP-NEXT: .LBB4_1: ; ILP-NEXT: movl $256, %ecx # imm = 0x100 ; ILP-NEXT: .LBB4_3: # %cond.end ; ILP-NEXT: movq %rcx, (%rax) -; ILP-NEXT: movq %rdi, 8(%rax) -; ILP-NEXT: movq %rdi, 16(%rax) -; ILP-NEXT: movq %rdi, 24(%rax) +; ILP-NEXT: movq %r11, 8(%rax) +; ILP-NEXT: movq %r11, 16(%rax) +; ILP-NEXT: movq %r11, 24(%rax) ; ILP-NEXT: popq %rbx ; ILP-NEXT: retq ; @@ -792,46 +786,46 @@ define i256 @PR25498(i256 %a) nounwind { ; HYBRID: # %bb.0: ; HYBRID-NEXT: pushq %rbx ; HYBRID-NEXT: movq %rdi, %rax -; HYBRID-NEXT: xorl %edi, %edi -; HYBRID-NEXT: movq %rsi, %rbx -; HYBRID-NEXT: negq %rbx -; HYBRID-NEXT: movl $0, %r11d -; HYBRID-NEXT: sbbq %rdx, %r11 +; HYBRID-NEXT: xorl %ebx, %ebx +; HYBRID-NEXT: movq %rsi, %r10 +; HYBRID-NEXT: negq %r10 ; HYBRID-NEXT: movl $0, %r9d -; HYBRID-NEXT: sbbq %rcx, %r9 -; HYBRID-NEXT: movl $0, %r10d -; HYBRID-NEXT: sbbq %r8, %r10 +; HYBRID-NEXT: sbbq %rdx, %r9 +; HYBRID-NEXT: movl $0, %edi +; HYBRID-NEXT: sbbq %rcx, %rdi +; HYBRID-NEXT: movl $0, %r11d +; HYBRID-NEXT: sbbq %r8, %r11 ; HYBRID-NEXT: orq %r8, %rdx ; HYBRID-NEXT: orq %rcx, %rsi ; HYBRID-NEXT: orq %rdx, %rsi ; HYBRID-NEXT: je .LBB4_1 ; HYBRID-NEXT: # %bb.2: # %cond.false -; HYBRID-NEXT: bsrq %r10, %rcx +; HYBRID-NEXT: bsrq %r11, %rcx ; HYBRID-NEXT: xorq $63, %rcx -; HYBRID-NEXT: bsrq %r9, %rdx +; HYBRID-NEXT: bsrq %rdi, %rdx ; HYBRID-NEXT: xorq $63, %rdx ; HYBRID-NEXT: addq $64, %rdx -; HYBRID-NEXT: testq %r10, %r10 +; HYBRID-NEXT: testq %r11, %r11 ; HYBRID-NEXT: cmovneq %rcx, %rdx -; HYBRID-NEXT: bsrq %r11, %rsi +; HYBRID-NEXT: bsrq %r9, %rsi ; HYBRID-NEXT: xorq $63, %rsi -; HYBRID-NEXT: bsrq %rbx, %rcx +; HYBRID-NEXT: bsrq %r10, %rcx ; HYBRID-NEXT: xorq $63, %rcx ; HYBRID-NEXT: addq $64, %rcx -; HYBRID-NEXT: testq %r11, %r11 +; HYBRID-NEXT: testq %r9, %r9 ; HYBRID-NEXT: cmovneq %rsi, %rcx ; HYBRID-NEXT: subq $-128, %rcx -; HYBRID-NEXT: orq %r10, %r9 +; HYBRID-NEXT: orq %r11, %rdi ; HYBRID-NEXT: cmovneq %rdx, %rcx -; HYBRID-NEXT: xorl %edi, %edi +; HYBRID-NEXT: xorl %ebx, %ebx ; HYBRID-NEXT: jmp .LBB4_3 ; HYBRID-NEXT: .LBB4_1: ; HYBRID-NEXT: movl $256, %ecx # imm = 0x100 ; HYBRID-NEXT: .LBB4_3: # %cond.end ; HYBRID-NEXT: movq %rcx, (%rax) -; HYBRID-NEXT: movq %rdi, 8(%rax) -; HYBRID-NEXT: movq %rdi, 16(%rax) -; HYBRID-NEXT: movq %rdi, 24(%rax) +; HYBRID-NEXT: movq %rbx, 8(%rax) +; HYBRID-NEXT: movq %rbx, 16(%rax) +; HYBRID-NEXT: movq %rbx, 24(%rax) ; HYBRID-NEXT: popq %rbx ; HYBRID-NEXT: retq ; @@ -839,46 +833,46 @@ define i256 @PR25498(i256 %a) nounwind { ; BURR: # %bb.0: ; BURR-NEXT: pushq %rbx ; BURR-NEXT: movq %rdi, %rax -; BURR-NEXT: xorl %edi, %edi -; BURR-NEXT: movq %rsi, %rbx -; BURR-NEXT: negq %rbx -; BURR-NEXT: movl $0, %r11d -; BURR-NEXT: sbbq %rdx, %r11 +; BURR-NEXT: xorl %ebx, %ebx +; BURR-NEXT: movq %rsi, %r10 +; BURR-NEXT: negq %r10 ; BURR-NEXT: movl $0, %r9d -; BURR-NEXT: sbbq %rcx, %r9 -; BURR-NEXT: movl $0, %r10d -; BURR-NEXT: sbbq %r8, %r10 +; BURR-NEXT: sbbq %rdx, %r9 +; BURR-NEXT: movl $0, %edi +; BURR-NEXT: sbbq %rcx, %rdi +; BURR-NEXT: movl $0, %r11d +; BURR-NEXT: sbbq %r8, %r11 ; BURR-NEXT: orq %r8, %rdx ; BURR-NEXT: orq %rcx, %rsi ; BURR-NEXT: orq %rdx, %rsi ; BURR-NEXT: je .LBB4_1 ; BURR-NEXT: # %bb.2: # %cond.false -; BURR-NEXT: bsrq %r10, %rcx +; BURR-NEXT: bsrq %r11, %rcx ; BURR-NEXT: xorq $63, %rcx -; BURR-NEXT: bsrq %r9, %rdx +; BURR-NEXT: bsrq %rdi, %rdx ; BURR-NEXT: xorq $63, %rdx ; BURR-NEXT: addq $64, %rdx -; BURR-NEXT: testq %r10, %r10 +; BURR-NEXT: testq %r11, %r11 ; BURR-NEXT: cmovneq %rcx, %rdx -; BURR-NEXT: bsrq %r11, %rsi +; BURR-NEXT: bsrq %r9, %rsi ; BURR-NEXT: xorq $63, %rsi -; BURR-NEXT: bsrq %rbx, %rcx +; BURR-NEXT: bsrq %r10, %rcx ; BURR-NEXT: xorq $63, %rcx ; BURR-NEXT: addq $64, %rcx -; BURR-NEXT: testq %r11, %r11 +; BURR-NEXT: testq %r9, %r9 ; BURR-NEXT: cmovneq %rsi, %rcx ; BURR-NEXT: subq $-128, %rcx -; BURR-NEXT: orq %r10, %r9 +; BURR-NEXT: orq %r11, %rdi ; BURR-NEXT: cmovneq %rdx, %rcx -; BURR-NEXT: xorl %edi, %edi +; BURR-NEXT: xorl %ebx, %ebx ; BURR-NEXT: jmp .LBB4_3 ; BURR-NEXT: .LBB4_1: ; BURR-NEXT: movl $256, %ecx # imm = 0x100 ; BURR-NEXT: .LBB4_3: # %cond.end ; BURR-NEXT: movq %rcx, (%rax) -; BURR-NEXT: movq %rdi, 8(%rax) -; BURR-NEXT: movq %rdi, 16(%rax) -; BURR-NEXT: movq %rdi, 24(%rax) +; BURR-NEXT: movq %rbx, 8(%rax) +; BURR-NEXT: movq %rbx, 16(%rax) +; BURR-NEXT: movq %rbx, 24(%rax) ; BURR-NEXT: popq %rbx ; BURR-NEXT: retq ; @@ -886,46 +880,46 @@ define i256 @PR25498(i256 %a) nounwind { ; SRC: # %bb.0: ; SRC-NEXT: pushq %rbx ; SRC-NEXT: movq %rdi, %rax -; SRC-NEXT: xorl %edi, %edi -; SRC-NEXT: movq %rsi, %rbx -; SRC-NEXT: negq %rbx -; SRC-NEXT: movl $0, %r11d -; SRC-NEXT: sbbq %rdx, %r11 +; SRC-NEXT: xorl %ebx, %ebx +; SRC-NEXT: movq %rsi, %r10 +; SRC-NEXT: negq %r10 ; SRC-NEXT: movl $0, %r9d -; SRC-NEXT: sbbq %rcx, %r9 -; SRC-NEXT: movl $0, %r10d -; SRC-NEXT: sbbq %r8, %r10 +; SRC-NEXT: sbbq %rdx, %r9 +; SRC-NEXT: movl $0, %edi +; SRC-NEXT: sbbq %rcx, %rdi +; SRC-NEXT: movl $0, %r11d +; SRC-NEXT: sbbq %r8, %r11 ; SRC-NEXT: orq %r8, %rdx ; SRC-NEXT: orq %rcx, %rsi ; SRC-NEXT: orq %rdx, %rsi ; SRC-NEXT: je .LBB4_1 ; SRC-NEXT: # %bb.2: # %cond.false -; SRC-NEXT: bsrq %r10, %rcx +; SRC-NEXT: bsrq %r11, %rcx ; SRC-NEXT: xorq $63, %rcx -; SRC-NEXT: bsrq %r9, %rdx +; SRC-NEXT: bsrq %rdi, %rdx ; SRC-NEXT: xorq $63, %rdx ; SRC-NEXT: addq $64, %rdx -; SRC-NEXT: testq %r10, %r10 +; SRC-NEXT: testq %r11, %r11 ; SRC-NEXT: cmovneq %rcx, %rdx -; SRC-NEXT: bsrq %r11, %rsi +; SRC-NEXT: bsrq %r9, %rsi ; SRC-NEXT: xorq $63, %rsi -; SRC-NEXT: bsrq %rbx, %rcx +; SRC-NEXT: bsrq %r10, %rcx ; SRC-NEXT: xorq $63, %rcx ; SRC-NEXT: addq $64, %rcx -; SRC-NEXT: testq %r11, %r11 +; SRC-NEXT: testq %r9, %r9 ; SRC-NEXT: cmovneq %rsi, %rcx ; SRC-NEXT: subq $-128, %rcx -; SRC-NEXT: orq %r10, %r9 +; SRC-NEXT: orq %r11, %rdi ; SRC-NEXT: cmovneq %rdx, %rcx -; SRC-NEXT: xorl %edi, %edi +; SRC-NEXT: xorl %ebx, %ebx ; SRC-NEXT: jmp .LBB4_3 ; SRC-NEXT: .LBB4_1: ; SRC-NEXT: movl $256, %ecx # imm = 0x100 ; SRC-NEXT: .LBB4_3: # %cond.end ; SRC-NEXT: movq %rcx, (%rax) -; SRC-NEXT: movq %rdi, 8(%rax) -; SRC-NEXT: movq %rdi, 16(%rax) -; SRC-NEXT: movq %rdi, 24(%rax) +; SRC-NEXT: movq %rbx, 8(%rax) +; SRC-NEXT: movq %rbx, 16(%rax) +; SRC-NEXT: movq %rbx, 24(%rax) ; SRC-NEXT: popq %rbx ; SRC-NEXT: retq ; @@ -935,13 +929,13 @@ define i256 @PR25498(i256 %a) nounwind { ; LIN-NEXT: movq %rdi, %rax ; LIN-NEXT: movq %rsi, %rbx ; LIN-NEXT: negq %rbx -; LIN-NEXT: xorl %edi, %edi +; LIN-NEXT: xorl %r10d, %r10d ; LIN-NEXT: movl $0, %r11d ; LIN-NEXT: sbbq %rdx, %r11 +; LIN-NEXT: movl $0, %edi +; LIN-NEXT: sbbq %rcx, %rdi ; LIN-NEXT: movl $0, %r9d -; LIN-NEXT: sbbq %rcx, %r9 -; LIN-NEXT: movl $0, %r10d -; LIN-NEXT: sbbq %r8, %r10 +; LIN-NEXT: sbbq %r8, %r9 ; LIN-NEXT: orq %rcx, %rsi ; LIN-NEXT: orq %r8, %rdx ; LIN-NEXT: orq %rsi, %rdx @@ -955,24 +949,24 @@ define i256 @PR25498(i256 %a) nounwind { ; LIN-NEXT: testq %r11, %r11 ; LIN-NEXT: cmoveq %rcx, %rdx ; LIN-NEXT: subq $-128, %rdx -; LIN-NEXT: bsrq %r9, %rsi +; LIN-NEXT: bsrq %rdi, %rsi ; LIN-NEXT: xorq $63, %rsi ; LIN-NEXT: addq $64, %rsi -; LIN-NEXT: bsrq %r10, %rcx +; LIN-NEXT: bsrq %r9, %rcx ; LIN-NEXT: xorq $63, %rcx -; LIN-NEXT: testq %r10, %r10 +; LIN-NEXT: testq %r9, %r9 ; LIN-NEXT: cmoveq %rsi, %rcx -; LIN-NEXT: orq %r10, %r9 +; LIN-NEXT: orq %r9, %rdi ; LIN-NEXT: cmoveq %rdx, %rcx -; LIN-NEXT: xorl %edi, %edi +; LIN-NEXT: xorl %r10d, %r10d ; LIN-NEXT: jmp .LBB4_3 ; LIN-NEXT: .LBB4_1: ; LIN-NEXT: movl $256, %ecx # imm = 0x100 ; LIN-NEXT: .LBB4_3: # %cond.end ; LIN-NEXT: movq %rcx, (%rax) -; LIN-NEXT: movq %rdi, 8(%rax) -; LIN-NEXT: movq %rdi, 16(%rax) -; LIN-NEXT: movq %rdi, 24(%rax) +; LIN-NEXT: movq %r10, 8(%rax) +; LIN-NEXT: movq %r10, 16(%rax) +; LIN-NEXT: movq %r10, 24(%rax) ; LIN-NEXT: popq %rbx ; LIN-NEXT: retq %b = sub i256 0, %a diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll index 336aa216d19b11..02316c09a96966 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -93,14 +93,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movsbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $14, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd -; X86-NEXT: idivl %edi -; X86-NEXT: leal -1(%eax), %esi -; X86-NEXT: testl %edi, %edi +; X86-NEXT: idivl %esi +; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testl %esi, %esi ; X86-NEXT: sets %bl ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sets %cl @@ -108,9 +108,9 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setne %dl ; X86-NEXT: testb %cl, %dl -; X86-NEXT: cmovel %eax, %esi -; X86-NEXT: addl %esi, %esi -; X86-NEXT: movswl %si, %eax +; X86-NEXT: cmovel %eax, %edi +; X86-NEXT: addl %edi, %edi +; X86-NEXT: movswl %di, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi @@ -218,33 +218,31 @@ define i4 @func4(i4 %x, i4 %y) nounwind { ; ; X86-LABEL: func4: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shlb $4, %cl -; X86-NEXT: sarb $4, %cl ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: shlb $4, %dl ; X86-NEXT: sarb $4, %dl -; X86-NEXT: shlb $2, %dl -; X86-NEXT: movsbl %dl, %eax -; X86-NEXT: idivb %cl -; X86-NEXT: movsbl %ah, %ebx +; X86-NEXT: movb {{[0-9]+}}(%esp), %dh +; X86-NEXT: shlb $4, %dh +; X86-NEXT: sarb $4, %dh +; X86-NEXT: shlb $2, %dh +; X86-NEXT: movsbl %dh, %eax +; X86-NEXT: idivb %dl +; X86-NEXT: movsbl %ah, %ecx ; X86-NEXT: movzbl %al, %esi ; X86-NEXT: decb %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: testb %cl, %cl -; X86-NEXT: sets %cl ; X86-NEXT: testb %dl, %dl ; X86-NEXT: sets %dl -; X86-NEXT: xorb %cl, %dl -; X86-NEXT: testb %bl, %bl +; X86-NEXT: testb %dh, %dh +; X86-NEXT: sets %dh +; X86-NEXT: xorb %dl, %dh +; X86-NEXT: testb %cl, %cl ; X86-NEXT: setne %cl -; X86-NEXT: testb %dl, %cl +; X86-NEXT: testb %dh, %cl ; X86-NEXT: cmovel %esi, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: popl %esi -; X86-NEXT: popl %ebx ; X86-NEXT: retl %tmp = call i4 @llvm.sdiv.fix.i4(i4 %x, i4 %y, i32 2) ret i4 %tmp @@ -334,11 +332,10 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ebx @@ -453,7 +450,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X64-NEXT: movq %xmm3, %rcx ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] @@ -474,14 +471,14 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; X64-NEXT: movq %xmm4, %r11 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm4, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %r11 -; X64-NEXT: movq %r8, %xmm5 +; X64-NEXT: movq %r9, %xmm5 ; X64-NEXT: movq %r10, %xmm6 ; X64-NEXT: pxor %xmm4, %xmm4 ; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] @@ -499,33 +496,33 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movq %rdi, %xmm5 ; X64-NEXT: pandn %xmm2, %xmm6 ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; X64-NEXT: movdqa %xmm6, %xmm5 -; X64-NEXT: pandn %xmm0, %xmm5 -; X64-NEXT: pcmpeqd %xmm2, %xmm2 -; X64-NEXT: paddq %xmm2, %xmm0 +; X64-NEXT: movdqa %xmm6, %xmm2 +; X64-NEXT: pandn %xmm0, %xmm2 +; X64-NEXT: pcmpeqd %xmm5, %xmm5 +; X64-NEXT: paddq %xmm5, %xmm0 ; X64-NEXT: pand %xmm6, %xmm0 -; X64-NEXT: por %xmm5, %xmm0 -; X64-NEXT: movq %r9, %xmm5 +; X64-NEXT: por %xmm2, %xmm0 +; X64-NEXT: movq %r8, %xmm2 ; X64-NEXT: movq %rdx, %xmm6 -; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; X64-NEXT: pcmpeqd %xmm4, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] -; X64-NEXT: pand %xmm5, %xmm6 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; X64-NEXT: pxor %xmm5, %xmm5 -; X64-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; X64-NEXT: pcmpeqd %xmm4, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,0,3,2] +; X64-NEXT: pand %xmm2, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pcmpgtd %xmm2, %xmm3 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X64-NEXT: pcmpgtd %xmm1, %xmm4 -; X64-NEXT: pxor %xmm5, %xmm4 +; X64-NEXT: pxor %xmm3, %xmm4 ; X64-NEXT: pandn %xmm4, %xmm6 ; X64-NEXT: movq %rcx, %xmm1 -; X64-NEXT: movq %rax, %xmm3 -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; X64-NEXT: movdqa %xmm6, %xmm3 -; X64-NEXT: pandn %xmm1, %xmm3 -; X64-NEXT: paddq %xmm2, %xmm1 +; X64-NEXT: movq %rax, %xmm2 +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-NEXT: movdqa %xmm6, %xmm2 +; X64-NEXT: pandn %xmm1, %xmm2 +; X64-NEXT: paddq %xmm5, %xmm1 ; X64-NEXT: pand %xmm6, %xmm1 -; X64-NEXT: por %xmm3, %xmm1 +; X64-NEXT: por %xmm2, %xmm1 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll index 371484e01556c8..0e185b9ad76518 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -258,29 +258,28 @@ define i4 @func4(i4 %x, i4 %y) nounwind { ; ; X86-LABEL: func4: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shlb $4, %cl -; X86-NEXT: sarb $4, %cl ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: shlb $4, %dl ; X86-NEXT: sarb $4, %dl -; X86-NEXT: shlb $2, %dl -; X86-NEXT: movsbl %dl, %eax -; X86-NEXT: idivb %cl -; X86-NEXT: movsbl %ah, %ebx +; X86-NEXT: movb {{[0-9]+}}(%esp), %dh +; X86-NEXT: shlb $4, %dh +; X86-NEXT: sarb $4, %dh +; X86-NEXT: shlb $2, %dh +; X86-NEXT: movsbl %dh, %eax +; X86-NEXT: idivb %dl +; X86-NEXT: movsbl %ah, %ecx ; X86-NEXT: movzbl %al, %esi ; X86-NEXT: decb %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: testb %cl, %cl -; X86-NEXT: sets %cl ; X86-NEXT: testb %dl, %dl ; X86-NEXT: sets %dl -; X86-NEXT: xorb %cl, %dl -; X86-NEXT: testb %bl, %bl +; X86-NEXT: testb %dh, %dh +; X86-NEXT: sets %dh +; X86-NEXT: xorb %dl, %dh +; X86-NEXT: testb %cl, %cl ; X86-NEXT: setne %cl -; X86-NEXT: testb %dl, %cl +; X86-NEXT: testb %dh, %cl ; X86-NEXT: cmovel %esi, %eax ; X86-NEXT: cmpb $7, %al ; X86-NEXT: movl $7, %ecx @@ -290,7 +289,6 @@ define i4 @func4(i4 %x, i4 %y) nounwind { ; X86-NEXT: cmovgel %ecx, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: popl %esi -; X86-NEXT: popl %ebx ; X86-NEXT: retl %tmp = call i4 @llvm.sdiv.fix.sat.i4(i4 %x, i4 %y, i32 2) ret i4 %tmp @@ -373,46 +371,46 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: subl $88, %esp ; X86-NEXT: movl 8(%ebp), %ecx ; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: movl 20(%ebp), %esi -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl 20(%ebp), %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $31, %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $31, %eax, %edi ; X86-NEXT: shldl $31, %ecx, %eax +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll $31, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl 20(%ebp) ; X86-NEXT: pushl 16(%ebp) +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %eax +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %ecx -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl $1, %esi -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $1, %esi ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: testl %ecx, %ecx @@ -420,12 +418,12 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: xorb %al, %dl ; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi ; X86-NEXT: pushl 20(%ebp) ; X86-NEXT: pushl 16(%ebp) ; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %edi +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl %eax @@ -438,27 +436,25 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: setne %al ; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmpl $-1, %esi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: sbbl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %edi, %ecx ; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF ; X86-NEXT: cmovll %eax, %edx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %edi +; X86-NEXT: movl $0, %eax +; X86-NEXT: cmovgel %eax, %ebx +; X86-NEXT: cmovgel %eax, %edi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: cmovgel %ecx, %ebx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovgel %ecx, %esi ; X86-NEXT: movl %esi, %edi @@ -466,8 +462,8 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: movl $-2147483648, %edi # imm = 0x80000000 ; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: movl $-1, %edi -; X86-NEXT: sbbl %ebx, %edi -; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: movl $0, %eax ; X86-NEXT: cmovgel %eax, %esi ; X86-NEXT: movl $-2147483648, %eax # imm = 0x80000000 @@ -872,9 +868,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl 36(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax @@ -884,8 +880,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: sarl $31, %ebx ; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: leal (%ecx,%ecx), %eax ; X86-NEXT: shrl $31, %ecx ; X86-NEXT: shldl $31, %eax, %ecx @@ -895,34 +891,34 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl 40(%ebp), %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%ebp), %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: movl 24(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sarl $31, %eax -; X86-NEXT: leal (%ecx,%ecx), %edx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: leal (%ecx,%ecx), %eax ; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %edx, %ecx +; X86-NEXT: shldl $31, %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx ; X86-NEXT: pushl 40(%ebp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl $0 -; X86-NEXT: pushl %edx +; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax @@ -930,8 +926,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl 28(%ebp) -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax @@ -952,14 +948,14 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: sbbl $0, %edx ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: sets %bl -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %esi, %esi ; X86-NEXT: sets %bh ; X86-NEXT: xorb %bl, %bh -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %esi -; X86-NEXT: orl %edi, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl %esi, %edi ; X86-NEXT: setne %bl ; X86-NEXT: testb %bh, %bl ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -989,8 +985,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: cmovgel %ebx, %eax -; X86-NEXT: movl $-1, %edx -; X86-NEXT: cmovgel %edx, %edi +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovgel %ecx, %edi ; X86-NEXT: shldl $31, %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1000,9 +996,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %edx @@ -1011,44 +1007,44 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bh ; X86-NEXT: xorb %bl, %bh -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: setne %cl -; X86-NEXT: testb %bh, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: setne %bl +; X86-NEXT: testb %bh, %bl ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: cmpl $-1, %eax -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %edx -; X86-NEXT: cmovgel %ecx, %edi -; X86-NEXT: cmovgel %ecx, %esi +; X86-NEXT: movl %edi, %esi +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl $0, %esi +; X86-NEXT: cmovgel %esi, %edx +; X86-NEXT: cmovgel %esi, %ecx +; X86-NEXT: cmovgel %esi, %edi ; X86-NEXT: movl $-1, %ebx ; X86-NEXT: cmovgel %ebx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: negl %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %esi, %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: negl %esi +; X86-NEXT: movl $-1, %esi +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: movl $-1, %esi +; X86-NEXT: sbbl %ecx, %esi ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: cmovgel %ecx, %eax -; X86-NEXT: cmovgel %ebx, %esi -; X86-NEXT: shldl $31, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovgel %ebx, %edi +; X86-NEXT: shldl $31, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %eax @@ -1056,9 +1052,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %edx @@ -1067,72 +1063,72 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bh ; X86-NEXT: xorb %bl, %bh -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: setne %cl -; X86-NEXT: testb %bh, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: setne %bl +; X86-NEXT: testb %bh, %bl ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: cmpl $-1, %eax -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %edx -; X86-NEXT: cmovgel %ecx, %edi -; X86-NEXT: cmovgel %ecx, %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl $0, %esi +; X86-NEXT: cmovgel %esi, %edx +; X86-NEXT: cmovgel %esi, %ecx +; X86-NEXT: cmovgel %esi, %ebx +; X86-NEXT: movl $-1, %edi +; X86-NEXT: cmovgel %edi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: negl %esi ; X86-NEXT: movl $-1, %esi -; X86-NEXT: cmovgel %esi, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: negl %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl $-1, %esi +; X86-NEXT: sbbl %ecx, %esi ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: cmovgel %ecx, %eax -; X86-NEXT: cmovgel %esi, %ebx +; X86-NEXT: cmovgel %edi, %ebx ; X86-NEXT: shldl $31, %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl $1, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %edi +; X86-NEXT: subl $1, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: sets %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: testl %edx, %edx +; X86-NEXT: sets %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sets %ah ; X86-NEXT: xorb %al, %ah ; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl 40(%ebp) ; X86-NEXT: pushl %edx ; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl 40(%ebp) +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax @@ -1145,38 +1141,38 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: setne %al ; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: cmpl $-1, %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: cmpl $-1, %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sbbl $0, %eax -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl $0, %eax -; X86-NEXT: cmovgel %eax, %esi +; X86-NEXT: cmovgel %eax, %ebx ; X86-NEXT: cmovgel %eax, %ecx -; X86-NEXT: cmovgel %eax, %edi +; X86-NEXT: cmovgel %eax, %esi ; X86-NEXT: movl $-1, %edx -; X86-NEXT: cmovgel %edx, %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: cmovgel %edx, %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: negl %eax ; X86-NEXT: movl $-1, %eax -; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: sbbl %esi, %eax ; X86-NEXT: movl $-1, %eax ; X86-NEXT: sbbl %ecx, %eax ; X86-NEXT: movl $-1, %eax -; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: sbbl %ebx, %eax ; X86-NEXT: movl $0, %eax -; X86-NEXT: cmovgel %eax, %ebx -; X86-NEXT: cmovgel %edx, %edi -; X86-NEXT: shldl $31, %ebx, %edi +; X86-NEXT: cmovgel %eax, %edi +; X86-NEXT: cmovgel %edx, %esi +; X86-NEXT: shldl $31, %edi, %esi ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll index 213b2b018d0ad4..67a385dccfa75f 100644 --- a/llvm/test/CodeGen/X86/select.ll +++ b/llvm/test/CodeGen/X86/select.ll @@ -509,42 +509,42 @@ define void @test8(i1 %c, ptr %dst.addr, <6 x i32> %src1,<6 x i32> %src2) nounwi ; ATHLON-NEXT: pushl %esi ; ATHLON-NEXT: testb $1, {{[0-9]+}}(%esp) ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ebx -; ATHLON-NEXT: cmovnel %eax, %ebx +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ecx +; ATHLON-NEXT: cmovnel %eax, %ecx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edi -; ATHLON-NEXT: cmovnel %eax, %edi +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx +; ATHLON-NEXT: cmovnel %eax, %edx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %esi ; ATHLON-NEXT: cmovnel %eax, %esi ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx -; ATHLON-NEXT: cmovnel %eax, %edx +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edi +; ATHLON-NEXT: cmovnel %eax, %edi ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ecx -; ATHLON-NEXT: cmovnel %eax, %ecx -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ebp +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ebx +; ATHLON-NEXT: cmovnel %eax, %ebx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax -; ATHLON-NEXT: cmovnel %ebp, %eax -; ATHLON-NEXT: movl (%ebx), %ebp -; ATHLON-NEXT: movl (%edi), %ebx -; ATHLON-NEXT: movl (%esi), %edi -; ATHLON-NEXT: movl (%edx), %esi -; ATHLON-NEXT: movl (%ecx), %edx -; ATHLON-NEXT: movl (%eax), %ecx -; ATHLON-NEXT: decl %ebp +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ebp +; ATHLON-NEXT: cmovnel %eax, %ebp ; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax -; ATHLON-NEXT: movl %ebp, 20(%eax) -; ATHLON-NEXT: decl %ebx -; ATHLON-NEXT: movl %ebx, 16(%eax) -; ATHLON-NEXT: decl %edi -; ATHLON-NEXT: movl %edi, 12(%eax) -; ATHLON-NEXT: decl %esi -; ATHLON-NEXT: movl %esi, 8(%eax) -; ATHLON-NEXT: decl %edx -; ATHLON-NEXT: movl %edx, 4(%eax) +; ATHLON-NEXT: movl (%ecx), %ecx +; ATHLON-NEXT: movl (%edx), %edx +; ATHLON-NEXT: movl (%esi), %esi +; ATHLON-NEXT: movl (%edi), %edi +; ATHLON-NEXT: movl (%ebx), %ebx +; ATHLON-NEXT: movl (%ebp), %ebp ; ATHLON-NEXT: decl %ecx -; ATHLON-NEXT: movl %ecx, (%eax) +; ATHLON-NEXT: movl %ecx, 20(%eax) +; ATHLON-NEXT: decl %edx +; ATHLON-NEXT: movl %edx, 16(%eax) +; ATHLON-NEXT: decl %esi +; ATHLON-NEXT: movl %esi, 12(%eax) +; ATHLON-NEXT: decl %edi +; ATHLON-NEXT: movl %edi, 8(%eax) +; ATHLON-NEXT: decl %ebx +; ATHLON-NEXT: movl %ebx, 4(%eax) +; ATHLON-NEXT: decl %ebp +; ATHLON-NEXT: movl %ebp, (%eax) ; ATHLON-NEXT: popl %esi ; ATHLON-NEXT: popl %edi ; ATHLON-NEXT: popl %ebx @@ -729,14 +729,14 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone { ; ; MCU-LABEL: test9b: ; MCU: # %bb.0: -; MCU-NEXT: movl %edx, %ecx -; MCU-NEXT: xorl %edx, %edx -; MCU-NEXT: orl %ecx, %eax -; MCU-NEXT: sete %dl -; MCU-NEXT: negl %edx -; MCU-NEXT: movl {{[0-9]+}}(%esp), %eax +; MCU-NEXT: xorl %ecx, %ecx ; MCU-NEXT: orl %edx, %eax -; MCU-NEXT: orl {{[0-9]+}}(%esp), %edx +; MCU-NEXT: sete %cl +; MCU-NEXT: negl %ecx +; MCU-NEXT: movl {{[0-9]+}}(%esp), %eax +; MCU-NEXT: orl %ecx, %eax +; MCU-NEXT: orl {{[0-9]+}}(%esp), %ecx +; MCU-NEXT: movl %ecx, %edx ; MCU-NEXT: retl %cmp = icmp eq i64 %x, 0 %A = sext i1 %cmp to i64 @@ -767,13 +767,13 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone { ; ; MCU-LABEL: test10: ; MCU: # %bb.0: -; MCU-NEXT: movl %edx, %ecx -; MCU-NEXT: xorl %edx, %edx -; MCU-NEXT: orl %ecx, %eax -; MCU-NEXT: sete %dl -; MCU-NEXT: negl %edx -; MCU-NEXT: movl %edx, %eax +; MCU-NEXT: xorl %ecx, %ecx +; MCU-NEXT: orl %edx, %eax +; MCU-NEXT: sete %cl +; MCU-NEXT: negl %ecx +; MCU-NEXT: movl %ecx, %eax ; MCU-NEXT: orl $1, %eax +; MCU-NEXT: movl %ecx, %edx ; MCU-NEXT: retl %cmp = icmp eq i64 %x, 0 %cond = select i1 %cmp, i64 -1, i64 1 @@ -939,13 +939,13 @@ define i64 @eqzero_all_ones_or_const(i64 %x) { ; ; MCU-LABEL: eqzero_all_ones_or_const: ; MCU: # %bb.0: -; MCU-NEXT: movl %edx, %ecx -; MCU-NEXT: xorl %edx, %edx -; MCU-NEXT: orl %ecx, %eax -; MCU-NEXT: sete %dl -; MCU-NEXT: negl %edx -; MCU-NEXT: movl %edx, %eax +; MCU-NEXT: xorl %ecx, %ecx +; MCU-NEXT: orl %edx, %eax +; MCU-NEXT: sete %cl +; MCU-NEXT: negl %ecx +; MCU-NEXT: movl %ecx, %eax ; MCU-NEXT: orl $42, %eax +; MCU-NEXT: movl %ecx, %edx ; MCU-NEXT: retl %z = icmp eq i64 %x, 0 %r = select i1 %z, i64 -1, i64 42 @@ -1155,12 +1155,12 @@ define i64 @test16(i64 %x) nounwind uwtable readnone ssp { ; ; MCU-LABEL: test16: ; MCU: # %bb.0: # %entry -; MCU-NEXT: movl %eax, %ecx -; MCU-NEXT: xorl %eax, %eax -; MCU-NEXT: orl %edx, %ecx -; MCU-NEXT: setne %al -; MCU-NEXT: negl %eax -; MCU-NEXT: movl %eax, %edx +; MCU-NEXT: xorl %ecx, %ecx +; MCU-NEXT: orl %edx, %eax +; MCU-NEXT: setne %cl +; MCU-NEXT: negl %ecx +; MCU-NEXT: movl %ecx, %eax +; MCU-NEXT: movl %ecx, %edx ; MCU-NEXT: retl entry: %cmp = icmp ne i64 %x, 0 diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll index 61254d5e5c2f48..3965f5bc60ffd2 100644 --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -236,165 +236,165 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { ; SSE2-LABEL: ne_i512: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rdx +; SSE2-NEXT: movq %xmm8, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rsi +; SSE2-NEXT: movq %xmm8, %rcx ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rdi +; SSE2-NEXT: movq %xmm8, %rdx ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %r8 -; SSE2-NEXT: movq %xmm0, %r9 -; SSE2-NEXT: movq %xmm2, %r10 -; SSE2-NEXT: movq %xmm1, %rcx -; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: movq %xmm8, %rsi +; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: movq %xmm2, %r8 +; SSE2-NEXT: movq %xmm1, %r9 +; SSE2-NEXT: movq %xmm3, %r10 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %r11 -; SSE2-NEXT: xorq %rdx, %r11 +; SSE2-NEXT: xorq %rax, %r11 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: orq %r11, %rdx +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorq %rcx, %rax +; SSE2-NEXT: orq %r11, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: xorq %rdi, %rsi +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: xorq %rdx, %rcx ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rsi, %rdi -; SSE2-NEXT: orq %rdx, %rdi -; SSE2-NEXT: movq %xmm4, %rdx -; SSE2-NEXT: xorq %r9, %rdx -; SSE2-NEXT: movq %xmm6, %rsi +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: xorq %rsi, %rdx +; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: orq %rax, %rdx +; SSE2-NEXT: movq %xmm4, %rax +; SSE2-NEXT: xorq %rdi, %rax +; SSE2-NEXT: movq %xmm6, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: xorq %r9, %rax +; SSE2-NEXT: movq %xmm7, %rsi ; SSE2-NEXT: xorq %r10, %rsi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: movq %xmm5, %rdx -; SSE2-NEXT: xorq %rcx, %rdx -; SSE2-NEXT: movq %xmm7, %rcx -; SSE2-NEXT: xorq %rax, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: orq %rax, %rsi +; SSE2-NEXT: orq %rcx, %rsi ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rcx +; SSE2-NEXT: orq %rdx, %rsi ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: ne_i512: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %xmm2, %rdx -; SSE41-NEXT: movq %xmm1, %rsi -; SSE41-NEXT: movq %xmm3, %rdi -; SSE41-NEXT: pextrq $1, %xmm0, %r8 -; SSE41-NEXT: pextrq $1, %xmm2, %r9 -; SSE41-NEXT: pextrq $1, %xmm1, %r10 -; SSE41-NEXT: pextrq $1, %xmm3, %rax +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: movq %xmm2, %rcx +; SSE41-NEXT: movq %xmm1, %rdx +; SSE41-NEXT: movq %xmm3, %rsi +; SSE41-NEXT: pextrq $1, %xmm0, %rdi +; SSE41-NEXT: pextrq $1, %xmm2, %r8 +; SSE41-NEXT: pextrq $1, %xmm1, %r9 +; SSE41-NEXT: pextrq $1, %xmm3, %r10 ; SSE41-NEXT: movq %xmm4, %r11 -; SSE41-NEXT: xorq %rcx, %r11 -; SSE41-NEXT: movq %xmm6, %rcx +; SSE41-NEXT: xorq %rax, %r11 +; SSE41-NEXT: movq %xmm6, %rax +; SSE41-NEXT: xorq %rcx, %rax +; SSE41-NEXT: orq %r11, %rax +; SSE41-NEXT: movq %xmm5, %rcx ; SSE41-NEXT: xorq %rdx, %rcx -; SSE41-NEXT: orq %r11, %rcx -; SSE41-NEXT: movq %xmm5, %rdx +; SSE41-NEXT: movq %xmm7, %rdx ; SSE41-NEXT: xorq %rsi, %rdx -; SSE41-NEXT: movq %xmm7, %rsi -; SSE41-NEXT: xorq %rdi, %rsi -; SSE41-NEXT: orq %rdx, %rsi -; SSE41-NEXT: orq %rcx, %rsi -; SSE41-NEXT: pextrq $1, %xmm4, %rcx -; SSE41-NEXT: xorq %r8, %rcx -; SSE41-NEXT: pextrq $1, %xmm6, %rdx -; SSE41-NEXT: xorq %r9, %rdx ; SSE41-NEXT: orq %rcx, %rdx -; SSE41-NEXT: pextrq $1, %xmm5, %rcx -; SSE41-NEXT: xorq %r10, %rcx -; SSE41-NEXT: pextrq $1, %xmm7, %rdi -; SSE41-NEXT: xorq %rax, %rdi -; SSE41-NEXT: orq %rcx, %rdi -; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: orq %rax, %rdx +; SSE41-NEXT: pextrq $1, %xmm4, %rax +; SSE41-NEXT: xorq %rdi, %rax +; SSE41-NEXT: pextrq $1, %xmm6, %rcx +; SSE41-NEXT: xorq %r8, %rcx +; SSE41-NEXT: orq %rax, %rcx +; SSE41-NEXT: pextrq $1, %xmm5, %rax +; SSE41-NEXT: xorq %r9, %rax +; SSE41-NEXT: pextrq $1, %xmm7, %rsi +; SSE41-NEXT: xorq %r10, %rsi +; SSE41-NEXT: orq %rax, %rsi +; SSE41-NEXT: orq %rcx, %rsi ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rsi, %rdi +; SSE41-NEXT: orq %rdx, %rsi ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: ne_i512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm0, %rdx -; AVX1-NEXT: vmovq %xmm1, %rsi +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovq %xmm1, %rcx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rdi +; AVX1-NEXT: vmovq %xmm4, %rdx ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vmovq %xmm5, %r8 -; AVX1-NEXT: vpextrq $1, %xmm0, %r9 -; AVX1-NEXT: vpextrq $1, %xmm1, %r10 -; AVX1-NEXT: vpextrq $1, %xmm4, %rcx -; AVX1-NEXT: vpextrq $1, %xmm5, %rax +; AVX1-NEXT: vmovq %xmm5, %rsi +; AVX1-NEXT: vpextrq $1, %xmm0, %rdi +; AVX1-NEXT: vpextrq $1, %xmm1, %r8 +; AVX1-NEXT: vpextrq $1, %xmm4, %r9 +; AVX1-NEXT: vpextrq $1, %xmm5, %r10 ; AVX1-NEXT: vmovq %xmm2, %r11 -; AVX1-NEXT: xorq %rdx, %r11 -; AVX1-NEXT: vmovq %xmm3, %rdx -; AVX1-NEXT: xorq %rsi, %rdx -; AVX1-NEXT: orq %r11, %rdx +; AVX1-NEXT: xorq %rax, %r11 +; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: orq %r11, %rax ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rsi -; AVX1-NEXT: xorq %rdi, %rsi +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: xorq %rdx, %rcx ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rdi -; AVX1-NEXT: xorq %r8, %rdi -; AVX1-NEXT: orq %rsi, %rdi -; AVX1-NEXT: orq %rdx, %rdi -; AVX1-NEXT: vpextrq $1, %xmm2, %rdx -; AVX1-NEXT: xorq %r9, %rdx -; AVX1-NEXT: vpextrq $1, %xmm3, %rsi +; AVX1-NEXT: vmovq %xmm1, %rdx +; AVX1-NEXT: xorq %rsi, %rdx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: orq %rax, %rdx +; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: xorq %rdi, %rax +; AVX1-NEXT: vpextrq $1, %xmm3, %rcx +; AVX1-NEXT: xorq %r8, %rcx +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: xorq %r9, %rax +; AVX1-NEXT: vpextrq $1, %xmm1, %rsi ; AVX1-NEXT: xorq %r10, %rsi -; AVX1-NEXT: orq %rdx, %rsi -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: xorq %rcx, %rdx -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: orq %rdx, %rcx -; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: orq %rax, %rsi +; AVX1-NEXT: orq %rcx, %rsi ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rdi, %rcx +; AVX1-NEXT: orq %rdx, %rsi ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: ne_i512: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %xmm0, %rdx -; AVX2-NEXT: vmovq %xmm1, %rsi +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovq %xmm1, %rcx ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rdi +; AVX2-NEXT: vmovq %xmm4, %rdx ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, %r8 -; AVX2-NEXT: vpextrq $1, %xmm0, %r9 -; AVX2-NEXT: vpextrq $1, %xmm1, %r10 -; AVX2-NEXT: vpextrq $1, %xmm4, %rcx -; AVX2-NEXT: vpextrq $1, %xmm5, %rax +; AVX2-NEXT: vmovq %xmm5, %rsi +; AVX2-NEXT: vpextrq $1, %xmm0, %rdi +; AVX2-NEXT: vpextrq $1, %xmm1, %r8 +; AVX2-NEXT: vpextrq $1, %xmm4, %r9 +; AVX2-NEXT: vpextrq $1, %xmm5, %r10 ; AVX2-NEXT: vmovq %xmm2, %r11 -; AVX2-NEXT: xorq %rdx, %r11 -; AVX2-NEXT: vmovq %xmm3, %rdx -; AVX2-NEXT: xorq %rsi, %rdx -; AVX2-NEXT: orq %r11, %rdx +; AVX2-NEXT: xorq %rax, %r11 +; AVX2-NEXT: vmovq %xmm3, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: orq %r11, %rax ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rsi -; AVX2-NEXT: xorq %rdi, %rsi +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: xorq %rdx, %rcx ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: xorq %r8, %rdi -; AVX2-NEXT: orq %rsi, %rdi -; AVX2-NEXT: orq %rdx, %rdi -; AVX2-NEXT: vpextrq $1, %xmm2, %rdx -; AVX2-NEXT: xorq %r9, %rdx -; AVX2-NEXT: vpextrq $1, %xmm3, %rsi +; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: xorq %rsi, %rdx +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: orq %rax, %rdx +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: xorq %rdi, %rax +; AVX2-NEXT: vpextrq $1, %xmm3, %rcx +; AVX2-NEXT: xorq %r8, %rcx +; AVX2-NEXT: orq %rax, %rcx +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: xorq %r9, %rax +; AVX2-NEXT: vpextrq $1, %xmm1, %rsi ; AVX2-NEXT: xorq %r10, %rsi -; AVX2-NEXT: orq %rdx, %rsi -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: xorq %rcx, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: xorq %rax, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: orq %rax, %rsi +; AVX2-NEXT: orq %rcx, %rsi ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rcx +; AVX2-NEXT: orq %rdx, %rsi ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -418,165 +418,165 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) { ; SSE2-LABEL: eq_i512: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rdx +; SSE2-NEXT: movq %xmm8, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rsi +; SSE2-NEXT: movq %xmm8, %rcx ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rdi +; SSE2-NEXT: movq %xmm8, %rdx ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %r8 -; SSE2-NEXT: movq %xmm0, %r9 -; SSE2-NEXT: movq %xmm2, %r10 -; SSE2-NEXT: movq %xmm1, %rcx -; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: movq %xmm8, %rsi +; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: movq %xmm2, %r8 +; SSE2-NEXT: movq %xmm1, %r9 +; SSE2-NEXT: movq %xmm3, %r10 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %r11 -; SSE2-NEXT: xorq %rdx, %r11 +; SSE2-NEXT: xorq %rax, %r11 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: orq %r11, %rdx +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorq %rcx, %rax +; SSE2-NEXT: orq %r11, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: xorq %rdi, %rsi +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: xorq %rdx, %rcx ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rsi, %rdi -; SSE2-NEXT: orq %rdx, %rdi -; SSE2-NEXT: movq %xmm4, %rdx -; SSE2-NEXT: xorq %r9, %rdx -; SSE2-NEXT: movq %xmm6, %rsi +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: xorq %rsi, %rdx +; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: orq %rax, %rdx +; SSE2-NEXT: movq %xmm4, %rax +; SSE2-NEXT: xorq %rdi, %rax +; SSE2-NEXT: movq %xmm6, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: xorq %r9, %rax +; SSE2-NEXT: movq %xmm7, %rsi ; SSE2-NEXT: xorq %r10, %rsi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: movq %xmm5, %rdx -; SSE2-NEXT: xorq %rcx, %rdx -; SSE2-NEXT: movq %xmm7, %rcx -; SSE2-NEXT: xorq %rax, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: orq %rax, %rsi +; SSE2-NEXT: orq %rcx, %rsi ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rcx +; SSE2-NEXT: orq %rdx, %rsi ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: eq_i512: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %xmm2, %rdx -; SSE41-NEXT: movq %xmm1, %rsi -; SSE41-NEXT: movq %xmm3, %rdi -; SSE41-NEXT: pextrq $1, %xmm0, %r8 -; SSE41-NEXT: pextrq $1, %xmm2, %r9 -; SSE41-NEXT: pextrq $1, %xmm1, %r10 -; SSE41-NEXT: pextrq $1, %xmm3, %rax +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: movq %xmm2, %rcx +; SSE41-NEXT: movq %xmm1, %rdx +; SSE41-NEXT: movq %xmm3, %rsi +; SSE41-NEXT: pextrq $1, %xmm0, %rdi +; SSE41-NEXT: pextrq $1, %xmm2, %r8 +; SSE41-NEXT: pextrq $1, %xmm1, %r9 +; SSE41-NEXT: pextrq $1, %xmm3, %r10 ; SSE41-NEXT: movq %xmm4, %r11 -; SSE41-NEXT: xorq %rcx, %r11 -; SSE41-NEXT: movq %xmm6, %rcx +; SSE41-NEXT: xorq %rax, %r11 +; SSE41-NEXT: movq %xmm6, %rax +; SSE41-NEXT: xorq %rcx, %rax +; SSE41-NEXT: orq %r11, %rax +; SSE41-NEXT: movq %xmm5, %rcx ; SSE41-NEXT: xorq %rdx, %rcx -; SSE41-NEXT: orq %r11, %rcx -; SSE41-NEXT: movq %xmm5, %rdx +; SSE41-NEXT: movq %xmm7, %rdx ; SSE41-NEXT: xorq %rsi, %rdx -; SSE41-NEXT: movq %xmm7, %rsi -; SSE41-NEXT: xorq %rdi, %rsi -; SSE41-NEXT: orq %rdx, %rsi -; SSE41-NEXT: orq %rcx, %rsi -; SSE41-NEXT: pextrq $1, %xmm4, %rcx -; SSE41-NEXT: xorq %r8, %rcx -; SSE41-NEXT: pextrq $1, %xmm6, %rdx -; SSE41-NEXT: xorq %r9, %rdx ; SSE41-NEXT: orq %rcx, %rdx -; SSE41-NEXT: pextrq $1, %xmm5, %rcx -; SSE41-NEXT: xorq %r10, %rcx -; SSE41-NEXT: pextrq $1, %xmm7, %rdi -; SSE41-NEXT: xorq %rax, %rdi -; SSE41-NEXT: orq %rcx, %rdi -; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: orq %rax, %rdx +; SSE41-NEXT: pextrq $1, %xmm4, %rax +; SSE41-NEXT: xorq %rdi, %rax +; SSE41-NEXT: pextrq $1, %xmm6, %rcx +; SSE41-NEXT: xorq %r8, %rcx +; SSE41-NEXT: orq %rax, %rcx +; SSE41-NEXT: pextrq $1, %xmm5, %rax +; SSE41-NEXT: xorq %r9, %rax +; SSE41-NEXT: pextrq $1, %xmm7, %rsi +; SSE41-NEXT: xorq %r10, %rsi +; SSE41-NEXT: orq %rax, %rsi +; SSE41-NEXT: orq %rcx, %rsi ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rsi, %rdi +; SSE41-NEXT: orq %rdx, %rsi ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: eq_i512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm0, %rdx -; AVX1-NEXT: vmovq %xmm1, %rsi +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovq %xmm1, %rcx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rdi +; AVX1-NEXT: vmovq %xmm4, %rdx ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vmovq %xmm5, %r8 -; AVX1-NEXT: vpextrq $1, %xmm0, %r9 -; AVX1-NEXT: vpextrq $1, %xmm1, %r10 -; AVX1-NEXT: vpextrq $1, %xmm4, %rcx -; AVX1-NEXT: vpextrq $1, %xmm5, %rax +; AVX1-NEXT: vmovq %xmm5, %rsi +; AVX1-NEXT: vpextrq $1, %xmm0, %rdi +; AVX1-NEXT: vpextrq $1, %xmm1, %r8 +; AVX1-NEXT: vpextrq $1, %xmm4, %r9 +; AVX1-NEXT: vpextrq $1, %xmm5, %r10 ; AVX1-NEXT: vmovq %xmm2, %r11 -; AVX1-NEXT: xorq %rdx, %r11 -; AVX1-NEXT: vmovq %xmm3, %rdx -; AVX1-NEXT: xorq %rsi, %rdx -; AVX1-NEXT: orq %r11, %rdx +; AVX1-NEXT: xorq %rax, %r11 +; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: orq %r11, %rax ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rsi -; AVX1-NEXT: xorq %rdi, %rsi +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: xorq %rdx, %rcx ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rdi -; AVX1-NEXT: xorq %r8, %rdi -; AVX1-NEXT: orq %rsi, %rdi -; AVX1-NEXT: orq %rdx, %rdi -; AVX1-NEXT: vpextrq $1, %xmm2, %rdx -; AVX1-NEXT: xorq %r9, %rdx -; AVX1-NEXT: vpextrq $1, %xmm3, %rsi +; AVX1-NEXT: vmovq %xmm1, %rdx +; AVX1-NEXT: xorq %rsi, %rdx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: orq %rax, %rdx +; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: xorq %rdi, %rax +; AVX1-NEXT: vpextrq $1, %xmm3, %rcx +; AVX1-NEXT: xorq %r8, %rcx +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: xorq %r9, %rax +; AVX1-NEXT: vpextrq $1, %xmm1, %rsi ; AVX1-NEXT: xorq %r10, %rsi -; AVX1-NEXT: orq %rdx, %rsi -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: xorq %rcx, %rdx -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: orq %rdx, %rcx -; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: orq %rax, %rsi +; AVX1-NEXT: orq %rcx, %rsi ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rdi, %rcx +; AVX1-NEXT: orq %rdx, %rsi ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: eq_i512: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %xmm0, %rdx -; AVX2-NEXT: vmovq %xmm1, %rsi +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovq %xmm1, %rcx ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rdi +; AVX2-NEXT: vmovq %xmm4, %rdx ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, %r8 -; AVX2-NEXT: vpextrq $1, %xmm0, %r9 -; AVX2-NEXT: vpextrq $1, %xmm1, %r10 -; AVX2-NEXT: vpextrq $1, %xmm4, %rcx -; AVX2-NEXT: vpextrq $1, %xmm5, %rax +; AVX2-NEXT: vmovq %xmm5, %rsi +; AVX2-NEXT: vpextrq $1, %xmm0, %rdi +; AVX2-NEXT: vpextrq $1, %xmm1, %r8 +; AVX2-NEXT: vpextrq $1, %xmm4, %r9 +; AVX2-NEXT: vpextrq $1, %xmm5, %r10 ; AVX2-NEXT: vmovq %xmm2, %r11 -; AVX2-NEXT: xorq %rdx, %r11 -; AVX2-NEXT: vmovq %xmm3, %rdx -; AVX2-NEXT: xorq %rsi, %rdx -; AVX2-NEXT: orq %r11, %rdx +; AVX2-NEXT: xorq %rax, %r11 +; AVX2-NEXT: vmovq %xmm3, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: orq %r11, %rax ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rsi -; AVX2-NEXT: xorq %rdi, %rsi +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: xorq %rdx, %rcx ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: xorq %r8, %rdi -; AVX2-NEXT: orq %rsi, %rdi -; AVX2-NEXT: orq %rdx, %rdi -; AVX2-NEXT: vpextrq $1, %xmm2, %rdx -; AVX2-NEXT: xorq %r9, %rdx -; AVX2-NEXT: vpextrq $1, %xmm3, %rsi +; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: xorq %rsi, %rdx +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: orq %rax, %rdx +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: xorq %rdi, %rax +; AVX2-NEXT: vpextrq $1, %xmm3, %rcx +; AVX2-NEXT: xorq %r8, %rcx +; AVX2-NEXT: orq %rax, %rcx +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: xorq %r9, %rax +; AVX2-NEXT: vpextrq $1, %xmm1, %rsi ; AVX2-NEXT: xorq %r10, %rsi -; AVX2-NEXT: orq %rdx, %rsi -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: xorq %rcx, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: xorq %rax, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: orq %rax, %rsi +; AVX2-NEXT: orq %rcx, %rsi ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rcx +; AVX2-NEXT: orq %rdx, %rsi ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1366,31 +1366,31 @@ define i1 @eq_i512_op(i512 %a, i512 %b) { ; ; AVXANY-LABEL: eq_i512_op: ; AVXANY: # %bb.0: -; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVXANY-NEXT: addq $1, %rdi ; AVXANY-NEXT: adcq $0, %rsi ; AVXANY-NEXT: adcq $0, %rdx ; AVXANY-NEXT: adcq $0, %rcx ; AVXANY-NEXT: adcq $0, %r8 ; AVXANY-NEXT: adcq $0, %r9 -; AVXANY-NEXT: adcq $0, %r10 ; AVXANY-NEXT: adcq $0, %rax +; AVXANY-NEXT: adcq $0, %r10 ; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi ; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 ; AVXANY-NEXT: orq %rsi, %r9 ; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; AVXANY-NEXT: orq %rcx, %rax -; AVXANY-NEXT: orq %r9, %rax -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx ; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 -; AVXANY-NEXT: orq %rdx, %r10 +; AVXANY-NEXT: orq %rcx, %r10 +; AVXANY-NEXT: orq %r9, %r10 +; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx +; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax +; AVXANY-NEXT: orq %rdx, %rax ; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 ; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi ; AVXANY-NEXT: orq %r8, %rdi -; AVXANY-NEXT: orq %r10, %rdi ; AVXANY-NEXT: orq %rax, %rdi +; AVXANY-NEXT: orq %r10, %rdi ; AVXANY-NEXT: sete %al ; AVXANY-NEXT: retq %a2 = add i512 %a, 1 diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll index 65e3c1f0633d79..f624cdbdff38d2 100644 --- a/llvm/test/CodeGen/X86/sext-vsetcc.ll +++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll @@ -571,50 +571,50 @@ define <8 x i32> @PR63946(<8 x i32> %a0, <8 x i32> %b0) nounwind { ; SSE-LABEL: PR63946: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,2,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,2,3,0] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,2,3,0] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,0,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,0,1,2] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,0,1,2] -; SSE-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[3,0,1,2] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pcmpeqd %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm11 ; SSE-NEXT: pcmpeqd %xmm4, %xmm11 -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pcmpeqd %xmm4, %xmm15 -; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: movdqa %xmm5, %xmm14 ; SSE-NEXT: pcmpeqd %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pcmpeqd %xmm4, %xmm15 ; SSE-NEXT: pcmpeqd %xmm4, %xmm2 -; SSE-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE-NEXT: pcmpeqd %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE-NEXT: pcmpeqd %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pcmpeqd %xmm4, %xmm7 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: pcmpeqd %xmm13, %xmm12 +; SSE-NEXT: pcmpeqd %xmm0, %xmm12 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: pcmpeqd %xmm13, %xmm10 -; SSE-NEXT: pcmpeqd %xmm13, %xmm0 -; SSE-NEXT: por %xmm15, %xmm2 +; SSE-NEXT: pcmpeqd %xmm0, %xmm13 +; SSE-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE-NEXT: por %xmm14, %xmm2 ; SSE-NEXT: por %xmm11, %xmm2 -; SSE-NEXT: pcmpeqd %xmm13, %xmm3 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: por %xmm12, %xmm10 -; SSE-NEXT: por %xmm14, %xmm7 -; SSE-NEXT: pcmpeqd %xmm13, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: pcmpeqd %xmm13, %xmm5 +; SSE-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: por %xmm12, %xmm13 +; SSE-NEXT: por %xmm15, %xmm6 +; SSE-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: por %xmm6, %xmm8 ; SSE-NEXT: por %xmm2, %xmm8 ; SSE-NEXT: packssdw %xmm8, %xmm5 -; SSE-NEXT: pcmpeqd %xmm13, %xmm1 -; SSE-NEXT: packssdw %xmm6, %xmm1 +; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: packssdw %xmm7, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] @@ -666,16 +666,16 @@ define <8 x i32> @PR63946(<8 x i32> %a0, <8 x i32> %b0) nounwind { ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] ; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: vpcmpeqd %zmm0, %zmm2, %k1 -; AVX512-NEXT: vpcmpeqd %zmm0, %zmm3, %k3 -; AVX512-NEXT: vpcmpeqd %zmm0, %zmm4, %k2 +; AVX512-NEXT: vpcmpeqd %zmm0, %zmm3, %k2 +; AVX512-NEXT: vpcmpeqd %zmm0, %zmm4, %k3 ; AVX512-NEXT: vpcmpeqd %zmm0, %zmm5, %k4 ; AVX512-NEXT: vpcmpeqd %zmm0, %zmm6, %k5 ; AVX512-NEXT: vpcmpeqd %zmm0, %zmm7, %k6 ; AVX512-NEXT: vpcmpeqd %zmm0, %zmm8, %k7 ; AVX512-NEXT: korw %k0, %k1, %k0 -; AVX512-NEXT: korw %k3, %k0, %k0 -; AVX512-NEXT: korw %k4, %k0, %k0 ; AVX512-NEXT: korw %k2, %k0, %k0 +; AVX512-NEXT: korw %k4, %k0, %k0 +; AVX512-NEXT: korw %k3, %k0, %k0 ; AVX512-NEXT: korw %k5, %k0, %k0 ; AVX512-NEXT: korw %k6, %k0, %k0 ; AVX512-NEXT: korw %k7, %k0, %k1 diff --git a/llvm/test/CodeGen/X86/shift-amount-mod.ll b/llvm/test/CodeGen/X86/shift-amount-mod.ll index c89db15d12f45d..5842f913b67520 100644 --- a/llvm/test/CodeGen/X86/shift-amount-mod.ll +++ b/llvm/test/CodeGen/X86/shift-amount-mod.ll @@ -653,20 +653,20 @@ define void @store64_ashr_by_negated(i64 %val, ptr %dstptr, i64 %shamt) nounwind ; X32-NEXT: pushl %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movb $64, %cl ; X32-NEXT: subb {{[0-9]+}}(%esp), %cl -; X32-NEXT: movl %edi, %esi -; X32-NEXT: sarl %cl, %esi -; X32-NEXT: shrdl %cl, %edi, %edx +; X32-NEXT: movl %esi, %edi +; X32-NEXT: sarl %cl, %edi +; X32-NEXT: shrdl %cl, %esi, %edx ; X32-NEXT: testb $32, %cl ; X32-NEXT: je .LBB22_2 ; X32-NEXT: # %bb.1: -; X32-NEXT: sarl $31, %edi -; X32-NEXT: movl %esi, %edx -; X32-NEXT: movl %edi, %esi +; X32-NEXT: sarl $31, %esi +; X32-NEXT: movl %edi, %edx +; X32-NEXT: movl %esi, %edi ; X32-NEXT: .LBB22_2: -; X32-NEXT: movl %esi, 4(%eax) +; X32-NEXT: movl %edi, 4(%eax) ; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll index f6d73b1fbc6e7c..3cb680396b6ba3 100644 --- a/llvm/test/CodeGen/X86/shift-and.ll +++ b/llvm/test/CodeGen/X86/shift-and.ll @@ -168,20 +168,22 @@ define void @t5ptr(i64 %t, ptr %ptr) nounwind { define i64 @t6(i64 %key, ptr nocapture %val) nounwind { ; X32-LABEL: t6: ; X32: # %bb.0: +; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shrdl $3, %eax, %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: shrl $3, %esi -; X32-NEXT: movl (%edx), %eax -; X32-NEXT: movl 4(%edx), %edx +; X32-NEXT: shrdl $3, %eax, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: shrl $3, %edi +; X32-NEXT: movl (%ecx), %eax +; X32-NEXT: movl 4(%ecx), %edx ; X32-NEXT: addl $-1, %eax ; X32-NEXT: adcl $-1, %edx -; X32-NEXT: andl %ecx, %eax -; X32-NEXT: andl %esi, %edx +; X32-NEXT: andl %esi, %eax +; X32-NEXT: andl %edi, %edx ; X32-NEXT: popl %esi +; X32-NEXT: popl %edi ; X32-NEXT: retl ; ; X64-LABEL: t6: diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index 1fe8d834dbcddb..7d3e6abc045907 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -31,26 +31,26 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: andb $7, %al ; i686-NEXT: shrb $3, %cl ; i686-NEXT: andb $15, %cl -; i686-NEXT: movzbl %cl, %ebp -; i686-NEXT: movl 4(%esp,%ebp), %edx -; i686-NEXT: movl %edx, %esi +; i686-NEXT: movzbl %cl, %ebx +; i686-NEXT: movl 4(%esp,%ebx), %esi +; i686-NEXT: movl %esi, %ebp ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrl %cl, %esi +; i686-NEXT: shrl %cl, %ebp ; i686-NEXT: notb %cl -; i686-NEXT: movl 8(%esp,%ebp), %ebx -; i686-NEXT: leal (%ebx,%ebx), %edi +; i686-NEXT: movl 8(%esp,%ebx), %edx +; i686-NEXT: leal (%edx,%edx), %edi ; i686-NEXT: shll %cl, %edi -; i686-NEXT: orl %esi, %edi -; i686-NEXT: movl (%esp,%ebp), %esi -; i686-NEXT: movl 12(%esp,%ebp), %ebp +; i686-NEXT: orl %ebp, %edi +; i686-NEXT: movl (%esp,%ebx), %ebp +; i686-NEXT: movl 12(%esp,%ebx), %ebx ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrdl %cl, %ebp, %ebx -; i686-NEXT: shrdl %cl, %edx, %esi -; i686-NEXT: shrl %cl, %ebp +; i686-NEXT: shrdl %cl, %ebx, %edx +; i686-NEXT: shrdl %cl, %esi, %ebp +; i686-NEXT: shrl %cl, %ebx ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl %ebp, 12(%eax) -; i686-NEXT: movl %ebx, 8(%eax) -; i686-NEXT: movl %esi, (%eax) +; i686-NEXT: movl %ebx, 12(%eax) +; i686-NEXT: movl %edx, 8(%eax) +; i686-NEXT: movl %ebp, (%eax) ; i686-NEXT: movl %edi, 4(%eax) ; i686-NEXT: addl $32, %esp ; i686-NEXT: popl %esi @@ -103,26 +103,26 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: andb $7, %al ; i686-NEXT: shrb $3, %cl ; i686-NEXT: andb $15, %cl -; i686-NEXT: movzbl %cl, %ebp -; i686-NEXT: movl 4(%esp,%ebp), %edx -; i686-NEXT: movl %edx, %esi +; i686-NEXT: movzbl %cl, %ebx +; i686-NEXT: movl 4(%esp,%ebx), %esi +; i686-NEXT: movl %esi, %ebp ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrl %cl, %esi +; i686-NEXT: shrl %cl, %ebp ; i686-NEXT: notb %cl -; i686-NEXT: movl 8(%esp,%ebp), %ebx -; i686-NEXT: leal (%ebx,%ebx), %edi +; i686-NEXT: movl 8(%esp,%ebx), %edx +; i686-NEXT: leal (%edx,%edx), %edi ; i686-NEXT: shll %cl, %edi -; i686-NEXT: orl %esi, %edi -; i686-NEXT: movl (%esp,%ebp), %esi -; i686-NEXT: movl 12(%esp,%ebp), %ebp +; i686-NEXT: orl %ebp, %edi +; i686-NEXT: movl (%esp,%ebx), %ebp +; i686-NEXT: movl 12(%esp,%ebx), %ebx ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrdl %cl, %ebp, %ebx -; i686-NEXT: shrdl %cl, %edx, %esi -; i686-NEXT: sarl %cl, %ebp +; i686-NEXT: shrdl %cl, %ebx, %edx +; i686-NEXT: shrdl %cl, %esi, %ebp +; i686-NEXT: sarl %cl, %ebx ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl %ebp, 12(%eax) -; i686-NEXT: movl %ebx, 8(%eax) -; i686-NEXT: movl %esi, (%eax) +; i686-NEXT: movl %ebx, 12(%eax) +; i686-NEXT: movl %edx, 8(%eax) +; i686-NEXT: movl %ebp, (%eax) ; i686-NEXT: movl %edi, 4(%eax) ; i686-NEXT: addl $32, %esp ; i686-NEXT: popl %esi @@ -160,12 +160,12 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: subl $32, %esp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl {{[0-9]+}}(%esp), %edx ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi +; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx +; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -177,29 +177,29 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: andb $15, %cl ; i686-NEXT: negb %cl ; i686-NEXT: movsbl %cl, %ebp -; i686-NEXT: movl 24(%esp,%ebp), %ebx -; i686-NEXT: movl %ebx, %edx +; i686-NEXT: movl 24(%esp,%ebp), %edx +; i686-NEXT: movl %edx, %ebx ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shll %cl, %edx +; i686-NEXT: shll %cl, %ebx ; i686-NEXT: notb %cl ; i686-NEXT: movl 20(%esp,%ebp), %edi ; i686-NEXT: movl %edi, %esi ; i686-NEXT: shrl %esi ; i686-NEXT: shrl %cl, %esi -; i686-NEXT: orl %edx, %esi -; i686-NEXT: movl 16(%esp,%ebp), %edx +; i686-NEXT: orl %ebx, %esi +; i686-NEXT: movl 16(%esp,%ebp), %ebx ; i686-NEXT: movl 28(%esp,%ebp), %ebp ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shldl %cl, %ebx, %ebp +; i686-NEXT: shldl %cl, %edx, %ebp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl %ebp, 12(%ecx) -; i686-NEXT: movl %edx, %ebx +; i686-NEXT: movl %ebx, %edx ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shll %cl, %ebx -; i686-NEXT: shldl %cl, %edx, %edi +; i686-NEXT: shll %cl, %edx +; i686-NEXT: shldl %cl, %ebx, %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl %edi, 4(%eax) -; i686-NEXT: movl %ebx, (%eax) +; i686-NEXT: movl %edx, (%eax) ; i686-NEXT: movl %esi, 8(%eax) ; i686-NEXT: addl $32, %esp ; i686-NEXT: popl %esi @@ -267,13 +267,13 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: pushl %ebx ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi -; i686-NEXT: subl $100, %esp +; i686-NEXT: subl $96, %esp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl {{[0-9]+}}(%esp), %esi +; i686-NEXT: movl {{[0-9]+}}(%esp), %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %edx -; i686-NEXT: movl {{[0-9]+}}(%esp), %edi +; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp ; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp @@ -282,7 +282,7 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp ; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -290,79 +290,81 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl %esi, %ecx +; i686-NEXT: movl %edi, %ecx ; i686-NEXT: andl $7, %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shrl $3, %esi -; i686-NEXT: andl $15, %esi -; i686-NEXT: movl 40(%esp,%esi), %eax +; i686-NEXT: shrl $3, %edi +; i686-NEXT: andl $15, %edi +; i686-NEXT: movl 36(%esp,%edi), %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: shrl %cl, %eax ; i686-NEXT: notl %ecx -; i686-NEXT: movl 44(%esp,%esi), %edx +; i686-NEXT: movl 40(%esp,%edi), %edx ; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: addl %edx, %edx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: shll %cl, %edx ; i686-NEXT: orl %eax, %edx ; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 36(%esp,%esi), %eax +; i686-NEXT: movl 32(%esp,%edi), %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) +; i686-NEXT: movl %ebx, %eax ; i686-NEXT: movl %ebx, %edx ; i686-NEXT: andl $7, %edx -; i686-NEXT: shrl $3, %ebx -; i686-NEXT: andl $15, %ebx -; i686-NEXT: movl 72(%esp,%ebx), %ebp -; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: shrl $3, %eax +; i686-NEXT: andl $15, %eax +; i686-NEXT: movl 68(%esp,%eax), %ecx +; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %eax, %esi +; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %ecx, %eax ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: shrl %cl, %ebp +; i686-NEXT: shrl %cl, %eax ; i686-NEXT: movl %edx, %ecx ; i686-NEXT: notl %ecx -; i686-NEXT: movl 76(%esp,%ebx), %eax -; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: leal (%eax,%eax), %edi +; i686-NEXT: movl 72(%esp,%esi), %ebp +; i686-NEXT: leal (%ebp,%ebp), %esi ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shll %cl, %edi -; i686-NEXT: orl %ebp, %edi -; i686-NEXT: movl 48(%esp,%esi), %esi -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: movl 68(%esp,%ebx), %ecx -; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill -; i686-NEXT: movl 80(%esp,%ebx), %esi -; i686-NEXT: movl %edx, %ecx +; i686-NEXT: shll %cl, %esi +; i686-NEXT: orl %eax, %esi +; i686-NEXT: movl 44(%esp,%edi), %edi ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; i686-NEXT: shrdl %cl, %esi, %ebx -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; i686-NEXT: shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; i686-NEXT: shrl %cl, %ebp +; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl 64(%esp,%ecx), %eax +; i686-NEXT: movl %eax, (%esp) # 4-byte Spill +; i686-NEXT: movl 76(%esp,%ecx), %eax ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill +; i686-NEXT: shrdl %cl, %eax, %ebp +; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; i686-NEXT: shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: shrl %cl, %edi ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: shrl %cl, %esi +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; i686-NEXT: shrdl %cl, %ebx, (%esp) # 4-byte Folded Spill +; i686-NEXT: shrl %cl, %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl %esi, 28(%ecx) -; i686-NEXT: movl %ebx, 24(%ecx) +; i686-NEXT: movl %eax, 28(%ecx) +; i686-NEXT: movl %ebp, 24(%ecx) ; i686-NEXT: movl (%esp), %eax # 4-byte Reload ; i686-NEXT: movl %eax, 16(%ecx) -; i686-NEXT: movl %ebp, 12(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 8(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, (%ecx) -; i686-NEXT: movl %edi, 20(%ecx) +; i686-NEXT: movl %edi, 12(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, 8(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, (%ecx) +; i686-NEXT: movl %esi, 20(%ecx) ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; i686-NEXT: movl %eax, 4(%ecx) -; i686-NEXT: addl $100, %esp +; i686-NEXT: addl $96, %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi ; i686-NEXT: popl %ebx @@ -407,25 +409,25 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: subl $92, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %edx ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: sarl $31, %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; i686-NEXT: sarl $31, %ebp +; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -435,75 +437,76 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edi, %ebx -; i686-NEXT: andl $7, %ebx +; i686-NEXT: movl %edi, %eax +; i686-NEXT: andl $7, %eax ; i686-NEXT: shrl $3, %edi ; i686-NEXT: andl $15, %edi -; i686-NEXT: movl 32(%esp,%edi), %eax -; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shrl %cl, %eax -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: notl %ecx -; i686-NEXT: movl 36(%esp,%edi), %edx +; i686-NEXT: movl 32(%esp,%edi), %edx ; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: addl %edx, %edx +; i686-NEXT: movl %eax, %ecx +; i686-NEXT: shrl %cl, %edx +; i686-NEXT: movl %eax, %ecx +; i686-NEXT: notl %ecx +; i686-NEXT: movl 36(%esp,%edi), %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: addl %esi, %esi ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shll %cl, %edx -; i686-NEXT: orl %eax, %edx -; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebp, %eax -; i686-NEXT: movl %ebp, %edx +; i686-NEXT: shll %cl, %esi +; i686-NEXT: orl %edx, %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: movl %ebx, %edx ; i686-NEXT: andl $7, %edx -; i686-NEXT: shrl $3, %eax -; i686-NEXT: andl $15, %eax -; i686-NEXT: movl 64(%esp,%eax), %ebp -; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %eax, (%esp) # 4-byte Spill +; i686-NEXT: shrl $3, %ecx +; i686-NEXT: andl $15, %ecx +; i686-NEXT: movl 64(%esp,%ecx), %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %ecx, %ebx +; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: shrl %cl, %ebp +; i686-NEXT: shrl %cl, %esi ; i686-NEXT: movl %edx, %ecx ; i686-NEXT: notl %ecx -; i686-NEXT: movl 68(%esp,%eax), %esi -; i686-NEXT: leal (%esi,%esi), %eax +; i686-NEXT: movl 68(%esp,%ebx), %ebp +; i686-NEXT: leal (%ebp,%ebp), %ebx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shll %cl, %eax -; i686-NEXT: orl %ebp, %eax +; i686-NEXT: shll %cl, %ebx +; i686-NEXT: orl %esi, %ebx ; i686-NEXT: movl 28(%esp,%edi), %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl 40(%esp,%edi), %edi -; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: movl %eax, %ecx ; i686-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: movl (%esp), %ecx # 4-byte Reload -; i686-NEXT: movl 60(%esp,%ecx), %ebp -; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 72(%esp,%ecx), %ebp +; i686-NEXT: movl 60(%esp,%ecx), %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 72(%esp,%ecx), %esi ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: shrdl %cl, %ebp, %esi -; i686-NEXT: movl %esi, (%esp) # 4-byte Spill -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: shrdl %cl, %esi, %ebp +; i686-NEXT: movl %ebp, (%esp) # 4-byte Spill +; i686-NEXT: movl %eax, %ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; i686-NEXT: shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: sarl %cl, %edi ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; i686-NEXT: shrdl %cl, %esi, %ebx -; i686-NEXT: movl %edx, %ecx -; i686-NEXT: sarl %cl, %ebp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl %ebp, 28(%ecx) -; i686-NEXT: movl (%esp), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 24(%ecx) -; i686-NEXT: movl %ebx, 16(%ecx) -; i686-NEXT: movl %edi, 12(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 8(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, (%ecx) -; i686-NEXT: movl %eax, 20(%ecx) ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: movl %eax, 4(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; i686-NEXT: shrdl %cl, %eax, %ebp +; i686-NEXT: movl %edx, %ecx +; i686-NEXT: sarl %cl, %esi +; i686-NEXT: movl {{[0-9]+}}(%esp), %eax +; i686-NEXT: movl %esi, 28(%eax) +; i686-NEXT: movl (%esp), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 24(%eax) +; i686-NEXT: movl %ebp, 16(%eax) +; i686-NEXT: movl %edi, 12(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 8(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, (%eax) +; i686-NEXT: movl %ebx, 20(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 4(%eax) ; i686-NEXT: addl $92, %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi @@ -570,18 +573,19 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ebp, %ecx -; i686-NEXT: shrl $3, %ebp -; i686-NEXT: andl $15, %ebp +; i686-NEXT: movl %ebp, %edi +; i686-NEXT: shrl $3, %edi +; i686-NEXT: andl $15, %edi ; i686-NEXT: leal {{[0-9]+}}(%esp), %eax -; i686-NEXT: subl %ebp, %eax +; i686-NEXT: subl %edi, %eax ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl 8(%eax), %edx -; i686-NEXT: movl %edx, (%esp) # 4-byte Spill +; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: andl $7, %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill ; i686-NEXT: shll %cl, %edx ; i686-NEXT: movl 4(%eax), %esi ; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -604,10 +608,10 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: andl $7, %ebx -; i686-NEXT: movl 8(%esi), %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 8(%esi), %ebp +; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shll %cl, %edi +; i686-NEXT: shll %cl, %ebp ; i686-NEXT: movl 4(%esi), %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: shrl %eax @@ -615,36 +619,38 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: notl %ecx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: shrl %cl, %eax -; i686-NEXT: orl %edi, %eax +; i686-NEXT: orl %ebp, %eax ; i686-NEXT: movl (%esi), %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: movl %esi, %edi -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: shll %cl, %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ecx, %edi +; i686-NEXT: movl %esi, %ebp +; i686-NEXT: movl (%esp), %ecx # 4-byte Reload +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: shll %cl, %ebp +; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl (%esp), %ecx # 4-byte Reload +; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: negl %ebp -; i686-NEXT: movl 64(%esp,%ebp), %esi -; i686-NEXT: movl %edi, %ecx +; i686-NEXT: negl %edi +; i686-NEXT: movl 64(%esp,%edi), %esi +; i686-NEXT: movl (%esp), %ecx # 4-byte Reload ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: movl (%esp), %edi # 4-byte Reload +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; i686-NEXT: shldl %cl, %edi, %esi ; i686-NEXT: movl %esi, (%esp) # 4-byte Spill -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: movl %esi, %edi +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; i686-NEXT: movl %ebp, %edi ; i686-NEXT: movl %ebx, %ecx ; i686-NEXT: shll %cl, %edi -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; i686-NEXT: shldl %cl, %esi, %ebp +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; i686-NEXT: shldl %cl, %ebp, %esi ; i686-NEXT: negl %edx ; i686-NEXT: movl 96(%esp,%edx), %edx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; i686-NEXT: shldl %cl, %ebx, %edx ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl %edx, 28(%ecx) -; i686-NEXT: movl %ebp, 20(%ecx) +; i686-NEXT: movl %esi, 20(%ecx) ; i686-NEXT: movl %edi, 16(%ecx) ; i686-NEXT: movl (%esp), %edx # 4-byte Reload ; i686-NEXT: movl %edx, 12(%ecx) diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll index 0e4e706669300c..7822f58799cd5a 100644 --- a/llvm/test/CodeGen/X86/shift-i256.ll +++ b/llvm/test/CodeGen/X86/shift-i256.ll @@ -14,30 +14,30 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone { ; CHECK-NEXT: subl $92, %esp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) -; CHECK-NEXT: sarl $31, %esi -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: sarl $31, %edx +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: andb $7, %al ; CHECK-NEXT: shrb $3, %cl @@ -231,7 +231,7 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: movb %ch, %cl ; CHECK-NEXT: shll %cl, %edx ; CHECK-NEXT: notb %cl -; CHECK-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; CHECK-NEXT: movb %cl, (%esp) # 1-byte Spill ; CHECK-NEXT: movl 64(%esp,%eax), %ebp ; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: shrl %ebp @@ -245,7 +245,7 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: movl 72(%esp,%eax), %ebx ; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: shrl %ebx -; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; CHECK-NEXT: movb (%esp), %cl # 1-byte Reload ; CHECK-NEXT: shrl %cl, %ebx ; CHECK-NEXT: orl %edx, %ebx ; CHECK-NEXT: movl 84(%esp,%eax), %esi @@ -255,7 +255,7 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: movl 80(%esp,%eax), %edi ; CHECK-NEXT: movl %edi, %edx ; CHECK-NEXT: shrl %edx -; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; CHECK-NEXT: movb (%esp), %cl # 1-byte Reload ; CHECK-NEXT: shrl %cl, %edx ; CHECK-NEXT: orl %esi, %edx ; CHECK-NEXT: movb %ch, %cl @@ -263,25 +263,28 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; CHECK-NEXT: shldl %cl, %esi, %edi -; CHECK-NEXT: movl 60(%esp,%eax), %ebp -; CHECK-NEXT: movl 88(%esp,%eax), %esi -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: shldl %cl, %eax, %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %esi, 28(%eax) -; CHECK-NEXT: movl %edi, 20(%eax) +; CHECK-NEXT: movl 60(%esp,%eax), %esi +; CHECK-NEXT: movl %esi, (%esp) # 4-byte Spill +; CHECK-NEXT: movl 88(%esp,%eax), %eax ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; CHECK-NEXT: movl %esi, 12(%eax) -; CHECK-NEXT: movl %ebp, %esi -; CHECK-NEXT: shll %cl, %esi +; CHECK-NEXT: shldl %cl, %esi, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl %eax, 28(%esi) +; CHECK-NEXT: movl %edi, 20(%esi) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 12(%esi) +; CHECK-NEXT: movl (%esp), %eax # 4-byte Reload +; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movl (%esp), %ebp # 4-byte Reload ; CHECK-NEXT: shldl %cl, %ebp, %edi -; CHECK-NEXT: movl %edi, 4(%eax) -; CHECK-NEXT: movl %esi, (%eax) -; CHECK-NEXT: movl %edx, 24(%eax) -; CHECK-NEXT: movl %ebx, 16(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 8(%eax) +; CHECK-NEXT: movl %edi, 4(%esi) +; CHECK-NEXT: movl %eax, (%esi) +; CHECK-NEXT: movl %edx, 24(%esi) +; CHECK-NEXT: movl %ebx, 16(%esi) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 8(%esi) +; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: addl $92, %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index 524ecf2aece7e7..8cc8b00b6c6598 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -18,20 +18,20 @@ define void @mul_2xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; X86-SSE-LABEL: mul_2xi8: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movzwl (%edx,%eax), %edx +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx ; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx -; X86-SSE-NEXT: movd %ecx, %xmm1 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm1, (%esi,%eax,4) +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -178,10 +178,10 @@ define void @mul_8xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; X86-SSE-LABEL: mul_8xi8: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movl c, %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-SSE-NEXT: pxor %xmm2, %xmm2 @@ -191,8 +191,8 @@ define void @mul_8xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: movdqu %xmm1, 16(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -295,27 +295,27 @@ define void @mul_16xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE-NEXT: movl c, %ecx -; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm3 -; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0 -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; X86-SSE-NEXT: pmullw %xmm4, %xmm2 -; X86-SSE-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; X86-SSE-NEXT: pmullw %xmm3, %xmm0 +; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm0 +; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: movdqu %xmm0, 48(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm3, 32(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm4, (%ecx,%eax,4) +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X86-SSE-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm3, %xmm4 +; X86-SSE-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 48(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm0, 32(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm4, 16(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm3, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -711,26 +711,26 @@ define void @mul_16xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i6 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE-NEXT: movl c, %ecx -; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm2 -; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm3 -; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0 -; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE-NEXT: pmulhuw %xmm2, %xmm4 -; X86-SSE-NEXT: pmullw %xmm2, %xmm0 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-SSE-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE-NEXT: pmulhuw %xmm3, %xmm4 -; X86-SSE-NEXT: pmullw %xmm3, %xmm1 -; X86-SSE-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm0 +; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm1 +; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm2 +; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm2, (%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -862,14 +862,14 @@ define void @mul_2xi8_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, ; X86-SSE-LABEL: mul_2xi8_sext: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movl c, %ecx -; X86-SSE-NEXT: movzwl (%esi,%eax), %esi -; X86-SSE-NEXT: movd %esi, %xmm0 -; X86-SSE-NEXT: movzwl (%edx,%eax), %edx -; X86-SSE-NEXT: movd %edx, %xmm1 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -877,7 +877,7 @@ define void @mul_2xi8_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4) +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -952,14 +952,14 @@ define void @mul_2xi8_sext_zext(ptr nocapture readonly %a, ptr nocapture readonl ; X86-SSE-LABEL: mul_2xi8_sext_zext: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movl c, %ecx -; X86-SSE-NEXT: movzwl (%esi,%eax), %esi -; X86-SSE-NEXT: movd %esi, %xmm0 -; X86-SSE-NEXT: movzwl (%edx,%eax), %edx -; X86-SSE-NEXT: movd %edx, %xmm1 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] @@ -967,7 +967,7 @@ define void @mul_2xi8_sext_zext(ptr nocapture readonly %a, ptr nocapture readonl ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; X86-SSE-NEXT: pmaddwd %xmm1, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4) +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1118,10 +1118,10 @@ define void @mul_2xi16_sext_zext(ptr nocapture readonly %a, ptr nocapture readon ; X86-SSE-LABEL: mul_2xi16_sext_zext: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movl c, %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 @@ -1133,7 +1133,7 @@ define void @mul_2xi16_sext_zext(ptr nocapture readonly %a, ptr nocapture readon ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE-NEXT: movq %xmm1, (%ecx,%eax,4) +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1208,26 +1208,26 @@ define void @mul_16xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly % ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE-NEXT: movl c, %ecx -; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm2 -; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm3 -; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0 -; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE-NEXT: pmulhw %xmm2, %xmm4 -; X86-SSE-NEXT: pmullw %xmm2, %xmm0 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-SSE-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE-NEXT: pmulhw %xmm3, %xmm4 -; X86-SSE-NEXT: pmullw %xmm3, %xmm1 -; X86-SSE-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm0 +; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm1 +; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm2 +; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm2, (%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1978,98 +1978,97 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: pushl %ebx ; X86-SSE-NEXT: pushl %edi ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: pushl %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: subl $8, %esp +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movzwl 16(%eax), %edx -; X86-SSE-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SSE-NEXT: movdqa (%eax), %xmm3 -; X86-SSE-NEXT: movdqa (%ecx), %xmm0 -; X86-SSE-NEXT: movdqa 16(%ecx), %xmm1 -; X86-SSE-NEXT: pxor %xmm4, %xmm4 -; X86-SSE-NEXT: movdqa %xmm3, %xmm2 -; X86-SSE-NEXT: pextrw $7, %xmm3, %eax -; X86-SSE-NEXT: pextrw $4, %xmm3, %edi -; X86-SSE-NEXT: pextrw $0, %xmm3, %ebp -; X86-SSE-NEXT: pextrw $1, %xmm3, %esi -; X86-SSE-NEXT: pextrw $3, %xmm3, %ebx -; X86-SSE-NEXT: movdqa %xmm3, %xmm5 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; X86-SSE-NEXT: movd %xmm3, %ecx +; X86-SSE-NEXT: movzwl 16(%eax), %ecx +; X86-SSE-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE-NEXT: movdqa (%eax), %xmm1 +; X86-SSE-NEXT: movdqa (%esi), %xmm0 +; X86-SSE-NEXT: movdqa 16(%esi), %xmm3 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE-NEXT: pextrw $7, %xmm1, %eax +; X86-SSE-NEXT: pextrw $4, %xmm1, %ebp +; X86-SSE-NEXT: pextrw $0, %xmm1, %ebx +; X86-SSE-NEXT: pextrw $1, %xmm1, %edi +; X86-SSE-NEXT: pextrw $3, %xmm1, %ecx +; X86-SSE-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; X86-SSE-NEXT: movd %xmm2, %ecx ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] -; X86-SSE-NEXT: movd %xmm4, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; X86-SSE-NEXT: movd %xmm4, %ecx +; X86-SSE-NEXT: movd %edx, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; X86-SSE-NEXT: movd %xmm5, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; X86-SSE-NEXT: movd %xmm5, %ecx ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm4 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; X86-SSE-NEXT: movl %edi, %eax +; X86-SSE-NEXT: movd %edx, %xmm5 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; X86-SSE-NEXT: movl %ebp, %eax ; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: divl 16(%edi) -; X86-SSE-NEXT: movd %edx, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; X86-SSE-NEXT: movd %xmm2, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE-NEXT: movd %xmm1, %ecx +; X86-SSE-NEXT: divl 16(%esi) +; X86-SSE-NEXT: movd %edx, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; X86-SSE-NEXT: movd %xmm4, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; X86-SSE-NEXT: movd %xmm3, %ecx ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm1 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; X86-SSE-NEXT: movl %ebp, %eax +; X86-SSE-NEXT: movd %edx, %xmm3 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; X86-SSE-NEXT: movl %ebx, %eax ; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl (%edi) -; X86-SSE-NEXT: movd %edx, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-SSE-NEXT: movd %xmm2, %ecx -; X86-SSE-NEXT: movl %esi, %eax +; X86-SSE-NEXT: divl (%esi) +; X86-SSE-NEXT: movd %edx, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; X86-SSE-NEXT: movd %xmm4, %ecx +; X86-SSE-NEXT: movl %edi, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm2 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; X86-SSE-NEXT: movd %xmm2, %ecx -; X86-SSE-NEXT: movl %ebx, %eax +; X86-SSE-NEXT: movd %edx, %xmm4 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; X86-SSE-NEXT: movd %xmm4, %ecx +; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; X86-SSE-NEXT: movd %xmm4, %eax +; X86-SSE-NEXT: movd %edx, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X86-SSE-NEXT: movd %xmm1, %eax ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X86-SSE-NEXT: movd %xmm0, %ecx ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx ; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; X86-SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl 32(%edi) -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm4 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: divl 32(%esi) +; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X86-SSE-NEXT: movl %eax, (%eax) +; X86-SSE-NEXT: movdqa %xmm2, (%eax) ; X86-SSE-NEXT: movdqa %xmm3, (%eax) -; X86-SSE-NEXT: movdqa %xmm0, (%eax) -; X86-SSE-NEXT: addl $4, %esp +; X86-SSE-NEXT: addl $8, %esp ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: popl %edi ; X86-SSE-NEXT: popl %ebx @@ -2204,91 +2203,90 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X64-SSE-LABEL: PR34947: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movzwl 16(%rdi), %ecx -; X64-SSE-NEXT: movdqa (%rdi), %xmm3 +; X64-SSE-NEXT: movdqa (%rdi), %xmm1 ; X64-SSE-NEXT: movdqa (%rsi), %xmm0 -; X64-SSE-NEXT: movdqa 16(%rsi), %xmm1 -; X64-SSE-NEXT: pxor %xmm4, %xmm4 -; X64-SSE-NEXT: movdqa %xmm3, %xmm2 -; X64-SSE-NEXT: pextrw $7, %xmm3, %eax -; X64-SSE-NEXT: pextrw $4, %xmm3, %r8d -; X64-SSE-NEXT: pextrw $0, %xmm3, %r10d -; X64-SSE-NEXT: pextrw $1, %xmm3, %edi -; X64-SSE-NEXT: pextrw $3, %xmm3, %r9d -; X64-SSE-NEXT: movdqa %xmm3, %xmm5 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; X64-SSE-NEXT: movd %xmm3, %r11d +; X64-SSE-NEXT: movdqa 16(%rsi), %xmm3 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE-NEXT: pextrw $7, %xmm1, %eax +; X64-SSE-NEXT: pextrw $4, %xmm1, %r10d +; X64-SSE-NEXT: pextrw $0, %xmm1, %r9d +; X64-SSE-NEXT: pextrw $1, %xmm1, %r8d +; X64-SSE-NEXT: pextrw $3, %xmm1, %edi +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; X64-SSE-NEXT: movd %xmm2, %r11d ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %r11d -; X64-SSE-NEXT: movd %edx, %xmm3 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] -; X64-SSE-NEXT: movd %xmm4, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; X64-SSE-NEXT: movd %xmm4, %r11d +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; X64-SSE-NEXT: movd %xmm5, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; X64-SSE-NEXT: movd %xmm5, %r11d ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %r11d -; X64-SSE-NEXT: movd %edx, %xmm4 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; X64-SSE-NEXT: movl %r8d, %eax +; X64-SSE-NEXT: movd %edx, %xmm5 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; X64-SSE-NEXT: movl %r10d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 16(%rsi) -; X64-SSE-NEXT: movd %edx, %xmm3 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; X64-SSE-NEXT: movd %xmm2, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-SSE-NEXT: movd %xmm1, %r8d +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; X64-SSE-NEXT: movd %xmm4, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; X64-SSE-NEXT: movd %xmm3, %r10d ; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %r8d -; X64-SSE-NEXT: movd %edx, %xmm1 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; X64-SSE-NEXT: movl %r10d, %eax +; X64-SSE-NEXT: divl %r10d +; X64-SSE-NEXT: movd %edx, %xmm3 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; X64-SSE-NEXT: movl %r9d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl (%rsi) -; X64-SSE-NEXT: movd %edx, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X64-SSE-NEXT: movd %xmm2, %r8d +; X64-SSE-NEXT: movd %edx, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; X64-SSE-NEXT: movd %xmm4, %r9d +; X64-SSE-NEXT: movl %r8d, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %r9d +; X64-SSE-NEXT: movd %edx, %xmm4 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; X64-SSE-NEXT: movd %xmm4, %r8d ; X64-SSE-NEXT: movl %edi, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %r8d -; X64-SSE-NEXT: movd %edx, %xmm2 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; X64-SSE-NEXT: movd %xmm2, %edi -; X64-SSE-NEXT: movl %r9d, %eax -; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %edi -; X64-SSE-NEXT: movd %edx, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; X64-SSE-NEXT: movd %xmm4, %eax +; X64-SSE-NEXT: movd %edx, %xmm4 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X64-SSE-NEXT: movd %xmm1, %eax ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-SSE-NEXT: movd %xmm0, %edi ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %edi ; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; X64-SSE-NEXT: movl %ecx, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 32(%rsi) ; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm3 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X64-SSE-NEXT: movl %eax, (%rax) +; X64-SSE-NEXT: movdqa %xmm2, (%rax) ; X64-SSE-NEXT: movdqa %xmm3, (%rax) -; X64-SSE-NEXT: movdqa %xmm1, (%rax) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: PR34947: diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll index 2d59422953eb3d..3c28e917a06b3a 100644 --- a/llvm/test/CodeGen/X86/smax.ll +++ b/llvm/test/CodeGen/X86/smax.ll @@ -158,17 +158,17 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %ebx, %edx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: sbbl %ebx, %ebp ; X86-NEXT: movl %edi, %ebp ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovll %ebx, %edx -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovll %ebx, %esi ; X86-NEXT: cmovll {{[0-9]+}}(%esp), %edi ; X86-NEXT: cmovll %ebp, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -295,27 +295,27 @@ define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edi, %eax -; X86-NEXT: cmovgl %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: cmovgl %eax, %esi +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovgl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %edx, %eax ; X86-NEXT: cmovgl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmovgl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovgl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -359,32 +359,32 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ebp, %eax -; X86-NEXT: cmovgl %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ebx, %eax -; X86-NEXT: cmovgl %eax, %ebx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edi, %eax -; X86-NEXT: cmovgl %eax, %edi +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovgl %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: cmovgl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edx, %eax -; X86-NEXT: cmovgl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovgl %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovgl %eax, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: cmpl %ebx, %eax +; X86-NEXT: cmovgl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: cmovgl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %edx, %eax @@ -396,14 +396,14 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, 28(%ecx) ; X86-NEXT: movl %edx, 24(%ecx) +; X86-NEXT: movl %ebp, 20(%ecx) +; X86-NEXT: movl %ebx, 16(%ecx) +; X86-NEXT: movl %edi, 12(%ecx) +; X86-NEXT: movl %esi, 8(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%ecx) +; X86-NEXT: movl %eax, 4(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -433,32 +433,32 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %bp, %ax -; X86-NEXT: cmovgl %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %bx, %ax -; X86-NEXT: cmovgl %eax, %ebx +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovgl %eax, %edi +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmovgl %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw %si, %ax ; X86-NEXT: cmovgl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %dx, %ax -; X86-NEXT: cmovgl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpw %di, %ax +; X86-NEXT: cmovgl %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %cx, %ax -; X86-NEXT: cmovgl %eax, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: cmpw %bx, %ax +; X86-NEXT: cmovgl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bp, %ax +; X86-NEXT: cmovgl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw %dx, %ax @@ -470,14 +470,14 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) ; X86-NEXT: movw %dx, 12(%ecx) +; X86-NEXT: movw %bp, 10(%ecx) +; X86-NEXT: movw %bx, 8(%ecx) +; X86-NEXT: movw %di, 6(%ecx) +; X86-NEXT: movw %si, 4(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 10(%ecx) +; X86-NEXT: movw %ax, 2(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 8(%ecx) -; X86-NEXT: movw %si, 6(%ecx) -; X86-NEXT: movw %di, 4(%ecx) -; X86-NEXT: movw %bx, 2(%ecx) -; X86-NEXT: movw %bp, (%ecx) +; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -511,18 +511,19 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $40, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb %bl, %al -; X86-NEXT: cmovgl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb %dl, %al ; X86-NEXT: cmovgl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: cmpb %cl, %al ; X86-NEXT: cmovgl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -721,20 +722,20 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: shrdl $28, %edi, %ecx +; X86-NEXT: shrdl $28, %edi, %esi ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %esi, %ecx +; X86-NEXT: cmpl %ecx, %esi ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: cmovll %esi, %ecx +; X86-NEXT: cmovll %ecx, %esi ; X86-NEXT: cmovll %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll index bde61d5738ed5c..c06d100b4b4551 100644 --- a/llvm/test/CodeGen/X86/smin.ll +++ b/llvm/test/CodeGen/X86/smin.ll @@ -154,29 +154,29 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl %edx, %ebx +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: sbbl %esi, %ebp ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovll %ebx, %edx -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmovll %ebx, %esi +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovll %edi, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %ebp, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -296,27 +296,27 @@ define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edi, %eax -; X86-NEXT: cmovll %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: cmovll %eax, %esi +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovll %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %edx, %eax ; X86-NEXT: cmovll %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmovll %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovll %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -360,32 +360,32 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ebp, %eax -; X86-NEXT: cmovll %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ebx, %eax -; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edi, %eax -; X86-NEXT: cmovll %eax, %edi +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: cmovll %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edx, %eax -; X86-NEXT: cmovll %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovll %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovll %eax, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: cmpl %ebx, %eax +; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: cmovll %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %edx, %eax @@ -397,14 +397,14 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, 28(%ecx) ; X86-NEXT: movl %edx, 24(%ecx) +; X86-NEXT: movl %ebp, 20(%ecx) +; X86-NEXT: movl %ebx, 16(%ecx) +; X86-NEXT: movl %edi, 12(%ecx) +; X86-NEXT: movl %esi, 8(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%ecx) +; X86-NEXT: movl %eax, 4(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -434,32 +434,32 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %bp, %ax -; X86-NEXT: cmovll %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %bx, %ax -; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovll %eax, %edi +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw %si, %ax ; X86-NEXT: cmovll %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %dx, %ax -; X86-NEXT: cmovll %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpw %di, %ax +; X86-NEXT: cmovll %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %cx, %ax -; X86-NEXT: cmovll %eax, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: cmpw %bx, %ax +; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bp, %ax +; X86-NEXT: cmovll %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw %dx, %ax @@ -471,14 +471,14 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) ; X86-NEXT: movw %dx, 12(%ecx) +; X86-NEXT: movw %bp, 10(%ecx) +; X86-NEXT: movw %bx, 8(%ecx) +; X86-NEXT: movw %di, 6(%ecx) +; X86-NEXT: movw %si, 4(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 10(%ecx) +; X86-NEXT: movw %ax, 2(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 8(%ecx) -; X86-NEXT: movw %si, 6(%ecx) -; X86-NEXT: movw %di, 4(%ecx) -; X86-NEXT: movw %bx, 2(%ecx) -; X86-NEXT: movw %bp, (%ecx) +; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -512,18 +512,19 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $40, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb %bl, %al -; X86-NEXT: cmovll %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb %dl, %al ; X86-NEXT: cmovll %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: cmpb %cl, %al ; X86-NEXT: cmovll %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -722,20 +723,20 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: shrdl $28, %edi, %ecx +; X86-NEXT: shrdl $28, %edi, %esi ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: sbbl %edi, %ebx -; X86-NEXT: cmovll %esi, %ecx +; X86-NEXT: cmovll %ecx, %esi ; X86-NEXT: cmovll %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll index 0e17af441d649b..a39c036c10bc70 100644 --- a/llvm/test/CodeGen/X86/smul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll @@ -191,216 +191,219 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $188, %esp +; X86-NEXT: subl $192, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: andl $1, %ebp -; X86-NEXT: negl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $1, %ecx +; X86-NEXT: negl %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %edx, %ecx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %esi +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %edx, %edi -; X86-NEXT: setb %bl +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: addl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X86-NEXT: adcl %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: mull %ebp +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl %edx, %edi ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %edx, %edi -; X86-NEXT: setb %cl +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: addl %eax, %edi -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, %ebp ; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: setb %al ; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: setb %bl +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X86-NEXT: addl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: adcl %ebx, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %esi ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X86-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebx -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: movzbl (%esp), %esi # 1-byte Folded Reload +; X86-NEXT: adcl %esi, %eax +; X86-NEXT: adcl $0, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -408,392 +411,374 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: setb %al +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl %ebp, %edi -; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: addl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 1-byte Folded Reload +; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: setb %al -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl %edx, %eax +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: movl %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $0, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: setb %al +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movzbl %al, %ebx +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %ebp, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movzbl %bl, %ebx -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: adcl $0, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edx, %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edx, %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %eax, %ebp +; X86-NEXT: addl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: setb %bl ; X86-NEXT: addl %eax, %ecx -; X86-NEXT: movzbl %bl, %esi -; X86-NEXT: adcl %edx, %esi +; X86-NEXT: movzbl %bl, %ebp +; X86-NEXT: adcl %edx, %ebp ; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: adcl %esi, %eax +; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %edi, %edx +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: adcl %esi, %ebp ; X86-NEXT: movl %eax, %edx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: adcl $0, %eax -; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %edi, %edx ; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: addl %ebx, %edx -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl %edi, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: adcl %ecx, %ebx -; X86-NEXT: setb %al -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl %esi, %eax +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: setb %bl +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %edi, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: setb %al +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movzbl %al, %ebx +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl (%esp), %edi # 4-byte Reload ; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: setb %al -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: movzbl %al, %esi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: adcl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: adcl $0, %eax -; X86-NEXT: movl (%esp), %ebp # 4-byte Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl %eax, %edi -; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: setb %al -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: addl %esi, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: adcl %edx, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %eax, %eax -; X86-NEXT: adcl %edx, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %ecx, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %eax +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: addl %ebp, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: addl %edx, %eax -; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl %edx, %esi ; X86-NEXT: setb %al -; X86-NEXT: addl %edi, %edx +; X86-NEXT: addl %edi, %esi ; X86-NEXT: movzbl %al, %ebp -; X86-NEXT: adcl %esi, %ebp -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: addl %esi, %edx ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: addl %edx, %eax -; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: setb %bl -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movzbl %bl, %ebx -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl %edi, %esi -; X86-NEXT: adcl %eax, (%esp) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: setb (%esp) # 1-byte Folded Spill +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movzbl (%esp), %esi # 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: addl %eax, %edx +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl %ebx, %edi -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %edi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl %esi, %ebx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl %esi, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: addl %edx, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: adcl %edi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %ebp ; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %esi, %ecx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, %edi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: xorl %ebx, %edx +; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: xorl %ebx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %ebp -; X86-NEXT: xorl %edx, %eax -; X86-NEXT: orl %ebp, %eax -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: orl %ebx, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: xorl %edx, %esi -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: xorl %edx, %edi -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %esi, %edx +; X86-NEXT: xorl %ebx, %edx +; X86-NEXT: orl %ebp, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: xorl %ebx, %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: andl $1, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negl %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: xorl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl %eax, %esi -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: xorl %eax, %ebx -; X86-NEXT: xorl %edi, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: orl %edi, %eax ; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl %edx, 4(%eax) @@ -805,7 +790,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: movb %cl, 16(%eax) ; X86-NEXT: setne 20(%eax) -; X86-NEXT: addl $188, %esp +; X86-NEXT: addl $192, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -820,10 +805,10 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r9, %r15 +; X64-NEXT: movq %r9, %rbp ; X64-NEXT: movq %rcx, %r9 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rsi, %r12 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rsi, %r15 ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; X64-NEXT: andl $1, %r11d @@ -833,103 +818,101 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %rdi ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %rdx, %rbp +; X64-NEXT: addq %rdx, %rdi ; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %rax, %rbp +; X64-NEXT: addq %rax, %rdi ; X64-NEXT: adcq %rdx, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movzbl %sil, %edi +; X64-NEXT: movzbl %sil, %r14d ; X64-NEXT: addq %rax, %rcx -; X64-NEXT: adcq %rdx, %rdi -; X64-NEXT: movq %r12, %rax +; X64-NEXT: adcq %rdx, %r14 +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r10, %r13 -; X64-NEXT: adcq $0, %rbx ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r10, %rbx +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %r13, %rax +; X64-NEXT: addq %rbx, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %rsi +; X64-NEXT: adcq %r13, %rsi ; X64-NEXT: setb %r8b -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: addq %rsi, %rax ; X64-NEXT: movzbl %r8b, %edx ; X64-NEXT: adcq %rdx, %rbx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: adcq %rbp, %rbx +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: adcq %rdi, %rbx ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: adcq $0, %rdi +; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %rax, %r8 +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %r13, %r14 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r13, %r10 ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: addq %r15, %r14 +; X64-NEXT: addq %r15, %r10 ; X64-NEXT: adcq %r13, %rbp -; X64-NEXT: setb %al -; X64-NEXT: addq %r8, %rbp -; X64-NEXT: movzbl %al, %r12d +; X64-NEXT: setb %sil +; X64-NEXT: addq %rax, %rbp +; X64-NEXT: movzbl %sil, %r12d ; X64-NEXT: adcq %rdx, %r12 -; X64-NEXT: addq %r15, %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %r14 +; X64-NEXT: addq %r15, %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rbx, %r10 ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: adcq %rdi, %r12 +; X64-NEXT: adcq %r14, %r12 ; X64-NEXT: setb %cl ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: addq %rax, %r10 -; X64-NEXT: adcq %rdx, %rdi -; X64-NEXT: setb %bl +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rdx, %rdi +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: adcq $0, %rbx ; X64-NEXT: addq %rax, %rdi -; X64-NEXT: movzbl %bl, %esi -; X64-NEXT: adcq %rdx, %rsi +; X64-NEXT: adcq %rdx, %rbx +; X64-NEXT: setb %r14b +; X64-NEXT: addq %rax, %rbx +; X64-NEXT: movzbl %r14b, %r14d +; X64-NEXT: adcq %rdx, %r14 ; X64-NEXT: addq %rax, %rbp -; X64-NEXT: adcq %r12, %r10 +; X64-NEXT: adcq %r12, %rdi ; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: addq %rax, %r8 +; X64-NEXT: addq %rax, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: addq %rbx, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: addq %r8, %r12 ; X64-NEXT: adcq %rax, %rcx ; X64-NEXT: setb %al ; X64-NEXT: addq %rsi, %rcx @@ -937,59 +920,59 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X64-NEXT: adcq %rdx, %rsi ; X64-NEXT: movq %r9, %rax ; X64-NEXT: imulq %r11 -; X64-NEXT: movq %rbx, %r11 -; X64-NEXT: addq %rax, %r11 -; X64-NEXT: movq %r8, %r12 -; X64-NEXT: adcq %rdx, %r12 -; X64-NEXT: addq %rcx, %r11 -; X64-NEXT: adcq %rsi, %r12 -; X64-NEXT: movq %r15, %r9 -; X64-NEXT: addq %r13, %r9 -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %r8, %r9 +; X64-NEXT: addq %rax, %r9 +; X64-NEXT: movq %r12, %r11 +; X64-NEXT: adcq %rdx, %r11 ; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: movq %r15, %rcx +; X64-NEXT: addq %r13, %rcx +; X64-NEXT: adcq $0, %r13 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: adcq %rsi, %r13 -; X64-NEXT: setb %bl -; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: movzbl %bl, %ecx -; X64-NEXT: adcq %rsi, %rcx +; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: adcq %r8, %r13 +; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; X64-NEXT: addq %rsi, %r13 +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; X64-NEXT: adcq %r8, %rsi ; X64-NEXT: addq %r15, %rax -; X64-NEXT: adcq %r9, %rdx -; X64-NEXT: addq %r13, %rax ; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: addq %r13, %rax +; X64-NEXT: adcq %rsi, %rdx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; X64-NEXT: adcq %r8, %r9 -; X64-NEXT: adcq %r11, %rax -; X64-NEXT: adcq %r12, %rdx +; X64-NEXT: adcq %r12, %rcx +; X64-NEXT: adcq %r9, %rax +; X64-NEXT: adcq %r11, %rdx ; X64-NEXT: addq %rbp, %r15 -; X64-NEXT: adcq %r10, %r9 -; X64-NEXT: adcq %rdi, %rax -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq %r14, %rcx -; X64-NEXT: sarq $63, %rcx -; X64-NEXT: xorq %rcx, %rdx -; X64-NEXT: xorq %rcx, %r9 -; X64-NEXT: orq %rdx, %r9 -; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: xorq %r15, %rcx -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: orq %r9, %rcx +; X64-NEXT: adcq %rdi, %rcx +; X64-NEXT: adcq %rbx, %rax +; X64-NEXT: adcq %r14, %rdx +; X64-NEXT: movq %r10, %rsi +; X64-NEXT: sarq $63, %rsi +; X64-NEXT: xorq %rsi, %rdx +; X64-NEXT: xorq %rsi, %rcx +; X64-NEXT: orq %rdx, %rcx +; X64-NEXT: xorq %rsi, %rax +; X64-NEXT: xorq %r15, %rsi +; X64-NEXT: orq %rax, %rsi +; X64-NEXT: orq %rcx, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movl %eax, %esi -; X64-NEXT: andl $1, %esi -; X64-NEXT: movq %rsi, %rdx -; X64-NEXT: negq %rdx -; X64-NEXT: xorq %rdx, %r14 -; X64-NEXT: xorq %rax, %rdx -; X64-NEXT: orq %r14, %rdx -; X64-NEXT: orq %rcx, %rdx +; X64-NEXT: movl %eax, %edx +; X64-NEXT: andl $1, %edx +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: negq %rcx +; X64-NEXT: xorq %rcx, %r10 +; X64-NEXT: xorq %rax, %rcx +; X64-NEXT: orq %r10, %rcx +; X64-NEXT: orq %rsi, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, 8(%rax) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, (%rax) -; X64-NEXT: movb %sil, 16(%rax) +; X64-NEXT: movb %dl, 16(%rax) ; X64-NEXT: setne 24(%rax) ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll index 85c966c447fad6..b7a67f57bc6be7 100644 --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -60,73 +60,66 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: imull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %edx ; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: addl %edi, %edx ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sbbl $0, %ebp -; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: cmovnsl %esi, %ebp +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: cmovnsl %esi, %ebx ; X86-NEXT: cmovnsl %edx, %edi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: sbbl $0, %edx +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: subl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebp, %edx -; X86-NEXT: cmovnsl %edi, %ecx -; X86-NEXT: testl %edx, %edx -; X86-NEXT: setg %ah -; X86-NEXT: sete (%esp) # 1-byte Folded Spill -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setae %al -; X86-NEXT: andb (%esp), %al # 1-byte Folded Reload -; X86-NEXT: orb %ah, %al -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shrdl $2, %ebx, %ebp -; X86-NEXT: shrdl $2, %ecx, %ebx -; X86-NEXT: testb %al, %al -; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: cmovel %ebx, %esi -; X86-NEXT: movl $-1, %edi -; X86-NEXT: cmovel %ebp, %edi -; X86-NEXT: cmpl $-1, %edx -; X86-NEXT: setl %dl +; X86-NEXT: cmovnsl %ebx, %esi +; X86-NEXT: cmovnsl %edi, %ebp +; X86-NEXT: testl %esi, %esi +; X86-NEXT: setg %bl +; X86-NEXT: sete %bh +; X86-NEXT: cmpl $2, %ebp +; X86-NEXT: setae %dl +; X86-NEXT: andb %bh, %dl +; X86-NEXT: orb %bl, %dl +; X86-NEXT: shrdl $2, %eax, %ecx +; X86-NEXT: shrdl $2, %ebp, %eax +; X86-NEXT: testb %dl, %dl +; X86-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF +; X86-NEXT: cmovel %eax, %edi +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovel %ecx, %edx +; X86-NEXT: cmpl $-1, %esi +; X86-NEXT: setl %cl ; X86-NEXT: sete %al -; X86-NEXT: cmpl $-2, %ecx -; X86-NEXT: setb %cl -; X86-NEXT: andb %al, %cl +; X86-NEXT: cmpl $-2, %ebp +; X86-NEXT: setb %ch +; X86-NEXT: andb %al, %ch ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: orb %dl, %cl -; X86-NEXT: cmovel %edi, %eax +; X86-NEXT: orb %cl, %ch +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 -; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: addl $8, %esp +; X86-NEXT: cmovel %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -331,15 +324,15 @@ define i32 @func4(i32 %x, i32 %y) nounwind { ; X86-LABEL: func4: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %edx, %esi -; X86-NEXT: sets %cl -; X86-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-NEXT: imull %edx, %eax -; X86-NEXT: cmovol %ecx, %eax +; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: sets %dl +; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; X86-NEXT: imull %ecx, %eax +; X86-NEXT: cmovol %edx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl %tmp = call i32 @llvm.smul.fix.sat.i32(i32 %x, i32 %y, i32 0) @@ -369,76 +362,73 @@ define i64 @func5(i64 %x, i64 %y) { ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $12, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 28 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: imull %edx, %ebx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: imull %edx, %edi ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %eax, %esi -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: addl %edi, %esi ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: imull %ecx, %edi ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: adcl %esi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %eax, %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %eax, %edi +; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: setb %bl ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: sarl $31, %ebx ; X86-NEXT: xorl %ebx, %edx ; X86-NEXT: xorl %eax, %ebx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: xorl $2147483647, %edi # imm = 0x7FFFFFFF +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF ; X86-NEXT: orl %edx, %ebx ; X86-NEXT: notl %ecx ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: cmovel %esi, %edi +; X86-NEXT: cmovel %edi, %esi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %edi, %edx -; X86-NEXT: addl $12, %esp +; X86-NEXT: movl %esi, %edx +; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 @@ -475,19 +465,19 @@ define i4 @func6(i4 %x, i4 %y) nounwind { ; ; X86-LABEL: func6: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shlb $4, %cl -; X86-NEXT: sarb $4, %cl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: shlb $4, %dl +; X86-NEXT: sarb $4, %dl ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shlb $4, %al -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: movb %al, %ah -; X86-NEXT: xorb %cl, %ah -; X86-NEXT: sets %dl -; X86-NEXT: addl $127, %edx -; X86-NEXT: imulb %cl +; X86-NEXT: xorb %dl, %ah +; X86-NEXT: sets %cl +; X86-NEXT: addl $127, %ecx +; X86-NEXT: imulb %dl ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: cmovol %edx, %eax +; X86-NEXT: cmovol %ecx, %eax ; X86-NEXT: sarb $4, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl @@ -664,12 +654,12 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: cmovsl %edi, %edx ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setns {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: sets %bh +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: setg {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: sete %bl -; X86-NEXT: andb %bh, %bl -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload +; X86-NEXT: setg %bl +; X86-NEXT: sete %bh +; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload +; X86-NEXT: orb %bl, %bh ; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: movl $-1, %esi @@ -736,32 +726,32 @@ define i64 @func8(i64 %x, i64 %y) nounwind { ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: addl %ecx, %edi ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %edi, %edx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: sbbl $0, %ebp +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: sbbl $0, %edx ; X86-NEXT: testl %esi, %esi -; X86-NEXT: cmovnsl %ebx, %ebp -; X86-NEXT: cmovnsl %edi, %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: cmovnsl %ebx, %edx +; X86-NEXT: cmovnsl %edi, %ecx +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: subl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: sbbl $0, %edi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebp, %ecx -; X86-NEXT: cmovnsl %edx, %esi +; X86-NEXT: cmovnsl %edx, %edi +; X86-NEXT: cmovnsl %ecx, %esi ; X86-NEXT: shrdl $31, %esi, %eax -; X86-NEXT: shrdl $31, %ecx, %esi -; X86-NEXT: cmpl $1073741824, %ecx # imm = 0x40000000 -; X86-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF -; X86-NEXT: cmovll %esi, %edi +; X86-NEXT: shrdl $31, %edi, %esi +; X86-NEXT: cmpl $1073741824, %edi # imm = 0x40000000 +; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: cmovll %esi, %ecx ; X86-NEXT: movl $-1, %edx ; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl $-1073741824, %ecx # imm = 0xC0000000 +; X86-NEXT: cmpl $-1073741824, %edi # imm = 0xC0000000 ; X86-NEXT: cmovll %edx, %eax ; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 -; X86-NEXT: cmovgel %edi, %edx +; X86-NEXT: cmovgel %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll index abab313f4b12e7..55a9560c449249 100644 --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -14,60 +14,59 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X64-NEXT: .cfi_offset %rbx, -32 ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .cfi_offset %r15, -16 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rdi, %r11 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: sarq $63, %rdx -; X64-NEXT: movq %rcx, %r9 -; X64-NEXT: imulq %rdx, %r9 -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: imulq %rdx, %rbx +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rdx -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rax, %rdi -; X64-NEXT: addq %r9, %rdi +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rax, %r10 +; X64-NEXT: addq %rbx, %r10 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: sarq $63, %rax ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: imulq %rsi, %r15 -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: addq %r15, %r9 -; X64-NEXT: addq %rax, %r9 -; X64-NEXT: addq %r14, %r10 -; X64-NEXT: adcq %rdi, %r9 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r15, %r14 +; X64-NEXT: addq %rax, %r14 +; X64-NEXT: addq %r11, %rbx +; X64-NEXT: adcq %r10, %r14 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r14, %r15 -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %r11, %rax +; X64-NEXT: addq %r10, %r15 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r15, %r11 -; X64-NEXT: adcq %rbx, %r14 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r15, %r10 +; X64-NEXT: adcq %r9, %rdi ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %ebx +; X64-NEXT: movzbl %al, %r9d ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: addq %r14, %rax -; X64-NEXT: adcq %rbx, %rdx -; X64-NEXT: addq %r10, %rax +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: adcq %r9, %rdx -; X64-NEXT: movq %r11, 8(%r8) -; X64-NEXT: sarq $63, %r11 -; X64-NEXT: xorq %r11, %rdx -; X64-NEXT: xorq %rax, %r11 -; X64-NEXT: orq %rdx, %r11 +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: adcq %r14, %rdx +; X64-NEXT: movq %r10, 8(%r8) +; X64-NEXT: sarq $63, %r10 +; X64-NEXT: xorq %r10, %rdx +; X64-NEXT: xorq %rax, %r10 +; X64-NEXT: orq %rdx, %r10 ; X64-NEXT: setne %al -; X64-NEXT: movq %rdi, (%r8) +; X64-NEXT: movq %r11, (%r8) ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r14 ; X64-NEXT: popq %r15 @@ -89,163 +88,161 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: setb %cl +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: setb %cl +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %esi -; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, %edi -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %edi, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: setb %cl +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull %eax, %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %edx +; X86-NEXT: addl %edi, %edx ; X86-NEXT: addl %eax, %edx -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %edi +; X86-NEXT: addl %eax, %ebx ; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -253,7 +250,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %edx, %ebp ; X86-NEXT: adcl $0, %esi @@ -272,18 +269,18 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %eax, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: adcl %ebp, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: addl %ecx, %esi ; X86-NEXT: adcl %edx, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %ebx, %esi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload @@ -294,7 +291,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: xorl %ecx, %ebp ; X86-NEXT: orl %eax, %ebp ; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: xorl %edi, %ecx ; X86-NEXT: orl %esi, %ecx ; X86-NEXT: orl %ebp, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -340,218 +337,220 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X64-NEXT: .cfi_offset %r14, -32 ; X64-NEXT: .cfi_offset %r15, -24 ; X64-NEXT: .cfi_offset %rbp, -16 -; X64-NEXT: movq %rcx, %r13 -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %r8, %rbx +; X64-NEXT: movq %rcx, %r15 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: movq %rsi, %r10 ; X64-NEXT: movq %rdx, %rax -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r9, %rcx +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rsi, %r8 +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r14, %rbx -; X64-NEXT: adcq %rsi, %r12 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r8, %r14 +; X64-NEXT: adcq %rcx, %r12 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %r9d -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r12, %rsi -; X64-NEXT: adcq %r9, %rcx +; X64-NEXT: movzbl %al, %ecx +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r12, %r15 +; X64-NEXT: adcq %rcx, %rsi +; X64-NEXT: movq %rdi, %r11 ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r9, %r13 +; X64-NEXT: addq %r8, %r13 ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r15 -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %r9, %rdi +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r12, %r9 -; X64-NEXT: setb %dil +; X64-NEXT: adcq %r12, %r8 +; X64-NEXT: setb %cl +; X64-NEXT: movq %r10, %r9 ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r9, %rbp -; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: addq %r8, %rbp +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; X64-NEXT: addq %r11, %rbp -; X64-NEXT: adcq %rbx, %r13 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp ## 8-byte Folded Reload +; X64-NEXT: adcq %r14, %r13 +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq %r10, %r14 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: mulq %rdi +; X64-NEXT: movq %r11, %rcx +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, %r14 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r9, %r10 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %r8, %r9 ; X64-NEXT: adcq $0, %r11 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r10, %r15 -; X64-NEXT: adcq %r11, %rdi -; X64-NEXT: setb %r10b +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: addq %r9, %rax +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: adcq %r11, %r10 +; X64-NEXT: setb %cl ; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rdi, %r9 -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r10, %r8 +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: addq %rbp, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r13, %r15 -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: addq %rbp, %rdi +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r13, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq $0, %r8 ; X64-NEXT: adcq $0, %r11 -; X64-NEXT: addq %rsi, %r9 -; X64-NEXT: adcq %rcx, %r11 -; X64-NEXT: setb %bl +; X64-NEXT: addq %r15, %r8 +; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: setb %cl ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload ; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp ## 8-byte Reload -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rsi, %r9 +; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: adcq %rsi, %r10 -; X64-NEXT: setb %cl -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: addq %r9, %rax +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: setb %sil +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: addq %r10, %r15 +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: adcq %rax, %rbp +; X64-NEXT: addq %r8, %r13 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r11, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: addq %r9, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r11, %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movzbl %bl, %eax ; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %rbp, %rdi -; X64-NEXT: sarq $63, %rdi -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %r14, %r8 +; X64-NEXT: sarq $63, %r8 +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r10, %r11 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: addq %rsi, %r11 -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r10, %r9 +; X64-NEXT: addq %r9, %r11 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: addq %r14, %r11 +; X64-NEXT: adcq %r9, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %rdi, %r8 -; X64-NEXT: imulq %r12, %r8 -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %r8, %rdi +; X64-NEXT: imulq %r12, %rdi +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq {{[0-9]+}}(%rsp) -; X64-NEXT: addq %r8, %rdx +; X64-NEXT: addq %rdi, %rdx ; X64-NEXT: addq %rax, %rdx -; X64-NEXT: addq %rbx, %rax +; X64-NEXT: addq %r14, %rax ; X64-NEXT: adcq %r11, %rdx -; X64-NEXT: addq %r14, %r9 -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: adcq %rcx, %rsi -; X64-NEXT: addq %rax, %r9 -; X64-NEXT: adcq %rdx, %rsi +; X64-NEXT: addq %rbx, %rcx +; X64-NEXT: movzbl %sil, %r8d +; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: adcq %rdx, %r8 ; X64-NEXT: sarq $63, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rdx, %r14 -; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rax, %r14 -; X64-NEXT: adcq %rdx, %rdi -; X64-NEXT: setb %bl -; X64-NEXT: imulq %r12, %rbp +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rax, %rbx +; X64-NEXT: adcq %rdx, %r10 +; X64-NEXT: setb %r13b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload +; X64-NEXT: imulq %r12, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload ; X64-NEXT: mulq %r12 ; X64-NEXT: addq %rax, %rdx -; X64-NEXT: addq %rbp, %rdx -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: adcq %r14, %rdx -; X64-NEXT: addq %r10, %rdi -; X64-NEXT: movzbl %bl, %r10d -; X64-NEXT: adcq %r8, %r10 -; X64-NEXT: addq %rax, %rdi -; X64-NEXT: adcq %rdx, %r10 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload -; X64-NEXT: adcq %r11, %r14 -; X64-NEXT: adcq %r9, %rdi -; X64-NEXT: adcq %rsi, %r10 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload -; X64-NEXT: adcq %r15, %rdi -; X64-NEXT: adcq %r13, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload -; X64-NEXT: movq %rdx, %rax +; X64-NEXT: addq %rsi, %rdx +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: adcq %rbx, %rdx +; X64-NEXT: addq %r9, %r10 +; X64-NEXT: movzbl %r13b, %r9d +; X64-NEXT: adcq %r14, %r9 +; X64-NEXT: addq %rax, %r10 +; X64-NEXT: adcq %rdx, %r9 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Folded Reload +; X64-NEXT: adcq %r11, %rbx +; X64-NEXT: adcq %rcx, %r10 +; X64-NEXT: adcq %r8, %r9 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload +; X64-NEXT: adcq %r15, %r10 +; X64-NEXT: adcq %rbp, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorq %rax, %r9 +; X64-NEXT: xorq %rax, %rbx +; X64-NEXT: orq %r9, %rbx ; X64-NEXT: xorq %rax, %r10 -; X64-NEXT: xorq %rax, %r14 -; X64-NEXT: orq %r10, %r14 -; X64-NEXT: xorq %rax, %rdi -; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: orq %rdi, %rax -; X64-NEXT: orq %r14, %rax +; X64-NEXT: xorq %rdi, %rax +; X64-NEXT: orq %r10, %rax +; X64-NEXT: orq %rbx, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax -; X64-NEXT: movq %rdx, 24(%rax) +; X64-NEXT: movq %rcx, 24(%rax) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload ; X64-NEXT: movq %rcx, (%rax) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload @@ -577,872 +576,876 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $156, %esp -; X86-NEXT: .cfi_def_cfa_offset 176 +; X86-NEXT: subl $160, %esp +; X86-NEXT: .cfi_def_cfa_offset 180 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ebp +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: setb %bl +; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebp, %esi -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, (%esp) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ecx, (%esp) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: setb (%esp) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: setb %cl +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebp, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %esi, %ebx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi -; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: setb %bl +; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %ebp, %esi -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl %edi, %ecx -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: adcl %ebp, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi +; X86-NEXT: adcl %eax, %ebp ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %esi, %ebx -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: setb %cl +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movzbl %bl, %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: movl (%esp), %edx ## 4-byte Reload +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl %ebp, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload ; X86-NEXT: adcl %edi, %eax ; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi -; X86-NEXT: adcl %ecx, %ebx -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: setb %bl +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: adcl %ebx, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %esi +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: setb %bl +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: movl (%esp), %edi ## 4-byte Reload -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: adcl $0, %eax -; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl (%esp), %esi ## 4-byte Reload +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %eax ; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, (%esp) ## 4-byte Spill +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: setb %cl +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: addl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl %ebp, %esi ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebp -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %esi, %ebx -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: setb %cl +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movzbl %bl, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: adcl %ebp, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl (%esp), %eax ## 4-byte Reload -; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload +; X86-NEXT: adcl %edi, %eax ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sarl $31, %esi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %ebx -; X86-NEXT: setb %al -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: movzbl %al, %eax +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: setb %cl +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: mull %esi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %esi, %ecx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: setb %al -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: movzbl %al, %edx -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: setb %cl +; X86-NEXT: addl %eax, %edi +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl (%esp), %eax ## 4-byte Reload +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %eax, %edi +; X86-NEXT: adcl %edx, %esi ; X86-NEXT: setb %al -; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: addl %edx, %ecx ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %edx, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: imull %ebx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ebx ; X86-NEXT: addl %eax, %edx -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: addl (%esp), %eax ## 4-byte Folded Reload +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: movl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %edx -; X86-NEXT: adcl $0, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: addl %edx, %eax +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl (%esp), %eax ## 4-byte Reload +; X86-NEXT: addl %esi, %eax ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: adcl %ebx, %ebp ; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, %edi +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl %edx, %esi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: setb %bl ; X86-NEXT: addl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: setb (%esp) ## 1-byte Folded Spill -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: movzbl (%esp), %ebp ## 1-byte Folded Reload -; X86-NEXT: adcl %edx, %ebp -; X86-NEXT: movl %edi, %eax -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edx, %ebp ; X86-NEXT: movl %edx, %esi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %eax, %edi +; X86-NEXT: addl %eax, %ebp ; X86-NEXT: adcl %edx, %esi -; X86-NEXT: setb %cl +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: addl %eax, %esi -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %edi, (%esp) ## 4-byte Folded Spill -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl $0, %eax -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: setb %al -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl %esi, %edx ; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: setb %cl +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: movl %esi, %edi +; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: addl %edx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %eax, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: addl %edx, %eax -; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %edx, %eax +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %ebx, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: imull %ebp, %ebx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: imull %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebx, %edx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: addl %eax, %edx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl %ebx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: addl %edx, %esi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl (%esp), %eax ## 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: sarl $31, %eax -; X86-NEXT: xorl %eax, %edx -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: xorl %eax, %esi +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: xorl %eax, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: xorl %eax, %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: xorl %eax, %edi -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebp, 28(%eax) +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %ebp +; X86-NEXT: orl %esi, %ebp +; X86-NEXT: orl %ecx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ebp, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: movl %ecx, 28(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload @@ -1458,7 +1461,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, 24(%eax) ; X86-NEXT: setne %al -; X86-NEXT: addl $156, %esp +; X86-NEXT: addl $160, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening.ll b/llvm/test/CodeGen/X86/speculative-load-hardening.ll index 0c47fcddc43af2..ba657dbf4a2322 100644 --- a/llvm/test/CodeGen/X86/speculative-load-hardening.ll +++ b/llvm/test/CodeGen/X86/speculative-load-hardening.ll @@ -1088,25 +1088,25 @@ define void @test_deferred_hardening(ptr %ptr1, ptr %ptr2, i32 %x) nounwind spec ; X64-LFENCE-NEXT: pushq %r14 ; X64-LFENCE-NEXT: pushq %rbx ; X64-LFENCE-NEXT: pushq %rax -; X64-LFENCE-NEXT: movq %rsi, %r14 -; X64-LFENCE-NEXT: movq %rdi, %rbx +; X64-LFENCE-NEXT: movq %rsi, %rbx +; X64-LFENCE-NEXT: movq %rdi, %r14 ; X64-LFENCE-NEXT: movl (%rdi), %edi ; X64-LFENCE-NEXT: incl %edi ; X64-LFENCE-NEXT: imull %edx, %edi ; X64-LFENCE-NEXT: callq sink@PLT -; X64-LFENCE-NEXT: movl (%rbx), %eax -; X64-LFENCE-NEXT: movl (%r14), %ecx +; X64-LFENCE-NEXT: movl (%r14), %eax +; X64-LFENCE-NEXT: movl (%rbx), %ecx ; X64-LFENCE-NEXT: leal 1(%rax,%rcx), %edi ; X64-LFENCE-NEXT: callq sink@PLT -; X64-LFENCE-NEXT: movl (%rbx), %edi +; X64-LFENCE-NEXT: movl (%r14), %edi ; X64-LFENCE-NEXT: shll $7, %edi ; X64-LFENCE-NEXT: callq sink@PLT -; X64-LFENCE-NEXT: movswl (%rbx), %edi +; X64-LFENCE-NEXT: movswl (%r14), %edi ; X64-LFENCE-NEXT: notl %edi ; X64-LFENCE-NEXT: shrl $7, %edi ; X64-LFENCE-NEXT: orl $-65536, %edi # imm = 0xFFFF0000 ; X64-LFENCE-NEXT: callq sink@PLT -; X64-LFENCE-NEXT: movzwl (%rbx), %eax +; X64-LFENCE-NEXT: movzwl (%r14), %eax ; X64-LFENCE-NEXT: rolw $9, %ax ; X64-LFENCE-NEXT: movswl %ax, %edi ; X64-LFENCE-NEXT: negl %edi diff --git a/llvm/test/CodeGen/X86/split-vector-rem.ll b/llvm/test/CodeGen/X86/split-vector-rem.ll index e292e128d9bfac..a48e060e7e3950 100644 --- a/llvm/test/CodeGen/X86/split-vector-rem.ll +++ b/llvm/test/CodeGen/X86/split-vector-rem.ll @@ -141,29 +141,29 @@ define <8 x float> @qux(<8 x float> %t, <8 x float> %u) { ; CHECK-NEXT: subq $104, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 112 ; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: movaps %xmm2, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll index c0ad1a31c7d8dd..619b4cb365ea11 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -549,33 +549,32 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { define <16 x float> @v16f32_estimate(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_estimate: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm5 -; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: mulps %xmm4, %xmm6 -; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; SSE-NEXT: addps %xmm5, %xmm0 -; SSE-NEXT: mulps %xmm6, %xmm0 -; SSE-NEXT: rsqrtps %xmm1, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: mulps %xmm4, %xmm6 -; SSE-NEXT: addps %xmm5, %xmm1 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: rsqrtps %xmm2, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm2 -; SSE-NEXT: mulps %xmm6, %xmm2 -; SSE-NEXT: mulps %xmm4, %xmm6 -; SSE-NEXT: addps %xmm5, %xmm2 -; SSE-NEXT: mulps %xmm6, %xmm2 -; SSE-NEXT: rsqrtps %xmm3, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm4 -; SSE-NEXT: mulps %xmm6, %xmm3 -; SSE-NEXT: mulps %xmm6, %xmm3 -; SSE-NEXT: addps %xmm5, %xmm3 +; SSE-NEXT: rsqrtps %xmm0, %xmm4 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: mulps %xmm5, %xmm4 +; SSE-NEXT: movaps {{.*#+}} xmm6 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; SSE-NEXT: addps %xmm6, %xmm0 +; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: rsqrtps %xmm1, %xmm4 +; SSE-NEXT: mulps %xmm4, %xmm1 +; SSE-NEXT: mulps %xmm4, %xmm1 +; SSE-NEXT: mulps %xmm5, %xmm4 +; SSE-NEXT: addps %xmm6, %xmm1 +; SSE-NEXT: mulps %xmm4, %xmm1 +; SSE-NEXT: rsqrtps %xmm2, %xmm4 +; SSE-NEXT: mulps %xmm4, %xmm2 +; SSE-NEXT: mulps %xmm4, %xmm2 +; SSE-NEXT: mulps %xmm5, %xmm4 +; SSE-NEXT: addps %xmm6, %xmm2 +; SSE-NEXT: mulps %xmm4, %xmm2 +; SSE-NEXT: rsqrtps %xmm3, %xmm4 +; SSE-NEXT: mulps %xmm4, %xmm5 ; SSE-NEXT: mulps %xmm4, %xmm3 +; SSE-NEXT: mulps %xmm4, %xmm3 +; SSE-NEXT: addps %xmm6, %xmm3 +; SSE-NEXT: mulps %xmm5, %xmm3 ; SSE-NEXT: retq ; ; AVX1-LABEL: v16f32_estimate: diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll index 565946d342e935..e604c724cc9895 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -185,7 +185,7 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rsi -; SSE2-NEXT: movq %rsi, %xmm1 +; SSE2-NEXT: movq %rsi, %xmm0 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: imulq %r8 ; SSE2-NEXT: movq %rdx, %rax @@ -193,10 +193,10 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rdi -; SSE2-NEXT: movq %rdi, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movq %rdi, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [8589934591,8589934591] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: imulq %rdx @@ -208,16 +208,16 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: leaq (%rdx,%rdx,8), %rax ; SSE2-NEXT: addq %rcx, %rax ; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3] -; SSE2-NEXT: andps %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx @@ -240,7 +240,7 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE41-NEXT: addq %rdx, %rax ; SSE41-NEXT: leaq (%rax,%rax,8), %rax ; SSE41-NEXT: subq %rax, %rsi -; SSE41-NEXT: movq %rsi, %xmm1 +; SSE41-NEXT: movq %rsi, %xmm0 ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: imulq %r8 ; SSE41-NEXT: movq %rdx, %rax @@ -248,10 +248,10 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE41-NEXT: addq %rdx, %rax ; SSE41-NEXT: leaq (%rax,%rax,8), %rax ; SSE41-NEXT: subq %rax, %rdi -; SSE41-NEXT: movq %rdi, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] -; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movq %rdi, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8589934591,8589934591] +; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: imulq %rdx @@ -263,14 +263,14 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE41-NEXT: leaq (%rdx,%rdx,8), %rax ; SSE41-NEXT: addq %rcx, %rax ; SSE41-NEXT: movq %rax, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: pextrb $8, %xmm0, %edx +; SSE41-NEXT: pxor %xmm0, %xmm2 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: pextrb $8, %xmm1, %edx ; SSE41-NEXT: pextrb $0, %xmm2, %ecx ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: # kill: def $dl killed $dl killed $edx diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index 0d28dfc68f5bf0..8e25fe20c7eb58 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2218,42 +2218,42 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE2-LABEL: pr51133: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movq %rdi, %rax -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm5 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; CHECK-SSE2-NEXT: pand %xmm4, %xmm5 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm6 ; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE2-NEXT: pand %xmm4, %xmm6 -; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm6 +; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm6 ; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: psrlw $8, %xmm5 +; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm4 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: psrlw $8, %xmm4 ; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 ; CHECK-SSE2-NEXT: psrlw $8, %xmm6 -; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2] -; CHECK-SSE2-NEXT: pminub %xmm6, %xmm7 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm7 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-SSE2-NEXT: pandn %xmm5, %xmm7 +; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm6 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2] +; CHECK-SSE2-NEXT: pminub %xmm6, %xmm4 +; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; CHECK-SSE2-NEXT: pandn %xmm6, %xmm4 ; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6 -; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 -; CHECK-SSE2-NEXT: pandn %xmm1, %xmm5 -; CHECK-SSE2-NEXT: por %xmm7, %xmm5 +; CHECK-SSE2-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE2-NEXT: pcmpgtb %xmm7, %xmm1 +; CHECK-SSE2-NEXT: pandn %xmm1, %xmm6 +; CHECK-SSE2-NEXT: por %xmm4, %xmm6 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pand %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm1 ; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm0 ; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 ; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 @@ -2267,9 +2267,9 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5] ; CHECK-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm3 -; CHECK-SSE2-NEXT: pandn %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm2 +; CHECK-SSE2-NEXT: pcmpeqb %xmm7, %xmm3 +; CHECK-SSE2-NEXT: pandn %xmm6, %xmm3 +; CHECK-SSE2-NEXT: pcmpeqb %xmm7, %xmm2 ; CHECK-SSE2-NEXT: pandn %xmm1, %xmm2 ; CHECK-SSE2-NEXT: pmovmskb %xmm2, %ecx ; CHECK-SSE2-NEXT: pmovmskb %xmm3, %edx @@ -2303,13 +2303,13 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2] ; CHECK-SSE41-NEXT: pminub %xmm6, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm7, %xmm7 -; CHECK-SSE41-NEXT: pxor %xmm0, %xmm7 +; CHECK-SSE41-NEXT: pcmpeqd %xmm6, %xmm6 +; CHECK-SSE41-NEXT: pxor %xmm0, %xmm6 ; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm6, %xmm6 -; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE41-NEXT: pcmpgtb %xmm7, %xmm1 ; CHECK-SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1 +; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 ; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 @@ -2329,9 +2329,9 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5] ; CHECK-SSE41-NEXT: pmaxub %xmm0, %xmm4 ; CHECK-SSE41-NEXT: pcmpeqb %xmm0, %xmm4 -; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm3 +; CHECK-SSE41-NEXT: pcmpeqb %xmm7, %xmm3 ; CHECK-SSE41-NEXT: pandn %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm2 +; CHECK-SSE41-NEXT: pcmpeqb %xmm7, %xmm2 ; CHECK-SSE41-NEXT: pandn %xmm4, %xmm2 ; CHECK-SSE41-NEXT: pmovmskb %xmm2, %ecx ; CHECK-SSE41-NEXT: pmovmskb %xmm3, %edx diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll index c8c1026bdaf3fb..bc2e82e76ca7d6 100644 --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -175,7 +175,6 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_power_of_two: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pextrw $1, %xmm0, %eax ; SSE-NEXT: leal 31(%rax), %ecx ; SSE-NEXT: testw %ax, %ax @@ -188,16 +187,16 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; SSE-NEXT: cmovnsl %ecx, %edx ; SSE-NEXT: andl $-64, %edx ; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm0 -; SSE-NEXT: pinsrw $1, %eax, %xmm0 -; SSE-NEXT: pextrw $2, %xmm1, %eax +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax ; SSE-NEXT: leal 7(%rax), %ecx ; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: cmovnsl %eax, %ecx ; SSE-NEXT: andl $-8, %ecx ; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm0 -; SSE-NEXT: pextrw $3, %xmm1, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax ; SSE-NEXT: movswl %ax, %ecx ; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 ; SSE-NEXT: shrl $16, %ecx @@ -209,7 +208,8 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; SSE-NEXT: addl %ecx, %edx ; SSE-NEXT: imull $95, %edx, %ecx ; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm0 +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_srem_power_of_two: @@ -257,32 +257,32 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %eax -; SSE-NEXT: imull $-19945, %eax, %eax # imm = 0xB217 -; SSE-NEXT: shrl $16, %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: movzwl %ax, %edx -; SSE-NEXT: movswl %dx, %eax -; SSE-NEXT: shrl $15, %edx -; SSE-NEXT: sarl $4, %eax -; SSE-NEXT: addl %edx, %eax -; SSE-NEXT: leal (%rax,%rax,2), %edx -; SSE-NEXT: shll $3, %edx -; SSE-NEXT: subl %edx, %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: pextrw $1, %xmm0, %ecx +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: addl %eax, %ecx +; SSE-NEXT: movzwl %cx, %ecx ; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $12827, %edx, %edx # imm = 0x321B -; SSE-NEXT: movl %edx, %esi +; SSE-NEXT: shrl $15, %ecx +; SSE-NEXT: sarl $4, %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: leal (%rdx,%rdx,2), %ecx +; SSE-NEXT: shll $3, %ecx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: addl %eax, %edx +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B +; SSE-NEXT: movl %ecx, %esi ; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $23, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $654, %edx, %edx # imm = 0x28E -; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: sarl $23, %ecx +; SSE-NEXT: addl %esi, %ecx +; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E +; SSE-NEXT: subl %ecx, %eax ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pinsrw $2, %eax, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pinsrw $2, %edx, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax ; SSE-NEXT: movswl %ax, %ecx ; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 @@ -423,7 +423,6 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; SSE-LABEL: dont_fold_srem_i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movq %xmm1, %rcx ; SSE-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 ; SSE-NEXT: movq %rcx, %rax @@ -437,8 +436,8 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; SSE-NEXT: shlq $3, %rax ; SSE-NEXT: subq %rax, %rdx ; SSE-NEXT: addq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: pextrq $1, %xmm2, %rcx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: pextrq $1, %xmm1, %rcx ; SSE-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 ; SSE-NEXT: movq %rcx, %rax ; SSE-NEXT: imulq %rdx @@ -448,8 +447,8 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; SSE-NEXT: addq %rax, %rdx ; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F ; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm2 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: pextrq $1, %xmm0, %rcx ; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 ; SSE-NEXT: movq %rcx, %rax @@ -462,6 +461,7 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; SSE-NEXT: subq %rax, %rcx ; SSE-NEXT: movq %rcx, %xmm0 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: dont_fold_srem_i64: diff --git a/llvm/test/CodeGen/X86/sse-intel-ocl.ll b/llvm/test/CodeGen/X86/sse-intel-ocl.ll index b2de7545ff5f51..02057c0062c5d2 100644 --- a/llvm/test/CodeGen/X86/sse-intel-ocl.ll +++ b/llvm/test/CodeGen/X86/sse-intel-ocl.ll @@ -92,16 +92,16 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { ; WIN32-NEXT: movl %esp, %ebp ; WIN32-NEXT: andl $-16, %esp ; WIN32-NEXT: subl $80, %esp -; WIN32-NEXT: movups 72(%ebp), %xmm6 +; WIN32-NEXT: movups 72(%ebp), %xmm4 ; WIN32-NEXT: movups 8(%ebp), %xmm3 -; WIN32-NEXT: movups 56(%ebp), %xmm7 -; WIN32-NEXT: movups 40(%ebp), %xmm5 -; WIN32-NEXT: movups 24(%ebp), %xmm4 +; WIN32-NEXT: movups 56(%ebp), %xmm5 +; WIN32-NEXT: movups 40(%ebp), %xmm6 +; WIN32-NEXT: movups 24(%ebp), %xmm7 ; WIN32-NEXT: movl %esp, %eax -; WIN32-NEXT: addps %xmm4, %xmm0 -; WIN32-NEXT: addps %xmm5, %xmm1 -; WIN32-NEXT: addps %xmm7, %xmm2 -; WIN32-NEXT: addps %xmm6, %xmm3 +; WIN32-NEXT: addps %xmm7, %xmm0 +; WIN32-NEXT: addps %xmm6, %xmm1 +; WIN32-NEXT: addps %xmm5, %xmm2 +; WIN32-NEXT: addps %xmm4, %xmm3 ; WIN32-NEXT: pushl %eax ; WIN32-NEXT: calll _func_float16_ptr ; WIN32-NEXT: addl $4, %esp @@ -164,20 +164,20 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { ; NOT_WIN-LABEL: testf16_regs: ; NOT_WIN: ## %bb.0: ; NOT_WIN-NEXT: subq $72, %rsp -; NOT_WIN-NEXT: movaps %xmm7, %xmm9 -; NOT_WIN-NEXT: movaps %xmm6, %xmm10 -; NOT_WIN-NEXT: movaps %xmm5, %xmm11 -; NOT_WIN-NEXT: movaps %xmm4, %xmm8 +; NOT_WIN-NEXT: movaps %xmm7, %xmm8 +; NOT_WIN-NEXT: movaps %xmm6, %xmm9 +; NOT_WIN-NEXT: movaps %xmm5, %xmm10 +; NOT_WIN-NEXT: movaps %xmm4, %xmm11 ; NOT_WIN-NEXT: addps %xmm4, %xmm0 ; NOT_WIN-NEXT: addps %xmm5, %xmm1 ; NOT_WIN-NEXT: addps %xmm6, %xmm2 ; NOT_WIN-NEXT: addps %xmm7, %xmm3 ; NOT_WIN-NEXT: movq %rsp, %rdi ; NOT_WIN-NEXT: callq _func_float16_ptr -; NOT_WIN-NEXT: addps %xmm9, %xmm3 -; NOT_WIN-NEXT: addps %xmm10, %xmm2 -; NOT_WIN-NEXT: addps %xmm11, %xmm1 -; NOT_WIN-NEXT: addps %xmm8, %xmm0 +; NOT_WIN-NEXT: addps %xmm8, %xmm3 +; NOT_WIN-NEXT: addps %xmm9, %xmm2 +; NOT_WIN-NEXT: addps %xmm10, %xmm1 +; NOT_WIN-NEXT: addps %xmm11, %xmm0 ; NOT_WIN-NEXT: addps (%rsp), %xmm0 ; NOT_WIN-NEXT: addps {{[0-9]+}}(%rsp), %xmm1 ; NOT_WIN-NEXT: addps {{[0-9]+}}(%rsp), %xmm2 diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll index 6f0293392eef2b..6b1fcdefdcdaf4 100644 --- a/llvm/test/CodeGen/X86/sse-regcall.ll +++ b/llvm/test/CodeGen/X86/sse-regcall.ll @@ -198,43 +198,42 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: subl $12, %esp ; WIN32-NEXT: movl %esi, (%esp) # 4-byte Spill +; WIN32-NEXT: movl %ecx, %esi ; WIN32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: leal (%edx,%edi), %eax -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %edx, %eax -; WIN32-NEXT: subl %edi, %eax -; WIN32-NEXT: movl %ebp, %edx -; WIN32-NEXT: subl %ecx, %edx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: imull %edx, %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl %esi, %edx +; WIN32-NEXT: leal (%edx,%edi), %ecx +; WIN32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: subl %edi, %ecx +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: subl %esi, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: subl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imull %eax, %edx -; WIN32-NEXT: addl %ebx, %edx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl (%esp), %edi # 4-byte Reload -; WIN32-NEXT: subl %ebx, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %edi, %eax -; WIN32-NEXT: addl %edx, %eax -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; WIN32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload +; WIN32-NEXT: imull %edi, %edx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: movl %esi, %edi +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: imull %ecx, %edi +; WIN32-NEXT: addl %edx, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl (%esp), %ebx # 4-byte Reload +; WIN32-NEXT: subl %ebp, %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imull %edx, %ebp +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: imull %ebx, %ecx +; WIN32-NEXT: addl %edi, %ecx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: imull %edi, %eax ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; WIN32-NEXT: addl %esi, %ebp -; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: imull %ebx, %ecx -; WIN32-NEXT: addl %ecx, %ebp -; WIN32-NEXT: addl %eax, %ebp -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: addl %esi, %eax +; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: imull %ebp, %edx +; WIN32-NEXT: addl %edx, %eax +; WIN32-NEXT: addl %ecx, %eax ; WIN32-NEXT: addl $12, %esp ; WIN32-NEXT: popl %ebx ; WIN32-NEXT: popl %ebp @@ -242,7 +241,6 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; ; WIN64-LABEL: testi32_inp: ; WIN64: # %bb.0: -; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: # kill: def $esi killed $esi def $rsi @@ -255,36 +253,35 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi ; WIN64-NEXT: leal (%rdx,%rdi), %ebx -; WIN64-NEXT: movl %edx, %ebp -; WIN64-NEXT: subl %edi, %ebp -; WIN64-NEXT: leal (%rsi,%r8), %edx +; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx +; WIN64-NEXT: subl %edi, %edx +; WIN64-NEXT: leal (%rsi,%r8), %edi ; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi ; WIN64-NEXT: subl %r8d, %esi -; WIN64-NEXT: leal (%r9,%r10), %edi -; WIN64-NEXT: movl %r9d, %r8d -; WIN64-NEXT: subl %r10d, %r8d -; WIN64-NEXT: movl %eax, %r9d -; WIN64-NEXT: subl %ecx, %r9d -; WIN64-NEXT: imull %r9d, %r8d -; WIN64-NEXT: leal (%r11,%r12), %r9d -; WIN64-NEXT: movl %r11d, %r10d -; WIN64-NEXT: subl %r12d, %r10d -; WIN64-NEXT: imull %ebp, %r10d -; WIN64-NEXT: addl %r8d, %r10d -; WIN64-NEXT: leal (%r14,%r15), %r8d -; WIN64-NEXT: movl %r14d, %r11d -; WIN64-NEXT: subl %r15d, %r11d -; WIN64-NEXT: imull %esi, %r11d -; WIN64-NEXT: addl %r10d, %r11d +; WIN64-NEXT: leal (%r9,%r10), %r8d +; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9 +; WIN64-NEXT: subl %r10d, %r9d +; WIN64-NEXT: movl %eax, %r10d +; WIN64-NEXT: subl %ecx, %r10d +; WIN64-NEXT: imull %r10d, %r9d +; WIN64-NEXT: leal (%r11,%r12), %r10d +; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11 +; WIN64-NEXT: subl %r12d, %r11d +; WIN64-NEXT: imull %edx, %r11d +; WIN64-NEXT: addl %r9d, %r11d +; WIN64-NEXT: leal (%r14,%r15), %edx +; WIN64-NEXT: movl %r14d, %r9d +; WIN64-NEXT: subl %r15d, %r9d +; WIN64-NEXT: imull %esi, %r9d +; WIN64-NEXT: addl %r11d, %r9d ; WIN64-NEXT: addl %ecx, %eax -; WIN64-NEXT: imull %edi, %eax -; WIN64-NEXT: imull %ebx, %r9d +; WIN64-NEXT: imull %r8d, %eax +; WIN64-NEXT: imull %ebx, %r10d +; WIN64-NEXT: addl %r10d, %eax +; WIN64-NEXT: imull %edi, %edx +; WIN64-NEXT: addl %edx, %eax ; WIN64-NEXT: addl %r9d, %eax -; WIN64-NEXT: imull %edx, %r8d -; WIN64-NEXT: addl %r8d, %eax -; WIN64-NEXT: addl %r11d, %eax ; WIN64-NEXT: popq %rbx -; WIN64-NEXT: popq %rbp ; WIN64-NEXT: retq ; ; LINUXOSX-LABEL: testi32_inp: @@ -298,35 +295,35 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8 ; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi ; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d -; LINUXOSX-NEXT: movl %edx, %r11d -; LINUXOSX-NEXT: subl %edi, %r11d -; LINUXOSX-NEXT: leal (%rsi,%r8), %edx +; LINUXOSX-NEXT: # kill: def $edx killed $edx killed $rdx +; LINUXOSX-NEXT: subl %edi, %edx +; LINUXOSX-NEXT: leal (%rsi,%r8), %edi ; LINUXOSX-NEXT: # kill: def $esi killed $esi killed $rsi ; LINUXOSX-NEXT: subl %r8d, %esi -; LINUXOSX-NEXT: leal (%r9,%r12), %edi -; LINUXOSX-NEXT: movl %r9d, %r8d -; LINUXOSX-NEXT: subl %r12d, %r8d -; LINUXOSX-NEXT: movl %eax, %r9d -; LINUXOSX-NEXT: subl %ecx, %r9d -; LINUXOSX-NEXT: imull %r9d, %r8d -; LINUXOSX-NEXT: leal (%r13,%r14), %r9d +; LINUXOSX-NEXT: leal (%r9,%r12), %r8d +; LINUXOSX-NEXT: # kill: def $r9d killed $r9d killed $r9 +; LINUXOSX-NEXT: subl %r12d, %r9d +; LINUXOSX-NEXT: movl %eax, %r11d +; LINUXOSX-NEXT: subl %ecx, %r11d +; LINUXOSX-NEXT: imull %r11d, %r9d +; LINUXOSX-NEXT: leal (%r13,%r14), %r11d ; LINUXOSX-NEXT: movl %r13d, %r12d ; LINUXOSX-NEXT: subl %r14d, %r12d -; LINUXOSX-NEXT: imull %r11d, %r12d -; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; LINUXOSX-NEXT: addl %r8d, %r12d -; LINUXOSX-NEXT: movl %r15d, %r8d -; LINUXOSX-NEXT: subl %r11d, %r8d -; LINUXOSX-NEXT: imull %esi, %r8d -; LINUXOSX-NEXT: addl %r12d, %r8d +; LINUXOSX-NEXT: imull %edx, %r12d +; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %edx +; LINUXOSX-NEXT: addl %r9d, %r12d +; LINUXOSX-NEXT: movl %r15d, %r9d +; LINUXOSX-NEXT: subl %edx, %r9d +; LINUXOSX-NEXT: imull %esi, %r9d +; LINUXOSX-NEXT: addl %r12d, %r9d ; LINUXOSX-NEXT: addl %ecx, %eax -; LINUXOSX-NEXT: imull %edi, %eax -; LINUXOSX-NEXT: imull %r10d, %r9d -; LINUXOSX-NEXT: addl %r9d, %eax -; LINUXOSX-NEXT: addl %r15d, %r11d -; LINUXOSX-NEXT: imull %edx, %r11d +; LINUXOSX-NEXT: imull %r8d, %eax +; LINUXOSX-NEXT: imull %r10d, %r11d ; LINUXOSX-NEXT: addl %r11d, %eax -; LINUXOSX-NEXT: addl %r8d, %eax +; LINUXOSX-NEXT: addl %r15d, %edx +; LINUXOSX-NEXT: imull %edi, %edx +; LINUXOSX-NEXT: addl %edx, %eax +; LINUXOSX-NEXT: addl %r9d, %eax ; LINUXOSX-NEXT: retq i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind { %x1 = sub i32 %a1, %a2 diff --git a/llvm/test/CodeGen/X86/sse-regcall4.ll b/llvm/test/CodeGen/X86/sse-regcall4.ll index c8df7a233d7e3f..82102e63d0c278 100644 --- a/llvm/test/CodeGen/X86/sse-regcall4.ll +++ b/llvm/test/CodeGen/X86/sse-regcall4.ll @@ -197,44 +197,41 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN32-NEXT: pushl %ebp ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: subl $8, %esp -; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: movl %edx, (%esp) # 4-byte Spill -; WIN32-NEXT: movl %ecx, %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: leal (%eax,%esi), %ecx -; WIN32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: subl %esi, %ebx +; WIN32-NEXT: leal (%edi,%esi), %eax +; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl %edi, %eax -; WIN32-NEXT: subl %edx, %eax +; WIN32-NEXT: subl %esi, %eax +; WIN32-NEXT: movl %ecx, %esi +; WIN32-NEXT: subl %edx, %esi ; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: imull %eax, %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: imull %esi, %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: subl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: imull %ebx, %esi +; WIN32-NEXT: imull %eax, %esi ; WIN32-NEXT: addl %ebp, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl %ebp, %ebx ; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: movl %edx, %eax ; WIN32-NEXT: subl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: imull %ebx, %eax ; WIN32-NEXT: addl %esi, %eax -; WIN32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; WIN32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: imull %esi, %edi +; WIN32-NEXT: imull %esi, %ecx +; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; WIN32-NEXT: addl %edi, %ecx ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; WIN32-NEXT: addl %edx, %edi -; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: imull %ebp, %ecx -; WIN32-NEXT: addl %ecx, %edi -; WIN32-NEXT: addl %eax, %edi -; WIN32-NEXT: movl %edi, %ecx +; WIN32-NEXT: imull %ebp, %edx +; WIN32-NEXT: addl %edx, %ecx +; WIN32-NEXT: addl %eax, %ecx ; WIN32-NEXT: addl $8, %esp ; WIN32-NEXT: popl %ebx ; WIN32-NEXT: popl %ebp @@ -242,7 +239,6 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; ; WIN64-LABEL: testi32_inp: ; WIN64: # %bb.0: -; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: # kill: def $esi killed $esi def $rsi @@ -253,37 +249,36 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi ; WIN64-NEXT: leal (%rdx,%rdi), %ebx -; WIN64-NEXT: movl %edx, %ebp -; WIN64-NEXT: subl %edi, %ebp -; WIN64-NEXT: leal (%rsi,%r8), %edx +; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx +; WIN64-NEXT: subl %edi, %edx +; WIN64-NEXT: leal (%rsi,%r8), %edi ; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi ; WIN64-NEXT: subl %r8d, %esi -; WIN64-NEXT: leal (%r9,%r11), %edi -; WIN64-NEXT: movl %r9d, %r8d -; WIN64-NEXT: subl %r11d, %r8d -; WIN64-NEXT: movl %eax, %r9d -; WIN64-NEXT: subl %ecx, %r9d -; WIN64-NEXT: imull %r9d, %r8d -; WIN64-NEXT: leal (%r12,%r14), %r9d -; WIN64-NEXT: movl %r12d, %r11d -; WIN64-NEXT: subl %r14d, %r11d -; WIN64-NEXT: imull %ebp, %r11d -; WIN64-NEXT: movl {{[0-9]+}}(%rsp), %r14d -; WIN64-NEXT: addl %r8d, %r11d -; WIN64-NEXT: movl %r15d, %r8d -; WIN64-NEXT: subl %r14d, %r8d -; WIN64-NEXT: imull %esi, %r8d -; WIN64-NEXT: addl %r11d, %r8d +; WIN64-NEXT: leal (%r9,%r11), %r8d +; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9 +; WIN64-NEXT: subl %r11d, %r9d +; WIN64-NEXT: movl %eax, %r11d +; WIN64-NEXT: subl %ecx, %r11d +; WIN64-NEXT: imull %r11d, %r9d +; WIN64-NEXT: leal (%r12,%r14), %r11d +; WIN64-NEXT: # kill: def $r12d killed $r12d killed $r12 +; WIN64-NEXT: subl %r14d, %r12d +; WIN64-NEXT: imull %edx, %r12d +; WIN64-NEXT: movl {{[0-9]+}}(%rsp), %edx +; WIN64-NEXT: addl %r9d, %r12d +; WIN64-NEXT: movl %r15d, %r9d +; WIN64-NEXT: subl %edx, %r9d +; WIN64-NEXT: imull %esi, %r9d +; WIN64-NEXT: addl %r12d, %r9d ; WIN64-NEXT: addl %ecx, %eax -; WIN64-NEXT: imull %edi, %eax -; WIN64-NEXT: imull %ebx, %r9d +; WIN64-NEXT: imull %r8d, %eax +; WIN64-NEXT: imull %ebx, %r11d +; WIN64-NEXT: addl %r11d, %eax +; WIN64-NEXT: addl %r15d, %edx +; WIN64-NEXT: imull %edi, %edx +; WIN64-NEXT: addl %edx, %eax ; WIN64-NEXT: addl %r9d, %eax -; WIN64-NEXT: addl %r15d, %r14d -; WIN64-NEXT: imull %edx, %r14d -; WIN64-NEXT: addl %r14d, %eax -; WIN64-NEXT: addl %r8d, %eax ; WIN64-NEXT: popq %rbx -; WIN64-NEXT: popq %rbp ; WIN64-NEXT: retq ; ; LINUXOSX-LABEL: testi32_inp: @@ -297,35 +292,35 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8 ; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi ; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d -; LINUXOSX-NEXT: movl %edx, %r11d -; LINUXOSX-NEXT: subl %edi, %r11d -; LINUXOSX-NEXT: leal (%rsi,%r8), %edx +; LINUXOSX-NEXT: # kill: def $edx killed $edx killed $rdx +; LINUXOSX-NEXT: subl %edi, %edx +; LINUXOSX-NEXT: leal (%rsi,%r8), %edi ; LINUXOSX-NEXT: # kill: def $esi killed $esi killed $rsi ; LINUXOSX-NEXT: subl %r8d, %esi -; LINUXOSX-NEXT: leal (%r9,%r12), %edi -; LINUXOSX-NEXT: movl %r9d, %r8d -; LINUXOSX-NEXT: subl %r12d, %r8d -; LINUXOSX-NEXT: movl %eax, %r9d -; LINUXOSX-NEXT: subl %ecx, %r9d -; LINUXOSX-NEXT: imull %r9d, %r8d -; LINUXOSX-NEXT: leal (%r13,%r14), %r9d +; LINUXOSX-NEXT: leal (%r9,%r12), %r8d +; LINUXOSX-NEXT: # kill: def $r9d killed $r9d killed $r9 +; LINUXOSX-NEXT: subl %r12d, %r9d +; LINUXOSX-NEXT: movl %eax, %r11d +; LINUXOSX-NEXT: subl %ecx, %r11d +; LINUXOSX-NEXT: imull %r11d, %r9d +; LINUXOSX-NEXT: leal (%r13,%r14), %r11d ; LINUXOSX-NEXT: movl %r13d, %r12d ; LINUXOSX-NEXT: subl %r14d, %r12d -; LINUXOSX-NEXT: imull %r11d, %r12d -; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; LINUXOSX-NEXT: addl %r8d, %r12d -; LINUXOSX-NEXT: movl %r15d, %r8d -; LINUXOSX-NEXT: subl %r11d, %r8d -; LINUXOSX-NEXT: imull %esi, %r8d -; LINUXOSX-NEXT: addl %r12d, %r8d +; LINUXOSX-NEXT: imull %edx, %r12d +; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %edx +; LINUXOSX-NEXT: addl %r9d, %r12d +; LINUXOSX-NEXT: movl %r15d, %r9d +; LINUXOSX-NEXT: subl %edx, %r9d +; LINUXOSX-NEXT: imull %esi, %r9d +; LINUXOSX-NEXT: addl %r12d, %r9d ; LINUXOSX-NEXT: addl %ecx, %eax -; LINUXOSX-NEXT: imull %edi, %eax -; LINUXOSX-NEXT: imull %r10d, %r9d -; LINUXOSX-NEXT: addl %r9d, %eax -; LINUXOSX-NEXT: addl %r15d, %r11d -; LINUXOSX-NEXT: imull %edx, %r11d +; LINUXOSX-NEXT: imull %r8d, %eax +; LINUXOSX-NEXT: imull %r10d, %r11d ; LINUXOSX-NEXT: addl %r11d, %eax -; LINUXOSX-NEXT: addl %r8d, %eax +; LINUXOSX-NEXT: addl %r15d, %edx +; LINUXOSX-NEXT: imull %edi, %edx +; LINUXOSX-NEXT: addl %edx, %eax +; LINUXOSX-NEXT: addl %r9d, %eax ; LINUXOSX-NEXT: retq i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind { %x1 = sub i32 %a1, %a2 diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index eba390733794eb..2749a65a948329 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -3336,59 +3336,59 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] ; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] -; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: punpcklbw %xmm2, %xmm0 # encoding: [0x66,0x0f,0x60,0xc2] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-SSE-NEXT: punpcklwd %xmm1, %xmm0 # encoding: [0x66,0x0f,0x61,0xc1] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] +; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] +; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] +; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X86-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] +; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14] -; X86-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] +; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] -; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X86-SSE-NEXT: punpcklbw %xmm1, %xmm2 # encoding: [0x66,0x0f,0x60,0xd1] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x1c] ; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] +; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] +; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x1c] +; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] ; X86-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] -; X86-SSE-NEXT: punpcklbw %xmm3, %xmm1 # encoding: [0x66,0x0f,0x60,0xcb] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; X86-SSE-NEXT: punpcklwd %xmm2, %xmm1 # encoding: [0x66,0x0f,0x61,0xca] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: punpckldq %xmm0, %xmm1 # encoding: [0x66,0x0f,0x62,0xc8] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X86-SSE-NEXT: punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X86-SSE-NEXT: punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x24] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] -; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] -; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x2c] -; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] ; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] ; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X86-SSE-NEXT: punpcklwd %xmm3, %xmm2 # encoding: [0x66,0x0f,0x61,0xd3] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x2c] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] ; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] ; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X86-SSE-NEXT: punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda] +; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34] +; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] +; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] +; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] +; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x3c] ; X86-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; X86-SSE-NEXT: punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X86-SSE-NEXT: punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] ; X86-SSE-NEXT: retl # encoding: [0xc3] @@ -3484,51 +3484,51 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a ; X64-SSE-NEXT: movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0] ; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X64-SSE-NEXT: movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1] +; X64-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] +; X64-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] +; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] ; X64-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X64-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] ; X64-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X64-SSE-NEXT: punpcklwd %xmm1, %xmm3 # encoding: [0x66,0x0f,0x61,0xd9] +; X64-SSE-NEXT: # xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X64-SSE-NEXT: punpckldq %xmm2, %xmm3 # encoding: [0x66,0x0f,0x62,0xda] +; X64-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] ; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] +; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] ; X64-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] ; X64-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] ; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X64-SSE-NEXT: punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb] -; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X64-SSE-NEXT: punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca] -; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] +; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] ; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] +; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] ; X64-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X64-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] ; X64-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] -; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] -; X64-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X64-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] -; X64-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X64-SSE-NEXT: punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda] -; X64-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] +; X64-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] ; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40] -; X64-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X64-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] -; X64-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X64-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] +; X64-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] +; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x48] ; X64-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] ; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x50] ; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X64-SSE-NEXT: punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4] ; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2] -; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE-NEXT: punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3] -; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X64-SSE-NEXT: punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1] -; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] +; X64-SSE-NEXT: punpcklwd %xmm1, %xmm0 # encoding: [0x66,0x0f,0x61,0xc1] +; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2] +; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-SSE-NEXT: punpcklqdq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc3] +; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX1-LABEL: test_mm_set_epi8: @@ -3610,51 +3610,51 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a ; X32-SSE-NEXT: movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0] ; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X32-SSE-NEXT: movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1] +; X32-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] +; X32-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] +; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x08] +; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x10] ; X32-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X32-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] ; X32-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x08] +; X32-SSE-NEXT: punpcklwd %xmm1, %xmm3 # encoding: [0x66,0x0f,0x61,0xd9] +; X32-SSE-NEXT: # xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X32-SSE-NEXT: punpckldq %xmm2, %xmm3 # encoding: [0x66,0x0f,0x62,0xda] +; X32-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x18] ; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x10] +; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x20] ; X32-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] ; X32-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] ; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE-NEXT: punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb] -; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X32-SSE-NEXT: punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca] -; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x18] +; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x28] ; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x20] +; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x30] ; X32-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X32-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] ; X32-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x28] -; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x30] -; X32-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X32-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] -; X32-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE-NEXT: punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda] -; X32-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X32-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] +; X32-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38] ; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40] -; X32-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X32-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] -; X32-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] +; X32-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] +; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x48] ; X32-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] ; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50] ; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X32-SSE-NEXT: punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4] ; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X32-SSE-NEXT: punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2] -; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X32-SSE-NEXT: punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3] -; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE-NEXT: punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1] -; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] +; X32-SSE-NEXT: punpcklwd %xmm1, %xmm0 # encoding: [0x66,0x0f,0x61,0xc1] +; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE-NEXT: punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2] +; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE-NEXT: punpcklqdq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc3] +; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0] ; X32-SSE-NEXT: retq # encoding: [0xc3] ; ; X32-AVX1-LABEL: test_mm_set_epi8: @@ -3744,9 +3744,9 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x08] ; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x0c] -; X86-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] ; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] +; X86-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x14] ; X86-SSE-NEXT: movd %eax, %xmm5 # encoding: [0x66,0x0f,0x6e,0xe8] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x18] @@ -3757,18 +3757,18 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] ; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X86-SSE-NEXT: punpcklwd %xmm4, %xmm3 # encoding: [0x66,0x0f,0x61,0xdc] -; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-SSE-NEXT: punpckldq %xmm2, %xmm3 # encoding: [0x66,0x0f,0x62,0xda] -; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-SSE-NEXT: punpcklwd %xmm3, %xmm4 # encoding: [0x66,0x0f,0x61,0xe3] +; X86-SSE-NEXT: # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; X86-SSE-NEXT: punpckldq %xmm2, %xmm4 # encoding: [0x66,0x0f,0x62,0xe2] +; X86-SSE-NEXT: # xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; X86-SSE-NEXT: punpcklwd %xmm5, %xmm6 # encoding: [0x66,0x0f,0x61,0xf5] ; X86-SSE-NEXT: # xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; X86-SSE-NEXT: punpcklwd %xmm7, %xmm0 # encoding: [0x66,0x0f,0x61,0xc7] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; X86-SSE-NEXT: punpckldq %xmm6, %xmm0 # encoding: [0x66,0x0f,0x62,0xc6] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; X86-SSE-NEXT: punpcklqdq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc3] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0] +; X86-SSE-NEXT: punpcklqdq %xmm4, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc4] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_set_epi16: @@ -4727,59 +4727,59 @@ define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 % ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] ; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] -; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: punpcklbw %xmm2, %xmm0 # encoding: [0x66,0x0f,0x60,0xc2] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-SSE-NEXT: punpcklwd %xmm1, %xmm0 # encoding: [0x66,0x0f,0x61,0xc1] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34] +; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] +; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] +; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X86-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] +; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] -; X86-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] +; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x2c] -; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X86-SSE-NEXT: punpcklbw %xmm1, %xmm2 # encoding: [0x66,0x0f,0x60,0xd1] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] ; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] +; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] +; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] +; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x24] ; X86-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] -; X86-SSE-NEXT: punpcklbw %xmm3, %xmm1 # encoding: [0x66,0x0f,0x60,0xcb] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; X86-SSE-NEXT: punpcklwd %xmm2, %xmm1 # encoding: [0x66,0x0f,0x61,0xca] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: punpckldq %xmm0, %xmm1 # encoding: [0x66,0x0f,0x62,0xc8] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X86-SSE-NEXT: punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X86-SSE-NEXT: punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x1c] -; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] -; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] -; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14] ; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] ; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X86-SSE-NEXT: punpcklwd %xmm3, %xmm2 # encoding: [0x66,0x0f,0x61,0xd3] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14] ; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] ; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X86-SSE-NEXT: punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda] +; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] +; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] +; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] +; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; X86-SSE-NEXT: punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X86-SSE-NEXT: punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] ; X86-SSE-NEXT: retl # encoding: [0xc3] @@ -4875,51 +4875,51 @@ define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 % ; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] ; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] +; X64-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] +; X64-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] +; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] +; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] ; X64-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X64-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] ; X64-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] +; X64-SSE-NEXT: punpcklwd %xmm1, %xmm3 # encoding: [0x66,0x0f,0x61,0xd9] +; X64-SSE-NEXT: # xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X64-SSE-NEXT: punpckldq %xmm2, %xmm3 # encoding: [0x66,0x0f,0x62,0xda] +; X64-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] ; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] +; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X64-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] ; X64-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] ; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X64-SSE-NEXT: punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb] -; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X64-SSE-NEXT: punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca] -; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] +; X64-SSE-NEXT: movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1] ; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X64-SSE-NEXT: movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0] ; X64-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X64-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] ; X64-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X64-SSE-NEXT: movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1] -; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X64-SSE-NEXT: movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0] -; X64-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X64-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] -; X64-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X64-SSE-NEXT: punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda] -; X64-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] +; X64-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X64-SSE-NEXT: movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1] ; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X64-SSE-NEXT: movzbl %dl, %eax # encoding: [0x0f,0xb6,0xc2] -; X64-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X64-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] -; X64-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X64-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] +; X64-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] +; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X64-SSE-NEXT: movzbl %sil, %eax # encoding: [0x40,0x0f,0xb6,0xc6] ; X64-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] ; X64-SSE-NEXT: movzbl %dil, %eax # encoding: [0x40,0x0f,0xb6,0xc7] ; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X64-SSE-NEXT: punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4] ; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2] -; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE-NEXT: punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3] -; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X64-SSE-NEXT: punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1] -; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] +; X64-SSE-NEXT: punpcklwd %xmm1, %xmm0 # encoding: [0x66,0x0f,0x61,0xc1] +; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2] +; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-SSE-NEXT: punpcklqdq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc3] +; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX1-LABEL: test_mm_setr_epi8: @@ -5001,51 +5001,51 @@ define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 % ; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x30] ; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x28] +; X32-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] +; X32-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] +; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x20] +; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x18] ; X32-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X32-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] ; X32-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x20] +; X32-SSE-NEXT: punpcklwd %xmm1, %xmm3 # encoding: [0x66,0x0f,0x61,0xd9] +; X32-SSE-NEXT: # xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X32-SSE-NEXT: punpckldq %xmm2, %xmm3 # encoding: [0x66,0x0f,0x62,0xda] +; X32-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x10] ; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x18] +; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x08] ; X32-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] ; X32-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] ; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE-NEXT: punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb] -; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X32-SSE-NEXT: punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca] -; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x10] +; X32-SSE-NEXT: movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1] ; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X32-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x08] +; X32-SSE-NEXT: movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0] ; X32-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X32-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] ; X32-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE-NEXT: movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1] -; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X32-SSE-NEXT: movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0] -; X32-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X32-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] -; X32-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE-NEXT: punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda] -; X32-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X32-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] +; X32-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X32-SSE-NEXT: movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1] ; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X32-SSE-NEXT: movzbl %dl, %eax # encoding: [0x0f,0xb6,0xc2] -; X32-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X32-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] -; X32-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] +; X32-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] +; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X32-SSE-NEXT: movzbl %sil, %eax # encoding: [0x40,0x0f,0xb6,0xc6] ; X32-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] ; X32-SSE-NEXT: movzbl %dil, %eax # encoding: [0x40,0x0f,0xb6,0xc7] ; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X32-SSE-NEXT: punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4] ; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X32-SSE-NEXT: punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2] -; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X32-SSE-NEXT: punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3] -; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE-NEXT: punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1] -; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] +; X32-SSE-NEXT: punpcklwd %xmm1, %xmm0 # encoding: [0x66,0x0f,0x61,0xc1] +; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE-NEXT: punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2] +; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE-NEXT: punpcklqdq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc3] +; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0] ; X32-SSE-NEXT: retq # encoding: [0xc3] ; ; X32-AVX1-LABEL: test_mm_setr_epi8: @@ -5135,9 +5135,9 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4 ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x1c] ; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x18] -; X86-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x14] ; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x14] +; X86-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] ; X86-SSE-NEXT: movd %eax, %xmm5 # encoding: [0x66,0x0f,0x6e,0xe8] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x0c] @@ -5148,18 +5148,18 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4 ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] ; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X86-SSE-NEXT: punpcklwd %xmm4, %xmm3 # encoding: [0x66,0x0f,0x61,0xdc] -; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-SSE-NEXT: punpckldq %xmm2, %xmm3 # encoding: [0x66,0x0f,0x62,0xda] -; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-SSE-NEXT: punpcklwd %xmm3, %xmm4 # encoding: [0x66,0x0f,0x61,0xe3] +; X86-SSE-NEXT: # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; X86-SSE-NEXT: punpckldq %xmm2, %xmm4 # encoding: [0x66,0x0f,0x62,0xe2] +; X86-SSE-NEXT: # xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; X86-SSE-NEXT: punpcklwd %xmm5, %xmm6 # encoding: [0x66,0x0f,0x61,0xf5] ; X86-SSE-NEXT: # xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; X86-SSE-NEXT: punpcklwd %xmm7, %xmm0 # encoding: [0x66,0x0f,0x61,0xc7] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; X86-SSE-NEXT: punpckldq %xmm6, %xmm0 # encoding: [0x66,0x0f,0x62,0xc6] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; X86-SSE-NEXT: punpcklqdq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc3] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0] +; X86-SSE-NEXT: punpcklqdq %xmm4, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc4] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_setr_epi16: diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll index e5ea911d4771a8..1e2990dea8d4cf 100644 --- a/llvm/test/CodeGen/X86/sshl_sat.ll +++ b/llvm/test/CodeGen/X86/sshl_sat.ll @@ -225,6 +225,7 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -236,25 +237,26 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel %ebx, %esi ; X86-NEXT: cmovel %ebx, %edi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: sarl %cl, %edx +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: movl %esi, %ebx -; X86-NEXT: sarl $31, %ebx +; X86-NEXT: sarl %cl, %ebx +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: sarl $31, %ebp ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovel %edx, %ebx -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: shrdl %cl, %esi, %ebp +; X86-NEXT: cmovel %ebx, %ebp +; X86-NEXT: shrdl %cl, %esi, %edi ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %edx, %ebp -; X86-NEXT: xorl %eax, %ebx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmovnel %ebx, %edi +; X86-NEXT: xorl %eax, %ebp +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sarl $31, %eax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: xorl $2147483647, %edx # imm = 0x7FFFFFFF -; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: orl %ebp, %edi ; X86-NEXT: notl %eax -; X86-NEXT: cmovel %edi, %eax +; X86-NEXT: cmovel (%esp), %eax # 4-byte Folded Reload ; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index f91758b861b4c4..b964f067629c9a 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -18,31 +18,31 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X64-NEXT: movdqa %xmm2, %xmm5 ; X64-NEXT: psrlq %xmm4, %xmm5 ; X64-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; X64-NEXT: movdqa %xmm0, %xmm6 -; X64-NEXT: psllq %xmm1, %xmm6 ; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psllq %xmm4, %xmm3 -; X64-NEXT: movdqa %xmm3, %xmm7 -; X64-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] -; X64-NEXT: psrlq %xmm1, %xmm6 +; X64-NEXT: psllq %xmm1, %xmm3 +; X64-NEXT: movdqa %xmm0, %xmm6 +; X64-NEXT: psllq %xmm4, %xmm6 +; X64-NEXT: movdqa %xmm6, %xmm7 +; X64-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; X64-NEXT: psrlq %xmm1, %xmm3 ; X64-NEXT: psrlq %xmm4, %xmm7 -; X64-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; X64-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] ; X64-NEXT: xorpd %xmm5, %xmm7 ; X64-NEXT: psubq %xmm5, %xmm7 ; X64-NEXT: pcmpeqd %xmm0, %xmm7 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,0,3,2] ; X64-NEXT: pand %xmm7, %xmm1 -; X64-NEXT: andpd %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X64-NEXT: andpd %xmm1, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X64-NEXT: pand %xmm2, %xmm0 -; X64-NEXT: pxor %xmm5, %xmm5 -; X64-NEXT: pcmpgtd %xmm4, %xmm5 -; X64-NEXT: por %xmm2, %xmm5 +; X64-NEXT: pxor %xmm4, %xmm4 +; X64-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-NEXT: por %xmm2, %xmm4 ; X64-NEXT: pcmpeqd %xmm2, %xmm2 -; X64-NEXT: pxor %xmm5, %xmm2 +; X64-NEXT: pxor %xmm4, %xmm2 ; X64-NEXT: por %xmm0, %xmm2 ; X64-NEXT: pandn %xmm2, %xmm1 -; X64-NEXT: por %xmm3, %xmm1 +; X64-NEXT: por %xmm6, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; @@ -71,74 +71,78 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: subl $20, %esp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %edx, %eax ; X86-NEXT: shll %cl, %eax -; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: shldl %cl, %edx, %ebx ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %eax, %edi +; X86-NEXT: cmovnel %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: sarl %cl, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sarl %cl, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %eax +; X86-NEXT: sarl $31, %ebx ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovel %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovel %eax, %ebx ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: shldl %cl, %esi, %ebx +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: shldl %cl, %edx, %esi ; X86-NEXT: testb $32, %ch -; X86-NEXT: cmovnel %eax, %ebx +; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: movl $0, %edx ; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sarl %cl, %esi -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sarl %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %edx ; X86-NEXT: sarl $31, %edx ; X86-NEXT: testb $32, %ch -; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: cmovel %eax, %edx +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: shrdl %cl, %edi, %eax ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: cmovnel (%esp), %eax # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %ebx, %edi +; X86-NEXT: shrdl %cl, %esi, %edi ; X86-NEXT: testb $32, %ch -; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %ecx, %ebx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NEXT: xorl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: notl %esi -; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: notl %eax +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi ; X86-NEXT: xorl %ebp, %edx ; X86-NEXT: sarl $31, %ebp -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: xorl $2147483647, %ebx # imm = 0x7FFFFFFF ; X86-NEXT: orl %edx, %edi ; X86-NEXT: notl %ebp ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: cmovel %ebx, %esi +; X86-NEXT: cmovel %esi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: movl %ebp, 8(%eax) ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl (%esp), %ecx # 4-byte Reload @@ -156,41 +160,41 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-LABEL: vec_v4i32: ; X64: # %bb.0: -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] -; X64-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,1,1,4,5,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; X64-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; X64-NEXT: pslld $23, %xmm1 ; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-NEXT: cvttps2dq %xmm1, %xmm6 +; X64-NEXT: cvttps2dq %xmm1, %xmm5 ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: pmuludq %xmm6, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; X64-NEXT: pmuludq %xmm5, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; X64-NEXT: pmuludq %xmm7, %xmm6 -; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; X64-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,3,3,3,4,5,6,7] -; X64-NEXT: movdqa %xmm2, %xmm7 -; X64-NEXT: psrad %xmm6, %xmm7 -; X64-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] -; X64-NEXT: movdqa %xmm1, %xmm6 -; X64-NEXT: psrad %xmm3, %xmm6 -; X64-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; X64-NEXT: movdqa %xmm2, %xmm3 -; X64-NEXT: psrad %xmm4, %xmm3 -; X64-NEXT: psrad %xmm5, %xmm1 -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; X64-NEXT: pmuludq %xmm7, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; X64-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] +; X64-NEXT: movdqa %xmm6, %xmm7 +; X64-NEXT: psrad %xmm5, %xmm7 +; X64-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] +; X64-NEXT: movdqa %xmm1, %xmm5 +; X64-NEXT: psrad %xmm2, %xmm5 +; X64-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; X64-NEXT: movdqa %xmm6, %xmm2 +; X64-NEXT: psrad %xmm3, %xmm2 +; X64-NEXT: psrad %xmm4, %xmm1 +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3] ; X64-NEXT: pcmpeqd %xmm0, %xmm1 -; X64-NEXT: pand %xmm1, %xmm2 -; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: pcmpgtd %xmm0, %xmm3 -; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-NEXT: pand %xmm1, %xmm6 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: por %xmm3, %xmm0 +; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: pandn %xmm0, %xmm1 -; X64-NEXT: por %xmm2, %xmm1 +; X64-NEXT: por %xmm6, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; @@ -214,29 +218,29 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, %ebp ; X86-NEXT: sarl %cl, %ebp ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %edx, %edx ; X86-NEXT: sets %bl ; X86-NEXT: addl $2147483647, %ebx # imm = 0x7FFFFFFF -; X86-NEXT: cmpl %ebp, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovel %edx, %ebx -; X86-NEXT: movl %edi, %ebp +; X86-NEXT: cmpl %ebp, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmovel %edi, %ebx +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: sarl %cl, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %ebp, %ebp ; X86-NEXT: sets %dl ; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF -; X86-NEXT: cmpl %eax, %edi -; X86-NEXT: cmovel %ebp, %edx +; X86-NEXT: cmpl %eax, %ebp +; X86-NEXT: cmovel %edi, %edx ; X86-NEXT: movl %esi, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll %cl, %edi @@ -424,26 +428,13 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movswl %dx, %esi ; X86-NEXT: sarl %cl, %esi -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax -; X86-NEXT: sets %cl -; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmovel %edx, %ecx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movswl %dx, %esi -; X86-NEXT: sarl %cl, %esi ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: testw %ax, %ax ; X86-NEXT: sets %bl ; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF ; X86-NEXT: cmpw %si, %ax -; X86-NEXT: cmovel %edx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmovel %edx, %ebx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll %cl, %esi @@ -455,31 +446,44 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: addl $32767, %edx # imm = 0x7FFF ; X86-NEXT: cmpw %di, %ax ; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi -; X86-NEXT: sarl %cl, %edi +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movswl %di, %ebp +; X86-NEXT: sarl %cl, %ebp +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %al +; X86-NEXT: addl $32767, %eax # imm = 0x7FFF +; X86-NEXT: cmpw %bp, %si +; X86-NEXT: cmovel %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movswl %di, %ebp +; X86-NEXT: sarl %cl, %ebp ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovel %esi, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movw %cx, 14(%eax) -; X86-NEXT: movw %dx, 12(%eax) -; X86-NEXT: movw %bx, 10(%eax) -; X86-NEXT: movw %bp, 8(%eax) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: movw %cx, 6(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movw %cx, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movw %cx, 2(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movw %cx, (%eax) +; X86-NEXT: cmpw %bp, %si +; X86-NEXT: cmovel %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movw %cx, 14(%esi) +; X86-NEXT: movw %ax, 12(%esi) +; X86-NEXT: movw %dx, 10(%esi) +; X86-NEXT: movw %bx, 8(%esi) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 6(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 4(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 2(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movw %ax, (%esi) +; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $16, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -494,93 +498,93 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X64-LABEL: vec_v16i8: ; X64: # %bb.0: ; X64-NEXT: psllw $5, %xmm1 -; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: pxor %xmm4, %xmm4 ; X64-NEXT: pcmpgtb %xmm1, %xmm4 -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: psllw $4, %xmm2 -; X64-NEXT: pand %xmm4, %xmm2 +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: psllw $4, %xmm3 +; X64-NEXT: pand %xmm4, %xmm3 ; X64-NEXT: pandn %xmm0, %xmm4 -; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-NEXT: por %xmm4, %xmm2 -; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-NEXT: por %xmm4, %xmm3 +; X64-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] +; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; X64-NEXT: paddb %xmm1, %xmm1 -; X64-NEXT: pxor %xmm6, %xmm6 -; X64-NEXT: pcmpgtb %xmm1, %xmm6 -; X64-NEXT: movdqa %xmm6, %xmm7 -; X64-NEXT: pandn %xmm2, %xmm7 -; X64-NEXT: psllw $2, %xmm2 -; X64-NEXT: pand %xmm6, %xmm2 -; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-NEXT: por %xmm7, %xmm2 +; X64-NEXT: pxor %xmm5, %xmm5 +; X64-NEXT: pcmpgtb %xmm1, %xmm5 +; X64-NEXT: movdqa %xmm5, %xmm7 +; X64-NEXT: pandn %xmm3, %xmm7 +; X64-NEXT: psllw $2, %xmm3 +; X64-NEXT: pand %xmm5, %xmm3 +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-NEXT: por %xmm7, %xmm3 ; X64-NEXT: paddb %xmm1, %xmm1 -; X64-NEXT: pxor %xmm6, %xmm6 -; X64-NEXT: pcmpgtb %xmm1, %xmm6 -; X64-NEXT: movdqa %xmm6, %xmm1 -; X64-NEXT: pandn %xmm2, %xmm1 -; X64-NEXT: paddb %xmm2, %xmm2 -; X64-NEXT: pand %xmm6, %xmm2 -; X64-NEXT: por %xmm1, %xmm2 -; X64-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; X64-NEXT: pxor %xmm5, %xmm5 +; X64-NEXT: pcmpgtb %xmm1, %xmm5 +; X64-NEXT: movdqa %xmm5, %xmm1 +; X64-NEXT: pandn %xmm3, %xmm1 +; X64-NEXT: paddb %xmm3, %xmm3 +; X64-NEXT: pand %xmm5, %xmm3 +; X64-NEXT: por %xmm1, %xmm3 +; X64-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: pcmpgtw %xmm4, %xmm1 +; X64-NEXT: pcmpgtw %xmm6, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm7 -; X64-NEXT: pandn %xmm6, %xmm7 -; X64-NEXT: psraw $4, %xmm6 -; X64-NEXT: pand %xmm1, %xmm6 -; X64-NEXT: por %xmm7, %xmm6 -; X64-NEXT: paddw %xmm4, %xmm4 +; X64-NEXT: pandn %xmm5, %xmm7 +; X64-NEXT: psraw $4, %xmm5 +; X64-NEXT: pand %xmm1, %xmm5 +; X64-NEXT: por %xmm7, %xmm5 +; X64-NEXT: paddw %xmm6, %xmm6 ; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: pcmpgtw %xmm4, %xmm1 +; X64-NEXT: pcmpgtw %xmm6, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm7 -; X64-NEXT: pandn %xmm6, %xmm7 -; X64-NEXT: psraw $2, %xmm6 -; X64-NEXT: pand %xmm1, %xmm6 -; X64-NEXT: por %xmm7, %xmm6 -; X64-NEXT: paddw %xmm4, %xmm4 +; X64-NEXT: pandn %xmm5, %xmm7 +; X64-NEXT: psraw $2, %xmm5 +; X64-NEXT: pand %xmm1, %xmm5 +; X64-NEXT: por %xmm7, %xmm5 +; X64-NEXT: paddw %xmm6, %xmm6 ; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: pcmpgtw %xmm4, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: pandn %xmm6, %xmm4 -; X64-NEXT: psraw $1, %xmm6 -; X64-NEXT: pand %xmm1, %xmm6 -; X64-NEXT: por %xmm4, %xmm6 -; X64-NEXT: psrlw $8, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: pxor %xmm4, %xmm4 -; X64-NEXT: pcmpgtw %xmm5, %xmm4 -; X64-NEXT: movdqa %xmm4, %xmm7 +; X64-NEXT: pcmpgtw %xmm6, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm6 +; X64-NEXT: pandn %xmm5, %xmm6 +; X64-NEXT: psraw $1, %xmm5 +; X64-NEXT: pand %xmm1, %xmm5 +; X64-NEXT: por %xmm6, %xmm5 +; X64-NEXT: psrlw $8, %xmm5 +; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; X64-NEXT: pxor %xmm6, %xmm6 +; X64-NEXT: pcmpgtw %xmm4, %xmm6 +; X64-NEXT: movdqa %xmm6, %xmm7 ; X64-NEXT: pandn %xmm1, %xmm7 ; X64-NEXT: psraw $4, %xmm1 -; X64-NEXT: pand %xmm4, %xmm1 +; X64-NEXT: pand %xmm6, %xmm1 ; X64-NEXT: por %xmm7, %xmm1 -; X64-NEXT: paddw %xmm5, %xmm5 -; X64-NEXT: pxor %xmm4, %xmm4 -; X64-NEXT: pcmpgtw %xmm5, %xmm4 -; X64-NEXT: movdqa %xmm4, %xmm7 +; X64-NEXT: paddw %xmm4, %xmm4 +; X64-NEXT: pxor %xmm6, %xmm6 +; X64-NEXT: pcmpgtw %xmm4, %xmm6 +; X64-NEXT: movdqa %xmm6, %xmm7 ; X64-NEXT: pandn %xmm1, %xmm7 ; X64-NEXT: psraw $2, %xmm1 -; X64-NEXT: pand %xmm4, %xmm1 +; X64-NEXT: pand %xmm6, %xmm1 ; X64-NEXT: por %xmm7, %xmm1 -; X64-NEXT: paddw %xmm5, %xmm5 -; X64-NEXT: pxor %xmm4, %xmm4 -; X64-NEXT: pcmpgtw %xmm5, %xmm4 -; X64-NEXT: movdqa %xmm4, %xmm5 -; X64-NEXT: pandn %xmm1, %xmm5 +; X64-NEXT: paddw %xmm4, %xmm4 +; X64-NEXT: pxor %xmm6, %xmm6 +; X64-NEXT: pcmpgtw %xmm4, %xmm6 +; X64-NEXT: movdqa %xmm6, %xmm4 +; X64-NEXT: pandn %xmm1, %xmm4 ; X64-NEXT: psraw $1, %xmm1 -; X64-NEXT: pand %xmm4, %xmm1 -; X64-NEXT: por %xmm5, %xmm1 +; X64-NEXT: pand %xmm6, %xmm1 +; X64-NEXT: por %xmm4, %xmm1 ; X64-NEXT: psrlw $8, %xmm1 -; X64-NEXT: packuswb %xmm6, %xmm1 +; X64-NEXT: packuswb %xmm5, %xmm1 ; X64-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-NEXT: pand %xmm1, %xmm2 -; X64-NEXT: pcmpgtb %xmm0, %xmm3 -; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-NEXT: pand %xmm1, %xmm3 +; X64-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: por %xmm3, %xmm0 +; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: pandn %xmm0, %xmm1 -; X64-NEXT: por %xmm2, %xmm1 +; X64-NEXT: por %xmm3, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; @@ -633,49 +637,37 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $44, %esp -; X86-NEXT: movb {{[0-9]+}}(%esp), %dh -; X86-NEXT: movb {{[0-9]+}}(%esp), %dl -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: movb %ch, %bh +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movb {{[0-9]+}}(%esp), %dh +; X86-NEXT: movb %dh, %bh ; X86-NEXT: shlb %cl, %bh ; X86-NEXT: movzbl %bh, %esi ; X86-NEXT: sarb %cl, %bh ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testb %ch, %ch +; X86-NEXT: testb %dh, %dh ; X86-NEXT: sets %al ; X86-NEXT: addl $127, %eax -; X86-NEXT: cmpb %bh, %ch +; X86-NEXT: cmpb %bh, %dh ; X86-NEXT: cmovel %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movb %dl, %cl ; X86-NEXT: shlb %cl, %al ; X86-NEXT: movzbl %al, %esi ; X86-NEXT: sarb %cl, %al -; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: testb %bl, %bl -; X86-NEXT: sets %cl -; X86-NEXT: addl $127, %ecx +; X86-NEXT: sets %dl +; X86-NEXT: addl $127, %edx ; X86-NEXT: cmpb %al, %bl -; X86-NEXT: cmovel %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %dh, %al -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shlb %cl, %al -; X86-NEXT: movzbl %al, %esi -; X86-NEXT: sarb %cl, %al -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testb %dh, %dh -; X86-NEXT: sets %cl -; X86-NEXT: addl $127, %ecx -; X86-NEXT: cmpb %al, %dh -; X86-NEXT: cmovel %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: movb %ah, %al -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shlb %cl, %al ; X86-NEXT: movzbl %al, %esi ; X86-NEXT: sarb %cl, %al @@ -684,6 +676,18 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: sets %dl ; X86-NEXT: addl $127, %edx ; X86-NEXT: cmpb %al, %ah +; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movb %ch, %al +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: shlb %cl, %al +; X86-NEXT: movzbl %al, %esi +; X86-NEXT: sarb %cl, %al +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: testb %ch, %ch +; X86-NEXT: sets %dl +; X86-NEXT: addl $127, %edx +; X86-NEXT: cmpb %al, %ch ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovel %esi, %edx diff --git a/llvm/test/CodeGen/X86/ssub_sat.ll b/llvm/test/CodeGen/X86/ssub_sat.ll index 8ecc8b39ac4683..1c5cdde3d49e7e 100644 --- a/llvm/test/CodeGen/X86/ssub_sat.ll +++ b/llvm/test/CodeGen/X86/ssub_sat.ll @@ -13,13 +13,13 @@ define i32 @func(i32 %x, i32 %y) nounwind { ; X86-LABEL: func: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl %edx, %eax -; X86-NEXT: setns %cl -; X86-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-NEXT: subl %edx, %eax -; X86-NEXT: cmovol %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: setns %dl +; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: cmovol %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: func: @@ -72,13 +72,13 @@ define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind { ; X86-LABEL: func16: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpw %dx, %ax -; X86-NEXT: setns %cl -; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: subw %dx, %ax -; X86-NEXT: cmovol %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: setns %dl +; X86-NEXT: addl $32767, %edx # imm = 0x7FFF +; X86-NEXT: subw %cx, %ax +; X86-NEXT: cmovol %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -100,14 +100,14 @@ define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind { ; X86-LABEL: func8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb %dl, %al -; X86-NEXT: setns %cl -; X86-NEXT: addl $127, %ecx -; X86-NEXT: subb %dl, %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: setns %dl +; X86-NEXT: addl $127, %edx +; X86-NEXT: subb %cl, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: cmovol %ecx, %eax +; X86-NEXT: cmovol %edx, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -164,30 +164,30 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: setns %bl +; X86-NEXT: addl $2147483647, %ebx # imm = 0x7FFFFFFF +; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setns %al -; X86-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: subl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmovol %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: setns %al -; X86-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: subl %esi, %edx +; X86-NEXT: cmovol %ebx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: cmpl %eax, %edx +; X86-NEXT: setns %bl +; X86-NEXT: addl $2147483647, %ebx # imm = 0x7FFFFFFF +; X86-NEXT: subl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovol %eax, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edi, %esi -; X86-NEXT: setns %al -; X86-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: subl %edi, %esi +; X86-NEXT: cmovol %ebx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: cmpl %eax, %esi +; X86-NEXT: setns %bl +; X86-NEXT: addl $2147483647, %ebx # imm = 0x7FFFFFFF +; X86-NEXT: subl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovol %eax, %esi +; X86-NEXT: cmovol %ebx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpl %eax, %edi diff --git a/llvm/test/CodeGen/X86/ssub_sat_plus.ll b/llvm/test/CodeGen/X86/ssub_sat_plus.ll index 5baf7a1dac74c8..15b2d72cbea3e9 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_plus.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_plus.ll @@ -12,14 +12,14 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind { ; X86-LABEL: func32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl %edx, %eax -; X86-NEXT: setns %cl -; X86-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-NEXT: subl %edx, %eax -; X86-NEXT: cmovol %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: setns %dl +; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: cmovol %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: func32: @@ -75,14 +75,14 @@ define signext i16 @func16(i16 signext %x, i16 signext %y, i16 signext %z) nounw ; X86-LABEL: func16: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imulw {{[0-9]+}}(%esp), %dx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpw %dx, %ax -; X86-NEXT: setns %cl -; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: subw %dx, %ax -; X86-NEXT: cmovol %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imulw {{[0-9]+}}(%esp), %cx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: setns %dl +; X86-NEXT: addl $32767, %edx # imm = 0x7FFF +; X86-NEXT: subw %cx, %ax +; X86-NEXT: cmovol %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -105,16 +105,16 @@ define signext i16 @func16(i16 signext %x, i16 signext %y, i16 signext %z) nounw define signext i8 @func8(i8 signext %x, i8 signext %y, i8 signext %z) nounwind { ; X86-LABEL: func8: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mulb {{[0-9]+}}(%esp) -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb %al, %dl -; X86-NEXT: setns %cl -; X86-NEXT: addl $127, %ecx -; X86-NEXT: subb %al, %dl -; X86-NEXT: movzbl %dl, %eax -; X86-NEXT: cmovol %ecx, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpb %al, %cl +; X86-NEXT: setns %dl +; X86-NEXT: addl $127, %edx +; X86-NEXT: subb %al, %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: cmovol %edx, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index d99d5aaa875365..42dbc7474549d1 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -1063,27 +1063,27 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm3, %xmm11 ; SSE41-NEXT: movdqa %xmm2, %xmm10 -; SSE41-NEXT: movdqa %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm1, %xmm8 ; SSE41-NEXT: pxor %xmm12, %xmm12 -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: psubd %xmm4, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: psubd %xmm4, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm9, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm8 -; SSE41-NEXT: movdqa %xmm9, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: psubd %xmm5, %xmm1 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm9 -; SSE41-NEXT: pxor %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE41-NEXT: pxor %xmm5, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm10, %xmm2 ; SSE41-NEXT: psubd %xmm6, %xmm2 @@ -1105,7 +1105,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; SSE41-NEXT: pxor %xmm4, %xmm5 ; SSE41-NEXT: movdqa %xmm11, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3 -; SSE41-NEXT: movaps %xmm8, %xmm0 +; SSE41-NEXT: movaps %xmm9, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i32: @@ -1494,11 +1494,11 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE41-NEXT: pand %xmm0, %xmm5 ; SSE41-NEXT: por %xmm2, %xmm5 ; SSE41-NEXT: pxor %xmm8, %xmm5 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm8, %xmm2 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1521,9 +1521,9 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE41-NEXT: por %xmm3, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -1946,28 +1946,28 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; ; AVX1-LABEL: v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpsubq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm7 +; AVX1-NEXT: vpsubq %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm7 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX1-NEXT: vxorpd %ymm0, %ymm6, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm6 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm6 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vxorpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm6, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm7 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpsubq %xmm2, %xmm7, %xmm2 @@ -1977,10 +1977,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 ; AVX1-NEXT: vxorpd %ymm1, %ymm6, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm6 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vxorpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm6, %ymm1 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll index b5b9ce95a46bac..ce15abfa671637 100644 --- a/llvm/test/CodeGen/X86/stack-clash-large.ll +++ b/llvm/test/CodeGen/X86/stack-clash-large.ll @@ -141,16 +141,16 @@ define void @push_before_probe(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i ; CHECK-X86-NEXT: .cfi_offset %edx, -12 ; CHECK-X86-NEXT: .cfi_offset %esi, -8 ; CHECK-X86-NEXT: movl 72056(%esp), %eax -; CHECK-X86-NEXT: movl 72048(%esp), %edx -; CHECK-X86-NEXT: movl 72040(%esp), %ecx +; CHECK-X86-NEXT: movl 72048(%esp), %ecx +; CHECK-X86-NEXT: movl 72040(%esp), %edx ; CHECK-X86-NEXT: movl 72032(%esp), %esi ; CHECK-X86-NEXT: addl 72036(%esp), %esi -; CHECK-X86-NEXT: addl 72044(%esp), %ecx -; CHECK-X86-NEXT: addl %esi, %ecx -; CHECK-X86-NEXT: addl 72052(%esp), %edx +; CHECK-X86-NEXT: addl 72044(%esp), %edx +; CHECK-X86-NEXT: addl %esi, %edx +; CHECK-X86-NEXT: addl 72052(%esp), %ecx ; CHECK-X86-NEXT: addl 72060(%esp), %eax -; CHECK-X86-NEXT: addl %edx, %eax ; CHECK-X86-NEXT: addl %ecx, %eax +; CHECK-X86-NEXT: addl %edx, %eax ; CHECK-X86-NEXT: movl %eax, 392(%esp) ; CHECK-X86-NEXT: movl %eax, 28792(%esp) ; CHECK-X86-NEXT: addl $72012, %esp # imm = 0x1194C diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll index 787a33aa49b20e..7da634314b6889 100644 --- a/llvm/test/CodeGen/X86/statepoint-live-in.ll +++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll @@ -257,8 +257,6 @@ define void @test8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %r10d -; CHECK-NEXT: movl %r8d, %r9d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax @@ -284,7 +282,7 @@ define void @test8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r14d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: callq _bar ## 72-byte Folded Reload ; CHECK-NEXT: Ltmp9: @@ -432,8 +430,7 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax @@ -446,57 +443,50 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %edx, %r14d +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %r8d, %r15d +; CHECK-NEXT: movl %r8d, %r14d ; CHECK-NEXT: movl %r9d, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r12d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: callq _bar ## 160-byte Folded Reload ; CHECK-NEXT: Ltmp13: ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload +; CHECK-NEXT: addq %rcx, %r14 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r14 -; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Folded Reload -; CHECK-NEXT: addq %r14, %r15 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r15 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: addq %r15, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %r12 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %r12 -; CHECK-NEXT: addq %rbx, %r12 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbp +; CHECK-NEXT: addq %r14, %r15 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp -; CHECK-NEXT: addq %r12, %rbp -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %r13 +; CHECK-NEXT: addq %r15, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax @@ -504,17 +494,25 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: addq %rbp, %r13 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: addq %rax, %r12 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: addq %rax, %r12 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: addq %rax, %r12 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: addq %r13, %rcx -; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: addq %r13, %r12 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %r12, %rbx +; CHECK-NEXT: movq %rbx, %rax ; CHECK-NEXT: addq $168, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/statepoint-ra.ll b/llvm/test/CodeGen/X86/statepoint-ra.ll index 4e57648820c4b3..9a555403a908c0 100644 --- a/llvm/test/CodeGen/X86/statepoint-ra.ll +++ b/llvm/test/CodeGen/X86/statepoint-ra.ll @@ -65,10 +65,10 @@ declare token @llvm.experimental.gc.statepoint.p0(i64 , i32 , ptr, i32 , i32 , . ;CHECK: bb.0.bb: ;CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) ;CHECK: liveins: $rdi, $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7 -;CHECK: %55:fr64 = COPY $xmm7 +;CHECK: %48:fr64 = COPY $xmm7 ;CHECK: %10:fr64 = COPY $xmm6 ;CHECK: %45:fr64 = COPY $xmm5 -;CHECK: %52:fr64 = COPY $xmm4 +;CHECK: %56:fr64 = COPY $xmm4 ;CHECK: %59:fr64 = COPY $xmm3 ;CHECK: %6:fr64 = COPY $xmm2 ;CHECK: %64:fr64 = COPY $xmm1 @@ -80,19 +80,19 @@ declare token @llvm.experimental.gc.statepoint.p0(i64 , i32 , ptr, i32 , i32 , . ;CHECK: %77:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16) ;CHECK: MOV64mr %stack.0, 1, $noreg, 0, $noreg, %3 :: (store (s64) into %stack.0) ;CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp -;CHECK: STATEPOINT 2882400000, 0, 0, target-flags(x86-plt) @blam, 2, 9, 2, 0, 2, 59, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 26, 2, 0, 2, 0, 1, 8, %stack.0, 0, 2, 4, %68, 2, 7, 2, 0, 2, 4, %64, 2, 7, 2, 0, 2, 4, %6, 2, 7, 2, 0, 2, 4, %59, 2, 7, 2, 0, 2, 4, %52, 2, 7, 2, 0, 2, 4, %45, 2, 7, 2, 0, 2, 4, %10, 2, 7, 2, 0, 2, 4, %55, 2, 7, 2, 0, 2, 4, %77, 2, 7, 2, 0, 2, 4, %72, 2, 7, 2, 0, 2, 4, %14, 2, 7, 2, 0, 2, 4, %82, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, 1, 8, %stack.0, 0, 2, 0, 2, 1, 0, 0, csr_64_mostregs, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s64) on %stack.0) +;CHECK: STATEPOINT 2882400000, 0, 0, target-flags(x86-plt) @blam, 2, 9, 2, 0, 2, 59, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 26, 2, 0, 2, 0, 1, 8, %stack.0, 0, 2, 4, %68, 2, 7, 2, 0, 2, 4, %64, 2, 7, 2, 0, 2, 4, %6, 2, 7, 2, 0, 2, 4, %59, 2, 7, 2, 0, 2, 4, %56, 2, 7, 2, 0, 2, 4, %45, 2, 7, 2, 0, 2, 4, %10, 2, 7, 2, 0, 2, 4, %48, 2, 7, 2, 0, 2, 4, %77, 2, 7, 2, 0, 2, 4, %72, 2, 7, 2, 0, 2, 4, %14, 2, 7, 2, 0, 2, 4, %82, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, 1, 8, %stack.0, 0, 2, 0, 2, 1, 0, 0, csr_64_mostregs, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s64) on %stack.0) ;CHECK: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ;CHECK: %17:gr32 = MOV32r0 implicit-def dead $eflags ;CHECK: TEST8rr %17.sub_8bit, %17.sub_8bit, implicit-def $eflags ;CHECK: MOVSDmr %stack.1, 1, $noreg, 0, $noreg, %45 :: (store (s64) into %stack.1) -;CHECK: MOVSDmr %stack.2, 1, $noreg, 0, $noreg, %52 :: (store (s64) into %stack.2) +;CHECK: MOVSDmr %stack.3, 1, $noreg, 0, $noreg, %56 :: (store (s64) into %stack.3) ;CHECK: MOVSDmr %stack.5, 1, $noreg, 0, $noreg, %64 :: (store (s64) into %stack.5) ;CHECK: MOVSDmr %stack.6, 1, $noreg, 0, $noreg, %68 :: (store (s64) into %stack.6) ;CHECK: JCC_1 %bb.2, 4, implicit killed $eflags ;CHECK: bb.1: ;CHECK: successors: %bb.3(0x80000000) ;CHECK: %60:fr64 = MOVSDrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s64) from constant-pool) -;CHECK: MOVSDmr %stack.3, 1, $noreg, 0, $noreg, %60 :: (store (s64) into %stack.3) +;CHECK: MOVSDmr %stack.2, 1, $noreg, 0, $noreg, %60 :: (store (s64) into %stack.2) ;CHECK: MOVSDmr %stack.4, 1, $noreg, 0, $noreg, %60 :: (store (s64) into %stack.4) ;CHECK: MOVSDmr %stack.7, 1, $noreg, 0, $noreg, %60 :: (store (s64) into %stack.7) ;CHECK: JMP_1 %bb.3 @@ -104,15 +104,15 @@ declare token @llvm.experimental.gc.statepoint.p0(i64 , i32 , ptr, i32 , i32 , . ;CHECK: $xmm0 = COPY %68 ;CHECK: $xmm1 = COPY %64 ;CHECK: $xmm2 = COPY %6 -;CHECK: $xmm3 = COPY %52 +;CHECK: $xmm3 = COPY %56 ;CHECK: $xmm4 = COPY %45 ;CHECK: $xmm5 = COPY %10 ;CHECK: $xmm6 = COPY %77 ;CHECK: $xmm7 = COPY %72 -;CHECK: MOVSDmr %stack.3, 1, $noreg, 0, $noreg, %55 :: (store (s64) into %stack.3) +;CHECK: MOVSDmr %stack.2, 1, $noreg, 0, $noreg, %48 :: (store (s64) into %stack.2) ;CHECK: MOVSDmr %stack.4, 1, $noreg, 0, $noreg, %59 :: (store (s64) into %stack.4) ;CHECK: MOVSDmr %stack.7, 1, $noreg, 0, $noreg, %82 :: (store (s64) into %stack.7) -;CHECK: STATEPOINT 2, 5, 9, undef %22:gr64, $rdi, $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7, 2, 0, 2, 0, 2, 59, 2, 0, 2, 2, 2, 0, 2, 70, 2, 0, 2, 26, 2, 0, 2, 0, 2, 0, 2, 4, 1, 8, %stack.6, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.5, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.4, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.1, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.0, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, 2, 0, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $eax :: (load (s64) from %stack.1), (load (s64) from %stack.2), (load (s64) from %stack.3), (load (s64) from %stack.4), (load (s64) from %stack.5), (load (s64) from %stack.6), (load (s64) from %fixed-stack.2), (load (s64) from %fixed-stack.3, align 16), (load (s64) from %fixed-stack.0) +;CHECK: STATEPOINT 2, 5, 9, undef %22:gr64, $rdi, $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7, 2, 0, 2, 0, 2, 59, 2, 0, 2, 2, 2, 0, 2, 70, 2, 0, 2, 26, 2, 0, 2, 0, 2, 0, 2, 4, 1, 8, %stack.6, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.5, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.4, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.1, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.0, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, 2, 0, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $eax :: (load (s64) from %stack.1), (load (s64) from %stack.2), (load (s64) from %stack.3), (load (s64) from %stack.4), (load (s64) from %stack.5), (load (s64) from %stack.6), (load (s64) from %fixed-stack.2), (load (s64) from %fixed-stack.3, align 16), (load (s64) from %fixed-stack.0) ;CHECK: ADJCALLSTACKUP64 8, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ;CHECK: bb.3.bb15: ;CHECK: successors: %bb.7(0x7ffff800), %bb.4(0x00000800) @@ -125,7 +125,7 @@ declare token @llvm.experimental.gc.statepoint.p0(i64 , i32 , ptr, i32 , i32 , . ;CHECK: EH_LABEL ;CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ;CHECK: $edx = MOV32r0 implicit-def dead $eflags -;CHECK: STATEPOINT 1, 16, 3, undef %29:gr64, undef $edi, undef $rsi, $edx, 2, 0, 2, 0, 2, 105, 2, 0, 2, 2, 2, 0, 2, 97, 2, 0, 2, 26, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, 2, 2, 2, 3, 2, 0, 2, 20, 2, 0, 2, 0, 2, 4278124286, 2, 4, 1, 8, %stack.6, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.5, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.4, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.1, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.7, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, 2, 4278124286, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $eax :: (load (s64) from %stack.1), (load (s64) from %stack.2), (load (s64) from %stack.3), (load (s64) from %stack.4), (load (s64) from %stack.5), (load (s64) from %stack.6), (load (s64) from %fixed-stack.2), (load (s64) from %fixed-stack.3, align 16), (load (s64) from %stack.7) +;CHECK: STATEPOINT 1, 16, 3, undef %29:gr64, undef $edi, undef $rsi, $edx, 2, 0, 2, 0, 2, 105, 2, 0, 2, 2, 2, 0, 2, 97, 2, 0, 2, 26, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, 2, 2, 2, 3, 2, 0, 2, 20, 2, 0, 2, 0, 2, 4278124286, 2, 4, 1, 8, %stack.6, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.5, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.4, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.1, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.7, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, 2, 4278124286, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $eax :: (load (s64) from %stack.1), (load (s64) from %stack.2), (load (s64) from %stack.3), (load (s64) from %stack.4), (load (s64) from %stack.5), (load (s64) from %stack.6), (load (s64) from %fixed-stack.2), (load (s64) from %fixed-stack.3, align 16), (load (s64) from %stack.7) ;CHECK: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ;CHECK: EH_LABEL ;CHECK: JMP_1 %bb.5 @@ -137,15 +137,15 @@ declare token @llvm.experimental.gc.statepoint.p0(i64 , i32 , ptr, i32 , i32 , . ;CHECK: $xmm0 = MOVSDrm_alt %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) ;CHECK: $xmm1 = MOVSDrm_alt %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) ;CHECK: $xmm2 = MOVSDrm_alt %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) -;CHECK: $xmm3 = MOVSDrm_alt %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2) +;CHECK: $xmm3 = MOVSDrm_alt %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) ;CHECK: $xmm4 = MOVSDrm_alt %stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %stack.1) -;CHECK: $xmm5 = MOVSDrm_alt %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) +;CHECK: $xmm5 = MOVSDrm_alt %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2) ;CHECK: %80:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16) ;CHECK: $xmm6 = COPY %80 ;CHECK: $esi = MOV32ri 51 ;CHECK: %75:fr64 = MOVSDrm_alt %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2) ;CHECK: $xmm7 = COPY %75 -;CHECK: STATEPOINT 2, 5, 10, undef %36:gr64, undef $rdi, $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7, killed $esi, 2, 0, 2, 0, 2, 105, 2, 0, 2, 2, 2, 0, 2, 97, 2, 0, 2, 26, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 2, 2, 2, 2, 46, 2, 0, 2, 20, 2, 0, 2, 0, 2, 4278124286, 2, 4, 1, 8, %stack.6, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.5, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.4, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.1, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.7, 0, 2, 7, 2, 0, 2, 3, 2, 51, 2, 1, 2, 4278124286, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp :: (load (s64) from %stack.1), (load (s64) from %stack.2), (load (s64) from %stack.3), (load (s64) from %stack.4), (load (s64) from %stack.5), (load (s64) from %stack.6), (load (s64) from %fixed-stack.2), (load (s64) from %fixed-stack.3, align 16), (load (s64) from %stack.7) +;CHECK: STATEPOINT 2, 5, 10, undef %36:gr64, undef $rdi, $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7, killed $esi, 2, 0, 2, 0, 2, 105, 2, 0, 2, 2, 2, 0, 2, 97, 2, 0, 2, 26, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 2, 2, 2, 2, 46, 2, 0, 2, 20, 2, 0, 2, 0, 2, 4278124286, 2, 4, 1, 8, %stack.6, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.5, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.4, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.1, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.3, 0, 2, 7, 2, 0, 2, 4, 1, 8, %fixed-stack.2, 0, 2, 7, 2, 0, 2, 4, 1, 8, %stack.7, 0, 2, 7, 2, 0, 2, 3, 2, 51, 2, 1, 2, 4278124286, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp :: (load (s64) from %stack.1), (load (s64) from %stack.2), (load (s64) from %stack.3), (load (s64) from %stack.4), (load (s64) from %stack.5), (load (s64) from %stack.6), (load (s64) from %fixed-stack.2), (load (s64) from %fixed-stack.3, align 16), (load (s64) from %stack.7) ;CHECK: ADJCALLSTACKUP64 8, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ;CHECK: bb.6.bb23 (landing-pad): ;CHECK: liveins: $rax, $rdx diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll index 5c26e29dce45ed..71c6b59557b106 100644 --- a/llvm/test/CodeGen/X86/statepoint-regs.ll +++ b/llvm/test/CodeGen/X86/statepoint-regs.ll @@ -320,16 +320,16 @@ define void @test8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: subq $152, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 208 ; CHECK-NEXT: .cfi_offset %rbx, -56 ; CHECK-NEXT: .cfi_offset %r12, -48 ; CHECK-NEXT: .cfi_offset %r13, -40 ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl %r8d, (%rsp) ## 4-byte Spill +; CHECK-NEXT: movl %r9d, %ebx +; CHECK-NEXT: movl %r8d, %ebp ; CHECK-NEXT: movl %ecx, %r14d ; CHECK-NEXT: movl %edx, %r15d ; CHECK-NEXT: movl %esi, %r12d @@ -366,11 +366,13 @@ define void @test8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp -; CHECK-NEXT: callq _bar ## 132-byte Folded Reload +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: callq _bar ## 144-byte Folded Reload ; CHECK-NEXT: Ltmp10: -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: addq $152, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r13 @@ -544,8 +546,7 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax @@ -558,57 +559,50 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %edx, %r14d +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %r8d, %r15d +; CHECK-NEXT: movl %r8d, %r14d ; CHECK-NEXT: movl %r9d, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r12d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: callq _bar ## 160-byte Folded Reload ; CHECK-NEXT: Ltmp14: ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload +; CHECK-NEXT: addq %rcx, %r14 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r14 -; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Folded Reload -; CHECK-NEXT: addq %r14, %r15 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r15 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: addq %r15, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %r12 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %r12 -; CHECK-NEXT: addq %rbx, %r12 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbp +; CHECK-NEXT: addq %r14, %r15 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp -; CHECK-NEXT: addq %r12, %rbp -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %r13 +; CHECK-NEXT: addq %r15, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax @@ -616,17 +610,25 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: addq %rbp, %r13 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: addq %rax, %r12 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: addq %rax, %r12 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: addq %rax, %r12 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: addq %r13, %rcx -; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: addq %r13, %r12 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %r12, %rbx +; CHECK-NEXT: movq %rbx, %rax ; CHECK-NEXT: addq $168, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll index 0594f2fbc0a35f..944fd718acc506 100644 --- a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll +++ b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll @@ -34,87 +34,90 @@ define i32 @test_spill( ; CHECK-VREG-NEXT: [[MOV64rm11]]:gr64, [[MOV64rm10]]:gr64, [[MOV64rm9]]:gr64, [[MOV64rm8]]:gr64, [[MOV64rm7]]:gr64, [[MOV64rm6]]:gr64, [[MOV64rm5]]:gr64, [[MOV64rm4]]:gr64, [[MOV64rm3]]:gr64, [[MOV64rm2]]:gr64, [[MOV64rm1]]:gr64, [[MOV64rm]]:gr64, [[COPY]]:gr64, [[COPY1]]:gr64, [[COPY2]]:gr64, [[COPY3]]:gr64, [[COPY4]]:gr64, [[COPY5]]:gr64 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, [[MOV64rm11]](tied-def 0), [[MOV64rm10]](tied-def 1), [[MOV64rm9]](tied-def 2), [[MOV64rm8]](tied-def 3), [[MOV64rm7]](tied-def 4), [[MOV64rm6]](tied-def 5), [[MOV64rm5]](tied-def 6), [[MOV64rm4]](tied-def 7), [[MOV64rm3]](tied-def 8), [[MOV64rm2]](tied-def 9), [[MOV64rm1]](tied-def 10), [[MOV64rm]](tied-def 11), [[COPY]](tied-def 12), [[COPY1]](tied-def 13), [[COPY2]](tied-def 14), [[COPY3]](tied-def 15), [[COPY4]](tied-def 16), [[COPY5]](tied-def 17), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp ; CHECK-VREG-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY5]], 1, $noreg, 4, $noreg :: (load (s32) from %ir.gep00, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm]], [[COPY4]], 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[COPY3]], 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[COPY2]], 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[COPY1]], 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[COPY]], 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm]], 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm1]], 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm2]], 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm3]], 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm4]], 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm5]], 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm6]], 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm7]], 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm8]], 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm9]], 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm10]], 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1) - ; CHECK-VREG-NEXT: [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm11]], 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1) - ; CHECK-VREG-NEXT: $eax = COPY [[ADD32rm1]] + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY4]], 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY3]], 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY2]], 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY1]], 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm]], 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm1]], 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm2]], 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm3]], 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm4]], 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm5]], 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm6]], 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm7]], 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm8]], 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm9]], 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm10]], 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1) + ; CHECK-VREG-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm11]], 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1) + ; CHECK-VREG-NEXT: $eax = COPY [[MOV32rm]] ; CHECK-VREG-NEXT: RET 0, killed $eax + ; ; CHECK-PREG-LABEL: name: test_spill ; CHECK-PREG: bb.0 (%ir-block.0): ; CHECK-PREG-NEXT: liveins: $rcx, $rdi, $rdx, $rsi, $r8, $r9 ; CHECK-PREG-NEXT: {{ $}} - ; CHECK-PREG-NEXT: MOV64mr %stack.2, 1, $noreg, 0, $noreg, $r9 :: (store (s64) into %stack.2) - ; CHECK-PREG-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.6) - ; CHECK-PREG-NEXT: MOV64mr %stack.9, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.9) - ; CHECK-PREG-NEXT: MOV64mr %stack.10, 1, $noreg, 0, $noreg, $rdx :: (store (s64) into %stack.10) - ; CHECK-PREG-NEXT: MOV64mr %stack.11, 1, $noreg, 0, $noreg, $rsi :: (store (s64) into %stack.11) - ; CHECK-PREG-NEXT: renamable $rbp = COPY $rdi + ; CHECK-PREG-NEXT: renamable $rbx = COPY $r9 + ; CHECK-PREG-NEXT: renamable $r14 = COPY $r8 + ; CHECK-PREG-NEXT: renamable $r15 = COPY $rcx + ; CHECK-PREG-NEXT: MOV64mr %stack.9, 1, $noreg, 0, $noreg, $rdx :: (store (s64) into %stack.9) + ; CHECK-PREG-NEXT: MOV64mr %stack.10, 1, $noreg, 0, $noreg, $rsi :: (store (s64) into %stack.10) + ; CHECK-PREG-NEXT: MOV64mr %stack.11, 1, $noreg, 0, $noreg, $rdi :: (store (s64) into %stack.11) ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.11, align 16) - ; CHECK-PREG-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.8) + ; CHECK-PREG-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.0) ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.10) - ; CHECK-PREG-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.7) + ; CHECK-PREG-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.1) ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.9, align 16) - ; CHECK-PREG-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.5) + ; CHECK-PREG-NEXT: MOV64mr %stack.2, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.2) ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.8) - ; CHECK-PREG-NEXT: MOV64mr %stack.4, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.4) - ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.7, align 16) ; CHECK-PREG-NEXT: MOV64mr %stack.3, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.3) + ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.7, align 16) + ; CHECK-PREG-NEXT: MOV64mr %stack.4, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.4) ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.6) - ; CHECK-PREG-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.1) + ; CHECK-PREG-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.5) ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.5, align 16) - ; CHECK-PREG-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.0) - ; CHECK-PREG-NEXT: renamable $r13 = MOV64rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.4) + ; CHECK-PREG-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.6) + ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.4) + ; CHECK-PREG-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.7) ; CHECK-PREG-NEXT: renamable $r12 = MOV64rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16) - ; CHECK-PREG-NEXT: renamable $r15 = MOV64rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2) - ; CHECK-PREG-NEXT: renamable $rbx = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.1, align 16) - ; CHECK-PREG-NEXT: renamable $r14 = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0) + ; CHECK-PREG-NEXT: renamable $r13 = MOV64rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2) + ; CHECK-PREG-NEXT: renamable $rbp = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.1, align 16) + ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0) + ; CHECK-PREG-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.8) ; CHECK-PREG-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-PREG-NEXT: renamable $r14, renamable $rbx, renamable $r15, renamable $r12, renamable $r13, renamable $rbp = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, killed renamable $r14(tied-def 0), killed renamable $rbx(tied-def 1), killed renamable $r15(tied-def 2), killed renamable $r12(tied-def 3), killed renamable $r13(tied-def 4), 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 1, 8, %stack.3, 0, 1, 8, %stack.4, 0, 1, 8, %stack.5, 0, 1, 8, %stack.7, 0, 1, 8, %stack.8, 0, 1, 8, %stack.2, 0, 1, 8, %stack.6, 0, 1, 8, %stack.9, 0, 1, 8, %stack.10, 0, 1, 8, %stack.11, 0, killed renamable $rbp(tied-def 5), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11) + ; CHECK-PREG-NEXT: renamable $rbp, renamable $r13, renamable $r12, renamable $rbx, renamable $r14, renamable $r15 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, 1, 8, %stack.8, 0, killed renamable $rbp(tied-def 0), killed renamable $r13(tied-def 1), killed renamable $r12(tied-def 2), 1, 8, %stack.7, 0, 1, 8, %stack.6, 0, 1, 8, %stack.5, 0, 1, 8, %stack.4, 0, 1, 8, %stack.3, 0, 1, 8, %stack.2, 0, 1, 8, %stack.1, 0, 1, 8, %stack.0, 0, killed renamable $rbx(tied-def 3), killed renamable $r14(tied-def 4), killed renamable $r15(tied-def 5), 1, 8, %stack.9, 0, 1, 8, %stack.10, 0, 1, 8, %stack.11, 0, 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11) ; CHECK-PREG-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-PREG-NEXT: renamable $eax = MOV32rm killed renamable $rbp, 1, $noreg, 4, $noreg :: (load (s32) from %ir.gep00, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %stack.8) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %stack.1) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1) - ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r13, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) + ; CHECK-PREG-NEXT: renamable $eax = MOV32rm killed renamable $rax, 1, $noreg, 4, $noreg :: (load (s32) from %ir.gep00, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rcx = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rcx = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbx, 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rcx = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rcx = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %stack.1) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rcx = MOV64rm %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rcx = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rcx = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rcx = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rcx = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rcx = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1) ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r12, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbx, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r13, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbp, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1) + ; CHECK-PREG-NEXT: renamable $rcx = MOV64rm %stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %stack.8) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1) ; CHECK-PREG-NEXT: RET 0, $eax ptr addrspace(1) %arg00, ptr addrspace(1) %arg01, ptr addrspace(1) %arg02, ptr addrspace(1) %arg03, ptr addrspace(1) %arg04, ptr addrspace(1) %arg05, ptr addrspace(1) %arg06, ptr addrspace(1) %arg07, ptr addrspace(1) %arg08, ptr addrspace(1) %arg09, ptr addrspace(1) %arg10, ptr addrspace(1) %arg11, diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll index 870912bb6bb1be..7d1a6171c844a9 100644 --- a/llvm/test/CodeGen/X86/sttni.ll +++ b/llvm/test/CodeGen/X86/sttni.ll @@ -1110,15 +1110,15 @@ entry: define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %iptr, ptr %fptr) nounwind { ; X86-LABEL: pcmpistr_index_flag: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X86-NEXT: setb %al -; X86-NEXT: movl %ecx, (%esi) -; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: popl %esi +; X86-NEXT: setb %bl +; X86-NEXT: movl %ecx, (%edx) +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_index_flag: @@ -1140,13 +1140,13 @@ entry: define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %fptr) nounwind { ; X86-LABEL: pcmpistr_mask_flag: ; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0 -; X86-NEXT: setb %al -; X86-NEXT: movdqa %xmm0, (%edx) -; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: setb %dl +; X86-NEXT: movdqa %xmm0, (%ecx) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_mask_flag: diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index e104d9d2b4c157..dd73db0be23e78 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -803,24 +803,24 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(ptr %p0, ptr %p1) { define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { ; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,0,2,0] -; X86-AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm3 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,2,0] +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0] ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0] ; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7 -; X86-AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5 -; X86-AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; X86-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 ; X86-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 ; X86-AVX1-NEXT: vmovdqu %xmm0, ga4+16 -; X86-AVX1-NEXT: vmovdqu %xmm3, ga4 +; X86-AVX1-NEXT: vmovdqu %xmm4, ga4 ; X86-AVX1-NEXT: vmovups %ymm2, gb4+32 ; X86-AVX1-NEXT: vmovups %ymm1, gb4 ; X86-AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index e0f438eb7cc8f7..5c193fd793ab3d 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -1288,34 +1288,34 @@ define void @vec256_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec256_v4i16: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movzwl 6(%rdi), %r8d +; SCALAR-NEXT: movzwl 6(%rdi), %eax ; SCALAR-NEXT: movzwl 2(%rdi), %ecx -; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: movl (%rdi), %r8d ; SCALAR-NEXT: movl 4(%rdi), %edi -; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: notl %r8d ; SCALAR-NEXT: notl %ecx ; SCALAR-NEXT: notl %edi -; SCALAR-NEXT: notl %r8d -; SCALAR-NEXT: movw %r8w, 6(%rsi) +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movw %ax, 6(%rsi) ; SCALAR-NEXT: movw %di, 4(%rsi) ; SCALAR-NEXT: movw %cx, 2(%rsi) -; SCALAR-NEXT: movw %ax, (%rsi) -; SCALAR-NEXT: movw %r8w, 6(%rdx) +; SCALAR-NEXT: movw %r8w, (%rsi) +; SCALAR-NEXT: movw %ax, 6(%rdx) ; SCALAR-NEXT: movw %di, 4(%rdx) ; SCALAR-NEXT: movw %cx, 2(%rdx) -; SCALAR-NEXT: movw %ax, (%rdx) -; SCALAR-NEXT: movw %r8w, 14(%rdx) +; SCALAR-NEXT: movw %r8w, (%rdx) +; SCALAR-NEXT: movw %ax, 14(%rdx) ; SCALAR-NEXT: movw %di, 12(%rdx) ; SCALAR-NEXT: movw %cx, 10(%rdx) -; SCALAR-NEXT: movw %ax, 8(%rdx) -; SCALAR-NEXT: movw %r8w, 22(%rdx) +; SCALAR-NEXT: movw %r8w, 8(%rdx) +; SCALAR-NEXT: movw %ax, 22(%rdx) ; SCALAR-NEXT: movw %di, 20(%rdx) ; SCALAR-NEXT: movw %cx, 18(%rdx) -; SCALAR-NEXT: movw %ax, 16(%rdx) -; SCALAR-NEXT: movw %r8w, 30(%rdx) +; SCALAR-NEXT: movw %r8w, 16(%rdx) +; SCALAR-NEXT: movw %ax, 30(%rdx) ; SCALAR-NEXT: movw %di, 28(%rdx) ; SCALAR-NEXT: movw %cx, 26(%rdx) -; SCALAR-NEXT: movw %ax, 24(%rdx) +; SCALAR-NEXT: movw %r8w, 24(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec256_v4i16: @@ -1659,123 +1659,108 @@ define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 13(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 12(%rdi), %r15d -; SCALAR-NEXT: movzbl 11(%rdi), %eax -; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 10(%rdi), %ebp -; SCALAR-NEXT: movzbl 9(%rdi), %r14d -; SCALAR-NEXT: movzbl 8(%rdi), %eax -; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 7(%rdi), %r12d +; SCALAR-NEXT: movzbl 12(%rdi), %r12d +; SCALAR-NEXT: movzbl 11(%rdi), %r15d +; SCALAR-NEXT: movzbl 10(%rdi), %r14d +; SCALAR-NEXT: movzbl 9(%rdi), %ebp +; SCALAR-NEXT: movzbl 8(%rdi), %ebx +; SCALAR-NEXT: movzbl 7(%rdi), %r11d ; SCALAR-NEXT: movzbl 6(%rdi), %r10d ; SCALAR-NEXT: movzbl 5(%rdi), %r9d -; SCALAR-NEXT: movzbl 4(%rdi), %ebx -; SCALAR-NEXT: movzbl 3(%rdi), %r8d -; SCALAR-NEXT: movzbl 2(%rdi), %ecx +; SCALAR-NEXT: movzbl 4(%rdi), %r13d +; SCALAR-NEXT: movzbl 3(%rdi), %eax +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 2(%rdi), %r8d ; SCALAR-NEXT: movzbl (%rdi), %eax -; SCALAR-NEXT: movzbl 1(%rdi), %r13d +; SCALAR-NEXT: movzbl 1(%rdi), %ecx ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: notb %bl +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r10b -; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: notb %r12b -; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload ; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movl %r14d, %r10d -; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: notb %bl ; SCALAR-NEXT: notb %bpl -; SCALAR-NEXT: movl %ebp, %r14d +; SCALAR-NEXT: notb %r14b +; SCALAR-NEXT: notb %r15b +; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movl %r15d, %edi -; SCALAR-NEXT: notb %dil -; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SCALAR-NEXT: notb %bpl -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SCALAR-NEXT: notb %r15b -; SCALAR-NEXT: movb %r15b, 15(%rsi) -; SCALAR-NEXT: movb %bpl, 14(%rsi) -; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movl %r9d, %eax -; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movb %r9b, 13(%rsi) -; SCALAR-NEXT: movb %dil, 12(%rsi) -; SCALAR-NEXT: movb %r8b, 11(%rsi) -; SCALAR-NEXT: movb %r14b, 10(%rsi) -; SCALAR-NEXT: movb %r10b, 9(%rsi) -; SCALAR-NEXT: movl %r10d, %r8d -; SCALAR-NEXT: movb %r11b, 8(%rsi) -; SCALAR-NEXT: movl %r11d, %r9d -; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movb %r12b, 7(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 6(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 5(%rsi) -; SCALAR-NEXT: movb %bl, 4(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 3(%rsi) +; SCALAR-NEXT: notb %cl +; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: notb %al +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %cl, 14(%rsi) +; SCALAR-NEXT: movb %r8b, 13(%rsi) +; SCALAR-NEXT: movb %r12b, 12(%rsi) +; SCALAR-NEXT: movb %r15b, 11(%rsi) +; SCALAR-NEXT: movb %r14b, 10(%rsi) +; SCALAR-NEXT: movb %bpl, 9(%rsi) +; SCALAR-NEXT: movb %bl, 8(%rsi) +; SCALAR-NEXT: movb %r11b, 7(%rsi) +; SCALAR-NEXT: movb %r10b, 6(%rsi) +; SCALAR-NEXT: movb %r9b, 5(%rsi) +; SCALAR-NEXT: movl %r13d, %r8d +; SCALAR-NEXT: movb %r13b, 4(%rsi) +; SCALAR-NEXT: movb %dil, 3(%rsi) +; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; SCALAR-NEXT: movb %cl, 2(%rsi) -; SCALAR-NEXT: movb %r13b, 1(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, (%rsi) -; SCALAR-NEXT: movb %r15b, 15(%rdx) -; SCALAR-NEXT: movl %r15d, %r11d -; SCALAR-NEXT: movb %bpl, 14(%rdx) -; SCALAR-NEXT: movb %al, 13(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 1(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r13b, (%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 15(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 14(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 13(%rdx) ; SCALAR-NEXT: movb %r12b, 12(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload ; SCALAR-NEXT: movb %r15b, 11(%rdx) ; SCALAR-NEXT: movb %r14b, 10(%rdx) -; SCALAR-NEXT: movb %r8b, 9(%rdx) -; SCALAR-NEXT: movb %r9b, 8(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r9b, 7(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 6(%rdx) -; SCALAR-NEXT: movb %dil, 5(%rdx) -; SCALAR-NEXT: movb %bl, 4(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 3(%rdx) +; SCALAR-NEXT: movb %bpl, 9(%rdx) +; SCALAR-NEXT: movb %bl, 8(%rdx) +; SCALAR-NEXT: movb %r11b, 7(%rdx) +; SCALAR-NEXT: movb %r10b, 6(%rdx) +; SCALAR-NEXT: movb %r9b, 5(%rdx) +; SCALAR-NEXT: movb %r8b, 4(%rdx) +; SCALAR-NEXT: movb %dil, 3(%rdx) ; SCALAR-NEXT: movb %cl, 2(%rdx) -; SCALAR-NEXT: movb %r13b, 1(%rdx) -; SCALAR-NEXT: movl %r10d, %edi -; SCALAR-NEXT: movb %r10b, (%rdx) -; SCALAR-NEXT: movb %r11b, 31(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, 30(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, 29(%rdx) +; SCALAR-NEXT: movl %ecx, %edi +; SCALAR-NEXT: movb %al, 1(%rdx) +; SCALAR-NEXT: movl %eax, %ecx +; SCALAR-NEXT: movl %r13d, %eax +; SCALAR-NEXT: movb %r13b, (%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r13b, 31(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r13b, 30(%rdx) +; SCALAR-NEXT: movb %sil, 29(%rdx) ; SCALAR-NEXT: movb %r12b, 28(%rdx) ; SCALAR-NEXT: movb %r15b, 27(%rdx) ; SCALAR-NEXT: movb %r14b, 26(%rdx) -; SCALAR-NEXT: movb %r8b, 25(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, 24(%rdx) -; SCALAR-NEXT: movb %r9b, 23(%rdx) -; SCALAR-NEXT: movb %al, 22(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 21(%rdx) -; SCALAR-NEXT: movb %bl, 20(%rdx) +; SCALAR-NEXT: movb %bpl, 25(%rdx) +; SCALAR-NEXT: movb %bl, 24(%rdx) +; SCALAR-NEXT: movb %r11b, 23(%rdx) +; SCALAR-NEXT: movb %r10b, 22(%rdx) +; SCALAR-NEXT: movb %r9b, 21(%rdx) +; SCALAR-NEXT: movb %r8b, 20(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 19(%rdx) -; SCALAR-NEXT: movb %cl, 18(%rdx) -; SCALAR-NEXT: movb %r13b, 17(%rdx) -; SCALAR-NEXT: movb %dil, 16(%rdx) +; SCALAR-NEXT: movb %dil, 18(%rdx) +; SCALAR-NEXT: movb %cl, 17(%rdx) +; SCALAR-NEXT: movb %al, 16(%rdx) ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: popq %r12 ; SCALAR-NEXT: popq %r13 @@ -2013,36 +1998,36 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p define void @vec384_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v2i16: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movzwl 2(%rdi), %ecx -; SCALAR-NEXT: movl (%rdi), %eax -; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movzwl 2(%rdi), %eax +; SCALAR-NEXT: movl (%rdi), %ecx ; SCALAR-NEXT: notl %ecx -; SCALAR-NEXT: movw %cx, 2(%rsi) -; SCALAR-NEXT: movw %ax, (%rsi) -; SCALAR-NEXT: movw %cx, 2(%rdx) -; SCALAR-NEXT: movw %ax, (%rdx) -; SCALAR-NEXT: movw %cx, 6(%rdx) -; SCALAR-NEXT: movw %ax, 4(%rdx) -; SCALAR-NEXT: movw %cx, 10(%rdx) -; SCALAR-NEXT: movw %ax, 8(%rdx) -; SCALAR-NEXT: movw %cx, 14(%rdx) -; SCALAR-NEXT: movw %ax, 12(%rdx) -; SCALAR-NEXT: movw %cx, 18(%rdx) -; SCALAR-NEXT: movw %ax, 16(%rdx) -; SCALAR-NEXT: movw %cx, 22(%rdx) -; SCALAR-NEXT: movw %ax, 20(%rdx) -; SCALAR-NEXT: movw %cx, 26(%rdx) -; SCALAR-NEXT: movw %ax, 24(%rdx) -; SCALAR-NEXT: movw %cx, 30(%rdx) -; SCALAR-NEXT: movw %ax, 28(%rdx) -; SCALAR-NEXT: movw %cx, 34(%rdx) -; SCALAR-NEXT: movw %ax, 32(%rdx) -; SCALAR-NEXT: movw %cx, 38(%rdx) -; SCALAR-NEXT: movw %ax, 36(%rdx) -; SCALAR-NEXT: movw %cx, 42(%rdx) -; SCALAR-NEXT: movw %ax, 40(%rdx) -; SCALAR-NEXT: movw %cx, 46(%rdx) -; SCALAR-NEXT: movw %ax, 44(%rdx) +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movw %ax, 2(%rsi) +; SCALAR-NEXT: movw %cx, (%rsi) +; SCALAR-NEXT: movw %ax, 2(%rdx) +; SCALAR-NEXT: movw %cx, (%rdx) +; SCALAR-NEXT: movw %ax, 6(%rdx) +; SCALAR-NEXT: movw %cx, 4(%rdx) +; SCALAR-NEXT: movw %ax, 10(%rdx) +; SCALAR-NEXT: movw %cx, 8(%rdx) +; SCALAR-NEXT: movw %ax, 14(%rdx) +; SCALAR-NEXT: movw %cx, 12(%rdx) +; SCALAR-NEXT: movw %ax, 18(%rdx) +; SCALAR-NEXT: movw %cx, 16(%rdx) +; SCALAR-NEXT: movw %ax, 22(%rdx) +; SCALAR-NEXT: movw %cx, 20(%rdx) +; SCALAR-NEXT: movw %ax, 26(%rdx) +; SCALAR-NEXT: movw %cx, 24(%rdx) +; SCALAR-NEXT: movw %ax, 30(%rdx) +; SCALAR-NEXT: movw %cx, 28(%rdx) +; SCALAR-NEXT: movw %ax, 34(%rdx) +; SCALAR-NEXT: movw %cx, 32(%rdx) +; SCALAR-NEXT: movw %ax, 38(%rdx) +; SCALAR-NEXT: movw %cx, 36(%rdx) +; SCALAR-NEXT: movw %ax, 42(%rdx) +; SCALAR-NEXT: movw %cx, 40(%rdx) +; SCALAR-NEXT: movw %ax, 46(%rdx) +; SCALAR-NEXT: movw %cx, 44(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec384_v2i16: @@ -3794,56 +3779,56 @@ define void @vec384_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v6i8: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %rax -; SCALAR-NEXT: shrq $40, %rax -; SCALAR-NEXT: movq %rdi, %rcx -; SCALAR-NEXT: shrq $32, %rcx -; SCALAR-NEXT: movl %edi, %r8d +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq %rax, %rcx +; SCALAR-NEXT: shrq $40, %rcx +; SCALAR-NEXT: movq %rax, %rdi +; SCALAR-NEXT: shrq $32, %rdi +; SCALAR-NEXT: movl %eax, %r8d ; SCALAR-NEXT: shrl $24, %r8d -; SCALAR-NEXT: movl %edi, %r9d +; SCALAR-NEXT: movl %eax, %r9d ; SCALAR-NEXT: shrl $16, %r9d -; SCALAR-NEXT: movl %edi, %r10d +; SCALAR-NEXT: movl %eax, %r10d ; SCALAR-NEXT: shrl $8, %r10d -; SCALAR-NEXT: notb %dil -; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: notb %al +; SCALAR-NEXT: movzbl %al, %eax ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d ; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %edi, %r10d +; SCALAR-NEXT: orl %eax, %r10d ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %edi +; SCALAR-NEXT: movzbl %r9b, %eax ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d ; SCALAR-NEXT: shll $8, %r8d -; SCALAR-NEXT: orl %edi, %r8d +; SCALAR-NEXT: orl %eax, %r8d +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %eax ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movzbl %cl, %ecx -; SCALAR-NEXT: notb %al -; SCALAR-NEXT: movzbl %al, %eax -; SCALAR-NEXT: shll $8, %eax -; SCALAR-NEXT: orl %ecx, %eax -; SCALAR-NEXT: movw %ax, 4(%rsi) +; SCALAR-NEXT: shll $8, %ecx +; SCALAR-NEXT: orl %eax, %ecx +; SCALAR-NEXT: movw %cx, 4(%rsi) ; SCALAR-NEXT: shll $16, %r8d -; SCALAR-NEXT: movzwl %r10w, %ecx -; SCALAR-NEXT: orl %r8d, %ecx -; SCALAR-NEXT: movl %ecx, (%rsi) -; SCALAR-NEXT: movw %ax, 4(%rdx) -; SCALAR-NEXT: movl %ecx, (%rdx) -; SCALAR-NEXT: movw %ax, 12(%rdx) -; SCALAR-NEXT: movl %ecx, 8(%rdx) -; SCALAR-NEXT: movw %ax, 20(%rdx) -; SCALAR-NEXT: movl %ecx, 16(%rdx) -; SCALAR-NEXT: movw %ax, 28(%rdx) -; SCALAR-NEXT: movl %ecx, 24(%rdx) -; SCALAR-NEXT: movw %ax, 36(%rdx) -; SCALAR-NEXT: movl %ecx, 32(%rdx) -; SCALAR-NEXT: movw %ax, 44(%rdx) -; SCALAR-NEXT: movl %ecx, 40(%rdx) -; SCALAR-NEXT: movw %ax, 52(%rdx) -; SCALAR-NEXT: movl %ecx, 48(%rdx) -; SCALAR-NEXT: movw %ax, 60(%rdx) -; SCALAR-NEXT: movl %ecx, 56(%rdx) +; SCALAR-NEXT: movzwl %r10w, %eax +; SCALAR-NEXT: orl %r8d, %eax +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movw %cx, 4(%rdx) +; SCALAR-NEXT: movl %eax, (%rdx) +; SCALAR-NEXT: movw %cx, 12(%rdx) +; SCALAR-NEXT: movl %eax, 8(%rdx) +; SCALAR-NEXT: movw %cx, 20(%rdx) +; SCALAR-NEXT: movl %eax, 16(%rdx) +; SCALAR-NEXT: movw %cx, 28(%rdx) +; SCALAR-NEXT: movl %eax, 24(%rdx) +; SCALAR-NEXT: movw %cx, 36(%rdx) +; SCALAR-NEXT: movl %eax, 32(%rdx) +; SCALAR-NEXT: movw %cx, 44(%rdx) +; SCALAR-NEXT: movl %eax, 40(%rdx) +; SCALAR-NEXT: movw %cx, 52(%rdx) +; SCALAR-NEXT: movl %eax, 48(%rdx) +; SCALAR-NEXT: movw %cx, 60(%rdx) +; SCALAR-NEXT: movl %eax, 56(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-ONLY-LABEL: vec384_v6i8: @@ -4940,146 +4925,152 @@ define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: pushq %r13 ; SCALAR-NEXT: pushq %r12 ; SCALAR-NEXT: pushq %rbx -; SCALAR-NEXT: movzbl 15(%rdi), %eax -; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 15(%rdi), %r13d ; SCALAR-NEXT: movzbl 14(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 13(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 12(%rdi), %r11d -; SCALAR-NEXT: movzbl 11(%rdi), %r13d -; SCALAR-NEXT: movzbl 10(%rdi), %r12d -; SCALAR-NEXT: movzbl 9(%rdi), %ebp -; SCALAR-NEXT: movzbl 8(%rdi), %r14d -; SCALAR-NEXT: movzbl 7(%rdi), %ebx -; SCALAR-NEXT: movzbl 6(%rdi), %r10d -; SCALAR-NEXT: movzbl 5(%rdi), %r15d -; SCALAR-NEXT: movzbl 4(%rdi), %r9d -; SCALAR-NEXT: movzbl 3(%rdi), %r8d -; SCALAR-NEXT: movzbl 2(%rdi), %ecx +; SCALAR-NEXT: movzbl 12(%rdi), %ebx +; SCALAR-NEXT: movzbl 11(%rdi), %eax +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 10(%rdi), %eax +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 9(%rdi), %r12d +; SCALAR-NEXT: movzbl 8(%rdi), %eax +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 7(%rdi), %r11d +; SCALAR-NEXT: movzbl 6(%rdi), %r14d +; SCALAR-NEXT: movzbl 5(%rdi), %ebp +; SCALAR-NEXT: movzbl 4(%rdi), %r15d +; SCALAR-NEXT: movzbl 3(%rdi), %r9d +; SCALAR-NEXT: movzbl 2(%rdi), %r8d ; SCALAR-NEXT: movzbl (%rdi), %eax -; SCALAR-NEXT: movzbl 1(%rdi), %edi +; SCALAR-NEXT: movzbl 1(%rdi), %ecx ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: notb %dil -; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movl %r15d, %r9d -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: notb %r10b -; SCALAR-NEXT: notb %bl -; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: notb %r14b +; SCALAR-NEXT: notb %r15b ; SCALAR-NEXT: notb %bpl -; SCALAR-NEXT: movl %ebp, %r15d +; SCALAR-NEXT: notb %r14b +; SCALAR-NEXT: notb %r11b +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SCALAR-NEXT: notb %r8b +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: notb %r13b -; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: notb %dil ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movb %r8b, 15(%rsi) -; SCALAR-NEXT: movb %cl, 14(%rsi) -; SCALAR-NEXT: movl %edi, %eax -; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movb %dil, 13(%rsi) -; SCALAR-NEXT: movb %r11b, 12(%rsi) -; SCALAR-NEXT: movl %r11d, %ebp +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: notb %al +; SCALAR-NEXT: notb %bl +; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: notb %r13b +; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movb %r13b, 15(%rsi) +; SCALAR-NEXT: movb %r10b, 14(%rsi) +; SCALAR-NEXT: movb %r9b, 13(%rsi) +; SCALAR-NEXT: movb %bl, 12(%rsi) +; SCALAR-NEXT: movb %al, 11(%rsi) +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movb %cl, 10(%rsi) +; SCALAR-NEXT: movb %r12b, 9(%rsi) +; SCALAR-NEXT: movb %r8b, 8(%rsi) +; SCALAR-NEXT: movb %r11b, 7(%rsi) +; SCALAR-NEXT: movl %r11d, %r13d ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movb %r13b, 11(%rsi) -; SCALAR-NEXT: movb %r12b, 10(%rsi) -; SCALAR-NEXT: movb %r15b, 9(%rsi) -; SCALAR-NEXT: movb %r14b, 8(%rsi) -; SCALAR-NEXT: movb %bl, 7(%rsi) -; SCALAR-NEXT: movb %r10b, 6(%rsi) -; SCALAR-NEXT: movl %r10d, %ebx -; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movb %r9b, 5(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r11b, 4(%rsi) +; SCALAR-NEXT: movl %r14d, %ebx +; SCALAR-NEXT: movb %r14b, 6(%rsi) +; SCALAR-NEXT: movl %ebp, %ecx +; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movb %bpl, 5(%rsi) +; SCALAR-NEXT: movl %r15d, %r8d +; SCALAR-NEXT: movb %r15b, 4(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload ; SCALAR-NEXT: movb %r12b, 3(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 2(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r13b, 1(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, (%rsi) -; SCALAR-NEXT: movb %r8b, 15(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 14(%rdx) -; SCALAR-NEXT: movb %al, 13(%rdx) -; SCALAR-NEXT: movb %bpl, 12(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 2(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 1(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r15b, (%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r11b, 15(%rdx) +; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movb %r10b, 14(%rdx) +; SCALAR-NEXT: movl %r9d, %esi +; SCALAR-NEXT: movb %r9b, 13(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r9b, 12(%rdx) ; SCALAR-NEXT: movb %al, 11(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 10(%rdx) -; SCALAR-NEXT: movb %r15b, 9(%rdx) -; SCALAR-NEXT: movb %r14b, 8(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SCALAR-NEXT: movb %bpl, 7(%rdx) +; SCALAR-NEXT: movb %bpl, 9(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r14b, 8(%rdx) +; SCALAR-NEXT: movb %r13b, 7(%rdx) ; SCALAR-NEXT: movb %bl, 6(%rdx) -; SCALAR-NEXT: movb %r9b, 5(%rdx) -; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movb %r11b, 4(%rdx) +; SCALAR-NEXT: movb %cl, 5(%rdx) +; SCALAR-NEXT: movb %r8b, 4(%rdx) ; SCALAR-NEXT: movb %r12b, 3(%rdx) -; SCALAR-NEXT: movb %cl, 2(%rdx) -; SCALAR-NEXT: movl %r13d, %ebx -; SCALAR-NEXT: movb %r13b, 1(%rdx) -; SCALAR-NEXT: movl %r10d, %esi -; SCALAR-NEXT: movb %r10b, (%rdx) -; SCALAR-NEXT: movb %r8b, 31(%rdx) -; SCALAR-NEXT: movb %dil, 30(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 29(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r11b, 28(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r13b, 27(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r12b, 26(%rdx) -; SCALAR-NEXT: movb %r15b, 25(%rdx) +; SCALAR-NEXT: movb %r13b, 2(%rdx) +; SCALAR-NEXT: movb %dil, 1(%rdx) +; SCALAR-NEXT: movl %r15d, %ecx +; SCALAR-NEXT: movb %r15b, (%rdx) +; SCALAR-NEXT: movb %r11b, 31(%rdx) +; SCALAR-NEXT: movb %r10b, 30(%rdx) +; SCALAR-NEXT: movb %sil, 29(%rdx) +; SCALAR-NEXT: movb %r9b, 28(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 27(%rdx) +; SCALAR-NEXT: movb %al, 26(%rdx) +; SCALAR-NEXT: movl %ebp, %r15d +; SCALAR-NEXT: movb %bpl, 25(%rdx) ; SCALAR-NEXT: movb %r14b, 24(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload ; SCALAR-NEXT: movb %bpl, 23(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, 22(%rdx) -; SCALAR-NEXT: movb %r9b, 21(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r9b, 20(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 19(%rdx) -; SCALAR-NEXT: movb %cl, 18(%rdx) -; SCALAR-NEXT: movb %bl, 17(%rdx) -; SCALAR-NEXT: movb %sil, 16(%rdx) -; SCALAR-NEXT: movb %r8b, 47(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 46(%rdx) -; SCALAR-NEXT: movb %al, 45(%rdx) -; SCALAR-NEXT: movb %r11b, 44(%rdx) -; SCALAR-NEXT: movb %r13b, 43(%rdx) -; SCALAR-NEXT: movb %r12b, 42(%rdx) +; SCALAR-NEXT: movb %bl, 22(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r11b, 21(%rdx) +; SCALAR-NEXT: movb %r8b, 20(%rdx) +; SCALAR-NEXT: movl %r12d, %r9d +; SCALAR-NEXT: movb %r12b, 19(%rdx) +; SCALAR-NEXT: movb %r13b, 18(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r12b, 17(%rdx) +; SCALAR-NEXT: movb %cl, 16(%rdx) +; SCALAR-NEXT: movl %ecx, %eax +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 47(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 46(%rdx) +; SCALAR-NEXT: movb %sil, 45(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 44(%rdx) +; SCALAR-NEXT: movb %dil, 43(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 42(%rdx) ; SCALAR-NEXT: movb %r15b, 41(%rdx) ; SCALAR-NEXT: movb %r14b, 40(%rdx) ; SCALAR-NEXT: movb %bpl, 39(%rdx) -; SCALAR-NEXT: movb %r10b, 38(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 37(%rdx) -; SCALAR-NEXT: movb %r9b, 36(%rdx) -; SCALAR-NEXT: movb %dil, 35(%rdx) -; SCALAR-NEXT: movb %cl, 34(%rdx) -; SCALAR-NEXT: movb %bl, 33(%rdx) -; SCALAR-NEXT: movb %sil, 32(%rdx) +; SCALAR-NEXT: movb %bl, 38(%rdx) +; SCALAR-NEXT: movb %r11b, 37(%rdx) +; SCALAR-NEXT: movb %r8b, 36(%rdx) +; SCALAR-NEXT: movb %r9b, 35(%rdx) +; SCALAR-NEXT: movb %r13b, 34(%rdx) +; SCALAR-NEXT: movb %r12b, 33(%rdx) +; SCALAR-NEXT: movb %al, 32(%rdx) ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: popq %r12 ; SCALAR-NEXT: popq %r13 @@ -5123,13 +5114,13 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-LABEL: vec384_v24i8: ; SCALAR: # %bb.0: ; SCALAR-NEXT: movq (%rdi), %rax -; SCALAR-NEXT: movq 8(%rdi), %rcx -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %r8 -; SCALAR-NEXT: shrq $40, %r8 -; SCALAR-NEXT: movq %rdi, %r9 +; SCALAR-NEXT: movq 8(%rdi), %r8 +; SCALAR-NEXT: movq 16(%rdi), %rcx +; SCALAR-NEXT: movq %rcx, %rdi +; SCALAR-NEXT: shrq $40, %rdi +; SCALAR-NEXT: movq %rcx, %r9 ; SCALAR-NEXT: shrq $56, %r9 -; SCALAR-NEXT: movq %rdi, %r10 +; SCALAR-NEXT: movq %rcx, %r10 ; SCALAR-NEXT: shrq $48, %r10 ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d @@ -5137,20 +5128,20 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movzbl %r9b, %r9d ; SCALAR-NEXT: shll $8, %r9d ; SCALAR-NEXT: orl %r10d, %r9d -; SCALAR-NEXT: movq %rdi, %r10 +; SCALAR-NEXT: movq %rcx, %r10 ; SCALAR-NEXT: shrq $32, %r10 ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d -; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: shll $8, %r8d -; SCALAR-NEXT: orl %r10d, %r8d -; SCALAR-NEXT: movl %edi, %r10d +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: shll $8, %edi +; SCALAR-NEXT: orl %r10d, %edi +; SCALAR-NEXT: movl %ecx, %r10d ; SCALAR-NEXT: shrl $24, %r10d ; SCALAR-NEXT: shll $16, %r9d -; SCALAR-NEXT: movzwl %r8w, %r8d -; SCALAR-NEXT: orl %r9d, %r8d -; SCALAR-NEXT: movl %edi, %r9d +; SCALAR-NEXT: movzwl %di, %edi +; SCALAR-NEXT: orl %r9d, %edi +; SCALAR-NEXT: movl %ecx, %r9d ; SCALAR-NEXT: shrl $16, %r9d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d @@ -5158,96 +5149,96 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movzbl %r10b, %r10d ; SCALAR-NEXT: shll $8, %r10d ; SCALAR-NEXT: orl %r9d, %r10d -; SCALAR-NEXT: movl %edi, %r9d +; SCALAR-NEXT: movl %ecx, %r9d ; SCALAR-NEXT: shrl $8, %r9d -; SCALAR-NEXT: notb %dil -; SCALAR-NEXT: movzbl %dil, %edi -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r11d -; SCALAR-NEXT: shll $8, %r11d -; SCALAR-NEXT: orl %edi, %r11d -; SCALAR-NEXT: movq %rcx, %r9 -; SCALAR-NEXT: shrq $40, %r9 -; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r11w, %edi -; SCALAR-NEXT: orl %r10d, %edi -; SCALAR-NEXT: movq %rcx, %r10 -; SCALAR-NEXT: shrq $56, %r10 -; SCALAR-NEXT: shlq $32, %r8 -; SCALAR-NEXT: orq %r8, %rdi -; SCALAR-NEXT: movq %rcx, %r8 -; SCALAR-NEXT: shrq $48, %r8 -; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: notb %r10b -; SCALAR-NEXT: movzbl %r10b, %r10d -; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %r8d, %r10d -; SCALAR-NEXT: movq %rcx, %r8 -; SCALAR-NEXT: shrq $32, %r8 -; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: notb %cl +; SCALAR-NEXT: movzbl %cl, %ecx ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d ; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %r8d, %r9d -; SCALAR-NEXT: movl %ecx, %r11d -; SCALAR-NEXT: shrl $24, %r11d +; SCALAR-NEXT: orl %ecx, %r9d +; SCALAR-NEXT: movq %r8, %r11 +; SCALAR-NEXT: shrq $40, %r11 ; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r9w, %r8d -; SCALAR-NEXT: orl %r10d, %r8d -; SCALAR-NEXT: movl %ecx, %r9d -; SCALAR-NEXT: shrl $16, %r9d +; SCALAR-NEXT: movzwl %r9w, %ecx +; SCALAR-NEXT: orl %r10d, %ecx +; SCALAR-NEXT: movq %r8, %r9 +; SCALAR-NEXT: shrq $56, %r9 +; SCALAR-NEXT: shlq $32, %rdi +; SCALAR-NEXT: orq %rdi, %rcx +; SCALAR-NEXT: movq %r8, %rdi +; SCALAR-NEXT: shrq $48, %rdi +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %edi ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %edi, %r9d +; SCALAR-NEXT: movq %r8, %rdi +; SCALAR-NEXT: shrq $32, %rdi +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %edi ; SCALAR-NEXT: notb %r11b ; SCALAR-NEXT: movzbl %r11b, %r10d ; SCALAR-NEXT: shll $8, %r10d +; SCALAR-NEXT: orl %edi, %r10d +; SCALAR-NEXT: movl %r8d, %edi +; SCALAR-NEXT: shrl $24, %edi +; SCALAR-NEXT: shll $16, %r9d +; SCALAR-NEXT: movzwl %r10w, %r10d ; SCALAR-NEXT: orl %r9d, %r10d -; SCALAR-NEXT: movl %ecx, %r9d -; SCALAR-NEXT: shrl $8, %r9d -; SCALAR-NEXT: notb %cl -; SCALAR-NEXT: movzbl %cl, %ecx +; SCALAR-NEXT: movl %r8d, %r9d +; SCALAR-NEXT: shrl $16, %r9d ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r11d +; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %r11d ; SCALAR-NEXT: shll $8, %r11d -; SCALAR-NEXT: orl %ecx, %r11d -; SCALAR-NEXT: movq %rax, %r9 -; SCALAR-NEXT: shrq $40, %r9 -; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r11w, %ecx -; SCALAR-NEXT: orl %r10d, %ecx -; SCALAR-NEXT: movq %rax, %r10 -; SCALAR-NEXT: shrq $56, %r10 -; SCALAR-NEXT: shlq $32, %r8 -; SCALAR-NEXT: orq %r8, %rcx -; SCALAR-NEXT: movq %rax, %r8 -; SCALAR-NEXT: shrq $48, %r8 +; SCALAR-NEXT: orl %r9d, %r11d +; SCALAR-NEXT: movl %r8d, %edi +; SCALAR-NEXT: shrl $8, %edi ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: shll $8, %edi +; SCALAR-NEXT: orl %r8d, %edi +; SCALAR-NEXT: movq %rax, %r8 +; SCALAR-NEXT: shrq $40, %r8 +; SCALAR-NEXT: shll $16, %r11d +; SCALAR-NEXT: movzwl %di, %edi +; SCALAR-NEXT: orl %r11d, %edi +; SCALAR-NEXT: movq %rax, %r9 +; SCALAR-NEXT: shrq $56, %r9 +; SCALAR-NEXT: shlq $32, %r10 +; SCALAR-NEXT: orq %r10, %rdi +; SCALAR-NEXT: movq %rax, %r10 +; SCALAR-NEXT: shrq $48, %r10 ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d -; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %r8d, %r10d -; SCALAR-NEXT: movq %rax, %r8 -; SCALAR-NEXT: shrq $32, %r8 -; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movzbl %r8b, %r8d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d ; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %r8d, %r9d -; SCALAR-NEXT: movl %eax, %r11d -; SCALAR-NEXT: shrl $24, %r11d -; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r9w, %r8d +; SCALAR-NEXT: orl %r10d, %r9d +; SCALAR-NEXT: movq %rax, %r10 +; SCALAR-NEXT: shrq $32, %r10 +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: notb %r8b +; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $8, %r8d ; SCALAR-NEXT: orl %r10d, %r8d +; SCALAR-NEXT: movl %eax, %r10d +; SCALAR-NEXT: shrl $24, %r10d +; SCALAR-NEXT: shll $16, %r9d +; SCALAR-NEXT: movzwl %r8w, %r8d +; SCALAR-NEXT: orl %r9d, %r8d ; SCALAR-NEXT: movl %eax, %r9d ; SCALAR-NEXT: shrl $16, %r9d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movzbl %r11b, %r10d +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: movzbl %r10b, %r10d ; SCALAR-NEXT: shll $8, %r10d ; SCALAR-NEXT: orl %r9d, %r10d ; SCALAR-NEXT: movl %eax, %r9d @@ -5264,13 +5255,13 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: orq %r8, %rax ; SCALAR-NEXT: movq %rax, (%rsi) -; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rcx, 16(%rsi) ; SCALAR-NEXT: movq %rax, (%rdx) -; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) -; SCALAR-NEXT: movq %rcx, 40(%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rcx, 16(%rdx) +; SCALAR-NEXT: movq %rcx, 48(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) ; SCALAR-NEXT: movq %rax, 32(%rdx) ; SCALAR-NEXT: retq ; @@ -7019,171 +7010,146 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 13(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 12(%rdi), %r10d +; SCALAR-NEXT: movzbl 12(%rdi), %ecx ; SCALAR-NEXT: movzbl 11(%rdi), %r13d ; SCALAR-NEXT: movzbl 10(%rdi), %r12d ; SCALAR-NEXT: movzbl 9(%rdi), %r15d ; SCALAR-NEXT: movzbl 8(%rdi), %r14d ; SCALAR-NEXT: movzbl 7(%rdi), %ebp -; SCALAR-NEXT: movzbl 6(%rdi), %r11d -; SCALAR-NEXT: movzbl 5(%rdi), %ebx -; SCALAR-NEXT: movzbl 4(%rdi), %r9d -; SCALAR-NEXT: movzbl 3(%rdi), %r8d -; SCALAR-NEXT: movzbl 2(%rdi), %ecx +; SCALAR-NEXT: movzbl 6(%rdi), %ebx +; SCALAR-NEXT: movzbl 5(%rdi), %r11d +; SCALAR-NEXT: movzbl 4(%rdi), %r10d +; SCALAR-NEXT: movzbl 3(%rdi), %r9d +; SCALAR-NEXT: movzbl 2(%rdi), %r8d ; SCALAR-NEXT: movzbl (%rdi), %eax ; SCALAR-NEXT: movzbl 1(%rdi), %edi ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %dil ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: notb %cl -; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movl %ebx, %r9d -; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movl %r11d, %ebx +; SCALAR-NEXT: notb %bl ; SCALAR-NEXT: notb %bpl ; SCALAR-NEXT: notb %r14b ; SCALAR-NEXT: notb %r15b -; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r12b -; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r13b -; SCALAR-NEXT: notb %r10b -; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movl %ecx, %edi ; SCALAR-NEXT: notb %dil ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload ; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movb %r11b, 15(%rsi) -; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movb %r8b, 14(%rsi) ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movl %edi, %eax -; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movb %dil, 13(%rsi) -; SCALAR-NEXT: movb %r10b, 12(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: notb %cl +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: notb %al +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %cl, 14(%rsi) +; SCALAR-NEXT: movb %r8b, 13(%rsi) +; SCALAR-NEXT: movl %edi, %r8d +; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movb %dil, 12(%rsi) ; SCALAR-NEXT: movb %r13b, 11(%rsi) ; SCALAR-NEXT: movb %r12b, 10(%rsi) ; SCALAR-NEXT: movb %r15b, 9(%rsi) ; SCALAR-NEXT: movb %r14b, 8(%rsi) -; SCALAR-NEXT: movl %r14d, %r12d ; SCALAR-NEXT: movb %bpl, 7(%rsi) -; SCALAR-NEXT: movl %ebp, %r14d -; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movb %bl, 6(%rsi) -; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movb %r9b, 5(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 4(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SCALAR-NEXT: movb %bpl, 3(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 2(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 1(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, (%rsi) -; SCALAR-NEXT: movb %r11b, 15(%rdx) -; SCALAR-NEXT: movb %r8b, 14(%rdx) -; SCALAR-NEXT: movb %al, 13(%rdx) +; SCALAR-NEXT: movb %r11b, 5(%rsi) +; SCALAR-NEXT: movb %r10b, 4(%rsi) +; SCALAR-NEXT: movb %r9b, 3(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 12(%rdx) -; SCALAR-NEXT: movb %r13b, 11(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r15b, 10(%rdx) +; SCALAR-NEXT: movb %al, 2(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 1(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, (%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 15(%rdx) +; SCALAR-NEXT: movb %cl, 14(%rdx) +; SCALAR-NEXT: movl %ecx, %edi ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 9(%rdx) -; SCALAR-NEXT: movb %r12b, 8(%rdx) -; SCALAR-NEXT: movb %r14b, 7(%rdx) +; SCALAR-NEXT: movb %sil, 13(%rdx) +; SCALAR-NEXT: movb %r8b, 12(%rdx) +; SCALAR-NEXT: movb %r13b, 11(%rdx) +; SCALAR-NEXT: movb %r12b, 10(%rdx) +; SCALAR-NEXT: movb %r15b, 9(%rdx) +; SCALAR-NEXT: movb %r14b, 8(%rdx) +; SCALAR-NEXT: movb %bpl, 7(%rdx) ; SCALAR-NEXT: movb %bl, 6(%rdx) -; SCALAR-NEXT: movb %r9b, 5(%rdx) -; SCALAR-NEXT: movl %r9d, %r11d -; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 4(%rdx) -; SCALAR-NEXT: movb %bpl, 3(%rdx) -; SCALAR-NEXT: movb %dil, 2(%rdx) -; SCALAR-NEXT: movb %cl, 1(%rdx) -; SCALAR-NEXT: movl %ecx, %r14d -; SCALAR-NEXT: movl %r10d, %esi -; SCALAR-NEXT: movb %r10b, (%rdx) +; SCALAR-NEXT: movb %r11b, 5(%rdx) +; SCALAR-NEXT: movb %r10b, 4(%rdx) +; SCALAR-NEXT: movb %r9b, 3(%rdx) +; SCALAR-NEXT: movb %al, 2(%rdx) +; SCALAR-NEXT: movl %eax, %r8d +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 1(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, (%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; SCALAR-NEXT: movb %cl, 31(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r9b, 30(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 29(%rdx) -; SCALAR-NEXT: movb %al, 28(%rdx) -; SCALAR-NEXT: movl %eax, %r10d +; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movb %dil, 30(%rdx) +; SCALAR-NEXT: movb %sil, 29(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 28(%rdx) ; SCALAR-NEXT: movb %r13b, 27(%rdx) -; SCALAR-NEXT: movb %r15b, 26(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r12b, 26(%rdx) ; SCALAR-NEXT: movb %r15b, 25(%rdx) -; SCALAR-NEXT: movl %r12d, %ebp -; SCALAR-NEXT: movb %r12b, 24(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 23(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 22(%rdx) +; SCALAR-NEXT: movb %r14b, 24(%rdx) +; SCALAR-NEXT: movb %bpl, 23(%rdx) +; SCALAR-NEXT: movb %bl, 22(%rdx) ; SCALAR-NEXT: movb %r11b, 21(%rdx) -; SCALAR-NEXT: movb %r8b, 20(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 19(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 20(%rdx) +; SCALAR-NEXT: movb %r9b, 19(%rdx) ; SCALAR-NEXT: movb %r8b, 18(%rdx) -; SCALAR-NEXT: movb %r14b, 17(%rdx) -; SCALAR-NEXT: movb %sil, 16(%rdx) -; SCALAR-NEXT: movl %esi, %r11d +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 17(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 16(%rdx) ; SCALAR-NEXT: movb %cl, 47(%rdx) -; SCALAR-NEXT: movb %r9b, 46(%rdx) -; SCALAR-NEXT: movb %dil, 45(%rdx) -; SCALAR-NEXT: movb %r10b, 44(%rdx) +; SCALAR-NEXT: movb %dil, 46(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 45(%rdx) +; SCALAR-NEXT: movb %sil, 44(%rdx) ; SCALAR-NEXT: movb %r13b, 43(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload ; SCALAR-NEXT: movb %r12b, 42(%rdx) ; SCALAR-NEXT: movb %r15b, 41(%rdx) -; SCALAR-NEXT: movl %ebp, %r14d -; SCALAR-NEXT: movb %bpl, 40(%rdx) -; SCALAR-NEXT: movl %ebx, %ebp -; SCALAR-NEXT: movb %bl, 39(%rdx) -; SCALAR-NEXT: movl %eax, %ebx -; SCALAR-NEXT: movb %al, 38(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 37(%rdx) +; SCALAR-NEXT: movb %r14b, 40(%rdx) +; SCALAR-NEXT: movb %bpl, 39(%rdx) +; SCALAR-NEXT: movb %bl, 38(%rdx) +; SCALAR-NEXT: movb %r11b, 37(%rdx) +; SCALAR-NEXT: movb %r10b, 36(%rdx) +; SCALAR-NEXT: movb %r9b, 35(%rdx) +; SCALAR-NEXT: movb %r8b, 34(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 36(%rdx) +; SCALAR-NEXT: movb %al, 33(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 32(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 35(%rdx) -; SCALAR-NEXT: movb %r8b, 34(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r9b, 33(%rdx) -; SCALAR-NEXT: movb %r11b, 32(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r11b, 63(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r11b, 62(%rdx) -; SCALAR-NEXT: movb %dil, 61(%rdx) -; SCALAR-NEXT: movb %r10b, 60(%rdx) +; SCALAR-NEXT: movb %sil, 63(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 62(%rdx) +; SCALAR-NEXT: movb %cl, 61(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 60(%rdx) ; SCALAR-NEXT: movb %r13b, 59(%rdx) ; SCALAR-NEXT: movb %r12b, 58(%rdx) ; SCALAR-NEXT: movb %r15b, 57(%rdx) ; SCALAR-NEXT: movb %r14b, 56(%rdx) ; SCALAR-NEXT: movb %bpl, 55(%rdx) ; SCALAR-NEXT: movb %bl, 54(%rdx) -; SCALAR-NEXT: movb %cl, 53(%rdx) -; SCALAR-NEXT: movb %al, 52(%rdx) -; SCALAR-NEXT: movb %sil, 51(%rdx) +; SCALAR-NEXT: movb %r11b, 53(%rdx) +; SCALAR-NEXT: movb %r10b, 52(%rdx) +; SCALAR-NEXT: movb %r9b, 51(%rdx) ; SCALAR-NEXT: movb %r8b, 50(%rdx) -; SCALAR-NEXT: movb %r9b, 49(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 48(%rdx) +; SCALAR-NEXT: movb %al, 49(%rdx) +; SCALAR-NEXT: movb %dil, 48(%rdx) ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: popq %r12 ; SCALAR-NEXT: popq %r13 @@ -7242,114 +7208,106 @@ define void @vec512_v16i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SCALAR-NEXT: movzwl 26(%rdi), %eax ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SCALAR-NEXT: movl 24(%rdi), %r13d -; SCALAR-NEXT: movzwl 22(%rdi), %r12d -; SCALAR-NEXT: movl 20(%rdi), %r15d -; SCALAR-NEXT: movzwl 18(%rdi), %r14d -; SCALAR-NEXT: movl 16(%rdi), %ebx -; SCALAR-NEXT: movzwl 14(%rdi), %r11d -; SCALAR-NEXT: movl 12(%rdi), %r10d -; SCALAR-NEXT: movzwl 10(%rdi), %r9d +; SCALAR-NEXT: movl 24(%rdi), %r15d +; SCALAR-NEXT: movzwl 22(%rdi), %r14d +; SCALAR-NEXT: movl 20(%rdi), %ebp +; SCALAR-NEXT: movzwl 18(%rdi), %ebx +; SCALAR-NEXT: movl 16(%rdi), %r11d +; SCALAR-NEXT: movzwl 14(%rdi), %r10d +; SCALAR-NEXT: movl 12(%rdi), %r9d +; SCALAR-NEXT: movzwl 10(%rdi), %r12d ; SCALAR-NEXT: movl 8(%rdi), %r8d -; SCALAR-NEXT: movzwl 6(%rdi), %ecx -; SCALAR-NEXT: movzwl 2(%rdi), %ebp -; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: movzwl 6(%rdi), %eax +; SCALAR-NEXT: movzwl 2(%rdi), %r13d +; SCALAR-NEXT: movl (%rdi), %ecx ; SCALAR-NEXT: movl 4(%rdi), %edi -; SCALAR-NEXT: notl %eax -; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SCALAR-NEXT: notl %ebp -; SCALAR-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SCALAR-NEXT: notl %edi -; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SCALAR-NEXT: notl %ecx ; SCALAR-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: notl %r13d +; SCALAR-NEXT: notl %edi +; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SCALAR-NEXT: notl %r8d ; SCALAR-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: notl %r12d ; SCALAR-NEXT: notl %r9d -; SCALAR-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SCALAR-NEXT: movl %r10d, %edi -; SCALAR-NEXT: notl %edi -; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: notl %r10d ; SCALAR-NEXT: notl %r11d -; SCALAR-NEXT: movl %r11d, %r9d ; SCALAR-NEXT: notl %ebx -; SCALAR-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: notl %ebp ; SCALAR-NEXT: notl %r14d ; SCALAR-NEXT: notl %r15d -; SCALAR-NEXT: notl %r12d -; SCALAR-NEXT: notl %r13d -; SCALAR-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload -; SCALAR-NEXT: notl %r10d -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload -; SCALAR-NEXT: notl %r11d -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload -; SCALAR-NEXT: notl %r8d -; SCALAR-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SCALAR-NEXT: movw %r8w, 30(%rsi) -; SCALAR-NEXT: movw %r11w, 28(%rsi) -; SCALAR-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SCALAR-NEXT: movw %r10w, 26(%rsi) -; SCALAR-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SCALAR-NEXT: movw %r13w, 24(%rsi) -; SCALAR-NEXT: movw %r12w, 22(%rsi) -; SCALAR-NEXT: movw %r15w, 20(%rsi) -; SCALAR-NEXT: movw %r14w, 18(%rsi) -; SCALAR-NEXT: movw %bx, 16(%rsi) -; SCALAR-NEXT: movw %r9w, 14(%rsi) -; SCALAR-NEXT: movw %di, 12(%rsi) -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Reload -; SCALAR-NEXT: movw %bp, 10(%rsi) ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload -; SCALAR-NEXT: movw %di, 8(%rsi) +; SCALAR-NEXT: notl %edi +; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; SCALAR-NEXT: movw %cx, 6(%rsi) +; SCALAR-NEXT: notl %ecx +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: movw %ax, 30(%rsi) +; SCALAR-NEXT: movw %cx, 28(%rsi) +; SCALAR-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: movw %di, 26(%rsi) +; SCALAR-NEXT: movw %r15w, 24(%rsi) +; SCALAR-NEXT: movw %r14w, 22(%rsi) +; SCALAR-NEXT: movw %bp, 20(%rsi) +; SCALAR-NEXT: movw %bx, 18(%rsi) +; SCALAR-NEXT: movw %r11w, 16(%rsi) +; SCALAR-NEXT: movw %r10w, 14(%rsi) +; SCALAR-NEXT: movw %r9w, 12(%rsi) +; SCALAR-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: movw %r12w, 10(%rsi) +; SCALAR-NEXT: movw %r8w, 8(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movw %ax, 6(%rsi) ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload ; SCALAR-NEXT: movw %r8w, 4(%rsi) -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; SCALAR-NEXT: movw %ax, 2(%rsi) -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload -; SCALAR-NEXT: movw %bx, (%rsi) -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload -; SCALAR-NEXT: movw %r13w, 30(%rdx) -; SCALAR-NEXT: movw %r11w, 28(%rdx) -; SCALAR-NEXT: movw %r10w, 26(%rdx) +; SCALAR-NEXT: movw %r13w, 2(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movw %di, (%rsi) ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; SCALAR-NEXT: movw %si, 24(%rdx) -; SCALAR-NEXT: movw %r12w, 22(%rdx) -; SCALAR-NEXT: movw %r15w, 20(%rdx) -; SCALAR-NEXT: movw %r14w, 18(%rdx) -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload +; SCALAR-NEXT: movw %si, 30(%rdx) +; SCALAR-NEXT: movw %cx, 28(%rdx) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; SCALAR-NEXT: movw %cx, 26(%rdx) +; SCALAR-NEXT: movw %r15w, 24(%rdx) +; SCALAR-NEXT: movw %r14w, 22(%rdx) +; SCALAR-NEXT: movw %bp, 20(%rdx) +; SCALAR-NEXT: movw %bx, 18(%rdx) ; SCALAR-NEXT: movw %r11w, 16(%rdx) -; SCALAR-NEXT: movw %r9w, 14(%rdx) -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload -; SCALAR-NEXT: movw %r10w, 12(%rdx) -; SCALAR-NEXT: movw %bp, 10(%rdx) -; SCALAR-NEXT: movw %di, 8(%rdx) -; SCALAR-NEXT: movw %cx, 6(%rdx) +; SCALAR-NEXT: movw %r10w, 14(%rdx) +; SCALAR-NEXT: movw %r9w, 12(%rdx) +; SCALAR-NEXT: movw %r12w, 10(%rdx) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; SCALAR-NEXT: movw %si, 8(%rdx) +; SCALAR-NEXT: movw %ax, 6(%rdx) ; SCALAR-NEXT: movw %r8w, 4(%rdx) -; SCALAR-NEXT: movw %ax, 2(%rdx) -; SCALAR-NEXT: movl %ebx, %esi -; SCALAR-NEXT: movw %si, (%rdx) -; SCALAR-NEXT: movw %r13w, 62(%rdx) -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload -; SCALAR-NEXT: movw %bx, 60(%rdx) -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload -; SCALAR-NEXT: movw %bx, 58(%rdx) -; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload -; SCALAR-NEXT: movw %bx, 56(%rdx) -; SCALAR-NEXT: movw %r12w, 54(%rdx) -; SCALAR-NEXT: movw %r15w, 52(%rdx) -; SCALAR-NEXT: movw %r14w, 50(%rdx) +; SCALAR-NEXT: movw %r13w, 2(%rdx) +; SCALAR-NEXT: movl %edi, %eax +; SCALAR-NEXT: movw %ax, (%rdx) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movw %di, 62(%rdx) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movw %di, 60(%rdx) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movw %di, 58(%rdx) +; SCALAR-NEXT: movw %r15w, 56(%rdx) +; SCALAR-NEXT: movw %r14w, 54(%rdx) +; SCALAR-NEXT: movw %bp, 52(%rdx) +; SCALAR-NEXT: movw %bx, 50(%rdx) ; SCALAR-NEXT: movw %r11w, 48(%rdx) -; SCALAR-NEXT: movw %r9w, 46(%rdx) -; SCALAR-NEXT: movw %r10w, 44(%rdx) -; SCALAR-NEXT: movw %bp, 42(%rdx) -; SCALAR-NEXT: movw %di, 40(%rdx) -; SCALAR-NEXT: movw %cx, 38(%rdx) +; SCALAR-NEXT: movw %r10w, 46(%rdx) +; SCALAR-NEXT: movw %r9w, 44(%rdx) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movw %di, 42(%rdx) +; SCALAR-NEXT: movw %si, 40(%rdx) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; SCALAR-NEXT: movw %si, 38(%rdx) ; SCALAR-NEXT: movw %r8w, 36(%rdx) -; SCALAR-NEXT: movw %ax, 34(%rdx) -; SCALAR-NEXT: movw %si, 32(%rdx) +; SCALAR-NEXT: movw %r13w, 34(%rdx) +; SCALAR-NEXT: movw %ax, 32(%rdx) ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: popq %r12 ; SCALAR-NEXT: popq %r13 @@ -7419,9 +7377,9 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 13(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 12(%rdi), %r13d -; SCALAR-NEXT: movzbl 11(%rdi), %eax +; SCALAR-NEXT: movzbl 12(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 11(%rdi), %r13d ; SCALAR-NEXT: movzbl 10(%rdi), %r12d ; SCALAR-NEXT: movzbl 9(%rdi), %r15d ; SCALAR-NEXT: movzbl 8(%rdi), %r14d @@ -7455,15 +7413,15 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SCALAR-NEXT: notb %bl +; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: notb %r8b +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill ; SCALAR-NEXT: movzbl 17(%rdi), %eax ; SCALAR-NEXT: notb %al @@ -7477,21 +7435,21 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movzbl 20(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 21(%rdi), %ebp -; SCALAR-NEXT: notb %bpl -; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 22(%rdi), %ebx -; SCALAR-NEXT: notb %bl -; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 23(%rdi), %r10d +; SCALAR-NEXT: movzbl 21(%rdi), %r11d +; SCALAR-NEXT: notb %r11b +; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 22(%rdi), %r10d ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 24(%rdi), %r9d +; SCALAR-NEXT: movzbl 23(%rdi), %r9d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 25(%rdi), %ecx -; SCALAR-NEXT: notb %cl -; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 24(%rdi), %r8d +; SCALAR-NEXT: notb %r8b +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 25(%rdi), %ebp +; SCALAR-NEXT: notb %bpl +; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 26(%rdi), %r14d ; SCALAR-NEXT: notb %r14b ; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill @@ -7516,53 +7474,52 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %r12b, 28(%rsi) ; SCALAR-NEXT: movb %r15b, 27(%rsi) ; SCALAR-NEXT: movb %r14b, 26(%rsi) -; SCALAR-NEXT: movb %cl, 25(%rsi) -; SCALAR-NEXT: movb %r9b, 24(%rsi) -; SCALAR-NEXT: movb %r10b, 23(%rsi) -; SCALAR-NEXT: movb %bl, 22(%rsi) -; SCALAR-NEXT: movb %bpl, 21(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SCALAR-NEXT: movb %bpl, 20(%rsi) +; SCALAR-NEXT: movb %bpl, 25(%rsi) +; SCALAR-NEXT: movb %r8b, 24(%rsi) +; SCALAR-NEXT: movb %r9b, 23(%rsi) +; SCALAR-NEXT: movb %r10b, 22(%rsi) +; SCALAR-NEXT: movb %r11b, 21(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 20(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 19(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 18(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 17(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 16(%rsi) -; SCALAR-NEXT: movb %r8b, 15(%rsi) -; SCALAR-NEXT: movl %r8d, %r14d -; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 14(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 13(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r9b, 16(%rsi) +; SCALAR-NEXT: movb %cl, 15(%rsi) +; SCALAR-NEXT: movl %ecx, %ebp ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 12(%rsi) -; SCALAR-NEXT: movb %r11b, 11(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 10(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 9(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 8(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r11b, 7(%rsi) +; SCALAR-NEXT: movb %al, 14(%rsi) +; SCALAR-NEXT: movb %bl, 13(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 12(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r13b, 6(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, 5(%rsi) +; SCALAR-NEXT: movb %r13b, 11(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r11b, 10(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r12b, 4(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r9b, 3(%rsi) +; SCALAR-NEXT: movb %r12b, 9(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 8(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 7(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r15b, 2(%rsi) +; SCALAR-NEXT: movb %r15b, 6(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 5(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r8b, 4(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r8b, 3(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r14b, 2(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload ; SCALAR-NEXT: movb %r8b, 1(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, (%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, (%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 31(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload @@ -7585,84 +7542,85 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %sil, 22(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 21(%rdx) -; SCALAR-NEXT: movb %bpl, 20(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 20(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 19(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 18(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 17(%rdx) -; SCALAR-NEXT: movb %cl, 16(%rdx) -; SCALAR-NEXT: movb %r14b, 15(%rdx) -; SCALAR-NEXT: movb %bl, 14(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 13(%rdx) -; SCALAR-NEXT: movb %al, 12(%rdx) +; SCALAR-NEXT: movb %r9b, 16(%rdx) +; SCALAR-NEXT: movb %bpl, 15(%rdx) +; SCALAR-NEXT: movb %al, 14(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 11(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 10(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r14b, 9(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SCALAR-NEXT: movb %bpl, 8(%rdx) -; SCALAR-NEXT: movb %r11b, 7(%rdx) -; SCALAR-NEXT: movb %r13b, 6(%rdx) -; SCALAR-NEXT: movb %r10b, 5(%rdx) -; SCALAR-NEXT: movb %r12b, 4(%rdx) -; SCALAR-NEXT: movb %r9b, 3(%rdx) -; SCALAR-NEXT: movb %r15b, 2(%rdx) -; SCALAR-NEXT: movb %r8b, 1(%rdx) -; SCALAR-NEXT: movb %dil, (%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 63(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 62(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 61(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 60(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 59(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 58(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 57(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 56(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 55(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 54(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 53(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 52(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 51(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 50(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 49(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 48(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 47(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 46(%rdx) -; SCALAR-NEXT: movb %cl, 45(%rdx) +; SCALAR-NEXT: movb %sil, 13(%rdx) +; SCALAR-NEXT: movb %bl, 12(%rdx) +; SCALAR-NEXT: movb %r13b, 11(%rdx) +; SCALAR-NEXT: movb %r11b, 10(%rdx) +; SCALAR-NEXT: movb %r12b, 9(%rdx) +; SCALAR-NEXT: movb %dil, 8(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r9b, 7(%rdx) +; SCALAR-NEXT: movb %r15b, 6(%rdx) +; SCALAR-NEXT: movb %cl, 5(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 4(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 44(%rdx) -; SCALAR-NEXT: movb %sil, 43(%rdx) -; SCALAR-NEXT: movb %bl, 42(%rdx) -; SCALAR-NEXT: movb %r14b, 41(%rdx) -; SCALAR-NEXT: movb %bpl, 40(%rdx) -; SCALAR-NEXT: movb %r11b, 39(%rdx) -; SCALAR-NEXT: movb %r13b, 38(%rdx) -; SCALAR-NEXT: movb %r10b, 37(%rdx) -; SCALAR-NEXT: movb %r12b, 36(%rdx) -; SCALAR-NEXT: movb %r9b, 35(%rdx) -; SCALAR-NEXT: movb %r15b, 34(%rdx) +; SCALAR-NEXT: movb %al, 3(%rdx) +; SCALAR-NEXT: movb %r14b, 2(%rdx) +; SCALAR-NEXT: movb %r8b, 1(%rdx) +; SCALAR-NEXT: movb %r10b, (%rdx) +; SCALAR-NEXT: movl %r10d, %edi +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 63(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 62(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 61(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 60(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 59(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 58(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 57(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 56(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 55(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 54(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 53(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 52(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 51(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 50(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 49(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 48(%rdx) +; SCALAR-NEXT: movb %bpl, 47(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 46(%rdx) +; SCALAR-NEXT: movb %sil, 45(%rdx) +; SCALAR-NEXT: movb %bl, 44(%rdx) +; SCALAR-NEXT: movb %r13b, 43(%rdx) +; SCALAR-NEXT: movb %r11b, 42(%rdx) +; SCALAR-NEXT: movb %r12b, 41(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 40(%rdx) +; SCALAR-NEXT: movb %r9b, 39(%rdx) +; SCALAR-NEXT: movb %r15b, 38(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 37(%rdx) +; SCALAR-NEXT: movb %cl, 36(%rdx) +; SCALAR-NEXT: movb %al, 35(%rdx) +; SCALAR-NEXT: movb %r14b, 34(%rdx) ; SCALAR-NEXT: movb %r8b, 33(%rdx) ; SCALAR-NEXT: movb %dil, 32(%rdx) ; SCALAR-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll index 6d77e04504e2d9..995ae4688bff97 100644 --- a/llvm/test/CodeGen/X86/swifterror.ll +++ b/llvm/test/CodeGen/X86/swifterror.ll @@ -1566,11 +1566,11 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-APPLE-NEXT: .cfi_offset %r14, -32 ; CHECK-APPLE-NEXT: .cfi_offset %r15, -24 ; CHECK-APPLE-NEXT: .cfi_offset %rbp, -16 -; CHECK-APPLE-NEXT: movq %r12, %rbx -; CHECK-APPLE-NEXT: movq %r13, (%rsp) ## 8-byte Spill +; CHECK-APPLE-NEXT: movq %r12, (%rsp) ## 8-byte Spill +; CHECK-APPLE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-APPLE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-APPLE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-APPLE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-APPLE-NEXT: movq %rcx, %rbx ; CHECK-APPLE-NEXT: movq %rdx, %r14 ; CHECK-APPLE-NEXT: movq %rsi, %r15 ; CHECK-APPLE-NEXT: movq %rdi, %rbp @@ -1587,16 +1587,16 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-APPLE-NEXT: movq %rbp, %rdi ; CHECK-APPLE-NEXT: movq %r15, %rsi ; CHECK-APPLE-NEXT: movq %r14, %rdx -; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload +; CHECK-APPLE-NEXT: movq %rbx, %rcx ; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload ; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload -; CHECK-APPLE-NEXT: movq (%rsp), %r13 ## 8-byte Reload -; CHECK-APPLE-NEXT: movq %rbx, %r12 +; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 ## 8-byte Reload +; CHECK-APPLE-NEXT: movq (%rsp), %r12 ## 8-byte Reload ; CHECK-APPLE-NEXT: callq _params_and_return_in_reg2 -; CHECK-APPLE-NEXT: movq %rax, %r14 -; CHECK-APPLE-NEXT: movq %rdx, %r15 -; CHECK-APPLE-NEXT: movq %rcx, %rbp -; CHECK-APPLE-NEXT: movq %r8, %rbx +; CHECK-APPLE-NEXT: movq %rax, %rbx +; CHECK-APPLE-NEXT: movq %rdx, %r14 +; CHECK-APPLE-NEXT: movq %rcx, %r15 +; CHECK-APPLE-NEXT: movq %r8, %rbp ; CHECK-APPLE-NEXT: movq %r12, (%rsp) ## 8-byte Spill ; CHECK-APPLE-NEXT: movl $1, %edi ; CHECK-APPLE-NEXT: movl $2, %esi @@ -1607,10 +1607,10 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-APPLE-NEXT: xorl %r13d, %r13d ; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload ; CHECK-APPLE-NEXT: callq _params_in_reg2 -; CHECK-APPLE-NEXT: movq %r14, %rax -; CHECK-APPLE-NEXT: movq %r15, %rdx -; CHECK-APPLE-NEXT: movq %rbp, %rcx -; CHECK-APPLE-NEXT: movq %rbx, %r8 +; CHECK-APPLE-NEXT: movq %rbx, %rax +; CHECK-APPLE-NEXT: movq %r14, %rdx +; CHECK-APPLE-NEXT: movq %r15, %rcx +; CHECK-APPLE-NEXT: movq %rbp, %r8 ; CHECK-APPLE-NEXT: movq (%rsp), %r12 ## 8-byte Reload ; CHECK-APPLE-NEXT: addq $48, %rsp ; CHECK-APPLE-NEXT: popq %rbx @@ -1708,11 +1708,11 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: .cfi_offset %edi, -16 ; CHECK-i386-NEXT: .cfi_offset %ebx, -12 ; CHECK-i386-NEXT: .cfi_offset %ebp, -8 -; CHECK-i386-NEXT: movl 148(%esp), %esi +; CHECK-i386-NEXT: movl 144(%esp), %esi ; CHECK-i386-NEXT: movl $0, 64(%esp) -; CHECK-i386-NEXT: movl 192(%esp), %ebx -; CHECK-i386-NEXT: movl 196(%esp), %ebp -; CHECK-i386-NEXT: movl 200(%esp), %edi +; CHECK-i386-NEXT: movl 192(%esp), %edi +; CHECK-i386-NEXT: movl 196(%esp), %ebx +; CHECK-i386-NEXT: movl 200(%esp), %ebp ; CHECK-i386-NEXT: leal 64(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 52(%esp) ; CHECK-i386-NEXT: movl $0, 48(%esp) @@ -1729,9 +1729,9 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl $0, 4(%esp) ; CHECK-i386-NEXT: movl $1, (%esp) ; CHECK-i386-NEXT: calll _params_in_reg2 -; CHECK-i386-NEXT: movl %edi, 56(%esp) -; CHECK-i386-NEXT: movl %ebp, 52(%esp) -; CHECK-i386-NEXT: movl %ebx, 48(%esp) +; CHECK-i386-NEXT: movl %ebp, 56(%esp) +; CHECK-i386-NEXT: movl %ebx, 52(%esp) +; CHECK-i386-NEXT: movl %edi, 48(%esp) ; CHECK-i386-NEXT: movl 188(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 44(%esp) ; CHECK-i386-NEXT: movl 184(%esp), %eax @@ -1752,7 +1752,8 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl %eax, 12(%esp) ; CHECK-i386-NEXT: movl 152(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 8(%esp) -; CHECK-i386-NEXT: movl %esi, 4(%esp) +; CHECK-i386-NEXT: movl 148(%esp), %eax +; CHECK-i386-NEXT: movl %eax, 4(%esp) ; CHECK-i386-NEXT: leal 88(%esp), %eax ; CHECK-i386-NEXT: movl %eax, (%esp) ; CHECK-i386-NEXT: calll _params_and_return_in_reg2 @@ -1765,9 +1766,10 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-i386-NEXT: movl 100(%esp), %eax ; CHECK-i386-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-i386-NEXT: movl 104(%esp), %ebp -; CHECK-i386-NEXT: movl 108(%esp), %edi -; CHECK-i386-NEXT: movl 112(%esp), %esi +; CHECK-i386-NEXT: movl 104(%esp), %eax +; CHECK-i386-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-i386-NEXT: movl 108(%esp), %ebp +; CHECK-i386-NEXT: movl 112(%esp), %edi ; CHECK-i386-NEXT: movl 116(%esp), %ebx ; CHECK-i386-NEXT: leal 64(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 52(%esp) @@ -1785,19 +1787,19 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl $0, 4(%esp) ; CHECK-i386-NEXT: movl $1, (%esp) ; CHECK-i386-NEXT: calll _params_in_reg2 -; CHECK-i386-NEXT: movl 144(%esp), %eax -; CHECK-i386-NEXT: movl %ebx, 28(%eax) -; CHECK-i386-NEXT: movl %esi, 24(%eax) -; CHECK-i386-NEXT: movl %edi, 20(%eax) -; CHECK-i386-NEXT: movl %ebp, 16(%eax) -; CHECK-i386-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; CHECK-i386-NEXT: movl %ecx, 12(%eax) -; CHECK-i386-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; CHECK-i386-NEXT: movl %ecx, 8(%eax) -; CHECK-i386-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; CHECK-i386-NEXT: movl %ecx, 4(%eax) -; CHECK-i386-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; CHECK-i386-NEXT: movl %ecx, (%eax) +; CHECK-i386-NEXT: movl %ebx, 28(%esi) +; CHECK-i386-NEXT: movl %edi, 24(%esi) +; CHECK-i386-NEXT: movl %ebp, 20(%esi) +; CHECK-i386-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; CHECK-i386-NEXT: movl %eax, 16(%esi) +; CHECK-i386-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; CHECK-i386-NEXT: movl %eax, 12(%esi) +; CHECK-i386-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; CHECK-i386-NEXT: movl %eax, 8(%esi) +; CHECK-i386-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; CHECK-i386-NEXT: movl %eax, 4(%esi) +; CHECK-i386-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; CHECK-i386-NEXT: movl %eax, (%esi) ; CHECK-i386-NEXT: addl $124, %esp ; CHECK-i386-NEXT: popl %esi ; CHECK-i386-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/uadd_sat.ll b/llvm/test/CodeGen/X86/uadd_sat.ll index 0a3c2ae344fd32..fa8eed01d6d731 100644 --- a/llvm/test/CodeGen/X86/uadd_sat.ll +++ b/llvm/test/CodeGen/X86/uadd_sat.ll @@ -126,23 +126,23 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: addl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl $-1, %ebx -; X86-NEXT: cmovbl %ebx, %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmovbl %ebx, %edx +; X86-NEXT: cmovbl %ebx, %edi ; X86-NEXT: addl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovbl %ebx, %esi -; X86-NEXT: addl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovbl %ebx, %edi -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: addl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovbl %ebx, %edx +; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmovbl %ebx, %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll index 8d3319eb595883..5434d9fe27e7d9 100644 --- a/llvm/test/CodeGen/X86/udiv_fix.ll +++ b/llvm/test/CodeGen/X86/udiv_fix.ll @@ -241,22 +241,22 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movdqa %xmm1, %xmm3 ; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; X64-NEXT: movq %xmm3, %rcx -; X64-NEXT: movdqa %xmm0, %xmm4 -; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; X64-NEXT: psllq $31, %xmm4 -; X64-NEXT: movq %xmm4, %rax +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: psllq $31, %xmm3 +; X64-NEXT: movq %xmm3, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X64-NEXT: movq %xmm4, %rax -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: movq %xmm4, %rcx +; X64-NEXT: movq %rax, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; X64-NEXT: movq %xmm3, %rax +; X64-NEXT: movdqa %xmm1, %xmm3 +; X64-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: movq %xmm3, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm4 -; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; X64-NEXT: movq %rax, %xmm3 +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: psllq $31, %xmm0 ; X64-NEXT: movq %xmm0, %rax @@ -272,7 +272,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] ; X64-NEXT: movaps %xmm2, %xmm0 ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll index 67d3e5b16e08b5..dbdde968c904d8 100644 --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -328,24 +328,24 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm3 ; X64-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm3[0] -; X64-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; X64-NEXT: movdqa %xmm8, %xmm3 -; X64-NEXT: pxor %xmm4, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; X64-NEXT: movdqa {{.*#+}} xmm7 = [2147483649,2147483649,2147483649,2147483649] -; X64-NEXT: pcmpeqd %xmm7, %xmm6 +; X64-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; X64-NEXT: movdqa %xmm8, %xmm4 +; X64-NEXT: pxor %xmm3, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; X64-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649,2147483649,2147483649] +; X64-NEXT: pcmpeqd %xmm6, %xmm7 ; X64-NEXT: movdqa {{.*#+}} xmm5 = [9223372043297226751,9223372043297226751] ; X64-NEXT: movdqa %xmm5, %xmm9 -; X64-NEXT: pcmpgtd %xmm3, %xmm9 +; X64-NEXT: pcmpgtd %xmm4, %xmm9 ; X64-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; X64-NEXT: pand %xmm6, %xmm10 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] -; X64-NEXT: por %xmm10, %xmm3 -; X64-NEXT: pcmpeqd %xmm6, %xmm6 -; X64-NEXT: pand %xmm3, %xmm8 -; X64-NEXT: pxor %xmm6, %xmm3 -; X64-NEXT: por %xmm8, %xmm3 -; X64-NEXT: psrlq $1, %xmm3 +; X64-NEXT: pand %xmm7, %xmm10 +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; X64-NEXT: por %xmm10, %xmm4 +; X64-NEXT: pcmpeqd %xmm7, %xmm7 +; X64-NEXT: pand %xmm4, %xmm8 +; X64-NEXT: pxor %xmm7, %xmm4 +; X64-NEXT: por %xmm8, %xmm4 +; X64-NEXT: psrlq $1, %xmm4 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: movd %xmm1, %ecx @@ -360,19 +360,19 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; X64-NEXT: pxor %xmm8, %xmm4 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; X64-NEXT: pcmpeqd %xmm7, %xmm0 -; X64-NEXT: pcmpgtd %xmm4, %xmm5 +; X64-NEXT: pxor %xmm8, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X64-NEXT: pcmpeqd %xmm6, %xmm0 +; X64-NEXT: pcmpgtd %xmm3, %xmm5 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] ; X64-NEXT: pand %xmm0, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: pxor %xmm0, %xmm6 +; X64-NEXT: pxor %xmm0, %xmm7 ; X64-NEXT: pand %xmm8, %xmm0 -; X64-NEXT: por %xmm6, %xmm0 +; X64-NEXT: por %xmm7, %xmm0 ; X64-NEXT: psrlq $1, %xmm0 -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] ; X64-NEXT: retq ; ; X86-LABEL: vec: diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll index f0479aea1b82c8..425dcfc99bf185 100644 --- a/llvm/test/CodeGen/X86/umax.ll +++ b/llvm/test/CodeGen/X86/umax.ll @@ -239,17 +239,17 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %ebx, %edx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: sbbl %ebx, %ebp ; X86-NEXT: movl %edi, %ebp ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovbl %ebx, %edx -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovbl %ebx, %esi ; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: cmovbl %ebp, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -367,27 +367,27 @@ define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %ebx, %ecx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: cmpl %edi, %ecx +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: sbbl %ebx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmovbl %edi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovbl %ebx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmovbl %ebp, %esi -; X86-NEXT: cmpl %edx, %edi -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: cmovbl %ebx, %esi +; X86-NEXT: cmpl %eax, %ebp +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: cmovbl %eax, %ebp ; X86-NEXT: cmovbl %edx, %edi -; X86-NEXT: cmovbl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %ebp, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi @@ -572,27 +572,27 @@ define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edi, %eax -; X86-NEXT: cmoval %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: cmoval %eax, %esi +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmoval %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %edx, %eax ; X86-NEXT: cmoval %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmoval %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmoval %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -624,22 +624,22 @@ define <4 x i32> @test_v4i32_1(<4 x i32> %a) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl $1, %ecx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: cmpl $1, %edx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: cmpl $1, %esi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: cmpl $1, %edi ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: cmpl $1, %esi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: cmpl $1, %ecx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -689,32 +689,32 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ebp, %eax -; X86-NEXT: cmoval %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ebx, %eax -; X86-NEXT: cmoval %eax, %ebx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edi, %eax -; X86-NEXT: cmoval %eax, %edi +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmoval %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: cmoval %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edx, %eax -; X86-NEXT: cmoval %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmoval %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmoval %eax, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: cmpl %ebx, %eax +; X86-NEXT: cmoval %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: cmoval %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %edx, %eax @@ -726,14 +726,14 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, 28(%ecx) ; X86-NEXT: movl %edx, 24(%ecx) +; X86-NEXT: movl %ebp, 20(%ecx) +; X86-NEXT: movl %ebx, 16(%ecx) +; X86-NEXT: movl %edi, 12(%ecx) +; X86-NEXT: movl %esi, 8(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%ecx) +; X86-NEXT: movl %eax, 4(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -845,32 +845,32 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %bp, %ax -; X86-NEXT: cmoval %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %bx, %ax -; X86-NEXT: cmoval %eax, %ebx +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmoval %eax, %edi +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmoval %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw %si, %ax ; X86-NEXT: cmoval %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %dx, %ax -; X86-NEXT: cmoval %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpw %di, %ax +; X86-NEXT: cmoval %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %cx, %ax -; X86-NEXT: cmoval %eax, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: cmpw %bx, %ax +; X86-NEXT: cmoval %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bp, %ax +; X86-NEXT: cmoval %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw %dx, %ax @@ -882,14 +882,14 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) ; X86-NEXT: movw %dx, 12(%ecx) +; X86-NEXT: movw %bp, 10(%ecx) +; X86-NEXT: movw %bx, 8(%ecx) +; X86-NEXT: movw %di, 6(%ecx) +; X86-NEXT: movw %si, 4(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 10(%ecx) +; X86-NEXT: movw %ax, 2(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 8(%ecx) -; X86-NEXT: movw %si, 6(%ecx) -; X86-NEXT: movw %di, 4(%ecx) -; X86-NEXT: movw %bx, 2(%ecx) -; X86-NEXT: movw %bp, (%ecx) +; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -921,29 +921,29 @@ define <8 x i16> @test_v8i16_1(<8 x i16> %a) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw $1, %ax +; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: cmpw $1, %dx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: cmpw $1, %bp -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: cmpw $1, %bx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: cmpw $1, %di -; X86-NEXT: adcl $0, %edi ; X86-NEXT: cmpw $1, %si ; X86-NEXT: adcl $0, %esi +; X86-NEXT: cmpw $1, %di +; X86-NEXT: adcl $0, %edi +; X86-NEXT: cmpw $1, %bx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: cmpw $1, %bp +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: cmpw $1, %cx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpw $1, %ax -; X86-NEXT: adcl $0, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw $1, %ax ; X86-NEXT: adcl $0, %eax @@ -951,13 +951,13 @@ define <8 x i16> @test_v8i16_1(<8 x i16> %a) nounwind { ; X86-NEXT: movw %ax, 14(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 12(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 10(%ecx) -; X86-NEXT: movw %si, 8(%ecx) +; X86-NEXT: movw %bp, 10(%ecx) +; X86-NEXT: movw %bx, 8(%ecx) ; X86-NEXT: movw %di, 6(%ecx) -; X86-NEXT: movw %bx, 4(%ecx) -; X86-NEXT: movw %bp, 2(%ecx) -; X86-NEXT: movw %dx, (%ecx) +; X86-NEXT: movw %si, 4(%ecx) +; X86-NEXT: movw %dx, 2(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -987,18 +987,19 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $40, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb %bl, %al -; X86-NEXT: cmoval %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb %dl, %al ; X86-NEXT: cmoval %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: cmpb %cl, %al ; X86-NEXT: cmoval %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1122,19 +1123,20 @@ define <16 x i8> @test_v16i8_1(<16 x i8> %a) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $40, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmpb $1, %bl -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb $1, %dl -; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb $1, %al +; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: cmpb $1, %cl ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb $1, %dl +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax ; X86-NEXT: cmpb $1, %al ; X86-NEXT: adcl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1316,20 +1318,20 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: shrdl $28, %edi, %ecx +; X86-NEXT: shrdl $28, %edi, %esi ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %esi, %ecx +; X86-NEXT: cmpl %ecx, %esi ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: cmovbl %esi, %ecx +; X86-NEXT: cmovbl %ecx, %esi ; X86-NEXT: cmovbl %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll index e4ce08966a8946..9dc6a8774710c6 100644 --- a/llvm/test/CodeGen/X86/umin.ll +++ b/llvm/test/CodeGen/X86/umin.ll @@ -150,29 +150,29 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl %edx, %ebx +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: sbbl %esi, %ebp ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovbl %ebx, %edx -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmovbl %ebx, %esi +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovbl %edi, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %ebp, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -301,27 +301,27 @@ define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edi, %eax -; X86-NEXT: cmovbl %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: cmovbl %eax, %esi +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovbl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %edx, %eax ; X86-NEXT: cmovbl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmovbl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovbl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -371,32 +371,32 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ebp, %eax -; X86-NEXT: cmovbl %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ebx, %eax -; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edi, %eax -; X86-NEXT: cmovbl %eax, %edi +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovbl %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: cmovbl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edx, %eax -; X86-NEXT: cmovbl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovbl %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovbl %eax, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: cmpl %ebx, %eax +; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: cmovbl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %edx, %eax @@ -408,14 +408,14 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, 28(%ecx) ; X86-NEXT: movl %edx, 24(%ecx) +; X86-NEXT: movl %ebp, 20(%ecx) +; X86-NEXT: movl %ebx, 16(%ecx) +; X86-NEXT: movl %edi, 12(%ecx) +; X86-NEXT: movl %esi, 8(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%ecx) +; X86-NEXT: movl %eax, 4(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -447,32 +447,32 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %bp, %ax -; X86-NEXT: cmovbl %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %bx, %ax -; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovbl %eax, %edi +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmovbl %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw %si, %ax ; X86-NEXT: cmovbl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %dx, %ax -; X86-NEXT: cmovbl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpw %di, %ax +; X86-NEXT: cmovbl %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %cx, %ax -; X86-NEXT: cmovbl %eax, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: cmpw %bx, %ax +; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bp, %ax +; X86-NEXT: cmovbl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw %dx, %ax @@ -484,14 +484,14 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) ; X86-NEXT: movw %dx, 12(%ecx) +; X86-NEXT: movw %bp, 10(%ecx) +; X86-NEXT: movw %bx, 8(%ecx) +; X86-NEXT: movw %di, 6(%ecx) +; X86-NEXT: movw %si, 4(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 10(%ecx) +; X86-NEXT: movw %ax, 2(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 8(%ecx) -; X86-NEXT: movw %si, 6(%ecx) -; X86-NEXT: movw %di, 4(%ecx) -; X86-NEXT: movw %bx, 2(%ecx) -; X86-NEXT: movw %bp, (%ecx) +; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -521,18 +521,19 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $40, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb %bl, %al -; X86-NEXT: cmovbl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb %dl, %al ; X86-NEXT: cmovbl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: cmpb %cl, %al ; X86-NEXT: cmovbl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -731,20 +732,20 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: shrdl $28, %edi, %ecx +; X86-NEXT: shrdl $28, %edi, %esi ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: sbbl %edi, %ebx -; X86-NEXT: cmovbl %esi, %ecx +; X86-NEXT: cmovbl %ecx, %esi ; X86-NEXT: cmovbl %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll index ccabb360a990c9..18b5cd85b8fe54 100644 --- a/llvm/test/CodeGen/X86/umul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll @@ -96,341 +96,339 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp +; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: addl %ebx, %esi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %ebp ; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %ecx -; X86-NEXT: setb (%esp) # 1-byte Folded Spill +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl (%esp), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: setb %cl +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: setb %cl +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebx, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl (%esp), %edi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: imull %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx -; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %ebp, %esi ; X86-NEXT: movl %eax, %edx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: adcl %ebx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edi, %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: adcl %ecx, %ebx -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %esi -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb (%esp) # 1-byte Folded Spill +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb (%esp) # 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ecx, %esi ; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl %ebx, %ebp ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %ebx, (%esp) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl %ebp, (%esp) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %ecx, %esi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx ; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -443,10 +441,10 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl $0, %ebx ; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: imull %ecx, %ebp @@ -457,8 +455,8 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: addl %edx, %ebp ; X86-NEXT: imull {{[0-9]+}}(%esp), %edi ; X86-NEXT: addl %ebp, %edi -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl %ebx, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax @@ -467,22 +465,22 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull %edx, %esi ; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %edx, %edi ; X86-NEXT: addl %esi, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edx, %esi +; X86-NEXT: imull %edx, %ebp ; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: addl %edx, %esi +; X86-NEXT: addl %ebp, %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: addl %ecx, %eax -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl %edx, 4(%ecx) @@ -501,8 +499,8 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NEXT: movl %edx, 28(%ecx) ; X86-NEXT: movl %eax, 32(%ecx) -; X86-NEXT: andl $4095, %ebx # imm = 0xFFF -; X86-NEXT: movw %bx, 36(%ecx) +; X86-NEXT: andl $4095, %esi # imm = 0xFFF +; X86-NEXT: movw %si, 36(%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi @@ -519,88 +517,86 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r8, %r11 +; X64-NEXT: movq %r8, %r10 ; X64-NEXT: movq %rcx, %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %r11, %rbp ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: addq %rbx, %r15 ; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %r15, %rbx -; X64-NEXT: adcq %r14, %rbp +; X64-NEXT: adcq %r14, %r12 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %r10d +; X64-NEXT: movzbl %al, %r11d ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rbp, %r13 -; X64-NEXT: adcq %r10, %r12 +; X64-NEXT: addq %r12, %r13 +; X64-NEXT: adcq %r11, %r15 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r15, %r10 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r12, %r11 ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r10, %r15 +; X64-NEXT: mulq {{[0-9]+}}(%rsp) +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %r11, %r12 ; X64-NEXT: adcq %rbp, %rdx -; X64-NEXT: imulq %r9, %r11 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; X64-NEXT: imulq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; X64-NEXT: addq %r13, %r14 -; X64-NEXT: adcq %r12, %r15 -; X64-NEXT: adcq %rdx, %r11 +; X64-NEXT: adcq %r15, %r12 +; X64-NEXT: adcq %rdx, %r10 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %r15 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r10, %rbp +; X64-NEXT: addq %r11, %rbp ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: mulq %r11 ; X64-NEXT: addq %rbp, %rax ; X64-NEXT: adcq %r13, %rdx -; X64-NEXT: imulq %r10, %rcx +; X64-NEXT: imulq %r11, %rcx ; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: addq %r14, %r12 -; X64-NEXT: adcq %r15, %rax -; X64-NEXT: adcq %r11, %rcx -; X64-NEXT: imulq %r9, %r8 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rdx +; X64-NEXT: addq %r14, %r15 +; X64-NEXT: adcq %r12, %rax +; X64-NEXT: adcq %r10, %rcx +; X64-NEXT: imulq {{[0-9]+}}(%rsp), %r8 +; X64-NEXT: imulq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rsi -; X64-NEXT: addq %rdx, %rsi +; X64-NEXT: addq %r9, %rsi ; X64-NEXT: addq %r8, %rsi ; X64-NEXT: addq %rcx, %rsi ; X64-NEXT: movq %rbx, 8(%rdi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, (%rdi) -; X64-NEXT: movq %r12, 16(%rdi) +; X64-NEXT: movq %r15, 16(%rdi) ; X64-NEXT: movq %rax, 24(%rdi) ; X64-NEXT: movl %esi, 32(%rdi) ; X64-NEXT: shrq $32, %rsi diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index eacc714b49a4d4..5f62d1a7c4505c 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -266,21 +266,21 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %edx, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %edi, %eax -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %edi, %edx ; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: popl %esi @@ -306,31 +306,31 @@ define i64 @func8(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %edx, %ebx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: shldl $1, %ecx, %esi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: shldl $1, %ecx, %edi ; X86-NEXT: shrdl $31, %ecx, %eax -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -356,28 +356,29 @@ define i64 @func9(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %edi, %eax ; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll index 6b6845147e0439..b188c6326ad5ab 100644 --- a/llvm/test/CodeGen/X86/umul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -52,31 +52,31 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %edx, %ebx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %esi, %edx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: shrdl $2, %eax, %ecx ; X86-NEXT: shrdl $2, %edx, %eax ; X86-NEXT: shrl $2, %edx -; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %ebp, %edx ; X86-NEXT: movl $-1, %edx ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: cmovel %eax, %edx @@ -116,18 +116,18 @@ define i4 @func3(i4 %x, i4 %y) nounwind { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andb $15, %al ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzbl %al, %edx +; X86-NEXT: movzbl %al, %eax ; X86-NEXT: shlb $4, %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: imull %edx, %eax -; X86-NEXT: movb %ah, %cl -; X86-NEXT: shlb $6, %cl -; X86-NEXT: shrb $2, %al -; X86-NEXT: orb %cl, %al -; X86-NEXT: movzbl %al, %ecx -; X86-NEXT: cmpb $4, %ah +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: imull %eax, %ecx +; X86-NEXT: movb %ch, %al +; X86-NEXT: shlb $6, %al +; X86-NEXT: shrb $2, %cl +; X86-NEXT: orb %al, %cl +; X86-NEXT: movzbl %cl, %edx +; X86-NEXT: cmpb $4, %ch ; X86-NEXT: movl $255, %eax -; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: cmovbl %edx, %eax ; X86-NEXT: shrb $4, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl @@ -391,27 +391,27 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl $-1, %edi ; X86-NEXT: cmovol %edi, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: cmovol %edi, %ebx ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: cmovol %edi, %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: cmovol %edi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: cmovol %edi, %eax ; X86-NEXT: movl %eax, 12(%ecx) -; X86-NEXT: movl %ebp, 8(%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl %ebx, 8(%ecx) +; X86-NEXT: movl %ebp, 4(%ecx) ; X86-NEXT: movl %esi, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi @@ -442,24 +442,24 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edx, %esi -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: mull %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ebp, %edx ; X86-NEXT: adcl $0, %edi @@ -496,24 +496,24 @@ define i64 @func8(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edx, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: mull %esi +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: mull %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %edi, %edx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: addl %ebp, %edx ; X86-NEXT: adcl $0, %ecx diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll index 82603b35ba7128..0216b46b13b97d 100644 --- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -44,38 +44,39 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: leal (%ecx,%eax), %esi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: leal (%esi,%eax), %esi +; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %esi, %ebx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %edi ; X86-NEXT: leal (%esi,%eax), %esi ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %ebx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %esi, %ebx @@ -84,27 +85,28 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: adcl %ebx, %edx @@ -120,7 +122,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: orb %ch, %cl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: testl %edi, %edi +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: setne %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: testl %edi, %edi diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll index b1194bedc4e1ca..fef389f700ace3 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -658,7 +658,6 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin ; CHECK-BASELINE-NEXT: xorb %bl, %r8b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b ; CHECK-BASELINE-NEXT: xorb %bl, %r8b -; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: xorb %r14b, %r9b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b ; CHECK-BASELINE-NEXT: xorb %r14b, %r9b @@ -683,50 +682,48 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: xorb %al, %r13b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: xorb %al, %r13b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: xorb %al, %r15b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-BASELINE-NEXT: xorb %al, %r15b +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %al, %bl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-BASELINE-NEXT: xorb %al, %bpl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: xorb %al, %r15b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: xorb %al, %r15b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: xorb %al, %r13b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: xorb %al, %r13b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d -; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d -; CHECK-BASELINE-NEXT: xorb %r8b, %r10b +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r10b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-BASELINE-NEXT: xorb %r8b, %r10b +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r10b ; CHECK-BASELINE-NEXT: movb %r10b, 15(%rdi) ; CHECK-BASELINE-NEXT: movb %al, 14(%rdi) -; CHECK-BASELINE-NEXT: movb %bl, 13(%rdi) -; CHECK-BASELINE-NEXT: movb %bpl, 12(%rdi) -; CHECK-BASELINE-NEXT: movb %r15b, 11(%rdi) -; CHECK-BASELINE-NEXT: movb %r13b, 10(%rdi) +; CHECK-BASELINE-NEXT: movb %r13b, 13(%rdi) +; CHECK-BASELINE-NEXT: movb %r15b, 12(%rdi) +; CHECK-BASELINE-NEXT: movb %bpl, 11(%rdi) +; CHECK-BASELINE-NEXT: movb %bl, 10(%rdi) ; CHECK-BASELINE-NEXT: movb %cl, 9(%rdi) ; CHECK-BASELINE-NEXT: movb %dl, 8(%rdi) ; CHECK-BASELINE-NEXT: movb %sil, 7(%rdi) ; CHECK-BASELINE-NEXT: movb %r12b, 6(%rdi) ; CHECK-BASELINE-NEXT: movb %r14b, 5(%rdi) ; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movb %al, 3(%rdi) +; CHECK-BASELINE-NEXT: movb %r8b, 3(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: movb %al, 2(%rdi) ; CHECK-BASELINE-NEXT: movb %r11b, 1(%rdi) @@ -773,7 +770,6 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin ; CHECK-SSE1-NEXT: xorb %bl, %r8b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b ; CHECK-SSE1-NEXT: xorb %bl, %r8b -; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: xorb %r14b, %r9b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b ; CHECK-SSE1-NEXT: xorb %r14b, %r9b @@ -798,50 +794,48 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: xorb %al, %r13b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: xorb %al, %r13b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: xorb %al, %r15b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-SSE1-NEXT: xorb %al, %r15b +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %al, %bl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-SSE1-NEXT: xorb %al, %bpl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: xorb %al, %r15b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: xorb %al, %r15b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: xorb %al, %r13b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: xorb %al, %r13b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d -; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d -; CHECK-SSE1-NEXT: xorb %r8b, %r10b +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r10b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-SSE1-NEXT: xorb %r8b, %r10b +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r10b ; CHECK-SSE1-NEXT: movb %r10b, 15(%rdi) ; CHECK-SSE1-NEXT: movb %al, 14(%rdi) -; CHECK-SSE1-NEXT: movb %bl, 13(%rdi) -; CHECK-SSE1-NEXT: movb %bpl, 12(%rdi) -; CHECK-SSE1-NEXT: movb %r15b, 11(%rdi) -; CHECK-SSE1-NEXT: movb %r13b, 10(%rdi) +; CHECK-SSE1-NEXT: movb %r13b, 13(%rdi) +; CHECK-SSE1-NEXT: movb %r15b, 12(%rdi) +; CHECK-SSE1-NEXT: movb %bpl, 11(%rdi) +; CHECK-SSE1-NEXT: movb %bl, 10(%rdi) ; CHECK-SSE1-NEXT: movb %cl, 9(%rdi) ; CHECK-SSE1-NEXT: movb %dl, 8(%rdi) ; CHECK-SSE1-NEXT: movb %sil, 7(%rdi) ; CHECK-SSE1-NEXT: movb %r12b, 6(%rdi) ; CHECK-SSE1-NEXT: movb %r14b, 5(%rdi) ; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movb %al, 3(%rdi) +; CHECK-SSE1-NEXT: movb %r8b, 3(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: movb %al, 2(%rdi) ; CHECK-SSE1-NEXT: movb %r11b, 1(%rdi) @@ -1016,27 +1010,27 @@ define <4 x i32> @out_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movl 12(%rdx), %edi ; CHECK-BASELINE-NEXT: movl 8(%rdx), %r8d ; CHECK-BASELINE-NEXT: movl (%rdx), %r9d -; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d -; CHECK-BASELINE-NEXT: movl (%rsi), %edx -; CHECK-BASELINE-NEXT: xorl %r9d, %edx -; CHECK-BASELINE-NEXT: andl (%rcx), %edx -; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: movl 4(%rdx), %edx +; CHECK-BASELINE-NEXT: movl (%rsi), %r10d +; CHECK-BASELINE-NEXT: xorl %r9d, %r10d +; CHECK-BASELINE-NEXT: andl (%rcx), %r10d +; CHECK-BASELINE-NEXT: xorl %r9d, %r10d ; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d -; CHECK-BASELINE-NEXT: xorl %r10d, %r9d +; CHECK-BASELINE-NEXT: xorl %edx, %r9d ; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d -; CHECK-BASELINE-NEXT: xorl %r10d, %r9d -; CHECK-BASELINE-NEXT: movl 8(%rsi), %r10d -; CHECK-BASELINE-NEXT: xorl %r8d, %r10d -; CHECK-BASELINE-NEXT: andl 8(%rcx), %r10d -; CHECK-BASELINE-NEXT: xorl %r8d, %r10d +; CHECK-BASELINE-NEXT: xorl %edx, %r9d +; CHECK-BASELINE-NEXT: movl 8(%rsi), %edx +; CHECK-BASELINE-NEXT: xorl %r8d, %edx +; CHECK-BASELINE-NEXT: andl 8(%rcx), %edx +; CHECK-BASELINE-NEXT: xorl %r8d, %edx ; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi ; CHECK-BASELINE-NEXT: xorl %edi, %esi ; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi ; CHECK-BASELINE-NEXT: xorl %edi, %esi ; CHECK-BASELINE-NEXT: movl %esi, 12(%rax) -; CHECK-BASELINE-NEXT: movl %r10d, 8(%rax) +; CHECK-BASELINE-NEXT: movl %edx, 8(%rax) ; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax) -; CHECK-BASELINE-NEXT: movl %edx, (%rax) +; CHECK-BASELINE-NEXT: movl %r10d, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i32: @@ -1194,10 +1188,8 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rcx, %r10 -; CHECK-BASELINE-NEXT: movq %rdx, %r8 -; CHECK-BASELINE-NEXT: movq %rsi, %r9 -; CHECK-BASELINE-NEXT: movq %rdi, %r11 +; CHECK-BASELINE-NEXT: movq %rcx, %r8 +; CHECK-BASELINE-NEXT: movq %rdi, %r9 ; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax @@ -1215,236 +1207,236 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r15d ; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r12d ; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi -; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edx -; CHECK-BASELINE-NEXT: movzbl 2(%r8), %edi -; CHECK-BASELINE-NEXT: movzbl (%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 1(%r8), %ecx -; CHECK-BASELINE-NEXT: movzbl (%r9), %ebx -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb (%r10), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %r11d +; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %r10d +; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %ecx +; CHECK-BASELINE-NEXT: movzbl (%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl (%rsi), %ebx +; CHECK-BASELINE-NEXT: xorb %dil, %bl +; CHECK-BASELINE-NEXT: andb (%r8), %bl +; CHECK-BASELINE-NEXT: xorb %dil, %bl ; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 1(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 1(%rsi), %edi +; CHECK-BASELINE-NEXT: xorb %al, %dil +; CHECK-BASELINE-NEXT: andb 1(%r8), %dil +; CHECK-BASELINE-NEXT: xorb %al, %dil +; CHECK-BASELINE-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 2(%rsi), %eax ; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: andb 1(%r10), %al +; CHECK-BASELINE-NEXT: andb 2(%r8), %al ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 2(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %dil, %al -; CHECK-BASELINE-NEXT: andb 2(%r10), %al -; CHECK-BASELINE-NEXT: xorb %dil, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 3(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %dl, %al -; CHECK-BASELINE-NEXT: andb 3(%r10), %al -; CHECK-BASELINE-NEXT: xorb %dl, %al +; CHECK-BASELINE-NEXT: movzbl 3(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: andb 3(%r8), %al +; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 4(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %sil, %al -; CHECK-BASELINE-NEXT: andb 4(%r10), %al -; CHECK-BASELINE-NEXT: xorb %sil, %al +; CHECK-BASELINE-NEXT: movzbl 4(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %r11b, %al +; CHECK-BASELINE-NEXT: andb 4(%r8), %al +; CHECK-BASELINE-NEXT: xorb %r11b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 5(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 5(%rsi), %eax ; CHECK-BASELINE-NEXT: xorb %r13b, %al -; CHECK-BASELINE-NEXT: andb 5(%r10), %al +; CHECK-BASELINE-NEXT: andb 5(%r8), %al ; CHECK-BASELINE-NEXT: xorb %r13b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 6(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 6(%rsi), %eax ; CHECK-BASELINE-NEXT: xorb %r12b, %al -; CHECK-BASELINE-NEXT: andb 6(%r10), %al +; CHECK-BASELINE-NEXT: andb 6(%r8), %al ; CHECK-BASELINE-NEXT: xorb %r12b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 7(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 7(%rsi), %eax ; CHECK-BASELINE-NEXT: xorb %r15b, %al -; CHECK-BASELINE-NEXT: andb 7(%r10), %al +; CHECK-BASELINE-NEXT: andb 7(%r8), %al ; CHECK-BASELINE-NEXT: xorb %r15b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 8(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 8(%rsi), %eax ; CHECK-BASELINE-NEXT: xorb %r14b, %al -; CHECK-BASELINE-NEXT: andb 8(%r10), %al +; CHECK-BASELINE-NEXT: andb 8(%r8), %al ; CHECK-BASELINE-NEXT: xorb %r14b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 9(%rsi), %eax ; CHECK-BASELINE-NEXT: xorb %bpl, %al -; CHECK-BASELINE-NEXT: andb 9(%r10), %al +; CHECK-BASELINE-NEXT: andb 9(%r8), %al ; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 10(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 10(%rsi), %eax ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: andb 10(%r10), %al +; CHECK-BASELINE-NEXT: andb 10(%r8), %al ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 11(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 11(%rsi), %eax ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: andb 11(%r10), %al +; CHECK-BASELINE-NEXT: andb 11(%r8), %al ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 12(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 12(%rsi), %eax ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: andb 12(%r10), %al +; CHECK-BASELINE-NEXT: andb 12(%r8), %al ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 13(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 13(%rsi), %eax ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: andb 13(%r10), %al +; CHECK-BASELINE-NEXT: andb 13(%r8), %al ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 14(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 14(%rsi), %eax ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: andb 14(%r10), %al +; CHECK-BASELINE-NEXT: andb 14(%r8), %al ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 15(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 15(%rsi), %eax ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: andb 15(%r10), %al +; CHECK-BASELINE-NEXT: andb 15(%r8), %al ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 16(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 16(%r9), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 16(%r10), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 17(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 17(%r9), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 17(%r10), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 18(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 18(%r9), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 18(%r10), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 19(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 19(%r9), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 19(%r10), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 20(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 20(%r9), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 20(%r10), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 21(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 21(%r9), %r13d -; CHECK-BASELINE-NEXT: xorb %al, %r13b -; CHECK-BASELINE-NEXT: andb 21(%r10), %r13b -; CHECK-BASELINE-NEXT: xorb %al, %r13b -; CHECK-BASELINE-NEXT: movzbl 22(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 22(%r9), %r12d -; CHECK-BASELINE-NEXT: xorb %al, %r12b -; CHECK-BASELINE-NEXT: andb 22(%r10), %r12b -; CHECK-BASELINE-NEXT: xorb %al, %r12b -; CHECK-BASELINE-NEXT: movzbl 23(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 23(%r9), %r15d -; CHECK-BASELINE-NEXT: xorb %al, %r15b -; CHECK-BASELINE-NEXT: andb 23(%r10), %r15b -; CHECK-BASELINE-NEXT: xorb %al, %r15b -; CHECK-BASELINE-NEXT: movzbl 24(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 24(%r9), %r14d -; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: andb 24(%r10), %r14b -; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: movzbl 25(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 25(%r9), %ebp -; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: andb 25(%r10), %bpl -; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: movzbl 26(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 26(%r9), %edi +; CHECK-BASELINE-NEXT: movzbl 16(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 16(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 16(%r8), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 17(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 17(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 17(%r8), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 18(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 18(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 18(%r8), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 19(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 19(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 19(%r8), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 20(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 20(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 20(%r8), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 21(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 21(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorb %dil, %r13b +; CHECK-BASELINE-NEXT: andb 21(%r8), %r13b +; CHECK-BASELINE-NEXT: xorb %dil, %r13b +; CHECK-BASELINE-NEXT: movzbl 22(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 22(%rsi), %r12d +; CHECK-BASELINE-NEXT: xorb %dil, %r12b +; CHECK-BASELINE-NEXT: andb 22(%r8), %r12b +; CHECK-BASELINE-NEXT: xorb %dil, %r12b +; CHECK-BASELINE-NEXT: movzbl 23(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 23(%rsi), %r14d +; CHECK-BASELINE-NEXT: xorb %dil, %r14b +; CHECK-BASELINE-NEXT: andb 23(%r8), %r14b +; CHECK-BASELINE-NEXT: xorb %dil, %r14b +; CHECK-BASELINE-NEXT: movzbl 24(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 24(%rsi), %ebp +; CHECK-BASELINE-NEXT: xorb %dil, %bpl +; CHECK-BASELINE-NEXT: andb 24(%r8), %bpl +; CHECK-BASELINE-NEXT: xorb %dil, %bpl +; CHECK-BASELINE-NEXT: movzbl 25(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 25(%rsi), %ebx +; CHECK-BASELINE-NEXT: xorb %dil, %bl +; CHECK-BASELINE-NEXT: andb 25(%r8), %bl +; CHECK-BASELINE-NEXT: xorb %dil, %bl +; CHECK-BASELINE-NEXT: movzbl 26(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 26(%rsi), %r11d +; CHECK-BASELINE-NEXT: xorb %dil, %r11b +; CHECK-BASELINE-NEXT: andb 26(%r8), %r11b +; CHECK-BASELINE-NEXT: xorb %dil, %r11b +; CHECK-BASELINE-NEXT: movzbl 27(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl 27(%rsi), %r10d +; CHECK-BASELINE-NEXT: xorb %al, %r10b +; CHECK-BASELINE-NEXT: andb 27(%r8), %r10b +; CHECK-BASELINE-NEXT: xorb %al, %r10b +; CHECK-BASELINE-NEXT: movzbl 28(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl 28(%rsi), %edi ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: andb 26(%r10), %dil +; CHECK-BASELINE-NEXT: andb 28(%r8), %dil ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: movzbl 27(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 27(%r9), %esi -; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: andb 27(%r10), %sil -; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: movzbl 28(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 28(%r9), %edx -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 28(%r10), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movzbl 29(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 29(%r9), %ecx +; CHECK-BASELINE-NEXT: movzbl 29(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl 29(%rsi), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 29(%r10), %cl +; CHECK-BASELINE-NEXT: andb 29(%r8), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl 30(%r8), %ebx -; CHECK-BASELINE-NEXT: movzbl 30(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %bl, %al -; CHECK-BASELINE-NEXT: andb 30(%r10), %al -; CHECK-BASELINE-NEXT: xorb %bl, %al -; CHECK-BASELINE-NEXT: movzbl 31(%r8), %r8d -; CHECK-BASELINE-NEXT: movzbl 31(%r9), %r9d -; CHECK-BASELINE-NEXT: xorb %r8b, %r9b -; CHECK-BASELINE-NEXT: andb 31(%r10), %r9b -; CHECK-BASELINE-NEXT: xorb %r8b, %r9b -; CHECK-BASELINE-NEXT: movb %r9b, 31(%r11) -; CHECK-BASELINE-NEXT: movb %al, 30(%r11) -; CHECK-BASELINE-NEXT: movb %cl, 29(%r11) -; CHECK-BASELINE-NEXT: movb %dl, 28(%r11) -; CHECK-BASELINE-NEXT: movb %sil, 27(%r11) -; CHECK-BASELINE-NEXT: movb %dil, 26(%r11) -; CHECK-BASELINE-NEXT: movb %bpl, 25(%r11) -; CHECK-BASELINE-NEXT: movb %r14b, 24(%r11) -; CHECK-BASELINE-NEXT: movb %r15b, 23(%r11) -; CHECK-BASELINE-NEXT: movb %r12b, 22(%r11) -; CHECK-BASELINE-NEXT: movb %r13b, 21(%r11) +; CHECK-BASELINE-NEXT: movzbl 30(%rdx), %r15d +; CHECK-BASELINE-NEXT: movzbl 30(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: andb 30(%r8), %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: movzbl 31(%rdx), %edx +; CHECK-BASELINE-NEXT: movzbl 31(%rsi), %esi +; CHECK-BASELINE-NEXT: xorb %dl, %sil +; CHECK-BASELINE-NEXT: andb 31(%r8), %sil +; CHECK-BASELINE-NEXT: xorb %dl, %sil +; CHECK-BASELINE-NEXT: movb %sil, 31(%r9) +; CHECK-BASELINE-NEXT: movb %al, 30(%r9) +; CHECK-BASELINE-NEXT: movb %cl, 29(%r9) +; CHECK-BASELINE-NEXT: movb %dil, 28(%r9) +; CHECK-BASELINE-NEXT: movb %r10b, 27(%r9) +; CHECK-BASELINE-NEXT: movb %r11b, 26(%r9) +; CHECK-BASELINE-NEXT: movb %bl, 25(%r9) +; CHECK-BASELINE-NEXT: movb %bpl, 24(%r9) +; CHECK-BASELINE-NEXT: movb %r14b, 23(%r9) +; CHECK-BASELINE-NEXT: movb %r12b, 22(%r9) +; CHECK-BASELINE-NEXT: movb %r13b, 21(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 20(%r11) +; CHECK-BASELINE-NEXT: movb %al, 20(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 19(%r11) +; CHECK-BASELINE-NEXT: movb %al, 19(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 18(%r11) +; CHECK-BASELINE-NEXT: movb %al, 18(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 17(%r11) +; CHECK-BASELINE-NEXT: movb %al, 17(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 16(%r11) +; CHECK-BASELINE-NEXT: movb %al, 16(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 15(%r11) +; CHECK-BASELINE-NEXT: movb %al, 15(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 14(%r11) +; CHECK-BASELINE-NEXT: movb %al, 14(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 13(%r11) +; CHECK-BASELINE-NEXT: movb %al, 13(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 12(%r11) +; CHECK-BASELINE-NEXT: movb %al, 12(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 11(%r11) +; CHECK-BASELINE-NEXT: movb %al, 11(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 10(%r11) +; CHECK-BASELINE-NEXT: movb %al, 10(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 9(%r11) +; CHECK-BASELINE-NEXT: movb %al, 9(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 8(%r11) +; CHECK-BASELINE-NEXT: movb %al, 8(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 7(%r11) +; CHECK-BASELINE-NEXT: movb %al, 7(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 6(%r11) +; CHECK-BASELINE-NEXT: movb %al, 6(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 5(%r11) +; CHECK-BASELINE-NEXT: movb %al, 5(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 4(%r11) +; CHECK-BASELINE-NEXT: movb %al, 4(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 3(%r11) +; CHECK-BASELINE-NEXT: movb %al, 3(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 2(%r11) +; CHECK-BASELINE-NEXT: movb %al, 2(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 1(%r11) +; CHECK-BASELINE-NEXT: movb %al, 1(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, (%r11) -; CHECK-BASELINE-NEXT: movq %r11, %rax +; CHECK-BASELINE-NEXT: movb %al, (%r9) +; CHECK-BASELINE-NEXT: movq %r9, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -1461,10 +1453,8 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rcx, %r10 -; CHECK-SSE1-NEXT: movq %rdx, %r8 -; CHECK-SSE1-NEXT: movq %rsi, %r9 -; CHECK-SSE1-NEXT: movq %rdi, %r11 +; CHECK-SSE1-NEXT: movq %rcx, %r8 +; CHECK-SSE1-NEXT: movq %rdi, %r9 ; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax @@ -1482,236 +1472,236 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r15d ; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r12d ; CHECK-SSE1-NEXT: movzbl 5(%rdx), %r13d -; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi -; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edx -; CHECK-SSE1-NEXT: movzbl 2(%r8), %edi -; CHECK-SSE1-NEXT: movzbl (%r8), %eax -; CHECK-SSE1-NEXT: movzbl 1(%r8), %ecx -; CHECK-SSE1-NEXT: movzbl (%r9), %ebx -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb (%r10), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: movzbl 4(%rdx), %r11d +; CHECK-SSE1-NEXT: movzbl 3(%rdx), %r10d +; CHECK-SSE1-NEXT: movzbl 2(%rdx), %ecx +; CHECK-SSE1-NEXT: movzbl (%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 1(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl (%rsi), %ebx +; CHECK-SSE1-NEXT: xorb %dil, %bl +; CHECK-SSE1-NEXT: andb (%r8), %bl +; CHECK-SSE1-NEXT: xorb %dil, %bl ; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 1(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 1(%rsi), %edi +; CHECK-SSE1-NEXT: xorb %al, %dil +; CHECK-SSE1-NEXT: andb 1(%r8), %dil +; CHECK-SSE1-NEXT: xorb %al, %dil +; CHECK-SSE1-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 2(%rsi), %eax ; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: andb 1(%r10), %al +; CHECK-SSE1-NEXT: andb 2(%r8), %al ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 2(%r9), %eax -; CHECK-SSE1-NEXT: xorb %dil, %al -; CHECK-SSE1-NEXT: andb 2(%r10), %al -; CHECK-SSE1-NEXT: xorb %dil, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 3(%r9), %eax -; CHECK-SSE1-NEXT: xorb %dl, %al -; CHECK-SSE1-NEXT: andb 3(%r10), %al -; CHECK-SSE1-NEXT: xorb %dl, %al +; CHECK-SSE1-NEXT: movzbl 3(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: andb 3(%r8), %al +; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 4(%r9), %eax -; CHECK-SSE1-NEXT: xorb %sil, %al -; CHECK-SSE1-NEXT: andb 4(%r10), %al -; CHECK-SSE1-NEXT: xorb %sil, %al +; CHECK-SSE1-NEXT: movzbl 4(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %r11b, %al +; CHECK-SSE1-NEXT: andb 4(%r8), %al +; CHECK-SSE1-NEXT: xorb %r11b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 5(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 5(%rsi), %eax ; CHECK-SSE1-NEXT: xorb %r13b, %al -; CHECK-SSE1-NEXT: andb 5(%r10), %al +; CHECK-SSE1-NEXT: andb 5(%r8), %al ; CHECK-SSE1-NEXT: xorb %r13b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 6(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 6(%rsi), %eax ; CHECK-SSE1-NEXT: xorb %r12b, %al -; CHECK-SSE1-NEXT: andb 6(%r10), %al +; CHECK-SSE1-NEXT: andb 6(%r8), %al ; CHECK-SSE1-NEXT: xorb %r12b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 7(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 7(%rsi), %eax ; CHECK-SSE1-NEXT: xorb %r15b, %al -; CHECK-SSE1-NEXT: andb 7(%r10), %al +; CHECK-SSE1-NEXT: andb 7(%r8), %al ; CHECK-SSE1-NEXT: xorb %r15b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 8(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 8(%rsi), %eax ; CHECK-SSE1-NEXT: xorb %r14b, %al -; CHECK-SSE1-NEXT: andb 8(%r10), %al +; CHECK-SSE1-NEXT: andb 8(%r8), %al ; CHECK-SSE1-NEXT: xorb %r14b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 9(%rsi), %eax ; CHECK-SSE1-NEXT: xorb %bpl, %al -; CHECK-SSE1-NEXT: andb 9(%r10), %al +; CHECK-SSE1-NEXT: andb 9(%r8), %al ; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 10(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 10(%rsi), %eax ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: andb 10(%r10), %al +; CHECK-SSE1-NEXT: andb 10(%r8), %al ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 11(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 11(%rsi), %eax ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: andb 11(%r10), %al +; CHECK-SSE1-NEXT: andb 11(%r8), %al ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 12(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 12(%rsi), %eax ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: andb 12(%r10), %al +; CHECK-SSE1-NEXT: andb 12(%r8), %al ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 13(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 13(%rsi), %eax ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: andb 13(%r10), %al +; CHECK-SSE1-NEXT: andb 13(%r8), %al ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 14(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 14(%rsi), %eax ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: andb 14(%r10), %al +; CHECK-SSE1-NEXT: andb 14(%r8), %al ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 15(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 15(%rsi), %eax ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: andb 15(%r10), %al +; CHECK-SSE1-NEXT: andb 15(%r8), %al ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 16(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 16(%r9), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 16(%r10), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 17(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 17(%r9), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 17(%r10), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 18(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 18(%r9), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 18(%r10), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 19(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 19(%r9), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 19(%r10), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 20(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 20(%r9), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 20(%r10), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 21(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 21(%r9), %r13d -; CHECK-SSE1-NEXT: xorb %al, %r13b -; CHECK-SSE1-NEXT: andb 21(%r10), %r13b -; CHECK-SSE1-NEXT: xorb %al, %r13b -; CHECK-SSE1-NEXT: movzbl 22(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 22(%r9), %r12d -; CHECK-SSE1-NEXT: xorb %al, %r12b -; CHECK-SSE1-NEXT: andb 22(%r10), %r12b -; CHECK-SSE1-NEXT: xorb %al, %r12b -; CHECK-SSE1-NEXT: movzbl 23(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 23(%r9), %r15d -; CHECK-SSE1-NEXT: xorb %al, %r15b -; CHECK-SSE1-NEXT: andb 23(%r10), %r15b -; CHECK-SSE1-NEXT: xorb %al, %r15b -; CHECK-SSE1-NEXT: movzbl 24(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 24(%r9), %r14d -; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: andb 24(%r10), %r14b -; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: movzbl 25(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 25(%r9), %ebp -; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: andb 25(%r10), %bpl -; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: movzbl 26(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 26(%r9), %edi +; CHECK-SSE1-NEXT: movzbl 16(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 16(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 16(%r8), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 17(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 17(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 17(%r8), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 18(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 18(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 18(%r8), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 19(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 19(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 19(%r8), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 20(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 20(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 20(%r8), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 21(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 21(%rsi), %r13d +; CHECK-SSE1-NEXT: xorb %dil, %r13b +; CHECK-SSE1-NEXT: andb 21(%r8), %r13b +; CHECK-SSE1-NEXT: xorb %dil, %r13b +; CHECK-SSE1-NEXT: movzbl 22(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 22(%rsi), %r12d +; CHECK-SSE1-NEXT: xorb %dil, %r12b +; CHECK-SSE1-NEXT: andb 22(%r8), %r12b +; CHECK-SSE1-NEXT: xorb %dil, %r12b +; CHECK-SSE1-NEXT: movzbl 23(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 23(%rsi), %r14d +; CHECK-SSE1-NEXT: xorb %dil, %r14b +; CHECK-SSE1-NEXT: andb 23(%r8), %r14b +; CHECK-SSE1-NEXT: xorb %dil, %r14b +; CHECK-SSE1-NEXT: movzbl 24(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 24(%rsi), %ebp +; CHECK-SSE1-NEXT: xorb %dil, %bpl +; CHECK-SSE1-NEXT: andb 24(%r8), %bpl +; CHECK-SSE1-NEXT: xorb %dil, %bpl +; CHECK-SSE1-NEXT: movzbl 25(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 25(%rsi), %ebx +; CHECK-SSE1-NEXT: xorb %dil, %bl +; CHECK-SSE1-NEXT: andb 25(%r8), %bl +; CHECK-SSE1-NEXT: xorb %dil, %bl +; CHECK-SSE1-NEXT: movzbl 26(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 26(%rsi), %r11d +; CHECK-SSE1-NEXT: xorb %dil, %r11b +; CHECK-SSE1-NEXT: andb 26(%r8), %r11b +; CHECK-SSE1-NEXT: xorb %dil, %r11b +; CHECK-SSE1-NEXT: movzbl 27(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl 27(%rsi), %r10d +; CHECK-SSE1-NEXT: xorb %al, %r10b +; CHECK-SSE1-NEXT: andb 27(%r8), %r10b +; CHECK-SSE1-NEXT: xorb %al, %r10b +; CHECK-SSE1-NEXT: movzbl 28(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl 28(%rsi), %edi ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: andb 26(%r10), %dil +; CHECK-SSE1-NEXT: andb 28(%r8), %dil ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: movzbl 27(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 27(%r9), %esi -; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: andb 27(%r10), %sil -; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: movzbl 28(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 28(%r9), %edx -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 28(%r10), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movzbl 29(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 29(%r9), %ecx +; CHECK-SSE1-NEXT: movzbl 29(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl 29(%rsi), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 29(%r10), %cl +; CHECK-SSE1-NEXT: andb 29(%r8), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl 30(%r8), %ebx -; CHECK-SSE1-NEXT: movzbl 30(%r9), %eax -; CHECK-SSE1-NEXT: xorb %bl, %al -; CHECK-SSE1-NEXT: andb 30(%r10), %al -; CHECK-SSE1-NEXT: xorb %bl, %al -; CHECK-SSE1-NEXT: movzbl 31(%r8), %r8d -; CHECK-SSE1-NEXT: movzbl 31(%r9), %r9d -; CHECK-SSE1-NEXT: xorb %r8b, %r9b -; CHECK-SSE1-NEXT: andb 31(%r10), %r9b -; CHECK-SSE1-NEXT: xorb %r8b, %r9b -; CHECK-SSE1-NEXT: movb %r9b, 31(%r11) -; CHECK-SSE1-NEXT: movb %al, 30(%r11) -; CHECK-SSE1-NEXT: movb %cl, 29(%r11) -; CHECK-SSE1-NEXT: movb %dl, 28(%r11) -; CHECK-SSE1-NEXT: movb %sil, 27(%r11) -; CHECK-SSE1-NEXT: movb %dil, 26(%r11) -; CHECK-SSE1-NEXT: movb %bpl, 25(%r11) -; CHECK-SSE1-NEXT: movb %r14b, 24(%r11) -; CHECK-SSE1-NEXT: movb %r15b, 23(%r11) -; CHECK-SSE1-NEXT: movb %r12b, 22(%r11) -; CHECK-SSE1-NEXT: movb %r13b, 21(%r11) +; CHECK-SSE1-NEXT: movzbl 30(%rdx), %r15d +; CHECK-SSE1-NEXT: movzbl 30(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: andb 30(%r8), %al +; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: movzbl 31(%rdx), %edx +; CHECK-SSE1-NEXT: movzbl 31(%rsi), %esi +; CHECK-SSE1-NEXT: xorb %dl, %sil +; CHECK-SSE1-NEXT: andb 31(%r8), %sil +; CHECK-SSE1-NEXT: xorb %dl, %sil +; CHECK-SSE1-NEXT: movb %sil, 31(%r9) +; CHECK-SSE1-NEXT: movb %al, 30(%r9) +; CHECK-SSE1-NEXT: movb %cl, 29(%r9) +; CHECK-SSE1-NEXT: movb %dil, 28(%r9) +; CHECK-SSE1-NEXT: movb %r10b, 27(%r9) +; CHECK-SSE1-NEXT: movb %r11b, 26(%r9) +; CHECK-SSE1-NEXT: movb %bl, 25(%r9) +; CHECK-SSE1-NEXT: movb %bpl, 24(%r9) +; CHECK-SSE1-NEXT: movb %r14b, 23(%r9) +; CHECK-SSE1-NEXT: movb %r12b, 22(%r9) +; CHECK-SSE1-NEXT: movb %r13b, 21(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 20(%r11) +; CHECK-SSE1-NEXT: movb %al, 20(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 19(%r11) +; CHECK-SSE1-NEXT: movb %al, 19(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 18(%r11) +; CHECK-SSE1-NEXT: movb %al, 18(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 17(%r11) +; CHECK-SSE1-NEXT: movb %al, 17(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 16(%r11) +; CHECK-SSE1-NEXT: movb %al, 16(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 15(%r11) +; CHECK-SSE1-NEXT: movb %al, 15(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 14(%r11) +; CHECK-SSE1-NEXT: movb %al, 14(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 13(%r11) +; CHECK-SSE1-NEXT: movb %al, 13(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 12(%r11) +; CHECK-SSE1-NEXT: movb %al, 12(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 11(%r11) +; CHECK-SSE1-NEXT: movb %al, 11(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 10(%r11) +; CHECK-SSE1-NEXT: movb %al, 10(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 9(%r11) +; CHECK-SSE1-NEXT: movb %al, 9(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 8(%r11) +; CHECK-SSE1-NEXT: movb %al, 8(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 7(%r11) +; CHECK-SSE1-NEXT: movb %al, 7(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 6(%r11) +; CHECK-SSE1-NEXT: movb %al, 6(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 5(%r11) +; CHECK-SSE1-NEXT: movb %al, 5(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 4(%r11) +; CHECK-SSE1-NEXT: movb %al, 4(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 3(%r11) +; CHECK-SSE1-NEXT: movb %al, 3(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 2(%r11) +; CHECK-SSE1-NEXT: movb %al, 2(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 1(%r11) +; CHECK-SSE1-NEXT: movb %al, 1(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, (%r11) -; CHECK-SSE1-NEXT: movq %r11, %rax +; CHECK-SSE1-NEXT: movb %al, (%r9) +; CHECK-SSE1-NEXT: movq %r9, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -1759,81 +1749,82 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx +; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r15d ; CHECK-BASELINE-NEXT: movzwl 16(%rdx), %r14d ; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %ebp ; CHECK-BASELINE-NEXT: movzwl 12(%rdx), %ebx -; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r11d -; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzwl (%rdx), %r8d +; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r11d +; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r10d +; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r9d +; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r8d +; CHECK-BASELINE-NEXT: movzwl (%rdx), %edi ; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r12d -; CHECK-BASELINE-NEXT: movzwl (%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r8w, %ax -; CHECK-BASELINE-NEXT: andw (%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r8d +; CHECK-BASELINE-NEXT: movzwl (%rsi), %r13d +; CHECK-BASELINE-NEXT: xorw %di, %r13w +; CHECK-BASELINE-NEXT: andw (%rcx), %r13w +; CHECK-BASELINE-NEXT: xorl %r13d, %edi +; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorw %r12w, %r13w +; CHECK-BASELINE-NEXT: andw 2(%rcx), %r13w +; CHECK-BASELINE-NEXT: xorl %r13d, %r12d +; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorw %r8w, %r13w +; CHECK-BASELINE-NEXT: andw 4(%rcx), %r13w +; CHECK-BASELINE-NEXT: xorl %r13d, %r8d ; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r12w, %ax -; CHECK-BASELINE-NEXT: andw 2(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r12d -; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r9w, %ax -; CHECK-BASELINE-NEXT: andw 4(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r9d +; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorw %r9w, %r13w +; CHECK-BASELINE-NEXT: andw 6(%rcx), %r13w +; CHECK-BASELINE-NEXT: xorl %r13d, %r9d ; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r10w, %ax -; CHECK-BASELINE-NEXT: andw 6(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r10d +; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorw %r10w, %r13w +; CHECK-BASELINE-NEXT: andw 8(%rcx), %r13w +; CHECK-BASELINE-NEXT: xorl %r13d, %r10d ; CHECK-BASELINE-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r11w, %ax -; CHECK-BASELINE-NEXT: andw 8(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r11d +; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorw %r11w, %r13w +; CHECK-BASELINE-NEXT: andw 10(%rcx), %r13w +; CHECK-BASELINE-NEXT: xorl %r13d, %r11d ; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r13w, %ax -; CHECK-BASELINE-NEXT: andw 10(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r13d -; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %bx, %ax -; CHECK-BASELINE-NEXT: andw 12(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %ebx -; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %bp, %ax -; CHECK-BASELINE-NEXT: andw 14(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %ebp -; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r14w, %ax -; CHECK-BASELINE-NEXT: andw 16(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r14d -; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r15w, %ax -; CHECK-BASELINE-NEXT: andw 18(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r15d +; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorw %bx, %r13w +; CHECK-BASELINE-NEXT: andw 12(%rcx), %r13w +; CHECK-BASELINE-NEXT: xorl %r13d, %ebx +; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorw %bp, %r13w +; CHECK-BASELINE-NEXT: andw 14(%rcx), %r13w +; CHECK-BASELINE-NEXT: xorl %r13d, %ebp +; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorw %r14w, %r13w +; CHECK-BASELINE-NEXT: andw 16(%rcx), %r13w +; CHECK-BASELINE-NEXT: xorl %r13d, %r14d +; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorw %r15w, %r13w +; CHECK-BASELINE-NEXT: andw 18(%rcx), %r13w +; CHECK-BASELINE-NEXT: xorl %r13d, %r15d ; CHECK-BASELINE-NEXT: movzwl 20(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r13w, %ax -; CHECK-BASELINE-NEXT: andw 20(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r13d -; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r9w, %ax -; CHECK-BASELINE-NEXT: andw 22(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r9d +; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %edi +; CHECK-BASELINE-NEXT: xorw %r13w, %di +; CHECK-BASELINE-NEXT: andw 20(%rcx), %di +; CHECK-BASELINE-NEXT: xorl %edi, %r13d +; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %edi +; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %r8d +; CHECK-BASELINE-NEXT: xorw %di, %r8w +; CHECK-BASELINE-NEXT: andw 22(%rcx), %r8w +; CHECK-BASELINE-NEXT: xorl %r8d, %edi ; CHECK-BASELINE-NEXT: movzwl 24(%rdx), %r8d -; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r8w, %ax -; CHECK-BASELINE-NEXT: andw 24(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r8d -; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %eax +; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %r9d +; CHECK-BASELINE-NEXT: xorw %r8w, %r9w +; CHECK-BASELINE-NEXT: andw 24(%rcx), %r9w +; CHECK-BASELINE-NEXT: xorl %r9d, %r8d +; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %r9d ; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r10d -; CHECK-BASELINE-NEXT: xorw %ax, %r10w +; CHECK-BASELINE-NEXT: xorw %r9w, %r10w ; CHECK-BASELINE-NEXT: andw 26(%rcx), %r10w -; CHECK-BASELINE-NEXT: xorl %r10d, %eax +; CHECK-BASELINE-NEXT: xorl %r10d, %r9d ; CHECK-BASELINE-NEXT: movzwl 28(%rdx), %r10d ; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %r11d ; CHECK-BASELINE-NEXT: xorw %r10w, %r11w @@ -1844,28 +1835,27 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: xorw %dx, %si ; CHECK-BASELINE-NEXT: andw 30(%rcx), %si ; CHECK-BASELINE-NEXT: xorl %esi, %edx -; CHECK-BASELINE-NEXT: movw %dx, 30(%rdi) -; CHECK-BASELINE-NEXT: movw %r10w, 28(%rdi) -; CHECK-BASELINE-NEXT: movw %ax, 26(%rdi) -; CHECK-BASELINE-NEXT: movw %r8w, 24(%rdi) -; CHECK-BASELINE-NEXT: movw %r9w, 22(%rdi) -; CHECK-BASELINE-NEXT: movw %r13w, 20(%rdi) -; CHECK-BASELINE-NEXT: movw %r15w, 18(%rdi) -; CHECK-BASELINE-NEXT: movw %r14w, 16(%rdi) -; CHECK-BASELINE-NEXT: movw %bp, 14(%rdi) -; CHECK-BASELINE-NEXT: movw %bx, 12(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 10(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 8(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 6(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 4(%rdi) -; CHECK-BASELINE-NEXT: movw %r12w, 2(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, (%rdi) -; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: movw %dx, 30(%rax) +; CHECK-BASELINE-NEXT: movw %r10w, 28(%rax) +; CHECK-BASELINE-NEXT: movw %r9w, 26(%rax) +; CHECK-BASELINE-NEXT: movw %r8w, 24(%rax) +; CHECK-BASELINE-NEXT: movw %di, 22(%rax) +; CHECK-BASELINE-NEXT: movw %r13w, 20(%rax) +; CHECK-BASELINE-NEXT: movw %r15w, 18(%rax) +; CHECK-BASELINE-NEXT: movw %r14w, 16(%rax) +; CHECK-BASELINE-NEXT: movw %bp, 14(%rax) +; CHECK-BASELINE-NEXT: movw %bx, 12(%rax) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %cx, 10(%rax) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %cx, 8(%rax) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %cx, 6(%rax) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) +; CHECK-BASELINE-NEXT: movw %r12w, 2(%rax) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %cx, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -1882,81 +1872,82 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r15d ; CHECK-SSE1-NEXT: movzwl 16(%rdx), %r14d ; CHECK-SSE1-NEXT: movzwl 14(%rdx), %ebp ; CHECK-SSE1-NEXT: movzwl 12(%rdx), %ebx -; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r13d -; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r11d -; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r10d -; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r9d -; CHECK-SSE1-NEXT: movzwl (%rdx), %r8d +; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r11d +; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r10d +; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r9d +; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r8d +; CHECK-SSE1-NEXT: movzwl (%rdx), %edi ; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r12d -; CHECK-SSE1-NEXT: movzwl (%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r8w, %ax -; CHECK-SSE1-NEXT: andw (%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r8d +; CHECK-SSE1-NEXT: movzwl (%rsi), %r13d +; CHECK-SSE1-NEXT: xorw %di, %r13w +; CHECK-SSE1-NEXT: andw (%rcx), %r13w +; CHECK-SSE1-NEXT: xorl %r13d, %edi +; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 2(%rsi), %r13d +; CHECK-SSE1-NEXT: xorw %r12w, %r13w +; CHECK-SSE1-NEXT: andw 2(%rcx), %r13w +; CHECK-SSE1-NEXT: xorl %r13d, %r12d +; CHECK-SSE1-NEXT: movzwl 4(%rsi), %r13d +; CHECK-SSE1-NEXT: xorw %r8w, %r13w +; CHECK-SSE1-NEXT: andw 4(%rcx), %r13w +; CHECK-SSE1-NEXT: xorl %r13d, %r8d ; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 2(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r12w, %ax -; CHECK-SSE1-NEXT: andw 2(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r12d -; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r9w, %ax -; CHECK-SSE1-NEXT: andw 4(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r9d +; CHECK-SSE1-NEXT: movzwl 6(%rsi), %r13d +; CHECK-SSE1-NEXT: xorw %r9w, %r13w +; CHECK-SSE1-NEXT: andw 6(%rcx), %r13w +; CHECK-SSE1-NEXT: xorl %r13d, %r9d ; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 6(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r10w, %ax -; CHECK-SSE1-NEXT: andw 6(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r10d +; CHECK-SSE1-NEXT: movzwl 8(%rsi), %r13d +; CHECK-SSE1-NEXT: xorw %r10w, %r13w +; CHECK-SSE1-NEXT: andw 8(%rcx), %r13w +; CHECK-SSE1-NEXT: xorl %r13d, %r10d ; CHECK-SSE1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r11w, %ax -; CHECK-SSE1-NEXT: andw 8(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r11d +; CHECK-SSE1-NEXT: movzwl 10(%rsi), %r13d +; CHECK-SSE1-NEXT: xorw %r11w, %r13w +; CHECK-SSE1-NEXT: andw 10(%rcx), %r13w +; CHECK-SSE1-NEXT: xorl %r13d, %r11d ; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r13w, %ax -; CHECK-SSE1-NEXT: andw 10(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r13d -; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 12(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %bx, %ax -; CHECK-SSE1-NEXT: andw 12(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %ebx -; CHECK-SSE1-NEXT: movzwl 14(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %bp, %ax -; CHECK-SSE1-NEXT: andw 14(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %ebp -; CHECK-SSE1-NEXT: movzwl 16(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r14w, %ax -; CHECK-SSE1-NEXT: andw 16(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r14d -; CHECK-SSE1-NEXT: movzwl 18(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r15w, %ax -; CHECK-SSE1-NEXT: andw 18(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r15d +; CHECK-SSE1-NEXT: movzwl 12(%rsi), %r13d +; CHECK-SSE1-NEXT: xorw %bx, %r13w +; CHECK-SSE1-NEXT: andw 12(%rcx), %r13w +; CHECK-SSE1-NEXT: xorl %r13d, %ebx +; CHECK-SSE1-NEXT: movzwl 14(%rsi), %r13d +; CHECK-SSE1-NEXT: xorw %bp, %r13w +; CHECK-SSE1-NEXT: andw 14(%rcx), %r13w +; CHECK-SSE1-NEXT: xorl %r13d, %ebp +; CHECK-SSE1-NEXT: movzwl 16(%rsi), %r13d +; CHECK-SSE1-NEXT: xorw %r14w, %r13w +; CHECK-SSE1-NEXT: andw 16(%rcx), %r13w +; CHECK-SSE1-NEXT: xorl %r13d, %r14d +; CHECK-SSE1-NEXT: movzwl 18(%rsi), %r13d +; CHECK-SSE1-NEXT: xorw %r15w, %r13w +; CHECK-SSE1-NEXT: andw 18(%rcx), %r13w +; CHECK-SSE1-NEXT: xorl %r13d, %r15d ; CHECK-SSE1-NEXT: movzwl 20(%rdx), %r13d -; CHECK-SSE1-NEXT: movzwl 20(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r13w, %ax -; CHECK-SSE1-NEXT: andw 20(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r13d -; CHECK-SSE1-NEXT: movzwl 22(%rdx), %r9d -; CHECK-SSE1-NEXT: movzwl 22(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r9w, %ax -; CHECK-SSE1-NEXT: andw 22(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r9d +; CHECK-SSE1-NEXT: movzwl 20(%rsi), %edi +; CHECK-SSE1-NEXT: xorw %r13w, %di +; CHECK-SSE1-NEXT: andw 20(%rcx), %di +; CHECK-SSE1-NEXT: xorl %edi, %r13d +; CHECK-SSE1-NEXT: movzwl 22(%rdx), %edi +; CHECK-SSE1-NEXT: movzwl 22(%rsi), %r8d +; CHECK-SSE1-NEXT: xorw %di, %r8w +; CHECK-SSE1-NEXT: andw 22(%rcx), %r8w +; CHECK-SSE1-NEXT: xorl %r8d, %edi ; CHECK-SSE1-NEXT: movzwl 24(%rdx), %r8d -; CHECK-SSE1-NEXT: movzwl 24(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r8w, %ax -; CHECK-SSE1-NEXT: andw 24(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r8d -; CHECK-SSE1-NEXT: movzwl 26(%rdx), %eax +; CHECK-SSE1-NEXT: movzwl 24(%rsi), %r9d +; CHECK-SSE1-NEXT: xorw %r8w, %r9w +; CHECK-SSE1-NEXT: andw 24(%rcx), %r9w +; CHECK-SSE1-NEXT: xorl %r9d, %r8d +; CHECK-SSE1-NEXT: movzwl 26(%rdx), %r9d ; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r10d -; CHECK-SSE1-NEXT: xorw %ax, %r10w +; CHECK-SSE1-NEXT: xorw %r9w, %r10w ; CHECK-SSE1-NEXT: andw 26(%rcx), %r10w -; CHECK-SSE1-NEXT: xorl %r10d, %eax +; CHECK-SSE1-NEXT: xorl %r10d, %r9d ; CHECK-SSE1-NEXT: movzwl 28(%rdx), %r10d ; CHECK-SSE1-NEXT: movzwl 28(%rsi), %r11d ; CHECK-SSE1-NEXT: xorw %r10w, %r11w @@ -1967,28 +1958,27 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: xorw %dx, %si ; CHECK-SSE1-NEXT: andw 30(%rcx), %si ; CHECK-SSE1-NEXT: xorl %esi, %edx -; CHECK-SSE1-NEXT: movw %dx, 30(%rdi) -; CHECK-SSE1-NEXT: movw %r10w, 28(%rdi) -; CHECK-SSE1-NEXT: movw %ax, 26(%rdi) -; CHECK-SSE1-NEXT: movw %r8w, 24(%rdi) -; CHECK-SSE1-NEXT: movw %r9w, 22(%rdi) -; CHECK-SSE1-NEXT: movw %r13w, 20(%rdi) -; CHECK-SSE1-NEXT: movw %r15w, 18(%rdi) -; CHECK-SSE1-NEXT: movw %r14w, 16(%rdi) -; CHECK-SSE1-NEXT: movw %bp, 14(%rdi) -; CHECK-SSE1-NEXT: movw %bx, 12(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 10(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 8(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 6(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 4(%rdi) -; CHECK-SSE1-NEXT: movw %r12w, 2(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, (%rdi) -; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movw %dx, 30(%rax) +; CHECK-SSE1-NEXT: movw %r10w, 28(%rax) +; CHECK-SSE1-NEXT: movw %r9w, 26(%rax) +; CHECK-SSE1-NEXT: movw %r8w, 24(%rax) +; CHECK-SSE1-NEXT: movw %di, 22(%rax) +; CHECK-SSE1-NEXT: movw %r13w, 20(%rax) +; CHECK-SSE1-NEXT: movw %r15w, 18(%rax) +; CHECK-SSE1-NEXT: movw %r14w, 16(%rax) +; CHECK-SSE1-NEXT: movw %bp, 14(%rax) +; CHECK-SSE1-NEXT: movw %bx, 12(%rax) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: movw %cx, 10(%rax) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: movw %cx, 8(%rax) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: movw %cx, 6(%rax) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: movw %cx, 4(%rax) +; CHECK-SSE1-NEXT: movw %r12w, 2(%rax) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: movw %cx, (%rax) ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -2035,52 +2025,52 @@ define <8 x i32> @out_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: movl 28(%rdx), %edi -; CHECK-BASELINE-NEXT: movl 24(%rdx), %r8d +; CHECK-BASELINE-NEXT: movl 24(%rdx), %r9d ; CHECK-BASELINE-NEXT: movl 20(%rdx), %r10d -; CHECK-BASELINE-NEXT: movl 16(%rdx), %ebx -; CHECK-BASELINE-NEXT: movl 12(%rdx), %r14d +; CHECK-BASELINE-NEXT: movl 16(%rdx), %r11d +; CHECK-BASELINE-NEXT: movl 12(%rdx), %ebx ; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebp -; CHECK-BASELINE-NEXT: movl (%rdx), %r9d -; CHECK-BASELINE-NEXT: movl 4(%rdx), %r11d +; CHECK-BASELINE-NEXT: movl (%rdx), %r8d +; CHECK-BASELINE-NEXT: movl 4(%rdx), %r14d ; CHECK-BASELINE-NEXT: movl (%rsi), %edx -; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: xorl %r8d, %edx ; CHECK-BASELINE-NEXT: andl (%rcx), %edx -; CHECK-BASELINE-NEXT: xorl %r9d, %edx -; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d -; CHECK-BASELINE-NEXT: xorl %r11d, %r9d -; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d -; CHECK-BASELINE-NEXT: xorl %r11d, %r9d -; CHECK-BASELINE-NEXT: movl 8(%rsi), %r11d -; CHECK-BASELINE-NEXT: xorl %ebp, %r11d -; CHECK-BASELINE-NEXT: andl 8(%rcx), %r11d -; CHECK-BASELINE-NEXT: xorl %ebp, %r11d +; CHECK-BASELINE-NEXT: xorl %r8d, %edx +; CHECK-BASELINE-NEXT: movl 4(%rsi), %r8d +; CHECK-BASELINE-NEXT: xorl %r14d, %r8d +; CHECK-BASELINE-NEXT: andl 4(%rcx), %r8d +; CHECK-BASELINE-NEXT: xorl %r14d, %r8d +; CHECK-BASELINE-NEXT: movl 8(%rsi), %r14d +; CHECK-BASELINE-NEXT: xorl %ebp, %r14d +; CHECK-BASELINE-NEXT: andl 8(%rcx), %r14d +; CHECK-BASELINE-NEXT: xorl %ebp, %r14d ; CHECK-BASELINE-NEXT: movl 12(%rsi), %ebp -; CHECK-BASELINE-NEXT: xorl %r14d, %ebp +; CHECK-BASELINE-NEXT: xorl %ebx, %ebp ; CHECK-BASELINE-NEXT: andl 12(%rcx), %ebp -; CHECK-BASELINE-NEXT: xorl %r14d, %ebp -; CHECK-BASELINE-NEXT: movl 16(%rsi), %r14d -; CHECK-BASELINE-NEXT: xorl %ebx, %r14d -; CHECK-BASELINE-NEXT: andl 16(%rcx), %r14d -; CHECK-BASELINE-NEXT: xorl %ebx, %r14d -; CHECK-BASELINE-NEXT: movl 20(%rsi), %ebx -; CHECK-BASELINE-NEXT: xorl %r10d, %ebx -; CHECK-BASELINE-NEXT: andl 20(%rcx), %ebx -; CHECK-BASELINE-NEXT: xorl %r10d, %ebx +; CHECK-BASELINE-NEXT: xorl %ebx, %ebp +; CHECK-BASELINE-NEXT: movl 16(%rsi), %ebx +; CHECK-BASELINE-NEXT: xorl %r11d, %ebx +; CHECK-BASELINE-NEXT: andl 16(%rcx), %ebx +; CHECK-BASELINE-NEXT: xorl %r11d, %ebx +; CHECK-BASELINE-NEXT: movl 20(%rsi), %r11d +; CHECK-BASELINE-NEXT: xorl %r10d, %r11d +; CHECK-BASELINE-NEXT: andl 20(%rcx), %r11d +; CHECK-BASELINE-NEXT: xorl %r10d, %r11d ; CHECK-BASELINE-NEXT: movl 24(%rsi), %r10d -; CHECK-BASELINE-NEXT: xorl %r8d, %r10d +; CHECK-BASELINE-NEXT: xorl %r9d, %r10d ; CHECK-BASELINE-NEXT: andl 24(%rcx), %r10d -; CHECK-BASELINE-NEXT: xorl %r8d, %r10d +; CHECK-BASELINE-NEXT: xorl %r9d, %r10d ; CHECK-BASELINE-NEXT: movl 28(%rsi), %esi ; CHECK-BASELINE-NEXT: xorl %edi, %esi ; CHECK-BASELINE-NEXT: andl 28(%rcx), %esi ; CHECK-BASELINE-NEXT: xorl %edi, %esi ; CHECK-BASELINE-NEXT: movl %esi, 28(%rax) ; CHECK-BASELINE-NEXT: movl %r10d, 24(%rax) -; CHECK-BASELINE-NEXT: movl %ebx, 20(%rax) -; CHECK-BASELINE-NEXT: movl %r14d, 16(%rax) +; CHECK-BASELINE-NEXT: movl %r11d, 20(%rax) +; CHECK-BASELINE-NEXT: movl %ebx, 16(%rax) ; CHECK-BASELINE-NEXT: movl %ebp, 12(%rax) -; CHECK-BASELINE-NEXT: movl %r11d, 8(%rax) -; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax) +; CHECK-BASELINE-NEXT: movl %r14d, 8(%rax) +; CHECK-BASELINE-NEXT: movl %r8d, 4(%rax) ; CHECK-BASELINE-NEXT: movl %edx, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r14 @@ -2094,52 +2084,52 @@ define <8 x i32> @out_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movl 28(%rdx), %edi -; CHECK-SSE1-NEXT: movl 24(%rdx), %r8d +; CHECK-SSE1-NEXT: movl 24(%rdx), %r9d ; CHECK-SSE1-NEXT: movl 20(%rdx), %r10d -; CHECK-SSE1-NEXT: movl 16(%rdx), %ebx -; CHECK-SSE1-NEXT: movl 12(%rdx), %r14d +; CHECK-SSE1-NEXT: movl 16(%rdx), %r11d +; CHECK-SSE1-NEXT: movl 12(%rdx), %ebx ; CHECK-SSE1-NEXT: movl 8(%rdx), %ebp -; CHECK-SSE1-NEXT: movl (%rdx), %r9d -; CHECK-SSE1-NEXT: movl 4(%rdx), %r11d +; CHECK-SSE1-NEXT: movl (%rdx), %r8d +; CHECK-SSE1-NEXT: movl 4(%rdx), %r14d ; CHECK-SSE1-NEXT: movl (%rsi), %edx -; CHECK-SSE1-NEXT: xorl %r9d, %edx +; CHECK-SSE1-NEXT: xorl %r8d, %edx ; CHECK-SSE1-NEXT: andl (%rcx), %edx -; CHECK-SSE1-NEXT: xorl %r9d, %edx -; CHECK-SSE1-NEXT: movl 4(%rsi), %r9d -; CHECK-SSE1-NEXT: xorl %r11d, %r9d -; CHECK-SSE1-NEXT: andl 4(%rcx), %r9d -; CHECK-SSE1-NEXT: xorl %r11d, %r9d -; CHECK-SSE1-NEXT: movl 8(%rsi), %r11d -; CHECK-SSE1-NEXT: xorl %ebp, %r11d -; CHECK-SSE1-NEXT: andl 8(%rcx), %r11d -; CHECK-SSE1-NEXT: xorl %ebp, %r11d +; CHECK-SSE1-NEXT: xorl %r8d, %edx +; CHECK-SSE1-NEXT: movl 4(%rsi), %r8d +; CHECK-SSE1-NEXT: xorl %r14d, %r8d +; CHECK-SSE1-NEXT: andl 4(%rcx), %r8d +; CHECK-SSE1-NEXT: xorl %r14d, %r8d +; CHECK-SSE1-NEXT: movl 8(%rsi), %r14d +; CHECK-SSE1-NEXT: xorl %ebp, %r14d +; CHECK-SSE1-NEXT: andl 8(%rcx), %r14d +; CHECK-SSE1-NEXT: xorl %ebp, %r14d ; CHECK-SSE1-NEXT: movl 12(%rsi), %ebp -; CHECK-SSE1-NEXT: xorl %r14d, %ebp +; CHECK-SSE1-NEXT: xorl %ebx, %ebp ; CHECK-SSE1-NEXT: andl 12(%rcx), %ebp -; CHECK-SSE1-NEXT: xorl %r14d, %ebp -; CHECK-SSE1-NEXT: movl 16(%rsi), %r14d -; CHECK-SSE1-NEXT: xorl %ebx, %r14d -; CHECK-SSE1-NEXT: andl 16(%rcx), %r14d -; CHECK-SSE1-NEXT: xorl %ebx, %r14d -; CHECK-SSE1-NEXT: movl 20(%rsi), %ebx -; CHECK-SSE1-NEXT: xorl %r10d, %ebx -; CHECK-SSE1-NEXT: andl 20(%rcx), %ebx -; CHECK-SSE1-NEXT: xorl %r10d, %ebx -; CHECK-SSE1-NEXT: movl 24(%rsi), %r10d -; CHECK-SSE1-NEXT: xorl %r8d, %r10d -; CHECK-SSE1-NEXT: andl 24(%rcx), %r10d -; CHECK-SSE1-NEXT: xorl %r8d, %r10d -; CHECK-SSE1-NEXT: movl 28(%rsi), %esi +; CHECK-SSE1-NEXT: xorl %ebx, %ebp +; CHECK-SSE1-NEXT: movl 16(%rsi), %ebx +; CHECK-SSE1-NEXT: xorl %r11d, %ebx +; CHECK-SSE1-NEXT: andl 16(%rcx), %ebx +; CHECK-SSE1-NEXT: xorl %r11d, %ebx +; CHECK-SSE1-NEXT: movl 20(%rsi), %r11d +; CHECK-SSE1-NEXT: xorl %r10d, %r11d +; CHECK-SSE1-NEXT: andl 20(%rcx), %r11d +; CHECK-SSE1-NEXT: xorl %r10d, %r11d +; CHECK-SSE1-NEXT: movl 24(%rsi), %r10d +; CHECK-SSE1-NEXT: xorl %r9d, %r10d +; CHECK-SSE1-NEXT: andl 24(%rcx), %r10d +; CHECK-SSE1-NEXT: xorl %r9d, %r10d +; CHECK-SSE1-NEXT: movl 28(%rsi), %esi ; CHECK-SSE1-NEXT: xorl %edi, %esi ; CHECK-SSE1-NEXT: andl 28(%rcx), %esi ; CHECK-SSE1-NEXT: xorl %edi, %esi ; CHECK-SSE1-NEXT: movl %esi, 28(%rax) ; CHECK-SSE1-NEXT: movl %r10d, 24(%rax) -; CHECK-SSE1-NEXT: movl %ebx, 20(%rax) -; CHECK-SSE1-NEXT: movl %r14d, 16(%rax) +; CHECK-SSE1-NEXT: movl %r11d, 20(%rax) +; CHECK-SSE1-NEXT: movl %ebx, 16(%rax) ; CHECK-SSE1-NEXT: movl %ebp, 12(%rax) -; CHECK-SSE1-NEXT: movl %r11d, 8(%rax) -; CHECK-SSE1-NEXT: movl %r9d, 4(%rax) +; CHECK-SSE1-NEXT: movl %r14d, 8(%rax) +; CHECK-SSE1-NEXT: movl %r8d, 4(%rax) ; CHECK-SSE1-NEXT: movl %edx, (%rax) ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r14 @@ -2183,27 +2173,27 @@ define <4 x i64> @out_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movq 24(%rdx), %rdi ; CHECK-BASELINE-NEXT: movq 16(%rdx), %r8 ; CHECK-BASELINE-NEXT: movq (%rdx), %r9 -; CHECK-BASELINE-NEXT: movq 8(%rdx), %r10 -; CHECK-BASELINE-NEXT: movq (%rsi), %rdx -; CHECK-BASELINE-NEXT: xorq %r9, %rdx -; CHECK-BASELINE-NEXT: andq (%rcx), %rdx -; CHECK-BASELINE-NEXT: xorq %r9, %rdx +; CHECK-BASELINE-NEXT: movq 8(%rdx), %rdx +; CHECK-BASELINE-NEXT: movq (%rsi), %r10 +; CHECK-BASELINE-NEXT: xorq %r9, %r10 +; CHECK-BASELINE-NEXT: andq (%rcx), %r10 +; CHECK-BASELINE-NEXT: xorq %r9, %r10 ; CHECK-BASELINE-NEXT: movq 8(%rsi), %r9 -; CHECK-BASELINE-NEXT: xorq %r10, %r9 +; CHECK-BASELINE-NEXT: xorq %rdx, %r9 ; CHECK-BASELINE-NEXT: andq 8(%rcx), %r9 -; CHECK-BASELINE-NEXT: xorq %r10, %r9 -; CHECK-BASELINE-NEXT: movq 16(%rsi), %r10 -; CHECK-BASELINE-NEXT: xorq %r8, %r10 -; CHECK-BASELINE-NEXT: andq 16(%rcx), %r10 -; CHECK-BASELINE-NEXT: xorq %r8, %r10 +; CHECK-BASELINE-NEXT: xorq %rdx, %r9 +; CHECK-BASELINE-NEXT: movq 16(%rsi), %rdx +; CHECK-BASELINE-NEXT: xorq %r8, %rdx +; CHECK-BASELINE-NEXT: andq 16(%rcx), %rdx +; CHECK-BASELINE-NEXT: xorq %r8, %rdx ; CHECK-BASELINE-NEXT: movq 24(%rsi), %rsi ; CHECK-BASELINE-NEXT: xorq %rdi, %rsi ; CHECK-BASELINE-NEXT: andq 24(%rcx), %rsi ; CHECK-BASELINE-NEXT: xorq %rdi, %rsi ; CHECK-BASELINE-NEXT: movq %rsi, 24(%rax) -; CHECK-BASELINE-NEXT: movq %r10, 16(%rax) +; CHECK-BASELINE-NEXT: movq %rdx, 16(%rax) ; CHECK-BASELINE-NEXT: movq %r9, 8(%rax) -; CHECK-BASELINE-NEXT: movq %rdx, (%rax) +; CHECK-BASELINE-NEXT: movq %r10, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i64: @@ -2212,27 +2202,27 @@ define <4 x i64> @out_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movq 24(%rdx), %rdi ; CHECK-SSE1-NEXT: movq 16(%rdx), %r8 ; CHECK-SSE1-NEXT: movq (%rdx), %r9 -; CHECK-SSE1-NEXT: movq 8(%rdx), %r10 -; CHECK-SSE1-NEXT: movq (%rsi), %rdx -; CHECK-SSE1-NEXT: xorq %r9, %rdx -; CHECK-SSE1-NEXT: andq (%rcx), %rdx -; CHECK-SSE1-NEXT: xorq %r9, %rdx +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movq (%rsi), %r10 +; CHECK-SSE1-NEXT: xorq %r9, %r10 +; CHECK-SSE1-NEXT: andq (%rcx), %r10 +; CHECK-SSE1-NEXT: xorq %r9, %r10 ; CHECK-SSE1-NEXT: movq 8(%rsi), %r9 -; CHECK-SSE1-NEXT: xorq %r10, %r9 +; CHECK-SSE1-NEXT: xorq %rdx, %r9 ; CHECK-SSE1-NEXT: andq 8(%rcx), %r9 -; CHECK-SSE1-NEXT: xorq %r10, %r9 -; CHECK-SSE1-NEXT: movq 16(%rsi), %r10 -; CHECK-SSE1-NEXT: xorq %r8, %r10 -; CHECK-SSE1-NEXT: andq 16(%rcx), %r10 -; CHECK-SSE1-NEXT: xorq %r8, %r10 +; CHECK-SSE1-NEXT: xorq %rdx, %r9 +; CHECK-SSE1-NEXT: movq 16(%rsi), %rdx +; CHECK-SSE1-NEXT: xorq %r8, %rdx +; CHECK-SSE1-NEXT: andq 16(%rcx), %rdx +; CHECK-SSE1-NEXT: xorq %r8, %rdx ; CHECK-SSE1-NEXT: movq 24(%rsi), %rsi ; CHECK-SSE1-NEXT: xorq %rdi, %rsi ; CHECK-SSE1-NEXT: andq 24(%rcx), %rsi ; CHECK-SSE1-NEXT: xorq %rdi, %rsi ; CHECK-SSE1-NEXT: movq %rsi, 24(%rax) -; CHECK-SSE1-NEXT: movq %r10, 16(%rax) +; CHECK-SSE1-NEXT: movq %rdx, 16(%rax) ; CHECK-SSE1-NEXT: movq %r9, 8(%rax) -; CHECK-SSE1-NEXT: movq %rdx, (%rax) +; CHECK-SSE1-NEXT: movq %r10, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i64: @@ -2492,45 +2482,46 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl %ecx, %r10d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorb %r11b, %sil -; CHECK-BASELINE-NEXT: xorb %r12b, %dl -; CHECK-BASELINE-NEXT: xorb %r14b, %cl +; CHECK-BASELINE-NEXT: xorb %r12b, %sil +; CHECK-BASELINE-NEXT: xorb %r15b, %dl +; CHECK-BASELINE-NEXT: xorb %r14b, %r10b ; CHECK-BASELINE-NEXT: xorb %bpl, %r8b ; CHECK-BASELINE-NEXT: xorb %bl, %r9b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r15b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %cl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: xorb %r11b, %al ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-BASELINE-NEXT: xorb %r11b, %sil -; CHECK-BASELINE-NEXT: xorb %r12b, %dl -; CHECK-BASELINE-NEXT: xorb %r14b, %cl +; CHECK-BASELINE-NEXT: xorb %r12b, %sil +; CHECK-BASELINE-NEXT: xorb %r15b, %dl +; CHECK-BASELINE-NEXT: xorb %r14b, %r10b ; CHECK-BASELINE-NEXT: xorb %bpl, %r8b ; CHECK-BASELINE-NEXT: xorb %bl, %r9b -; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r15b ; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: xorb %r11b, %al ; CHECK-BASELINE-NEXT: movb %al, 7(%rdi) -; CHECK-BASELINE-NEXT: movb %r13b, 6(%rdi) -; CHECK-BASELINE-NEXT: movb %r15b, 5(%rdi) +; CHECK-BASELINE-NEXT: movb %cl, 6(%rdi) +; CHECK-BASELINE-NEXT: movb %r13b, 5(%rdi) ; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi) ; CHECK-BASELINE-NEXT: movb %r8b, 3(%rdi) -; CHECK-BASELINE-NEXT: movb %cl, 2(%rdi) +; CHECK-BASELINE-NEXT: movb %r10b, 2(%rdi) ; CHECK-BASELINE-NEXT: movb %dl, 1(%rdi) ; CHECK-BASELINE-NEXT: movb %sil, (%rdi) ; CHECK-BASELINE-NEXT: movq %rdi, %rax @@ -2550,45 +2541,46 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl %ecx, %r10d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorb %r11b, %sil -; CHECK-SSE1-NEXT: xorb %r12b, %dl -; CHECK-SSE1-NEXT: xorb %r14b, %cl +; CHECK-SSE1-NEXT: xorb %r12b, %sil +; CHECK-SSE1-NEXT: xorb %r15b, %dl +; CHECK-SSE1-NEXT: xorb %r14b, %r10b ; CHECK-SSE1-NEXT: xorb %bpl, %r8b ; CHECK-SSE1-NEXT: xorb %bl, %r9b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r15b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %cl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: xorb %r11b, %al ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-SSE1-NEXT: xorb %r11b, %sil -; CHECK-SSE1-NEXT: xorb %r12b, %dl -; CHECK-SSE1-NEXT: xorb %r14b, %cl +; CHECK-SSE1-NEXT: xorb %r12b, %sil +; CHECK-SSE1-NEXT: xorb %r15b, %dl +; CHECK-SSE1-NEXT: xorb %r14b, %r10b ; CHECK-SSE1-NEXT: xorb %bpl, %r8b ; CHECK-SSE1-NEXT: xorb %bl, %r9b -; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r15b ; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: xorb %r11b, %al ; CHECK-SSE1-NEXT: movb %al, 7(%rdi) -; CHECK-SSE1-NEXT: movb %r13b, 6(%rdi) -; CHECK-SSE1-NEXT: movb %r15b, 5(%rdi) +; CHECK-SSE1-NEXT: movb %cl, 6(%rdi) +; CHECK-SSE1-NEXT: movb %r13b, 5(%rdi) ; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi) ; CHECK-SSE1-NEXT: movb %r8b, 3(%rdi) -; CHECK-SSE1-NEXT: movb %cl, 2(%rdi) +; CHECK-SSE1-NEXT: movb %r10b, 2(%rdi) ; CHECK-SSE1-NEXT: movb %dl, 1(%rdi) ; CHECK-SSE1-NEXT: movb %sil, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax @@ -2754,103 +2746,99 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movq %rdi, %rdx -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: xorb %dil, %r9b +; CHECK-BASELINE-NEXT: xorb %r10b, %r9b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b -; CHECK-BASELINE-NEXT: xorb %dil, %r9b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: xorb %r10b, %dil -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dil -; CHECK-BASELINE-NEXT: xorb %r10b, %dil +; CHECK-BASELINE-NEXT: xorb %r10b, %r9b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: xorb %r11b, %r10b +; CHECK-BASELINE-NEXT: xorb %bl, %r10b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-BASELINE-NEXT: xorb %r11b, %r10b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorb %r13b, %r11b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: xorb %r13b, %r11b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-BASELINE-NEXT: xorb %r12b, %r13b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: xorb %r12b, %r13b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; CHECK-BASELINE-NEXT: xorb %r15b, %r12b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b -; CHECK-BASELINE-NEXT: xorb %r15b, %r12b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-BASELINE-NEXT: xorb %r14b, %r15b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-BASELINE-NEXT: xorb %r14b, %r15b +; CHECK-BASELINE-NEXT: xorb %bl, %r10b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: xorb %r14b, %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %r14b, %bl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; CHECK-BASELINE-NEXT: xorb %bpl, %r14b +; CHECK-BASELINE-NEXT: xorb %r12b, %r14b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-BASELINE-NEXT: xorb %bpl, %r14b +; CHECK-BASELINE-NEXT: xorb %r12b, %r14b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; CHECK-BASELINE-NEXT: xorb %cl, %r12b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: xorb %cl, %r12b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-BASELINE-NEXT: xorb %bpl, %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: xorb %bpl, %cl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: xorb %bl, %bpl +; CHECK-BASELINE-NEXT: xorb %r13b, %bpl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-BASELINE-NEXT: xorb %bl, %bpl -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: xorb %r13b, %bpl +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d +; CHECK-BASELINE-NEXT: xorb %r15b, %r13b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: xorb %r15b, %r13b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: xorb %al, %r15b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: xorb %al, %r15b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: xorb %r11b, %al ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-BASELINE-NEXT: xorb %sil, %cl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: xorb %sil, %cl -; CHECK-BASELINE-NEXT: movb %cl, 15(%rdx) -; CHECK-BASELINE-NEXT: movb %al, 14(%rdx) -; CHECK-BASELINE-NEXT: movb %bl, 13(%rdx) -; CHECK-BASELINE-NEXT: movb %bpl, 12(%rdx) -; CHECK-BASELINE-NEXT: movb %r14b, 11(%rdx) -; CHECK-BASELINE-NEXT: movb %r15b, 10(%rdx) -; CHECK-BASELINE-NEXT: movb %r12b, 9(%rdx) -; CHECK-BASELINE-NEXT: movb %r13b, 8(%rdx) -; CHECK-BASELINE-NEXT: movb %r11b, 7(%rdx) -; CHECK-BASELINE-NEXT: movb %r10b, 6(%rdx) -; CHECK-BASELINE-NEXT: movb %dil, 5(%rdx) -; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdx) +; CHECK-BASELINE-NEXT: xorb %r11b, %al +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: movb %r11b, 15(%rdi) +; CHECK-BASELINE-NEXT: movb %al, 14(%rdi) +; CHECK-BASELINE-NEXT: movb %r15b, 13(%rdi) +; CHECK-BASELINE-NEXT: movb %r13b, 12(%rdi) +; CHECK-BASELINE-NEXT: movb %bpl, 11(%rdi) +; CHECK-BASELINE-NEXT: movb %cl, 10(%rdi) +; CHECK-BASELINE-NEXT: movb %dl, 9(%rdi) +; CHECK-BASELINE-NEXT: movb %r12b, 8(%rdi) +; CHECK-BASELINE-NEXT: movb %r14b, 7(%rdi) +; CHECK-BASELINE-NEXT: movb %bl, 6(%rdi) +; CHECK-BASELINE-NEXT: movb %r10b, 5(%rdi) +; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi) ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-BASELINE-NEXT: xorb %al, %r8b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: movb %r8b, 3(%rdx) +; CHECK-BASELINE-NEXT: movb %r8b, 3(%rdi) ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, 2(%rdx) +; CHECK-BASELINE-NEXT: movb %cl, 2(%rdi) ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, 1(%rdx) +; CHECK-BASELINE-NEXT: movb %cl, 1(%rdi) ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, (%rdx) -; CHECK-BASELINE-NEXT: movq %rdx, %rax +; CHECK-BASELINE-NEXT: xorb %al, %sil +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: xorb %al, %sil +; CHECK-BASELINE-NEXT: movb %sil, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -2869,103 +2857,99 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movq %rdi, %rdx -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: xorb %dil, %r9b +; CHECK-SSE1-NEXT: xorb %r10b, %r9b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b -; CHECK-SSE1-NEXT: xorb %dil, %r9b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: xorb %r10b, %dil -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dil -; CHECK-SSE1-NEXT: xorb %r10b, %dil +; CHECK-SSE1-NEXT: xorb %r10b, %r9b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: xorb %r11b, %r10b +; CHECK-SSE1-NEXT: xorb %bl, %r10b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-SSE1-NEXT: xorb %r11b, %r10b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorb %r13b, %r11b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: xorb %r13b, %r11b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-SSE1-NEXT: xorb %r12b, %r13b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: xorb %r12b, %r13b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; CHECK-SSE1-NEXT: xorb %r15b, %r12b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b -; CHECK-SSE1-NEXT: xorb %r15b, %r12b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-SSE1-NEXT: xorb %r14b, %r15b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-SSE1-NEXT: xorb %r14b, %r15b +; CHECK-SSE1-NEXT: xorb %bl, %r10b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: xorb %r14b, %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %r14b, %bl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; CHECK-SSE1-NEXT: xorb %bpl, %r14b +; CHECK-SSE1-NEXT: xorb %r12b, %r14b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-SSE1-NEXT: xorb %bpl, %r14b +; CHECK-SSE1-NEXT: xorb %r12b, %r14b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; CHECK-SSE1-NEXT: xorb %cl, %r12b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: xorb %cl, %r12b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-SSE1-NEXT: xorb %bpl, %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: xorb %bpl, %cl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: xorb %bl, %bpl +; CHECK-SSE1-NEXT: xorb %r13b, %bpl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-SSE1-NEXT: xorb %bl, %bpl -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: xorb %r13b, %bpl +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d +; CHECK-SSE1-NEXT: xorb %r15b, %r13b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: xorb %r15b, %r13b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: xorb %al, %r15b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: xorb %al, %r15b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: xorb %r11b, %al ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-SSE1-NEXT: xorb %sil, %cl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: xorb %sil, %cl -; CHECK-SSE1-NEXT: movb %cl, 15(%rdx) -; CHECK-SSE1-NEXT: movb %al, 14(%rdx) -; CHECK-SSE1-NEXT: movb %bl, 13(%rdx) -; CHECK-SSE1-NEXT: movb %bpl, 12(%rdx) -; CHECK-SSE1-NEXT: movb %r14b, 11(%rdx) -; CHECK-SSE1-NEXT: movb %r15b, 10(%rdx) -; CHECK-SSE1-NEXT: movb %r12b, 9(%rdx) -; CHECK-SSE1-NEXT: movb %r13b, 8(%rdx) -; CHECK-SSE1-NEXT: movb %r11b, 7(%rdx) -; CHECK-SSE1-NEXT: movb %r10b, 6(%rdx) -; CHECK-SSE1-NEXT: movb %dil, 5(%rdx) -; CHECK-SSE1-NEXT: movb %r9b, 4(%rdx) +; CHECK-SSE1-NEXT: xorb %r11b, %al +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: movb %r11b, 15(%rdi) +; CHECK-SSE1-NEXT: movb %al, 14(%rdi) +; CHECK-SSE1-NEXT: movb %r15b, 13(%rdi) +; CHECK-SSE1-NEXT: movb %r13b, 12(%rdi) +; CHECK-SSE1-NEXT: movb %bpl, 11(%rdi) +; CHECK-SSE1-NEXT: movb %cl, 10(%rdi) +; CHECK-SSE1-NEXT: movb %dl, 9(%rdi) +; CHECK-SSE1-NEXT: movb %r12b, 8(%rdi) +; CHECK-SSE1-NEXT: movb %r14b, 7(%rdi) +; CHECK-SSE1-NEXT: movb %bl, 6(%rdi) +; CHECK-SSE1-NEXT: movb %r10b, 5(%rdi) +; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi) ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-SSE1-NEXT: xorb %al, %r8b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: movb %r8b, 3(%rdx) +; CHECK-SSE1-NEXT: movb %r8b, 3(%rdi) ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, 2(%rdx) +; CHECK-SSE1-NEXT: movb %cl, 2(%rdi) ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, 1(%rdx) +; CHECK-SSE1-NEXT: movb %cl, 1(%rdi) ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, (%rdx) -; CHECK-SSE1-NEXT: movq %rdx, %rax +; CHECK-SSE1-NEXT: xorb %al, %sil +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: xorb %al, %sil +; CHECK-SSE1-NEXT: movb %sil, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -3117,11 +3101,11 @@ define <4 x i32> @in_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movl 12(%rdx), %edi ; CHECK-BASELINE-NEXT: movl 8(%rdx), %r8d ; CHECK-BASELINE-NEXT: movl (%rdx), %r9d -; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d -; CHECK-BASELINE-NEXT: movl (%rsi), %edx -; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: movl 4(%rdx), %edx +; CHECK-BASELINE-NEXT: movl (%rsi), %r10d +; CHECK-BASELINE-NEXT: xorl %r9d, %r10d ; CHECK-BASELINE-NEXT: movl 4(%rsi), %r11d -; CHECK-BASELINE-NEXT: xorl %r10d, %r11d +; CHECK-BASELINE-NEXT: xorl %edx, %r11d ; CHECK-BASELINE-NEXT: movl 8(%rsi), %ebx ; CHECK-BASELINE-NEXT: xorl %r8d, %ebx ; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi @@ -3129,15 +3113,15 @@ define <4 x i32> @in_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi ; CHECK-BASELINE-NEXT: andl 8(%rcx), %ebx ; CHECK-BASELINE-NEXT: andl 4(%rcx), %r11d -; CHECK-BASELINE-NEXT: andl (%rcx), %edx -; CHECK-BASELINE-NEXT: xorl %r9d, %edx -; CHECK-BASELINE-NEXT: xorl %r10d, %r11d +; CHECK-BASELINE-NEXT: andl (%rcx), %r10d +; CHECK-BASELINE-NEXT: xorl %r9d, %r10d +; CHECK-BASELINE-NEXT: xorl %edx, %r11d ; CHECK-BASELINE-NEXT: xorl %r8d, %ebx ; CHECK-BASELINE-NEXT: xorl %edi, %esi ; CHECK-BASELINE-NEXT: movl %esi, 12(%rax) ; CHECK-BASELINE-NEXT: movl %ebx, 8(%rax) ; CHECK-BASELINE-NEXT: movl %r11d, 4(%rax) -; CHECK-BASELINE-NEXT: movl %edx, (%rax) +; CHECK-BASELINE-NEXT: movl %r10d, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: retq ; @@ -3231,10 +3215,8 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rcx, %r12 -; CHECK-BASELINE-NEXT: movq %rdx, %r15 -; CHECK-BASELINE-NEXT: movq %rsi, %r14 -; CHECK-BASELINE-NEXT: movq %rdi, %r13 +; CHECK-BASELINE-NEXT: movq %rcx, %r8 +; CHECK-BASELINE-NEXT: movq %rdi, %r9 ; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax @@ -3247,241 +3229,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r8d -; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %edi -; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi -; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r14d +; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r15d +; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r12d +; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %r11d +; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %r10d ; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %ecx -; CHECK-BASELINE-NEXT: movzbl (%rdx), %r11d -; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %edx -; CHECK-BASELINE-NEXT: movzbl (%r14), %ebx -; CHECK-BASELINE-NEXT: xorb %r11b, %bl -; CHECK-BASELINE-NEXT: andb (%r12), %bl -; CHECK-BASELINE-NEXT: xorb %r11b, %bl +; CHECK-BASELINE-NEXT: movzbl (%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl (%rsi), %ebx +; CHECK-BASELINE-NEXT: xorb %dil, %bl +; CHECK-BASELINE-NEXT: andb (%r8), %bl +; CHECK-BASELINE-NEXT: xorb %dil, %bl ; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 1(%r14), %r11d -; CHECK-BASELINE-NEXT: xorb %dl, %r11b -; CHECK-BASELINE-NEXT: andb 1(%r12), %r11b -; CHECK-BASELINE-NEXT: xorb %dl, %r11b -; CHECK-BASELINE-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 2(%r14), %edx -; CHECK-BASELINE-NEXT: xorb %cl, %dl -; CHECK-BASELINE-NEXT: andb 2(%r12), %dl -; CHECK-BASELINE-NEXT: xorb %cl, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 3(%r14), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 3(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 4(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %sil, %al -; CHECK-BASELINE-NEXT: andb 4(%r12), %al -; CHECK-BASELINE-NEXT: xorb %sil, %al +; CHECK-BASELINE-NEXT: movzbl 1(%rsi), %edi +; CHECK-BASELINE-NEXT: xorb %al, %dil +; CHECK-BASELINE-NEXT: andb 1(%r8), %dil +; CHECK-BASELINE-NEXT: xorb %al, %dil +; CHECK-BASELINE-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 2(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 2(%r8), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 5(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %dil, %al -; CHECK-BASELINE-NEXT: andb 5(%r12), %al -; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movzbl 3(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: andb 3(%r8), %al +; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 4(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %r11b, %al +; CHECK-BASELINE-NEXT: andb 4(%r8), %al +; CHECK-BASELINE-NEXT: xorb %r11b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 5(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %r13b, %al +; CHECK-BASELINE-NEXT: andb 5(%r8), %al +; CHECK-BASELINE-NEXT: xorb %r13b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 6(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: andb 6(%r8), %al +; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 7(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: andb 7(%r8), %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 8(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: andb 8(%r8), %al +; CHECK-BASELINE-NEXT: xorb %r14b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 6(%r14), %eax +; CHECK-BASELINE-NEXT: movzbl 9(%rsi), %eax ; CHECK-BASELINE-NEXT: xorb %bpl, %al -; CHECK-BASELINE-NEXT: andb 6(%r12), %al +; CHECK-BASELINE-NEXT: andb 9(%r8), %al ; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 7(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: andb 7(%r12), %al -; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: movzbl 10(%rsi), %eax +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 10(%r8), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %r9b, %al -; CHECK-BASELINE-NEXT: andb 8(%r12), %al -; CHECK-BASELINE-NEXT: xorb %r9b, %al +; CHECK-BASELINE-NEXT: movzbl 11(%rsi), %eax +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 11(%r8), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %r8b, %al -; CHECK-BASELINE-NEXT: andb 9(%r12), %al -; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: movzbl 12(%rsi), %eax +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 12(%r8), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 10(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 10(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 11(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 11(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 12(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 12(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 13(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 13(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 14(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 14(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 15(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 15(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 16(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 16(%r14), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 16(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 17(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 17(%r14), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 17(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 18(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 18(%r14), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 18(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 19(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 19(%r14), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 19(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 20(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 20(%r14), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 20(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 21(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 21(%r14), %ebp -; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: andb 21(%r12), %bpl -; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: movzbl 22(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebx -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb 22(%r12), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: movzbl 23(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 23(%r14), %r11d -; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: andb 23(%r12), %r11b -; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: movzbl 24(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 24(%r14), %r9d -; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: andb 24(%r12), %r9b -; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: movzbl 25(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 25(%r14), %r8d -; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: andb 25(%r12), %r8b -; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: movzbl 26(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 26(%r14), %edi +; CHECK-BASELINE-NEXT: movzbl 13(%rsi), %eax +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 13(%r8), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 14(%rsi), %eax +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 14(%r8), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 15(%rsi), %eax +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 15(%r8), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 16(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 16(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 16(%r8), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 17(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 17(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 17(%r8), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 18(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 18(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 18(%r8), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 19(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 19(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 19(%r8), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 20(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 20(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 20(%r8), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 21(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 21(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorb %dil, %r13b +; CHECK-BASELINE-NEXT: andb 21(%r8), %r13b +; CHECK-BASELINE-NEXT: xorb %dil, %r13b +; CHECK-BASELINE-NEXT: movzbl 22(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 22(%rsi), %r12d +; CHECK-BASELINE-NEXT: xorb %dil, %r12b +; CHECK-BASELINE-NEXT: andb 22(%r8), %r12b +; CHECK-BASELINE-NEXT: xorb %dil, %r12b +; CHECK-BASELINE-NEXT: movzbl 23(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 23(%rsi), %r14d +; CHECK-BASELINE-NEXT: xorb %dil, %r14b +; CHECK-BASELINE-NEXT: andb 23(%r8), %r14b +; CHECK-BASELINE-NEXT: xorb %dil, %r14b +; CHECK-BASELINE-NEXT: movzbl 24(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 24(%rsi), %ebp +; CHECK-BASELINE-NEXT: xorb %dil, %bpl +; CHECK-BASELINE-NEXT: andb 24(%r8), %bpl +; CHECK-BASELINE-NEXT: xorb %dil, %bpl +; CHECK-BASELINE-NEXT: movzbl 25(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 25(%rsi), %ebx +; CHECK-BASELINE-NEXT: xorb %dil, %bl +; CHECK-BASELINE-NEXT: andb 25(%r8), %bl +; CHECK-BASELINE-NEXT: xorb %dil, %bl +; CHECK-BASELINE-NEXT: movzbl 26(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 26(%rsi), %r11d +; CHECK-BASELINE-NEXT: xorb %dil, %r11b +; CHECK-BASELINE-NEXT: andb 26(%r8), %r11b +; CHECK-BASELINE-NEXT: xorb %dil, %r11b +; CHECK-BASELINE-NEXT: movzbl 27(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl 27(%rsi), %r10d +; CHECK-BASELINE-NEXT: xorb %al, %r10b +; CHECK-BASELINE-NEXT: andb 27(%r8), %r10b +; CHECK-BASELINE-NEXT: xorb %al, %r10b +; CHECK-BASELINE-NEXT: movzbl 28(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl 28(%rsi), %edi ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: andb 26(%r12), %dil +; CHECK-BASELINE-NEXT: andb 28(%r8), %dil ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: movzbl 27(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 27(%r14), %esi -; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: andb 27(%r12), %sil -; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: movzbl 28(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 28(%r14), %edx -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 28(%r12), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movzbl 29(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 29(%r14), %ecx +; CHECK-BASELINE-NEXT: movzbl 29(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl 29(%rsi), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 29(%r12), %cl +; CHECK-BASELINE-NEXT: andb 29(%r8), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl 30(%r15), %r10d -; CHECK-BASELINE-NEXT: movzbl 30(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: andb 30(%r12), %al -; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: movzbl 31(%r15), %r10d -; CHECK-BASELINE-NEXT: movzbl 31(%r14), %r14d -; CHECK-BASELINE-NEXT: xorb %r10b, %r14b -; CHECK-BASELINE-NEXT: andb 31(%r12), %r14b -; CHECK-BASELINE-NEXT: xorb %r10b, %r14b -; CHECK-BASELINE-NEXT: movb %r14b, 31(%r13) -; CHECK-BASELINE-NEXT: movb %al, 30(%r13) -; CHECK-BASELINE-NEXT: movb %cl, 29(%r13) -; CHECK-BASELINE-NEXT: movb %dl, 28(%r13) -; CHECK-BASELINE-NEXT: movb %sil, 27(%r13) -; CHECK-BASELINE-NEXT: movb %dil, 26(%r13) -; CHECK-BASELINE-NEXT: movb %r8b, 25(%r13) -; CHECK-BASELINE-NEXT: movb %r9b, 24(%r13) -; CHECK-BASELINE-NEXT: movb %r11b, 23(%r13) -; CHECK-BASELINE-NEXT: movb %bl, 22(%r13) -; CHECK-BASELINE-NEXT: movb %bpl, 21(%r13) +; CHECK-BASELINE-NEXT: movzbl 30(%rdx), %r15d +; CHECK-BASELINE-NEXT: movzbl 30(%rsi), %eax +; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: andb 30(%r8), %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: movzbl 31(%rdx), %edx +; CHECK-BASELINE-NEXT: movzbl 31(%rsi), %esi +; CHECK-BASELINE-NEXT: xorb %dl, %sil +; CHECK-BASELINE-NEXT: andb 31(%r8), %sil +; CHECK-BASELINE-NEXT: xorb %dl, %sil +; CHECK-BASELINE-NEXT: movb %sil, 31(%r9) +; CHECK-BASELINE-NEXT: movb %al, 30(%r9) +; CHECK-BASELINE-NEXT: movb %cl, 29(%r9) +; CHECK-BASELINE-NEXT: movb %dil, 28(%r9) +; CHECK-BASELINE-NEXT: movb %r10b, 27(%r9) +; CHECK-BASELINE-NEXT: movb %r11b, 26(%r9) +; CHECK-BASELINE-NEXT: movb %bl, 25(%r9) +; CHECK-BASELINE-NEXT: movb %bpl, 24(%r9) +; CHECK-BASELINE-NEXT: movb %r14b, 23(%r9) +; CHECK-BASELINE-NEXT: movb %r12b, 22(%r9) +; CHECK-BASELINE-NEXT: movb %r13b, 21(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 20(%r13) +; CHECK-BASELINE-NEXT: movb %al, 20(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 19(%r13) +; CHECK-BASELINE-NEXT: movb %al, 19(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 18(%r13) +; CHECK-BASELINE-NEXT: movb %al, 18(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 17(%r13) +; CHECK-BASELINE-NEXT: movb %al, 17(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 16(%r13) +; CHECK-BASELINE-NEXT: movb %al, 16(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 15(%r13) +; CHECK-BASELINE-NEXT: movb %al, 15(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 14(%r13) +; CHECK-BASELINE-NEXT: movb %al, 14(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 13(%r13) +; CHECK-BASELINE-NEXT: movb %al, 13(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 12(%r13) +; CHECK-BASELINE-NEXT: movb %al, 12(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 11(%r13) +; CHECK-BASELINE-NEXT: movb %al, 11(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 10(%r13) +; CHECK-BASELINE-NEXT: movb %al, 10(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 9(%r13) +; CHECK-BASELINE-NEXT: movb %al, 9(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 8(%r13) +; CHECK-BASELINE-NEXT: movb %al, 8(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 7(%r13) +; CHECK-BASELINE-NEXT: movb %al, 7(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 6(%r13) +; CHECK-BASELINE-NEXT: movb %al, 6(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 5(%r13) +; CHECK-BASELINE-NEXT: movb %al, 5(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 4(%r13) +; CHECK-BASELINE-NEXT: movb %al, 4(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 3(%r13) +; CHECK-BASELINE-NEXT: movb %al, 3(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 2(%r13) +; CHECK-BASELINE-NEXT: movb %al, 2(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 1(%r13) +; CHECK-BASELINE-NEXT: movb %al, 1(%r9) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, (%r13) -; CHECK-BASELINE-NEXT: movq %r13, %rax +; CHECK-BASELINE-NEXT: movb %al, (%r9) +; CHECK-BASELINE-NEXT: movq %r9, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -3498,10 +3480,8 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rcx, %r12 -; CHECK-SSE1-NEXT: movq %rdx, %r15 -; CHECK-SSE1-NEXT: movq %rsi, %r14 -; CHECK-SSE1-NEXT: movq %rdi, %r13 +; CHECK-SSE1-NEXT: movq %rcx, %r8 +; CHECK-SSE1-NEXT: movq %rdi, %r9 ; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax @@ -3514,241 +3494,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r8d -; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r9d -; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r10d -; CHECK-SSE1-NEXT: movzbl 6(%rdx), %ebp -; CHECK-SSE1-NEXT: movzbl 5(%rdx), %edi -; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi -; CHECK-SSE1-NEXT: movzbl 3(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl 9(%rdx), %ebp +; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r14d +; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r15d +; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r12d +; CHECK-SSE1-NEXT: movzbl 5(%rdx), %r13d +; CHECK-SSE1-NEXT: movzbl 4(%rdx), %r11d +; CHECK-SSE1-NEXT: movzbl 3(%rdx), %r10d ; CHECK-SSE1-NEXT: movzbl 2(%rdx), %ecx -; CHECK-SSE1-NEXT: movzbl (%rdx), %r11d -; CHECK-SSE1-NEXT: movzbl 1(%rdx), %edx -; CHECK-SSE1-NEXT: movzbl (%r14), %ebx -; CHECK-SSE1-NEXT: xorb %r11b, %bl -; CHECK-SSE1-NEXT: andb (%r12), %bl -; CHECK-SSE1-NEXT: xorb %r11b, %bl +; CHECK-SSE1-NEXT: movzbl (%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 1(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl (%rsi), %ebx +; CHECK-SSE1-NEXT: xorb %dil, %bl +; CHECK-SSE1-NEXT: andb (%r8), %bl +; CHECK-SSE1-NEXT: xorb %dil, %bl ; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 1(%r14), %r11d -; CHECK-SSE1-NEXT: xorb %dl, %r11b -; CHECK-SSE1-NEXT: andb 1(%r12), %r11b -; CHECK-SSE1-NEXT: xorb %dl, %r11b -; CHECK-SSE1-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 2(%r14), %edx -; CHECK-SSE1-NEXT: xorb %cl, %dl -; CHECK-SSE1-NEXT: andb 2(%r12), %dl -; CHECK-SSE1-NEXT: xorb %cl, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 3(%r14), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 3(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 4(%r14), %eax -; CHECK-SSE1-NEXT: xorb %sil, %al -; CHECK-SSE1-NEXT: andb 4(%r12), %al -; CHECK-SSE1-NEXT: xorb %sil, %al +; CHECK-SSE1-NEXT: movzbl 1(%rsi), %edi +; CHECK-SSE1-NEXT: xorb %al, %dil +; CHECK-SSE1-NEXT: andb 1(%r8), %dil +; CHECK-SSE1-NEXT: xorb %al, %dil +; CHECK-SSE1-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 2(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 2(%r8), %al +; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 5(%r14), %eax -; CHECK-SSE1-NEXT: xorb %dil, %al -; CHECK-SSE1-NEXT: andb 5(%r12), %al -; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movzbl 3(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: andb 3(%r8), %al +; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 4(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %r11b, %al +; CHECK-SSE1-NEXT: andb 4(%r8), %al +; CHECK-SSE1-NEXT: xorb %r11b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 5(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %r13b, %al +; CHECK-SSE1-NEXT: andb 5(%r8), %al +; CHECK-SSE1-NEXT: xorb %r13b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 6(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: andb 6(%r8), %al +; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 7(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: andb 7(%r8), %al +; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 8(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: andb 8(%r8), %al +; CHECK-SSE1-NEXT: xorb %r14b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 6(%r14), %eax +; CHECK-SSE1-NEXT: movzbl 9(%rsi), %eax ; CHECK-SSE1-NEXT: xorb %bpl, %al -; CHECK-SSE1-NEXT: andb 6(%r12), %al +; CHECK-SSE1-NEXT: andb 9(%r8), %al ; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 7(%r14), %eax -; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: andb 7(%r12), %al -; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: movzbl 10(%rsi), %eax +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 10(%r8), %al +; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax -; CHECK-SSE1-NEXT: xorb %r9b, %al -; CHECK-SSE1-NEXT: andb 8(%r12), %al -; CHECK-SSE1-NEXT: xorb %r9b, %al +; CHECK-SSE1-NEXT: movzbl 11(%rsi), %eax +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 11(%r8), %al +; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%r14), %eax -; CHECK-SSE1-NEXT: xorb %r8b, %al -; CHECK-SSE1-NEXT: andb 9(%r12), %al -; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: movzbl 12(%rsi), %eax +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 12(%r8), %al +; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 10(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 10(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 11(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 11(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 12(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 12(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 13(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 13(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 14(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 14(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 15(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 15(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 16(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 16(%r14), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 16(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 17(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 17(%r14), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 17(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 18(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 18(%r14), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 18(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 19(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 19(%r14), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 19(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 20(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 20(%r14), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 20(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 21(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 21(%r14), %ebp -; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: andb 21(%r12), %bpl -; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: movzbl 22(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebx -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb 22(%r12), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: movzbl 23(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 23(%r14), %r11d -; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: andb 23(%r12), %r11b -; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: movzbl 24(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 24(%r14), %r9d -; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: andb 24(%r12), %r9b -; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: movzbl 25(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 25(%r14), %r8d -; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: andb 25(%r12), %r8b -; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: movzbl 26(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 26(%r14), %edi +; CHECK-SSE1-NEXT: movzbl 13(%rsi), %eax +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 13(%r8), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 14(%rsi), %eax +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 14(%r8), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 15(%rsi), %eax +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 15(%r8), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 16(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 16(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 16(%r8), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 17(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 17(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 17(%r8), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 18(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 18(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 18(%r8), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 19(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 19(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 19(%r8), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 20(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 20(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 20(%r8), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 21(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 21(%rsi), %r13d +; CHECK-SSE1-NEXT: xorb %dil, %r13b +; CHECK-SSE1-NEXT: andb 21(%r8), %r13b +; CHECK-SSE1-NEXT: xorb %dil, %r13b +; CHECK-SSE1-NEXT: movzbl 22(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 22(%rsi), %r12d +; CHECK-SSE1-NEXT: xorb %dil, %r12b +; CHECK-SSE1-NEXT: andb 22(%r8), %r12b +; CHECK-SSE1-NEXT: xorb %dil, %r12b +; CHECK-SSE1-NEXT: movzbl 23(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 23(%rsi), %r14d +; CHECK-SSE1-NEXT: xorb %dil, %r14b +; CHECK-SSE1-NEXT: andb 23(%r8), %r14b +; CHECK-SSE1-NEXT: xorb %dil, %r14b +; CHECK-SSE1-NEXT: movzbl 24(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 24(%rsi), %ebp +; CHECK-SSE1-NEXT: xorb %dil, %bpl +; CHECK-SSE1-NEXT: andb 24(%r8), %bpl +; CHECK-SSE1-NEXT: xorb %dil, %bpl +; CHECK-SSE1-NEXT: movzbl 25(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 25(%rsi), %ebx +; CHECK-SSE1-NEXT: xorb %dil, %bl +; CHECK-SSE1-NEXT: andb 25(%r8), %bl +; CHECK-SSE1-NEXT: xorb %dil, %bl +; CHECK-SSE1-NEXT: movzbl 26(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 26(%rsi), %r11d +; CHECK-SSE1-NEXT: xorb %dil, %r11b +; CHECK-SSE1-NEXT: andb 26(%r8), %r11b +; CHECK-SSE1-NEXT: xorb %dil, %r11b +; CHECK-SSE1-NEXT: movzbl 27(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl 27(%rsi), %r10d +; CHECK-SSE1-NEXT: xorb %al, %r10b +; CHECK-SSE1-NEXT: andb 27(%r8), %r10b +; CHECK-SSE1-NEXT: xorb %al, %r10b +; CHECK-SSE1-NEXT: movzbl 28(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl 28(%rsi), %edi ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: andb 26(%r12), %dil +; CHECK-SSE1-NEXT: andb 28(%r8), %dil ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: movzbl 27(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 27(%r14), %esi -; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: andb 27(%r12), %sil -; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: movzbl 28(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 28(%r14), %edx -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 28(%r12), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movzbl 29(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 29(%r14), %ecx +; CHECK-SSE1-NEXT: movzbl 29(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl 29(%rsi), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 29(%r12), %cl +; CHECK-SSE1-NEXT: andb 29(%r8), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl 30(%r15), %r10d -; CHECK-SSE1-NEXT: movzbl 30(%r14), %eax -; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: andb 30(%r12), %al -; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: movzbl 31(%r15), %r10d -; CHECK-SSE1-NEXT: movzbl 31(%r14), %r14d -; CHECK-SSE1-NEXT: xorb %r10b, %r14b -; CHECK-SSE1-NEXT: andb 31(%r12), %r14b -; CHECK-SSE1-NEXT: xorb %r10b, %r14b -; CHECK-SSE1-NEXT: movb %r14b, 31(%r13) -; CHECK-SSE1-NEXT: movb %al, 30(%r13) -; CHECK-SSE1-NEXT: movb %cl, 29(%r13) -; CHECK-SSE1-NEXT: movb %dl, 28(%r13) -; CHECK-SSE1-NEXT: movb %sil, 27(%r13) -; CHECK-SSE1-NEXT: movb %dil, 26(%r13) -; CHECK-SSE1-NEXT: movb %r8b, 25(%r13) -; CHECK-SSE1-NEXT: movb %r9b, 24(%r13) -; CHECK-SSE1-NEXT: movb %r11b, 23(%r13) -; CHECK-SSE1-NEXT: movb %bl, 22(%r13) -; CHECK-SSE1-NEXT: movb %bpl, 21(%r13) +; CHECK-SSE1-NEXT: movzbl 30(%rdx), %r15d +; CHECK-SSE1-NEXT: movzbl 30(%rsi), %eax +; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: andb 30(%r8), %al +; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: movzbl 31(%rdx), %edx +; CHECK-SSE1-NEXT: movzbl 31(%rsi), %esi +; CHECK-SSE1-NEXT: xorb %dl, %sil +; CHECK-SSE1-NEXT: andb 31(%r8), %sil +; CHECK-SSE1-NEXT: xorb %dl, %sil +; CHECK-SSE1-NEXT: movb %sil, 31(%r9) +; CHECK-SSE1-NEXT: movb %al, 30(%r9) +; CHECK-SSE1-NEXT: movb %cl, 29(%r9) +; CHECK-SSE1-NEXT: movb %dil, 28(%r9) +; CHECK-SSE1-NEXT: movb %r10b, 27(%r9) +; CHECK-SSE1-NEXT: movb %r11b, 26(%r9) +; CHECK-SSE1-NEXT: movb %bl, 25(%r9) +; CHECK-SSE1-NEXT: movb %bpl, 24(%r9) +; CHECK-SSE1-NEXT: movb %r14b, 23(%r9) +; CHECK-SSE1-NEXT: movb %r12b, 22(%r9) +; CHECK-SSE1-NEXT: movb %r13b, 21(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 20(%r13) +; CHECK-SSE1-NEXT: movb %al, 20(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 19(%r13) +; CHECK-SSE1-NEXT: movb %al, 19(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 18(%r13) +; CHECK-SSE1-NEXT: movb %al, 18(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 17(%r13) +; CHECK-SSE1-NEXT: movb %al, 17(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 16(%r13) +; CHECK-SSE1-NEXT: movb %al, 16(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 15(%r13) +; CHECK-SSE1-NEXT: movb %al, 15(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 14(%r13) +; CHECK-SSE1-NEXT: movb %al, 14(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 13(%r13) +; CHECK-SSE1-NEXT: movb %al, 13(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 12(%r13) +; CHECK-SSE1-NEXT: movb %al, 12(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 11(%r13) +; CHECK-SSE1-NEXT: movb %al, 11(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 10(%r13) +; CHECK-SSE1-NEXT: movb %al, 10(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 9(%r13) +; CHECK-SSE1-NEXT: movb %al, 9(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 8(%r13) +; CHECK-SSE1-NEXT: movb %al, 8(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 7(%r13) +; CHECK-SSE1-NEXT: movb %al, 7(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 6(%r13) +; CHECK-SSE1-NEXT: movb %al, 6(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 5(%r13) +; CHECK-SSE1-NEXT: movb %al, 5(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 4(%r13) +; CHECK-SSE1-NEXT: movb %al, 4(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 3(%r13) +; CHECK-SSE1-NEXT: movb %al, 3(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 2(%r13) +; CHECK-SSE1-NEXT: movb %al, 2(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 1(%r13) +; CHECK-SSE1-NEXT: movb %al, 1(%r9) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, (%r13) -; CHECK-SSE1-NEXT: movq %r13, %rax +; CHECK-SSE1-NEXT: movb %al, (%r9) +; CHECK-SSE1-NEXT: movq %r9, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -3795,140 +3775,139 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rcx, %r9 -; CHECK-BASELINE-NEXT: movq %rdi, %r10 +; CHECK-BASELINE-NEXT: movq %rdi, %r8 ; CHECK-BASELINE-NEXT: movzwl 30(%rdx), %edi ; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl 28(%rdx), %edi ; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %edi -; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %eax +; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl 24(%rdx), %eax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %eax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 20(%rdx), %r8d -; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r11d +; CHECK-BASELINE-NEXT: movl 20(%rdx), %eax +; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r10d +; CHECK-BASELINE-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl 16(%rdx), %r11d ; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 16(%rdx), %ebx -; CHECK-BASELINE-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %ebp ; CHECK-BASELINE-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl 12(%rdx), %r14d ; CHECK-BASELINE-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r15d ; CHECK-BASELINE-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 8(%rdx), %r12d -; CHECK-BASELINE-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebx +; CHECK-BASELINE-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r13d ; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl (%rdx), %ecx -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 4(%rdx), %edi -; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %eax +; CHECK-BASELINE-NEXT: movl (%rdx), %eax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl 4(%rdx), %r9d +; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %edi +; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl (%rsi), %edx -; CHECK-BASELINE-NEXT: xorw %cx, %dx +; CHECK-BASELINE-NEXT: xorw %ax, %dx ; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %ecx -; CHECK-BASELINE-NEXT: xorw %ax, %cx -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax +; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %di, %ax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %ecx -; CHECK-BASELINE-NEXT: xorw %r13w, %cx -; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r12w, %ax +; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r9w, %ax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r15w, %ax -; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %edx -; CHECK-BASELINE-NEXT: xorw %r14w, %dx +; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r13w, %ax +; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %r12d +; CHECK-BASELINE-NEXT: xorw %bx, %r12w +; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %edx +; CHECK-BASELINE-NEXT: xorw %r15w, %dx +; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r14w, %ax ; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %r13d ; CHECK-BASELINE-NEXT: xorw %bp, %r13w -; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %r12d -; CHECK-BASELINE-NEXT: xorw %bx, %r12w -; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %r15d +; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %r15d ; CHECK-BASELINE-NEXT: xorw %r11w, %r15w -; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %r14d -; CHECK-BASELINE-NEXT: xorw %r8w, %r14w -; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %ebp +; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %r14d +; CHECK-BASELINE-NEXT: xorw %r10w, %r14w +; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %ebp ; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload -; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %ebx +; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %ebx ; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload -; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r11d +; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %r11d ; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload -; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %edi +; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r10d +; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %r10w # 2-byte Folded Reload +; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %r9d +; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %r9w # 2-byte Folded Reload +; CHECK-BASELINE-NEXT: movzwl 30(%rsi), %edi ; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload -; CHECK-BASELINE-NEXT: movzwl 30(%rsi), %esi -; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload -; CHECK-BASELINE-NEXT: andw 30(%r9), %si -; CHECK-BASELINE-NEXT: andw 28(%r9), %di -; CHECK-BASELINE-NEXT: andw 26(%r9), %r11w -; CHECK-BASELINE-NEXT: andw 24(%r9), %bx -; CHECK-BASELINE-NEXT: andw 22(%r9), %bp -; CHECK-BASELINE-NEXT: andw 20(%r9), %r14w -; CHECK-BASELINE-NEXT: andw 18(%r9), %r15w -; CHECK-BASELINE-NEXT: andw 16(%r9), %r12w -; CHECK-BASELINE-NEXT: andw 14(%r9), %r13w -; CHECK-BASELINE-NEXT: andw 12(%r9), %dx -; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: andw 10(%r9), %ax +; CHECK-BASELINE-NEXT: andw 30(%rcx), %di +; CHECK-BASELINE-NEXT: andw 28(%rcx), %r9w +; CHECK-BASELINE-NEXT: andw 26(%rcx), %r10w +; CHECK-BASELINE-NEXT: andw 24(%rcx), %r11w +; CHECK-BASELINE-NEXT: andw 22(%rcx), %bx +; CHECK-BASELINE-NEXT: andw 20(%rcx), %bp +; CHECK-BASELINE-NEXT: andw 18(%rcx), %r14w +; CHECK-BASELINE-NEXT: andw 16(%rcx), %r15w +; CHECK-BASELINE-NEXT: andw 14(%rcx), %r13w +; CHECK-BASELINE-NEXT: andw 12(%rcx), %ax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: andw 10(%rcx), %dx +; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: andw 8(%rcx), %r12w +; CHECK-BASELINE-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload -; CHECK-BASELINE-NEXT: andw 8(%r9), %dx -; CHECK-BASELINE-NEXT: andw 6(%r9), %cx -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload -; CHECK-BASELINE-NEXT: andw 4(%r9), %r8w +; CHECK-BASELINE-NEXT: andw 6(%rcx), %dx ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: andw 2(%r9), %ax -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: andw (%r9), %cx -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: andw 4(%rcx), %ax +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; CHECK-BASELINE-NEXT: andw 2(%rcx), %si +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Reload +; CHECK-BASELINE-NEXT: andw (%rcx), %r12w +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl %esi, %r12d +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: movl %edx, %ecx -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: movw %si, 30(%r10) -; CHECK-BASELINE-NEXT: movw %di, 28(%r10) -; CHECK-BASELINE-NEXT: movw %r11w, 26(%r10) -; CHECK-BASELINE-NEXT: movw %bx, 24(%r10) -; CHECK-BASELINE-NEXT: movw %bp, 22(%r10) -; CHECK-BASELINE-NEXT: movw %r14w, 20(%r10) -; CHECK-BASELINE-NEXT: movw %r15w, 18(%r10) -; CHECK-BASELINE-NEXT: movw %r12w, 16(%r10) -; CHECK-BASELINE-NEXT: movw %r13w, 14(%r10) -; CHECK-BASELINE-NEXT: movw %ax, 12(%r10) -; CHECK-BASELINE-NEXT: movw %dx, 10(%r10) -; CHECK-BASELINE-NEXT: movw %cx, 8(%r10) -; CHECK-BASELINE-NEXT: movw %r9w, 6(%r10) -; CHECK-BASELINE-NEXT: movw %r8w, 4(%r10) +; CHECK-BASELINE-NEXT: movw %di, 30(%r8) +; CHECK-BASELINE-NEXT: movw %r9w, 28(%r8) +; CHECK-BASELINE-NEXT: movw %r10w, 26(%r8) +; CHECK-BASELINE-NEXT: movw %r11w, 24(%r8) +; CHECK-BASELINE-NEXT: movw %bx, 22(%r8) +; CHECK-BASELINE-NEXT: movw %bp, 20(%r8) +; CHECK-BASELINE-NEXT: movw %r14w, 18(%r8) +; CHECK-BASELINE-NEXT: movw %r15w, 16(%r8) +; CHECK-BASELINE-NEXT: movw %r13w, 14(%r8) +; CHECK-BASELINE-NEXT: movw %si, 12(%r8) +; CHECK-BASELINE-NEXT: movw %ax, 10(%r8) +; CHECK-BASELINE-NEXT: movw %cx, 8(%r8) +; CHECK-BASELINE-NEXT: movw %dx, 6(%r8) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 2(%r10) +; CHECK-BASELINE-NEXT: movw %ax, 4(%r8) +; CHECK-BASELINE-NEXT: movw %r12w, 2(%r8) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, (%r10) -; CHECK-BASELINE-NEXT: movq %r10, %rax +; CHECK-BASELINE-NEXT: movw %ax, (%r8) +; CHECK-BASELINE-NEXT: movq %r8, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -3945,140 +3924,139 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rcx, %r9 -; CHECK-SSE1-NEXT: movq %rdi, %r10 +; CHECK-SSE1-NEXT: movq %rdi, %r8 ; CHECK-SSE1-NEXT: movzwl 30(%rdx), %edi ; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl 28(%rdx), %edi ; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 26(%rdx), %edi -; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 26(%rdx), %eax +; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl 24(%rdx), %eax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 22(%rdx), %eax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 20(%rdx), %r8d -; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r11d +; CHECK-SSE1-NEXT: movl 20(%rdx), %eax +; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r10d +; CHECK-SSE1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl 16(%rdx), %r11d ; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 16(%rdx), %ebx -; CHECK-SSE1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 14(%rdx), %ebp ; CHECK-SSE1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl 12(%rdx), %r14d ; CHECK-SSE1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r15d ; CHECK-SSE1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 8(%rdx), %r12d -; CHECK-SSE1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl 8(%rdx), %ebx +; CHECK-SSE1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r13d ; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl (%rdx), %ecx -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 4(%rdx), %edi -; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 2(%rdx), %eax +; CHECK-SSE1-NEXT: movl (%rdx), %eax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl 4(%rdx), %r9d +; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 2(%rdx), %edi +; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl (%rsi), %edx -; CHECK-SSE1-NEXT: xorw %cx, %dx +; CHECK-SSE1-NEXT: xorw %ax, %dx ; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 2(%rsi), %ecx -; CHECK-SSE1-NEXT: xorw %ax, %cx -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax +; CHECK-SSE1-NEXT: movzwl 2(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %di, %ax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 6(%rsi), %ecx -; CHECK-SSE1-NEXT: xorw %r13w, %cx -; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r12w, %ax +; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r9w, %ax +; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 6(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r13w, %ax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r15w, %ax -; CHECK-SSE1-NEXT: movzwl 12(%rsi), %edx -; CHECK-SSE1-NEXT: xorw %r14w, %dx +; CHECK-SSE1-NEXT: movzwl 8(%rsi), %r12d +; CHECK-SSE1-NEXT: xorw %bx, %r12w +; CHECK-SSE1-NEXT: movzwl 10(%rsi), %edx +; CHECK-SSE1-NEXT: xorw %r15w, %dx +; CHECK-SSE1-NEXT: movzwl 12(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r14w, %ax ; CHECK-SSE1-NEXT: movzwl 14(%rsi), %r13d ; CHECK-SSE1-NEXT: xorw %bp, %r13w -; CHECK-SSE1-NEXT: movzwl 16(%rsi), %r12d -; CHECK-SSE1-NEXT: xorw %bx, %r12w -; CHECK-SSE1-NEXT: movzwl 18(%rsi), %r15d +; CHECK-SSE1-NEXT: movzwl 16(%rsi), %r15d ; CHECK-SSE1-NEXT: xorw %r11w, %r15w -; CHECK-SSE1-NEXT: movzwl 20(%rsi), %r14d -; CHECK-SSE1-NEXT: xorw %r8w, %r14w -; CHECK-SSE1-NEXT: movzwl 22(%rsi), %ebp +; CHECK-SSE1-NEXT: movzwl 18(%rsi), %r14d +; CHECK-SSE1-NEXT: xorw %r10w, %r14w +; CHECK-SSE1-NEXT: movzwl 20(%rsi), %ebp ; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload -; CHECK-SSE1-NEXT: movzwl 24(%rsi), %ebx +; CHECK-SSE1-NEXT: movzwl 22(%rsi), %ebx ; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload -; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r11d +; CHECK-SSE1-NEXT: movzwl 24(%rsi), %r11d ; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload -; CHECK-SSE1-NEXT: movzwl 28(%rsi), %edi +; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r10d +; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %r10w # 2-byte Folded Reload +; CHECK-SSE1-NEXT: movzwl 28(%rsi), %r9d +; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %r9w # 2-byte Folded Reload +; CHECK-SSE1-NEXT: movzwl 30(%rsi), %edi ; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload -; CHECK-SSE1-NEXT: movzwl 30(%rsi), %esi -; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload -; CHECK-SSE1-NEXT: andw 30(%r9), %si -; CHECK-SSE1-NEXT: andw 28(%r9), %di -; CHECK-SSE1-NEXT: andw 26(%r9), %r11w -; CHECK-SSE1-NEXT: andw 24(%r9), %bx -; CHECK-SSE1-NEXT: andw 22(%r9), %bp -; CHECK-SSE1-NEXT: andw 20(%r9), %r14w -; CHECK-SSE1-NEXT: andw 18(%r9), %r15w -; CHECK-SSE1-NEXT: andw 16(%r9), %r12w -; CHECK-SSE1-NEXT: andw 14(%r9), %r13w -; CHECK-SSE1-NEXT: andw 12(%r9), %dx -; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: andw 10(%r9), %ax +; CHECK-SSE1-NEXT: andw 30(%rcx), %di +; CHECK-SSE1-NEXT: andw 28(%rcx), %r9w +; CHECK-SSE1-NEXT: andw 26(%rcx), %r10w +; CHECK-SSE1-NEXT: andw 24(%rcx), %r11w +; CHECK-SSE1-NEXT: andw 22(%rcx), %bx +; CHECK-SSE1-NEXT: andw 20(%rcx), %bp +; CHECK-SSE1-NEXT: andw 18(%rcx), %r14w +; CHECK-SSE1-NEXT: andw 16(%rcx), %r15w +; CHECK-SSE1-NEXT: andw 14(%rcx), %r13w +; CHECK-SSE1-NEXT: andw 12(%rcx), %ax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: andw 10(%rcx), %dx +; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: andw 8(%rcx), %r12w +; CHECK-SSE1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload -; CHECK-SSE1-NEXT: andw 8(%r9), %dx -; CHECK-SSE1-NEXT: andw 6(%r9), %cx -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload -; CHECK-SSE1-NEXT: andw 4(%r9), %r8w +; CHECK-SSE1-NEXT: andw 6(%rcx), %dx ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: andw 2(%r9), %ax -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: andw (%r9), %cx -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: andw 4(%rcx), %ax +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; CHECK-SSE1-NEXT: andw 2(%rcx), %si +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Reload +; CHECK-SSE1-NEXT: andw (%rcx), %r12w +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload +; CHECK-SSE1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl %esi, %r12d +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload -; CHECK-SSE1-NEXT: movl %edx, %ecx -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload -; CHECK-SSE1-NEXT: movw %si, 30(%r10) -; CHECK-SSE1-NEXT: movw %di, 28(%r10) -; CHECK-SSE1-NEXT: movw %r11w, 26(%r10) -; CHECK-SSE1-NEXT: movw %bx, 24(%r10) -; CHECK-SSE1-NEXT: movw %bp, 22(%r10) -; CHECK-SSE1-NEXT: movw %r14w, 20(%r10) -; CHECK-SSE1-NEXT: movw %r15w, 18(%r10) -; CHECK-SSE1-NEXT: movw %r12w, 16(%r10) -; CHECK-SSE1-NEXT: movw %r13w, 14(%r10) -; CHECK-SSE1-NEXT: movw %ax, 12(%r10) -; CHECK-SSE1-NEXT: movw %dx, 10(%r10) -; CHECK-SSE1-NEXT: movw %cx, 8(%r10) -; CHECK-SSE1-NEXT: movw %r9w, 6(%r10) -; CHECK-SSE1-NEXT: movw %r8w, 4(%r10) +; CHECK-SSE1-NEXT: movw %di, 30(%r8) +; CHECK-SSE1-NEXT: movw %r9w, 28(%r8) +; CHECK-SSE1-NEXT: movw %r10w, 26(%r8) +; CHECK-SSE1-NEXT: movw %r11w, 24(%r8) +; CHECK-SSE1-NEXT: movw %bx, 22(%r8) +; CHECK-SSE1-NEXT: movw %bp, 20(%r8) +; CHECK-SSE1-NEXT: movw %r14w, 18(%r8) +; CHECK-SSE1-NEXT: movw %r15w, 16(%r8) +; CHECK-SSE1-NEXT: movw %r13w, 14(%r8) +; CHECK-SSE1-NEXT: movw %si, 12(%r8) +; CHECK-SSE1-NEXT: movw %ax, 10(%r8) +; CHECK-SSE1-NEXT: movw %cx, 8(%r8) +; CHECK-SSE1-NEXT: movw %dx, 6(%r8) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 2(%r10) +; CHECK-SSE1-NEXT: movw %ax, 4(%r8) +; CHECK-SSE1-NEXT: movw %r12w, 2(%r8) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, (%r10) -; CHECK-SSE1-NEXT: movq %r10, %rax +; CHECK-SSE1-NEXT: movw %ax, (%r8) +; CHECK-SSE1-NEXT: movq %r8, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -4125,57 +4103,59 @@ define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movl 28(%rdx), %ebp -; CHECK-BASELINE-NEXT: movl 24(%rdx), %ebx -; CHECK-BASELINE-NEXT: movl 20(%rdx), %r10d -; CHECK-BASELINE-NEXT: movl 16(%rdx), %eax +; CHECK-BASELINE-NEXT: movq %rcx, %r8 +; CHECK-BASELINE-NEXT: movl 28(%rdx), %r14d +; CHECK-BASELINE-NEXT: movl 24(%rdx), %ebp +; CHECK-BASELINE-NEXT: movl 20(%rdx), %r11d +; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl 16(%rdx), %ecx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl 12(%rdx), %eax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 12(%rdx), %r12d -; CHECK-BASELINE-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 8(%rdx), %r14d -; CHECK-BASELINE-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl 8(%rdx), %r13d +; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl (%rdx), %r15d -; CHECK-BASELINE-NEXT: movl 4(%rdx), %r13d -; CHECK-BASELINE-NEXT: movl (%rsi), %r8d -; CHECK-BASELINE-NEXT: xorl %r15d, %r8d +; CHECK-BASELINE-NEXT: movl 4(%rdx), %r12d +; CHECK-BASELINE-NEXT: movl (%rsi), %r10d +; CHECK-BASELINE-NEXT: xorl %r15d, %r10d ; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d -; CHECK-BASELINE-NEXT: xorl %r13d, %r9d -; CHECK-BASELINE-NEXT: movl 8(%rsi), %r11d -; CHECK-BASELINE-NEXT: xorl %r14d, %r11d -; CHECK-BASELINE-NEXT: movl 12(%rsi), %r14d -; CHECK-BASELINE-NEXT: xorl %r12d, %r14d -; CHECK-BASELINE-NEXT: movl 16(%rsi), %r12d -; CHECK-BASELINE-NEXT: xorl %eax, %r12d -; CHECK-BASELINE-NEXT: movl 20(%rsi), %edx -; CHECK-BASELINE-NEXT: xorl %r10d, %edx +; CHECK-BASELINE-NEXT: xorl %r12d, %r9d +; CHECK-BASELINE-NEXT: movl 8(%rsi), %ebx +; CHECK-BASELINE-NEXT: xorl %r13d, %ebx +; CHECK-BASELINE-NEXT: movl 12(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorl %eax, %r13d +; CHECK-BASELINE-NEXT: movl 16(%rsi), %edx +; CHECK-BASELINE-NEXT: xorl %ecx, %edx +; CHECK-BASELINE-NEXT: movl 20(%rsi), %ecx +; CHECK-BASELINE-NEXT: xorl %r11d, %ecx ; CHECK-BASELINE-NEXT: movl 24(%rsi), %eax -; CHECK-BASELINE-NEXT: xorl %ebx, %eax +; CHECK-BASELINE-NEXT: xorl %ebp, %eax ; CHECK-BASELINE-NEXT: movl 28(%rsi), %esi -; CHECK-BASELINE-NEXT: xorl %ebp, %esi -; CHECK-BASELINE-NEXT: andl 28(%rcx), %esi -; CHECK-BASELINE-NEXT: andl 24(%rcx), %eax -; CHECK-BASELINE-NEXT: andl 20(%rcx), %edx -; CHECK-BASELINE-NEXT: andl 16(%rcx), %r12d -; CHECK-BASELINE-NEXT: andl 12(%rcx), %r14d -; CHECK-BASELINE-NEXT: andl 8(%rcx), %r11d -; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d -; CHECK-BASELINE-NEXT: andl (%rcx), %r8d -; CHECK-BASELINE-NEXT: xorl %r15d, %r8d -; CHECK-BASELINE-NEXT: xorl %r13d, %r9d -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl %r10d, %edx -; CHECK-BASELINE-NEXT: xorl %ebx, %eax -; CHECK-BASELINE-NEXT: xorl %ebp, %esi +; CHECK-BASELINE-NEXT: xorl %r14d, %esi +; CHECK-BASELINE-NEXT: andl 28(%r8), %esi +; CHECK-BASELINE-NEXT: andl 24(%r8), %eax +; CHECK-BASELINE-NEXT: andl 20(%r8), %ecx +; CHECK-BASELINE-NEXT: andl 16(%r8), %edx +; CHECK-BASELINE-NEXT: andl 12(%r8), %r13d +; CHECK-BASELINE-NEXT: andl 8(%r8), %ebx +; CHECK-BASELINE-NEXT: andl 4(%r8), %r9d +; CHECK-BASELINE-NEXT: andl (%r8), %r10d +; CHECK-BASELINE-NEXT: xorl %r15d, %r10d +; CHECK-BASELINE-NEXT: xorl %r12d, %r9d +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl %ebp, %eax +; CHECK-BASELINE-NEXT: xorl %r14d, %esi ; CHECK-BASELINE-NEXT: movl %esi, 28(%rdi) ; CHECK-BASELINE-NEXT: movl %eax, 24(%rdi) -; CHECK-BASELINE-NEXT: movl %edx, 20(%rdi) -; CHECK-BASELINE-NEXT: movl %r12d, 16(%rdi) -; CHECK-BASELINE-NEXT: movl %r14d, 12(%rdi) -; CHECK-BASELINE-NEXT: movl %r11d, 8(%rdi) +; CHECK-BASELINE-NEXT: movl %ecx, 20(%rdi) +; CHECK-BASELINE-NEXT: movl %edx, 16(%rdi) +; CHECK-BASELINE-NEXT: movl %r13d, 12(%rdi) +; CHECK-BASELINE-NEXT: movl %ebx, 8(%rdi) ; CHECK-BASELINE-NEXT: movl %r9d, 4(%rdi) -; CHECK-BASELINE-NEXT: movl %r8d, (%rdi) +; CHECK-BASELINE-NEXT: movl %r10d, (%rdi) ; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 @@ -4193,57 +4173,59 @@ define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movl 28(%rdx), %ebp -; CHECK-SSE1-NEXT: movl 24(%rdx), %ebx -; CHECK-SSE1-NEXT: movl 20(%rdx), %r10d -; CHECK-SSE1-NEXT: movl 16(%rdx), %eax +; CHECK-SSE1-NEXT: movq %rcx, %r8 +; CHECK-SSE1-NEXT: movl 28(%rdx), %r14d +; CHECK-SSE1-NEXT: movl 24(%rdx), %ebp +; CHECK-SSE1-NEXT: movl 20(%rdx), %r11d +; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl 16(%rdx), %ecx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl 12(%rdx), %eax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 12(%rdx), %r12d -; CHECK-SSE1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 8(%rdx), %r14d -; CHECK-SSE1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl 8(%rdx), %r13d +; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl (%rdx), %r15d -; CHECK-SSE1-NEXT: movl 4(%rdx), %r13d -; CHECK-SSE1-NEXT: movl (%rsi), %r8d -; CHECK-SSE1-NEXT: xorl %r15d, %r8d +; CHECK-SSE1-NEXT: movl 4(%rdx), %r12d +; CHECK-SSE1-NEXT: movl (%rsi), %r10d +; CHECK-SSE1-NEXT: xorl %r15d, %r10d ; CHECK-SSE1-NEXT: movl 4(%rsi), %r9d -; CHECK-SSE1-NEXT: xorl %r13d, %r9d -; CHECK-SSE1-NEXT: movl 8(%rsi), %r11d -; CHECK-SSE1-NEXT: xorl %r14d, %r11d -; CHECK-SSE1-NEXT: movl 12(%rsi), %r14d -; CHECK-SSE1-NEXT: xorl %r12d, %r14d -; CHECK-SSE1-NEXT: movl 16(%rsi), %r12d -; CHECK-SSE1-NEXT: xorl %eax, %r12d -; CHECK-SSE1-NEXT: movl 20(%rsi), %edx -; CHECK-SSE1-NEXT: xorl %r10d, %edx +; CHECK-SSE1-NEXT: xorl %r12d, %r9d +; CHECK-SSE1-NEXT: movl 8(%rsi), %ebx +; CHECK-SSE1-NEXT: xorl %r13d, %ebx +; CHECK-SSE1-NEXT: movl 12(%rsi), %r13d +; CHECK-SSE1-NEXT: xorl %eax, %r13d +; CHECK-SSE1-NEXT: movl 16(%rsi), %edx +; CHECK-SSE1-NEXT: xorl %ecx, %edx +; CHECK-SSE1-NEXT: movl 20(%rsi), %ecx +; CHECK-SSE1-NEXT: xorl %r11d, %ecx ; CHECK-SSE1-NEXT: movl 24(%rsi), %eax -; CHECK-SSE1-NEXT: xorl %ebx, %eax +; CHECK-SSE1-NEXT: xorl %ebp, %eax ; CHECK-SSE1-NEXT: movl 28(%rsi), %esi -; CHECK-SSE1-NEXT: xorl %ebp, %esi -; CHECK-SSE1-NEXT: andl 28(%rcx), %esi -; CHECK-SSE1-NEXT: andl 24(%rcx), %eax -; CHECK-SSE1-NEXT: andl 20(%rcx), %edx -; CHECK-SSE1-NEXT: andl 16(%rcx), %r12d -; CHECK-SSE1-NEXT: andl 12(%rcx), %r14d -; CHECK-SSE1-NEXT: andl 8(%rcx), %r11d -; CHECK-SSE1-NEXT: andl 4(%rcx), %r9d -; CHECK-SSE1-NEXT: andl (%rcx), %r8d -; CHECK-SSE1-NEXT: xorl %r15d, %r8d -; CHECK-SSE1-NEXT: xorl %r13d, %r9d -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl %r10d, %edx -; CHECK-SSE1-NEXT: xorl %ebx, %eax -; CHECK-SSE1-NEXT: xorl %ebp, %esi +; CHECK-SSE1-NEXT: xorl %r14d, %esi +; CHECK-SSE1-NEXT: andl 28(%r8), %esi +; CHECK-SSE1-NEXT: andl 24(%r8), %eax +; CHECK-SSE1-NEXT: andl 20(%r8), %ecx +; CHECK-SSE1-NEXT: andl 16(%r8), %edx +; CHECK-SSE1-NEXT: andl 12(%r8), %r13d +; CHECK-SSE1-NEXT: andl 8(%r8), %ebx +; CHECK-SSE1-NEXT: andl 4(%r8), %r9d +; CHECK-SSE1-NEXT: andl (%r8), %r10d +; CHECK-SSE1-NEXT: xorl %r15d, %r10d +; CHECK-SSE1-NEXT: xorl %r12d, %r9d +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl %ebp, %eax +; CHECK-SSE1-NEXT: xorl %r14d, %esi ; CHECK-SSE1-NEXT: movl %esi, 28(%rdi) ; CHECK-SSE1-NEXT: movl %eax, 24(%rdi) -; CHECK-SSE1-NEXT: movl %edx, 20(%rdi) -; CHECK-SSE1-NEXT: movl %r12d, 16(%rdi) -; CHECK-SSE1-NEXT: movl %r14d, 12(%rdi) -; CHECK-SSE1-NEXT: movl %r11d, 8(%rdi) +; CHECK-SSE1-NEXT: movl %ecx, 20(%rdi) +; CHECK-SSE1-NEXT: movl %edx, 16(%rdi) +; CHECK-SSE1-NEXT: movl %r13d, 12(%rdi) +; CHECK-SSE1-NEXT: movl %ebx, 8(%rdi) ; CHECK-SSE1-NEXT: movl %r9d, 4(%rdi) -; CHECK-SSE1-NEXT: movl %r8d, (%rdi) +; CHECK-SSE1-NEXT: movl %r10d, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 @@ -4290,11 +4272,11 @@ define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movq 24(%rdx), %rdi ; CHECK-BASELINE-NEXT: movq 16(%rdx), %r8 ; CHECK-BASELINE-NEXT: movq (%rdx), %r9 -; CHECK-BASELINE-NEXT: movq 8(%rdx), %r10 -; CHECK-BASELINE-NEXT: movq (%rsi), %rdx -; CHECK-BASELINE-NEXT: xorq %r9, %rdx +; CHECK-BASELINE-NEXT: movq 8(%rdx), %rdx +; CHECK-BASELINE-NEXT: movq (%rsi), %r10 +; CHECK-BASELINE-NEXT: xorq %r9, %r10 ; CHECK-BASELINE-NEXT: movq 8(%rsi), %r11 -; CHECK-BASELINE-NEXT: xorq %r10, %r11 +; CHECK-BASELINE-NEXT: xorq %rdx, %r11 ; CHECK-BASELINE-NEXT: movq 16(%rsi), %rbx ; CHECK-BASELINE-NEXT: xorq %r8, %rbx ; CHECK-BASELINE-NEXT: movq 24(%rsi), %rsi @@ -4302,15 +4284,15 @@ define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: andq 24(%rcx), %rsi ; CHECK-BASELINE-NEXT: andq 16(%rcx), %rbx ; CHECK-BASELINE-NEXT: andq 8(%rcx), %r11 -; CHECK-BASELINE-NEXT: andq (%rcx), %rdx -; CHECK-BASELINE-NEXT: xorq %r9, %rdx -; CHECK-BASELINE-NEXT: xorq %r10, %r11 +; CHECK-BASELINE-NEXT: andq (%rcx), %r10 +; CHECK-BASELINE-NEXT: xorq %r9, %r10 +; CHECK-BASELINE-NEXT: xorq %rdx, %r11 ; CHECK-BASELINE-NEXT: xorq %r8, %rbx ; CHECK-BASELINE-NEXT: xorq %rdi, %rsi ; CHECK-BASELINE-NEXT: movq %rsi, 24(%rax) ; CHECK-BASELINE-NEXT: movq %rbx, 16(%rax) ; CHECK-BASELINE-NEXT: movq %r11, 8(%rax) -; CHECK-BASELINE-NEXT: movq %rdx, (%rax) +; CHECK-BASELINE-NEXT: movq %r10, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: retq ; @@ -4321,11 +4303,11 @@ define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movq 24(%rdx), %rdi ; CHECK-SSE1-NEXT: movq 16(%rdx), %r8 ; CHECK-SSE1-NEXT: movq (%rdx), %r9 -; CHECK-SSE1-NEXT: movq 8(%rdx), %r10 -; CHECK-SSE1-NEXT: movq (%rsi), %rdx -; CHECK-SSE1-NEXT: xorq %r9, %rdx +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movq (%rsi), %r10 +; CHECK-SSE1-NEXT: xorq %r9, %r10 ; CHECK-SSE1-NEXT: movq 8(%rsi), %r11 -; CHECK-SSE1-NEXT: xorq %r10, %r11 +; CHECK-SSE1-NEXT: xorq %rdx, %r11 ; CHECK-SSE1-NEXT: movq 16(%rsi), %rbx ; CHECK-SSE1-NEXT: xorq %r8, %rbx ; CHECK-SSE1-NEXT: movq 24(%rsi), %rsi @@ -4333,15 +4315,15 @@ define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: andq 24(%rcx), %rsi ; CHECK-SSE1-NEXT: andq 16(%rcx), %rbx ; CHECK-SSE1-NEXT: andq 8(%rcx), %r11 -; CHECK-SSE1-NEXT: andq (%rcx), %rdx -; CHECK-SSE1-NEXT: xorq %r9, %rdx -; CHECK-SSE1-NEXT: xorq %r10, %r11 +; CHECK-SSE1-NEXT: andq (%rcx), %r10 +; CHECK-SSE1-NEXT: xorq %r9, %r10 +; CHECK-SSE1-NEXT: xorq %rdx, %r11 ; CHECK-SSE1-NEXT: xorq %r8, %rbx ; CHECK-SSE1-NEXT: xorq %rdi, %rsi ; CHECK-SSE1-NEXT: movq %rsi, 24(%rax) ; CHECK-SSE1-NEXT: movq %rbx, 16(%rax) ; CHECK-SSE1-NEXT: movq %r11, 8(%rax) -; CHECK-SSE1-NEXT: movq %rdx, (%rax) +; CHECK-SSE1-NEXT: movq %r10, (%rax) ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll index e0e1ef7108d0d1..cf067ce4e2a52f 100644 --- a/llvm/test/CodeGen/X86/ushl_sat.ll +++ b/llvm/test/CodeGen/X86/ushl_sat.ll @@ -196,30 +196,30 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: shll %cl, %esi +; X86-NEXT: shldl %cl, %edi, %edx ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %edi, %edx -; X86-NEXT: cmovnel %ebx, %edi +; X86-NEXT: cmovnel %esi, %edx +; X86-NEXT: cmovnel %ebx, %esi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: shrl %cl, %ebp ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovel %ebp, %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: shrdl %cl, %edx, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel %ebp, %eax -; X86-NEXT: xorl %esi, %eax +; X86-NEXT: xorl %edi, %eax ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: orl %eax, %ebx ; X86-NEXT: movl $-1, %eax -; X86-NEXT: cmovnel %eax, %edi +; X86-NEXT: cmovnel %eax, %esi ; X86-NEXT: cmovnel %eax, %edx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index ebb5e135eacd02..4da833bdbc5f81 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -45,69 +45,69 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $16, %esp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %esi, %eax +; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %eax ; X86-NEXT: shldl %cl, %esi, %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testb $32, %cl +; X86-NEXT: testb $32, %ch ; X86-NEXT: cmovnel %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: cmovnel %ebx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shrl %cl, %eax +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testb $32, %cl +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: testb $32, %ch +; X86-NEXT: movl %edx, %eax ; X86-NEXT: cmovnel %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: testb $32, %ch +; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: cmovnel %ebx, %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: shrl %cl, %edi -; X86-NEXT: testb $32, %ch +; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovel %edi, %ebx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movb %ch, %cl +; X86-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-NEXT: shrdl %cl, %ebp, %eax -; X86-NEXT: testb $32, %cl +; X86-NEXT: testb $32, %ch ; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movb %ch, %cl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrdl %cl, %edx, %ebp -; X86-NEXT: testb $32, %ch +; X86-NEXT: testb $32, {{[0-9]+}}(%esp) ; X86-NEXT: cmovnel %edi, %ebp ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $-1, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: cmovnel %eax, %ecx ; X86-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-NEXT: cmovnel %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: cmovnel %eax, %edi ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: orl %ebp, %ebx -; X86-NEXT: cmovnel %ecx, %esi -; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, 12(%ecx) -; X86-NEXT: movl %esi, 8(%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: movl %edi, (%ecx) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: cmovnel %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $16, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -178,35 +178,32 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl %esi, %ebp ; X86-NEXT: shrl %cl, %ebp ; X86-NEXT: cmpl %ebp, %ebx -; X86-NEXT: movl $-1, %edx -; X86-NEXT: cmovnel %edx, %esi -; X86-NEXT: movl $-1, %ebx +; X86-NEXT: movl $-1, %ebp +; X86-NEXT: cmovnel %ebp, %esi ; X86-NEXT: movl %edi, %edx ; X86-NEXT: movb %ah, %cl ; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: cmpl %ebp, %edi -; X86-NEXT: cmovnel %ebx, %edx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: shrl %cl, %ebx +; X86-NEXT: cmpl %ebx, %edi +; X86-NEXT: cmovnel %ebp, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: shrl %cl, %ebx +; X86-NEXT: cmpl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmovnel %eax, %edi -; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: cmovnel %ebp, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpl %eax, %ebx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: cmovnel %eax, %ebp +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: cmovnel %ebp, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebp, 12(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: movl %esi, (%eax) @@ -301,83 +298,82 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $12, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: movzwl %bp, %ebx +; X86-NEXT: shrl %cl, %ebx +; X86-NEXT: cmpw %bx, %di +; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X86-NEXT: cmovnel %ecx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %si +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movzwl %di, %ebx +; X86-NEXT: shrl %cl, %ebx +; X86-NEXT: cmpw %bx, %ax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $65535, %esi # imm = 0xFFFF -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %bp +; X86-NEXT: movl $65535, %edx # imm = 0xFFFF +; X86-NEXT: cmovnel %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %edi +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movzwl %di, %eax +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: cmpw %ax, %si ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF +; X86-NEXT: cmovnel %edx, %edi +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %ebp ; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movzwl %bp, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %si -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bp, %eax +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: cmpw %ax, %si +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %eax, %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $65535, %esi # imm = 0xFFFF -; X86-NEXT: cmovnel %esi, %ebx +; X86-NEXT: movzwl %bx, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl $65535, %edx # imm = 0xFFFF +; X86-NEXT: cmovnel %edx, %ebx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %esi, %edi ; X86-NEXT: shll %cl, %edi ; X86-NEXT: movzwl %di, %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx -; X86-NEXT: cmovnel %esi, %edi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movzwl %si, %eax -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: cmpw %ax, %si +; X86-NEXT: cmovnel %edx, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: shll %cl, %eax ; X86-NEXT: movzwl %ax, %edx ; X86-NEXT: shrl %cl, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: cmpw %dx, %si ; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF ; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movzwl %dx, %esi +; X86-NEXT: shrl %cl, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movw %ax, 14(%ecx) -; X86-NEXT: movw %si, 12(%ecx) +; X86-NEXT: cmpw %si, %cx +; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movw %dx, 14(%ecx) +; X86-NEXT: movw %ax, 12(%ecx) ; X86-NEXT: movw %di, 10(%ecx) ; X86-NEXT: movw %bx, 8(%ecx) ; X86-NEXT: movw %bp, 6(%ecx) @@ -616,11 +612,11 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: cmovnel %esi, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: shlb %cl, %dl -; X86-NEXT: movzbl %dl, %ebx -; X86-NEXT: shrb %cl, %dl -; X86-NEXT: cmpb %dl, %al +; X86-NEXT: movb %al, %ah +; X86-NEXT: shlb %cl, %ah +; X86-NEXT: movzbl %ah, %ebx +; X86-NEXT: shrb %cl, %ah +; X86-NEXT: cmpb %ah, %al ; X86-NEXT: cmovnel %esi, %ebx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/usub_sat.ll b/llvm/test/CodeGen/X86/usub_sat.ll index 6749a1f9147aff..9856587c0c6232 100644 --- a/llvm/test/CodeGen/X86/usub_sat.ll +++ b/llvm/test/CodeGen/X86/usub_sat.ll @@ -124,23 +124,23 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovbl %ebx, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmovbl %ebx, %edx -; X86-NEXT: subl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovbl %ebx, %esi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: cmovbl %ebx, %edi -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: subl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovbl %ebx, %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovbl %ebx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmovbl %ebx, %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 99a3821bb9ba91..419beac060fe95 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -247,7 +247,7 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm6 +; SSE3-NEXT: movd %eax, %xmm5 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -259,7 +259,7 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm5 +; SSE3-NEXT: movd %eax, %xmm6 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -271,11 +271,11 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm12 +; SSE3-NEXT: movd %eax, %xmm11 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -295,18 +295,18 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8: @@ -511,7 +511,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm6 +; SSE3-NEXT: movd %eax, %xmm5 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -523,7 +523,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm5 +; SSE3-NEXT: movd %eax, %xmm6 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -535,11 +535,11 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm12 +; SSE3-NEXT: movd %eax, %xmm11 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -559,18 +559,18 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: @@ -731,10 +731,10 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: movd %eax, %xmm10 ; SSE3-NEXT: andl $31, %r8d ; SSE3-NEXT: movzbl 224(%rsp,%r8), %eax -; SSE3-NEXT: movd %eax, %xmm12 +; SSE3-NEXT: movd %eax, %xmm11 ; SSE3-NEXT: andl $31, %edi ; SSE3-NEXT: movzbl 256(%rsp,%rdi), %eax -; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: andl $31, %esi ; SSE3-NEXT: movzbl 288(%rsp,%rsi), %eax ; SSE3-NEXT: movd %eax, %xmm13 @@ -756,12 +756,12 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE3-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSE3-NEXT: popq %rbx @@ -863,10 +863,10 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: movd %eax, %xmm10 ; SSSE3-NEXT: andl $31, %r8d ; SSSE3-NEXT: movzbl 224(%rsp,%r8), %eax -; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movd %eax, %xmm11 ; SSSE3-NEXT: andl $31, %edi ; SSSE3-NEXT: movzbl 256(%rsp,%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: movd %eax, %xmm12 ; SSSE3-NEXT: andl $31, %esi ; SSSE3-NEXT: movzbl 288(%rsp,%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm13 @@ -888,12 +888,12 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSSE3-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll index 032ffb0d0bf7d9..5f6462dacef8e1 100644 --- a/llvm/test/CodeGen/X86/var-permute-512.ll +++ b/llvm/test/CodeGen/X86/var-permute-512.ll @@ -1067,33 +1067,33 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512F-NEXT: subq $128, %rsp ; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi ; AVX512F-NEXT: vpbroadcastd %esi, %zmm2 -; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-NEXT: vmovd %xmm4, %eax ; AVX512F-NEXT: vmovaps %zmm0, (%rsp) ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm1, %eax +; AVX512F-NEXT: vpextrd $1, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $2, %xmm1, %eax +; AVX512F-NEXT: vpextrd $2, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm1, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm1 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: vmovd %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm3, %eax +; AVX512F-NEXT: vpextrd $1, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $2, %xmm3, %eax +; AVX512F-NEXT: vpextrd $2, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; AVX512F-NEXT: vpextrd $3, %xmm1, %eax +; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm5 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm5, %eax @@ -1102,82 +1102,82 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512F-NEXT: andl $63, %esi ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm4 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: vmovd %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm1, %eax +; AVX512F-NEXT: vpextrd $1, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $2, %xmm1, %eax +; AVX512F-NEXT: vpextrd $2, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm1, %eax +; AVX512F-NEXT: vpextrd $3, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: vmovd %eax, %xmm4 +; AVX512F-NEXT: vpextrd $1, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm4, %eax +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrd $2, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrd $3, %xmm3, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vmovd %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrd $2, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm5 +; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vmovd %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrd $2, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm1 +; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm3 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 -; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: vmovd %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm5 -; AVX512F-NEXT: vpextrd $1, %xmm3, %eax +; AVX512F-NEXT: vpextrd $1, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $2, %xmm3, %eax +; AVX512F-NEXT: vpextrd $2, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm1, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm6 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vmovd %xmm6, %eax @@ -1190,7 +1190,7 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7 ; AVX512F-NEXT: vpextrd $3, %xmm6, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5 +; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6 ; AVX512F-NEXT: vmovd %xmm5, %eax @@ -1261,49 +1261,49 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm3 +; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5 -; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: vmovd %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $1, %xmm3, %eax +; AVX512F-NEXT: vpextrd $1, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $2, %xmm3, %eax +; AVX512F-NEXT: vpextrd $2, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $3, %xmm3, %eax +; AVX512F-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 -; AVX512F-NEXT: vpextrd $1, %xmm1, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm5, %xmm1 +; AVX512F-NEXT: vpextrd $1, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $2, %xmm1, %eax +; AVX512F-NEXT: vpextrd $2, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $3, %xmm1, %eax +; AVX512F-NEXT: vpextrd $3, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512F-NEXT: vcvtdq2ps %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0 ; AVX512F-NEXT: vmovaps %zmm0, 192(%rdi) -; AVX512F-NEXT: vmovaps %zmm1, 128(%rdi) -; AVX512F-NEXT: vmovaps %zmm3, 64(%rdi) +; AVX512F-NEXT: vmovaps %zmm3, 128(%rdi) +; AVX512F-NEXT: vmovaps %zmm1, 64(%rdi) ; AVX512F-NEXT: vmovaps %zmm2, (%rdi) ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp @@ -1318,33 +1318,33 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512BW-NEXT: subq $128, %rsp ; AVX512BW-NEXT: # kill: def $esi killed $esi def $rsi ; AVX512BW-NEXT: vpbroadcastd %esi, %zmm2 -; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512BW-NEXT: vmovd %xmm1, %eax +; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512BW-NEXT: vmovd %xmm4, %eax ; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm0 -; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm1 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm3, %eax +; AVX512BW-NEXT: vmovd %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax +; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm5 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm5, %eax @@ -1353,82 +1353,82 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: andl $63, %esi ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm4 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm1, %eax +; AVX512BW-NEXT: vmovd %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax +; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vmovd %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: vmovd %eax, %xmm4 +; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax +; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax +; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vmovd %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm5 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm5 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vmovd %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 +; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm3 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 -; AVX512BW-NEXT: vmovd %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 +; AVX512BW-NEXT: vmovd %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vmovd %xmm3, %eax +; AVX512BW-NEXT: vmovd %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm5 -; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm6 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5 ; AVX512BW-NEXT: vmovd %xmm6, %eax @@ -1441,7 +1441,7 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7 ; AVX512BW-NEXT: vpextrd $3, %xmm6, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm5 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6 ; AVX512BW-NEXT: vmovd %xmm5, %eax @@ -1512,49 +1512,49 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm3 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5 -; AVX512BW-NEXT: vmovd %xmm3, %eax +; AVX512BW-NEXT: vmovd %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 -; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm1 +; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax +; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 ; AVX512BW-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512BW-NEXT: vcvtdq2ps %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512BW-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512BW-NEXT: vcvtdq2ps %zmm0, %zmm0 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rdi) -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdi) -; AVX512BW-NEXT: vmovaps %zmm3, 64(%rdi) +; AVX512BW-NEXT: vmovaps %zmm3, 128(%rdi) +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdi) ; AVX512BW-NEXT: vmovaps %zmm2, (%rdi) ; AVX512BW-NEXT: movq %rbp, %rsp ; AVX512BW-NEXT: popq %rbp @@ -1568,21 +1568,21 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512VBMI-NEXT: andq $-64, %rsp ; AVX512VBMI-NEXT: subq $128, %rsp ; AVX512VBMI-NEXT: # kill: def $esi killed $esi def $rsi -; AVX512VBMI-NEXT: vpbroadcastd %esi, %zmm1 -; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 -; AVX512VBMI-NEXT: vmovd %xmm2, %eax +; AVX512VBMI-NEXT: vpbroadcastd %esi, %zmm2 +; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512VBMI-NEXT: vmovd %xmm1, %eax ; AVX512VBMI-NEXT: vmovdqa64 %zmm0, (%rsp) ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VBMI-NEXT: vmovd %eax, %xmm3 -; AVX512VBMI-NEXT: vpextrd $1, %xmm2, %eax +; AVX512VBMI-NEXT: vpextrd $1, %xmm1, %eax ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: vpinsrb $1, (%rsp,%rax), %xmm3, %xmm3 -; AVX512VBMI-NEXT: vpextrd $2, %xmm2, %eax +; AVX512VBMI-NEXT: vpextrd $2, %xmm1, %eax ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: vpinsrb $2, (%rsp,%rax), %xmm3, %xmm3 -; AVX512VBMI-NEXT: vpextrd $3, %xmm2, %eax -; AVX512VBMI-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512VBMI-NEXT: vpextrd $3, %xmm1, %eax +; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: vpinsrb $3, (%rsp,%rax), %xmm3, %xmm3 ; AVX512VBMI-NEXT: vmovd %xmm4, %eax @@ -1595,7 +1595,7 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: vpinsrb $6, (%rsp,%rax), %xmm3, %xmm3 ; AVX512VBMI-NEXT: vpextrd $3, %xmm4, %eax -; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm2, %xmm4 +; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm1, %xmm4 ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: vpinsrb $7, (%rsp,%rax), %xmm3, %xmm3 ; AVX512VBMI-NEXT: vmovd %xmm4, %eax @@ -1607,32 +1607,32 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512VBMI-NEXT: andl $63, %esi ; AVX512VBMI-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm3, %xmm3 ; AVX512VBMI-NEXT: vpextrd $3, %xmm4, %eax -; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm2, %xmm2 +; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: vpinsrb $11, (%rsp,%rax), %xmm3, %xmm3 -; AVX512VBMI-NEXT: vmovd %xmm2, %eax +; AVX512VBMI-NEXT: vmovd %xmm1, %eax ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: vpinsrb $12, (%rsp,%rax), %xmm3, %xmm3 -; AVX512VBMI-NEXT: vpextrd $1, %xmm2, %eax +; AVX512VBMI-NEXT: vpextrd $1, %xmm1, %eax ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: vpinsrb $13, (%rsp,%rax), %xmm3, %xmm3 -; AVX512VBMI-NEXT: vpextrd $2, %xmm2, %eax +; AVX512VBMI-NEXT: vpextrd $2, %xmm1, %eax ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: vpinsrb $14, (%rsp,%rax), %xmm3, %xmm3 -; AVX512VBMI-NEXT: vpextrd $3, %xmm2, %eax +; AVX512VBMI-NEXT: vpextrd $3, %xmm1, %eax ; AVX512VBMI-NEXT: andl $63, %eax -; AVX512VBMI-NEXT: vpinsrb $15, (%rsp,%rax), %xmm3, %xmm2 -; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3 +; AVX512VBMI-NEXT: vpinsrb $15, (%rsp,%rax), %xmm3, %xmm1 +; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 ; AVX512VBMI-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512VBMI-NEXT: vpmovdb %zmm4, %xmm4 -; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VBMI-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VBMI-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VBMI-NEXT: vcvtdq2ps %zmm2, %zmm2 -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VBMI-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512VBMI-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512VBMI-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm2 +; AVX512VBMI-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VBMI-NEXT: vcvtdq2ps %zmm2, %zmm2 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm4, %zmm4 ; AVX512VBMI-NEXT: vpmovsxbd %xmm4, %zmm4 ; AVX512VBMI-NEXT: vcvtdq2ps %zmm4, %zmm4 @@ -1641,8 +1641,8 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512VBMI-NEXT: vcvtdq2ps %zmm0, %zmm0 ; AVX512VBMI-NEXT: vmovaps %zmm0, 128(%rdi) ; AVX512VBMI-NEXT: vmovaps %zmm4, 64(%rdi) -; AVX512VBMI-NEXT: vmovaps %zmm1, (%rdi) -; AVX512VBMI-NEXT: vmovaps %zmm2, 192(%rdi) +; AVX512VBMI-NEXT: vmovaps %zmm2, (%rdi) +; AVX512VBMI-NEXT: vmovaps %zmm1, 192(%rdi) ; AVX512VBMI-NEXT: movq %rbp, %rsp ; AVX512VBMI-NEXT: popq %rbp ; AVX512VBMI-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll index 008435b47b53a2..6f86be6e5086fe 100644 --- a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll @@ -118,30 +118,30 @@ define <4 x i32> @test_v4f32_ogt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] ; SSE-32-NEXT: movaps %xmm2, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %edx, %xmm4 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -272,30 +272,30 @@ define <4 x i32> @test_v4f32_oge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] ; SSE-32-NEXT: movaps %xmm2, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %edx, %xmm4 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: cmovael %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -426,30 +426,30 @@ define <4 x i32> @test_v4f32_olt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm2, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] ; SSE-32-NEXT: movaps %xmm3, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %edx, %xmm4 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -578,30 +578,30 @@ define <4 x i32> @test_v4f32_ole_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm2, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] ; SSE-32-NEXT: movaps %xmm3, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %edx, %xmm4 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: cmovael %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1023,30 +1023,30 @@ define <4 x i32> @test_v4f32_ugt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm2, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] ; SSE-32-NEXT: movaps %xmm3, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %edx, %xmm4 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: cmovbl %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1175,30 +1175,30 @@ define <4 x i32> @test_v4f32_uge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm2, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] ; SSE-32-NEXT: movaps %xmm3, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %edx, %xmm4 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: cmovbel %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1327,30 +1327,30 @@ define <4 x i32> @test_v4f32_ult_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] ; SSE-32-NEXT: movaps %xmm2, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %edx, %xmm4 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: cmovbl %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1481,30 +1481,30 @@ define <4 x i32> @test_v4f32_ule_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] ; SSE-32-NEXT: movaps %xmm2, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %edx, %xmm4 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: cmovbel %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1904,24 +1904,24 @@ define <2 x i64> @test_v2f64_ogt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2024,24 +2024,24 @@ define <2 x i64> @test_v2f64_oge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: cmovael %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2144,24 +2144,24 @@ define <2 x i64> @test_v2f64_olt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2262,24 +2262,24 @@ define <2 x i64> @test_v2f64_ole_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: cmovael %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2673,24 +2673,24 @@ define <2 x i64> @test_v2f64_ugt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: cmovbl %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2791,24 +2791,24 @@ define <2 x i64> @test_v2f64_uge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: cmovbel %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2909,24 +2909,24 @@ define <2 x i64> @test_v2f64_ult_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: cmovbl %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -3029,24 +3029,24 @@ define <2 x i64> @test_v2f64_ule_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: cmovbel %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll index 1e56ddc0c8ec8a..a6c0aa03a74d2f 100644 --- a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll @@ -15,22 +15,22 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movaps 8(%ebp), %xmm4 +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: comiss %xmm4, %xmm2 +; SSE-32-NEXT: comiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-32-NEXT: comiss %xmm4, %xmm2 +; SSE-32-NEXT: comiss %xmm3, %xmm2 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -190,24 +190,24 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movaps 8(%ebp), %xmm4 +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm4, %xmm2 +; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $-1, %edx ; SSE-32-NEXT: cmovnel %eax, %edx ; SSE-32-NEXT: cmovpl %eax, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm2 +; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: cmovnel %eax, %ecx ; SSE-32-NEXT: cmovpl %eax, %ecx ; SSE-32-NEXT: movd %ecx, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll index 349d94d930651b..d0c9dd66a7f052 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -280,13 +280,13 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 { ; ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i64: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-64-NEXT: comisd %xmm3, %xmm0 -; SSE-64-NEXT: xorpd %xmm2, %xmm2 +; SSE-64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-64-NEXT: comisd %xmm2, %xmm0 +; SSE-64-NEXT: xorpd %xmm3, %xmm3 ; SSE-64-NEXT: xorpd %xmm1, %xmm1 ; SSE-64-NEXT: jb .LBB1_2 ; SSE-64-NEXT: # %bb.1: -; SSE-64-NEXT: movapd %xmm3, %xmm1 +; SSE-64-NEXT: movapd %xmm2, %xmm1 ; SSE-64-NEXT: .LBB1_2: ; SSE-64-NEXT: movapd %xmm0, %xmm4 ; SSE-64-NEXT: subsd %xmm1, %xmm4 @@ -297,12 +297,12 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 { ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm1 ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-64-NEXT: comisd %xmm3, %xmm0 +; SSE-64-NEXT: comisd %xmm2, %xmm0 ; SSE-64-NEXT: jb .LBB1_4 ; SSE-64-NEXT: # %bb.3: -; SSE-64-NEXT: movapd %xmm3, %xmm2 +; SSE-64-NEXT: movapd %xmm2, %xmm3 ; SSE-64-NEXT: .LBB1_4: -; SSE-64-NEXT: subsd %xmm2, %xmm0 +; SSE-64-NEXT: subsd %xmm3, %xmm0 ; SSE-64-NEXT: cvttsd2si %xmm0, %rax ; SSE-64-NEXT: setae %cl ; SSE-64-NEXT: movzbl %cl, %ecx @@ -951,13 +951,13 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 { ; ; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-64-NEXT: comiss %xmm3, %xmm0 -; SSE-64-NEXT: xorps %xmm2, %xmm2 +; SSE-64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-64-NEXT: comiss %xmm2, %xmm0 +; SSE-64-NEXT: xorps %xmm3, %xmm3 ; SSE-64-NEXT: xorps %xmm1, %xmm1 ; SSE-64-NEXT: jb .LBB4_2 ; SSE-64-NEXT: # %bb.1: -; SSE-64-NEXT: movaps %xmm3, %xmm1 +; SSE-64-NEXT: movaps %xmm2, %xmm1 ; SSE-64-NEXT: .LBB4_2: ; SSE-64-NEXT: movaps %xmm0, %xmm4 ; SSE-64-NEXT: subss %xmm1, %xmm4 @@ -968,12 +968,12 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 { ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm1 ; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-64-NEXT: comiss %xmm3, %xmm0 +; SSE-64-NEXT: comiss %xmm2, %xmm0 ; SSE-64-NEXT: jb .LBB4_4 ; SSE-64-NEXT: # %bb.3: -; SSE-64-NEXT: movaps %xmm3, %xmm2 +; SSE-64-NEXT: movaps %xmm2, %xmm3 ; SSE-64-NEXT: .LBB4_4: -; SSE-64-NEXT: subss %xmm2, %xmm0 +; SSE-64-NEXT: subss %xmm3, %xmm0 ; SSE-64-NEXT: cvttss2si %xmm0, %rax ; SSE-64-NEXT: setae %cl ; SSE-64-NEXT: movzbl %cl, %ecx @@ -1260,13 +1260,13 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(ptr %x) strictfp { ; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128: ; SSE-64: # %bb.0: ; SSE-64-NEXT: movaps (%rdi), %xmm1 -; SSE-64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-64-NEXT: comiss %xmm3, %xmm1 -; SSE-64-NEXT: xorps %xmm2, %xmm2 +; SSE-64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-64-NEXT: comiss %xmm2, %xmm1 +; SSE-64-NEXT: xorps %xmm3, %xmm3 ; SSE-64-NEXT: xorps %xmm0, %xmm0 ; SSE-64-NEXT: jb .LBB5_2 ; SSE-64-NEXT: # %bb.1: -; SSE-64-NEXT: movaps %xmm3, %xmm0 +; SSE-64-NEXT: movaps %xmm2, %xmm0 ; SSE-64-NEXT: .LBB5_2: ; SSE-64-NEXT: movaps %xmm1, %xmm4 ; SSE-64-NEXT: subss %xmm0, %xmm4 @@ -1277,12 +1277,12 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(ptr %x) strictfp { ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm0 ; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-64-NEXT: comiss %xmm3, %xmm1 +; SSE-64-NEXT: comiss %xmm2, %xmm1 ; SSE-64-NEXT: jb .LBB5_4 ; SSE-64-NEXT: # %bb.3: -; SSE-64-NEXT: movaps %xmm3, %xmm2 +; SSE-64-NEXT: movaps %xmm2, %xmm3 ; SSE-64-NEXT: .LBB5_4: -; SSE-64-NEXT: subss %xmm2, %xmm1 +; SSE-64-NEXT: subss %xmm3, %xmm1 ; SSE-64-NEXT: cvttss2si %xmm1, %rax ; SSE-64-NEXT: setae %cl ; SSE-64-NEXT: movzbl %cl, %ecx @@ -2442,13 +2442,13 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 { ; ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i1: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-64-NEXT: comisd %xmm3, %xmm0 -; SSE-64-NEXT: xorpd %xmm2, %xmm2 +; SSE-64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-64-NEXT: comisd %xmm2, %xmm0 +; SSE-64-NEXT: xorpd %xmm3, %xmm3 ; SSE-64-NEXT: xorpd %xmm1, %xmm1 ; SSE-64-NEXT: jb .LBB19_2 ; SSE-64-NEXT: # %bb.1: -; SSE-64-NEXT: movapd %xmm3, %xmm1 +; SSE-64-NEXT: movapd %xmm2, %xmm1 ; SSE-64-NEXT: .LBB19_2: ; SSE-64-NEXT: movapd %xmm0, %xmm4 ; SSE-64-NEXT: subsd %xmm1, %xmm4 @@ -2459,12 +2459,12 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 { ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm1 ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-64-NEXT: comisd %xmm3, %xmm0 +; SSE-64-NEXT: comisd %xmm2, %xmm0 ; SSE-64-NEXT: jb .LBB19_4 ; SSE-64-NEXT: # %bb.3: -; SSE-64-NEXT: movapd %xmm3, %xmm2 +; SSE-64-NEXT: movapd %xmm2, %xmm3 ; SSE-64-NEXT: .LBB19_4: -; SSE-64-NEXT: subsd %xmm2, %xmm0 +; SSE-64-NEXT: subsd %xmm3, %xmm0 ; SSE-64-NEXT: cvttsd2si %xmm0, %rax ; SSE-64-NEXT: setae %cl ; SSE-64-NEXT: movzbl %cl, %ecx @@ -2816,13 +2816,13 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 { ; ; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i1: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-64-NEXT: comiss %xmm3, %xmm0 -; SSE-64-NEXT: xorps %xmm2, %xmm2 +; SSE-64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-64-NEXT: comiss %xmm2, %xmm0 +; SSE-64-NEXT: xorps %xmm3, %xmm3 ; SSE-64-NEXT: xorps %xmm1, %xmm1 ; SSE-64-NEXT: jb .LBB21_2 ; SSE-64-NEXT: # %bb.1: -; SSE-64-NEXT: movaps %xmm3, %xmm1 +; SSE-64-NEXT: movaps %xmm2, %xmm1 ; SSE-64-NEXT: .LBB21_2: ; SSE-64-NEXT: movaps %xmm0, %xmm4 ; SSE-64-NEXT: subss %xmm1, %xmm4 @@ -2833,12 +2833,12 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 { ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm1 ; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-64-NEXT: comiss %xmm3, %xmm0 +; SSE-64-NEXT: comiss %xmm2, %xmm0 ; SSE-64-NEXT: jb .LBB21_4 ; SSE-64-NEXT: # %bb.3: -; SSE-64-NEXT: movaps %xmm3, %xmm2 +; SSE-64-NEXT: movaps %xmm2, %xmm3 ; SSE-64-NEXT: .LBB21_4: -; SSE-64-NEXT: subss %xmm2, %xmm0 +; SSE-64-NEXT: subss %xmm3, %xmm0 ; SSE-64-NEXT: cvttss2si %xmm0, %rax ; SSE-64-NEXT: setae %cl ; SSE-64-NEXT: movzbl %cl, %ecx diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll index 17c5ff7955106a..a2b6ce00ead17a 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -160,10 +160,11 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovsd %xmm3, (%esp) -; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 -; AVX512VL-32-NEXT: setae %bl -; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: movl %eax, %edi ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) @@ -177,10 +178,10 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: xorl %edx, %edx +; AVX512VL-32-NEXT: xorl %ebx, %ebx ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 -; AVX512VL-32-NEXT: setae %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) @@ -202,11 +203,10 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: xorl %edx, %edx ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: setae %dl +; AVX512VL-32-NEXT: kmovw %edx, %k1 ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) @@ -234,23 +234,23 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: shll $31, %ebx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: shll $31, %edi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ; AVX512VL-32-NEXT: shll $31, %esi ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: shll $31, %ebx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 -; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; AVX512VL-32-NEXT: shll $31, %esi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ; AVX512VL-32-NEXT: shll $31, %ecx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -264,10 +264,10 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX512VL-32-NEXT: shll $31, %edi -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 -; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 ; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -448,15 +448,16 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: movl %eax, %edi ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovss %xmm3, (%esp) ; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX512VL-32-NEXT: setae %bl -; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: movl %eax, %esi ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) @@ -469,10 +470,10 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: xorl %edx, %edx +; AVX512VL-32-NEXT: xorl %ebx, %ebx ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 -; AVX512VL-32-NEXT: setae %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) @@ -494,11 +495,10 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: xorl %edx, %edx ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: setae %dl +; AVX512VL-32-NEXT: kmovw %edx, %k1 ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) @@ -526,23 +526,23 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: shll $31, %ebx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx -; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: shll $31, %esi ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: shll $31, %edi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: shll $31, %ebx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 -; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; AVX512VL-32-NEXT: shll $31, %esi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ; AVX512VL-32-NEXT: shll $31, %ecx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -556,10 +556,10 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX512VL-32-NEXT: shll $31, %edi -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 -; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 ; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index a49f7e99097602..0a9ffc1c41c2ba 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -510,71 +510,72 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_4i64: ; SSE: # %bb.0: +; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: subsd %xmm3, %xmm0 -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: cvttsd2si %xmm2, %rcx +; SSE-NEXT: subsd %xmm4, %xmm2 +; SSE-NEXT: cvttsd2si %xmm2, %rax +; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: movq %rcx, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx ; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: subsd %xmm3, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rcx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: subsd %xmm4, %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm2 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: subsd %xmm3, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rax +; SSE-NEXT: movq %rdx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: subsd %xmm4, %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: cvttsd2si %xmm1, %rcx ; SSE-NEXT: movq %rcx, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx ; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: subsd %xmm3, %xmm1 +; SSE-NEXT: subsd %xmm4, %xmm1 ; SSE-NEXT: cvttsd2si %xmm1, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movq %rdx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_4f64_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vsubsd %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vcvttsd2si %xmm3, %rax -; AVX1-NEXT: vcvttsd2si %xmm2, %rcx +; AVX1-NEXT: vcvttsd2si %xmm1, %rcx ; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: sarq $63, %rdx ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vsubsd %xmm2, %xmm1, %xmm4 ; AVX1-NEXT: vcvttsd2si %xmm4, %rax -; AVX1-NEXT: vcvttsd2si %xmm2, %rcx +; AVX1-NEXT: vcvttsd2si %xmm1, %rcx ; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: sarq $63, %rdx ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vsubsd %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vcvttsd2si %xmm3, %rax ; AVX1-NEXT: vcvttsd2si %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rdx @@ -583,8 +584,8 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 ; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcvttsd2si %xmm1, %rax +; AVX1-NEXT: vsubsd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vcvttsd2si %xmm2, %rax ; AVX1-NEXT: vcvttsd2si %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: sarq $63, %rdx @@ -592,32 +593,32 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_4f64_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vsubsd %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vcvttsd2si %xmm3, %rax -; AVX2-NEXT: vcvttsd2si %xmm2, %rcx +; AVX2-NEXT: vcvttsd2si %xmm1, %rcx ; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: sarq $63, %rdx ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm4 +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-NEXT: vsubsd %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vcvttsd2si %xmm4, %rax -; AVX2-NEXT: vcvttsd2si %xmm2, %rcx +; AVX2-NEXT: vcvttsd2si %xmm1, %rcx ; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: sarq $63, %rdx ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vmovq %rdx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-NEXT: vsubsd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vcvttsd2si %xmm3, %rax ; AVX2-NEXT: vcvttsd2si %xmm0, %rcx ; AVX2-NEXT: movq %rcx, %rdx @@ -626,8 +627,8 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 ; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vcvttsd2si %xmm1, %rax +; AVX2-NEXT: vsubsd %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vcvttsd2si %xmm2, %rax ; AVX2-NEXT: vcvttsd2si %xmm0, %rcx ; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: sarq $63, %rdx @@ -635,7 +636,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_4f64_to_4i64: @@ -1566,18 +1567,18 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; ; AVX1-LABEL: fptoui_4f32_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX1-NEXT: vsubss %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax -; AVX1-NEXT: vcvttss2si %xmm2, %rcx +; AVX1-NEXT: vcvttss2si %xmm1, %rcx ; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: sarq $63, %rdx ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %rdx, %xmm1 ; AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vsubss %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vcvttss2si %xmm4, %rax ; AVX1-NEXT: vcvttss2si %xmm3, %rcx ; AVX1-NEXT: movq %rcx, %rdx @@ -1585,8 +1586,8 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vsubss %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax ; AVX1-NEXT: vcvttss2si %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rdx @@ -1595,8 +1596,8 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcvttss2si %xmm1, %rax +; AVX1-NEXT: vsubss %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vcvttss2si %xmm2, %rax ; AVX1-NEXT: vcvttss2si %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: sarq $63, %rdx @@ -1604,23 +1605,23 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_4f32_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vsubss %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax -; AVX2-NEXT: vcvttss2si %xmm2, %rcx +; AVX2-NEXT: vcvttss2si %xmm1, %rcx ; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: sarq $63, %rdx ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %rdx, %xmm1 ; AVX2-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX2-NEXT: vsubss %xmm2, %xmm3, %xmm4 ; AVX2-NEXT: vcvttss2si %xmm4, %rax ; AVX2-NEXT: vcvttss2si %xmm3, %rcx ; AVX2-NEXT: movq %rcx, %rdx @@ -1628,8 +1629,8 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-NEXT: vsubss %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax ; AVX2-NEXT: vcvttss2si %xmm0, %rcx ; AVX2-NEXT: movq %rcx, %rdx @@ -1638,8 +1639,8 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vcvttss2si %xmm1, %rax +; AVX2-NEXT: vsubss %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vcvttss2si %xmm2, %rax ; AVX2-NEXT: vcvttss2si %xmm0, %rcx ; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: sarq $63, %rdx @@ -1647,7 +1648,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_4f32_to_4i64: @@ -1750,18 +1751,18 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; ; AVX1-LABEL: fptoui_8f32_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX1-NEXT: vsubss %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax -; AVX1-NEXT: vcvttss2si %xmm2, %rcx +; AVX1-NEXT: vcvttss2si %xmm1, %rcx ; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: sarq $63, %rdx ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %rdx, %xmm1 ; AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vsubss %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vcvttss2si %xmm4, %rax ; AVX1-NEXT: vcvttss2si %xmm3, %rcx ; AVX1-NEXT: movq %rcx, %rdx @@ -1769,8 +1770,8 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vsubss %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax ; AVX1-NEXT: vcvttss2si %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rdx @@ -1779,8 +1780,8 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcvttss2si %xmm1, %rax +; AVX1-NEXT: vsubss %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vcvttss2si %xmm2, %rax ; AVX1-NEXT: vcvttss2si %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: sarq $63, %rdx @@ -1788,23 +1789,23 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_8f32_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vsubss %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax -; AVX2-NEXT: vcvttss2si %xmm2, %rcx +; AVX2-NEXT: vcvttss2si %xmm1, %rcx ; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: sarq $63, %rdx ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %rdx, %xmm1 ; AVX2-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX2-NEXT: vsubss %xmm2, %xmm3, %xmm4 ; AVX2-NEXT: vcvttss2si %xmm4, %rax ; AVX2-NEXT: vcvttss2si %xmm3, %rcx ; AVX2-NEXT: movq %rcx, %rdx @@ -1812,8 +1813,8 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-NEXT: vsubss %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax ; AVX2-NEXT: vcvttss2si %xmm0, %rcx ; AVX2-NEXT: movq %rcx, %rdx @@ -1822,8 +1823,8 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vcvttss2si %xmm1, %rax +; AVX2-NEXT: vsubss %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vcvttss2si %xmm2, %rax ; AVX2-NEXT: vcvttss2si %xmm0, %rcx ; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: sarq $63, %rdx @@ -1831,7 +1832,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_8f32_to_4i64: diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 18cc77c239b781..d01694025d88a4 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -2529,15 +2529,14 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; ; SSE41-LABEL: uitofp_4i64_to_4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrlq $1, %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 ; SSE41-NEXT: pextrq $1, %xmm5, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 @@ -2546,25 +2545,25 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE41-NEXT: xorps %xmm3, %xmm3 ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 ; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[2,3] -; SSE41-NEXT: pand %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: pand %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: psrlq $1, %xmm5 ; SSE41-NEXT: por %xmm4, %xmm5 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 ; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0],xmm3[3] -; SSE41-NEXT: pextrq $1, %xmm2, %rax +; SSE41-NEXT: pextrq $1, %xmm1, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 ; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[0] -; SSE41-NEXT: movaps %xmm3, %xmm2 -; SSE41-NEXT: addps %xmm3, %xmm2 -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: addps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index 7631367ba5d667..408367012df3b9 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -275,12 +275,12 @@ define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-LABEL: saddo_v6i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: movd %esi, %xmm1 -; SSE41-NEXT: pinsrd $1, %edx, %xmm1 -; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm1 -; SSE41-NEXT: movd %r9d, %xmm0 -; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: movd %r9d, %xmm1 +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 ; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero @@ -288,20 +288,20 @@ define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: paddd %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pxor %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm0, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE41-NEXT: paddd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: paddd %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm5, %xmm1 ; SSE41-NEXT: movq %xmm2, 16(%rcx) ; SSE41-NEXT: movdqa %xmm4, (%rcx) -; SSE41-NEXT: movq %xmm0, 16(%rdi) +; SSE41-NEXT: movq %xmm1, 16(%rdi) ; SSE41-NEXT: movdqa %xmm6, (%rdi) ; SSE41-NEXT: retq ; @@ -807,28 +807,27 @@ define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-LABEL: saddo_v4i24: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pslld $8, %xmm1 ; SSE2-NEXT: psrad $8, %xmm1 +; SSE2-NEXT: pslld $8, %xmm0 +; SSE2-NEXT: psrad $8, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pslld $8, %xmm2 ; SSE2-NEXT: psrad $8, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pslld $8, %xmm1 -; SSE2-NEXT: psrad $8, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movd %xmm2, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: movd %xmm0, %esi ; SSE2-NEXT: movw %si, 3(%rdi) ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: movb %al, 2(%rdi) @@ -838,32 +837,32 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movb %dl, 8(%rdi) ; SSE2-NEXT: shrl $16, %esi ; SSE2-NEXT: movb %sil, 5(%rdi) +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: saddo_v4i24: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pslld $8, %xmm1 ; SSSE3-NEXT: psrad $8, %xmm1 +; SSSE3-NEXT: pslld $8, %xmm0 +; SSSE3-NEXT: psrad $8, %xmm0 +; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pslld $8, %xmm2 ; SSSE3-NEXT: psrad $8, %xmm2 -; SSSE3-NEXT: paddd %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pslld $8, %xmm1 -; SSSE3-NEXT: psrad $8, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSSE3-NEXT: movd %xmm2, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSSE3-NEXT: movd %xmm2, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSSE3-NEXT: movd %xmm0, %esi ; SSSE3-NEXT: movw %si, 3(%rdi) ; SSSE3-NEXT: shrl $16, %eax ; SSSE3-NEXT: movb %al, 2(%rdi) @@ -873,6 +872,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movb %dl, 8(%rdi) ; SSSE3-NEXT: shrl $16, %esi ; SSSE3-NEXT: movb %sil, 5(%rdi) +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: saddo_v4i24: diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index b275814cc8033f..0338e60ba5280a 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -433,63 +433,63 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE2-NEXT: movd %r9d, %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: pmuludq %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: pmuludq %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 ; SSE2-NEXT: pand %xmm4, %xmm8 ; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm5, %xmm9 +; SSE2-NEXT: pand %xmm6, %xmm9 ; SSE2-NEXT: paddd %xmm8, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm4 +; SSE2-NEXT: pmuludq %xmm6, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; SSE2-NEXT: psubd %xmm9, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; SSE2-NEXT: movdqa %xmm4, (%rcx) ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: paddd %xmm8, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: paddd %xmm8, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pmuludq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: psubd %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE2-NEXT: psubd %xmm3, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movq %xmm0, 16(%rcx) ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm0 ; SSE2-NEXT: movq %xmm0, 16(%rdi) ; SSE2-NEXT: movdqa %xmm4, (%rdi) ; SSE2-NEXT: retq @@ -508,63 +508,63 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSSE3-NEXT: movd %r9d, %xmm0 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: pmuludq %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: pmuludq %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm7 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 ; SSSE3-NEXT: pand %xmm4, %xmm8 ; SSSE3-NEXT: pxor %xmm9, %xmm9 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9 -; SSSE3-NEXT: pand %xmm5, %xmm9 +; SSSE3-NEXT: pand %xmm6, %xmm9 ; SSSE3-NEXT: paddd %xmm8, %xmm9 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm5, %xmm4 +; SSSE3-NEXT: pmuludq %xmm6, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm8, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; SSSE3-NEXT: psubd %xmm9, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; SSSE3-NEXT: movdqa %xmm4, (%rcx) ; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 +; SSSE3-NEXT: pxor %xmm6, %xmm4 ; SSSE3-NEXT: pxor %xmm8, %xmm8 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pand %xmm7, %xmm6 -; SSSE3-NEXT: paddd %xmm8, %xmm6 +; SSSE3-NEXT: pand %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: paddd %xmm8, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSSE3-NEXT: pmuludq %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: psubd %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSSE3-NEXT: psubd %xmm3, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movq %xmm0, 16(%rcx) ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm6, %xmm0 ; SSSE3-NEXT: movq %xmm0, 16(%rdi) ; SSSE3-NEXT: movdqa %xmm4, (%rdi) ; SSSE3-NEXT: retq @@ -575,47 +575,47 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movd %esi, %xmm2 ; SSE41-NEXT: pinsrd $1, %edx, %xmm2 ; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmuldq %xmm2, %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmuldq %xmm2, %xmm1 ; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 -; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx +; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE41-NEXT: movd %r9d, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: pmuldq %xmm3, %xmm4 -; SSE41-NEXT: pinsrd $1, %edx, %xmm3 -; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %esi -; SSE41-NEXT: pinsrd $1, %esi, %xmm5 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm3 +; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx +; SSE41-NEXT: pinsrd $1, %edx, %xmm5 ; SSE41-NEXT: pmulld %xmm3, %xmm5 -; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: movd %edx, %xmm3 +; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE41-NEXT: movd %ecx, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE41-NEXT: movd %esi, %xmm6 +; SSE41-NEXT: movd %edx, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] ; SSE41-NEXT: pmuldq %xmm3, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] -; SSE41-NEXT: movq %xmm5, 16(%rcx) +; SSE41-NEXT: movq %xmm5, 16(%rsi) ; SSE41-NEXT: psrad $31, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] ; SSE41-NEXT: pmuldq %xmm4, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] -; SSE41-NEXT: pmulld %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, (%rcx) -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] +; SSE41-NEXT: pmulld %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, (%rsi) +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: movq %xmm5, 16(%rdi) -; SSE41-NEXT: movdqa %xmm1, (%rdi) +; SSE41-NEXT: movdqa %xmm0, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: smulo_v6i32: @@ -702,20 +702,20 @@ define <8 x i32> @smulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm2, %xmm6 ; SSE2-NEXT: paddd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE2-NEXT: psubd %xmm6, %xmm5 +; SSE2-NEXT: pmuludq %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: psubd %xmm6, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm5, %xmm5 @@ -751,20 +751,20 @@ define <8 x i32> @smulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 ; SSSE3-NEXT: pand %xmm2, %xmm6 ; SSSE3-NEXT: paddd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSSE3-NEXT: psubd %xmm6, %xmm5 +; SSSE3-NEXT: pmuludq %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSSE3-NEXT: psubd %xmm6, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: movdqa %xmm0, (%rdi) ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm5, %xmm5 @@ -899,20 +899,20 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 ; SSE2-NEXT: pand %xmm4, %xmm10 ; SSE2-NEXT: paddd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm11, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; SSE2-NEXT: psubd %xmm10, %xmm9 +; SSE2-NEXT: pmuludq %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; SSE2-NEXT: psubd %xmm10, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm9, %xmm9 @@ -992,20 +992,20 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm10 ; SSSE3-NEXT: pand %xmm4, %xmm10 ; SSSE3-NEXT: paddd %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm11, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; SSSE3-NEXT: psubd %xmm10, %xmm9 +; SSSE3-NEXT: pmuludq %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; SSSE3-NEXT: psubd %xmm10, %xmm11 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSSE3-NEXT: movdqa %xmm0, (%rdi) ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 ; SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSSE3-NEXT: pxor %xmm9, %xmm9 @@ -1251,24 +1251,24 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: pmulhw %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: pmulhw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: pmulhw %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: pmulhw %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm3, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: packuswb %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm6 +; SSE2-NEXT: packuswb %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm2 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -1288,7 +1288,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: movdqa %xmm6, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: smulo_v16i8: @@ -1296,24 +1296,24 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSSE3-NEXT: pmulhw %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSSE3-NEXT: pmulhw %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSSE3-NEXT: pmulhw %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSSE3-NEXT: pmulhw %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 ; SSSE3-NEXT: packuswb %xmm3, %xmm0 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: pand %xmm1, %xmm5 ; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: packuswb %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtb %xmm4, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm6 +; SSSE3-NEXT: packuswb %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtb %xmm6, %xmm2 ; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 @@ -1333,7 +1333,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: movdqa %xmm6, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: smulo_v16i8: @@ -1341,24 +1341,24 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE41-NEXT: pmulhw %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE41-NEXT: pmulhw %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE41-NEXT: pmulhw %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE41-NEXT: pmulhw %xmm5, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm1, %xmm5 ; SSE41-NEXT: pand %xmm1, %xmm4 -; SSE41-NEXT: packuswb %xmm5, %xmm4 -; SSE41-NEXT: pcmpgtb %xmm4, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm6 +; SSE41-NEXT: packuswb %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtb %xmm6, %xmm2 ; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -1375,7 +1375,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: movdqa %xmm6, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: smulo_v16i8: @@ -1744,59 +1744,59 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind { ; ; AVX1-LABEL: smulo_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; AVX1-NEXT: vpmulhw %xmm4, %xmm6, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm6 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX1-NEXT: vpmulhw %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm5 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-NEXT: vpmulhw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm5 ; AVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm4, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-NEXT: vpmulhw %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm8 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX1-NEXT: vpmulhw %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm8 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX1-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7 +; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm7, %xmm0, %xmm6 -; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm6 +; AVX1-NEXT: vpcmpgtb %xmm6, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 +; AVX1-NEXT: vpmovsxbd %xmm7, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,3,3,3] +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm6, (%rdi) ; AVX1-NEXT: retq @@ -1889,173 +1889,173 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] -; SSE2-NEXT: pmulhw %xmm8, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm8 -; SSE2-NEXT: psrlw $8, %xmm8 ; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] +; SSE2-NEXT: pmulhw %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: psrlw $8, %xmm8 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] ; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSE2-NEXT: pmulhw %xmm9, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: psrlw $8, %xmm11 -; SSE2-NEXT: packuswb %xmm8, %xmm11 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm9, %xmm10 -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: packuswb %xmm10, %xmm7 +; SSE2-NEXT: pmulhw %xmm10, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: psrlw $8, %xmm10 +; SSE2-NEXT: packuswb %xmm8, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: packuswb %xmm9, %xmm7 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm7, %xmm3 -; SSE2-NEXT: pcmpeqb %xmm11, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; SSE2-NEXT: pcmpeqb %xmm10, %xmm3 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] ; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSE2-NEXT: pmulhw %xmm8, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE2-NEXT: pmulhw %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm10, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: pmulhw %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: psrlw $8, %xmm8 -; SSE2-NEXT: packuswb %xmm2, %xmm8 -; SSE2-NEXT: pand %xmm9, %xmm10 -; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pmulhw %xmm9, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: psrlw $8, %xmm9 +; SSE2-NEXT: packuswb %xmm2, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm10 +; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: packuswb %xmm10, %xmm6 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm8, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSE2-NEXT: pcmpeqb %xmm9, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] ; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; SSE2-NEXT: pmulhw %xmm8, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: pmulhw %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmulhw %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: psrlw $8, %xmm8 -; SSE2-NEXT: packuswb %xmm1, %xmm8 -; SSE2-NEXT: pand %xmm9, %xmm10 -; SSE2-NEXT: pand %xmm9, %xmm5 -; SSE2-NEXT: packuswb %xmm10, %xmm5 +; SSE2-NEXT: pmulhw %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: psrlw $8, %xmm11 +; SSE2-NEXT: packuswb %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm8, %xmm10 +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: packuswb %xmm10, %xmm9 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtb %xmm9, %xmm5 +; SSE2-NEXT: pcmpeqb %xmm11, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm5, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] ; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSE2-NEXT: pmulhw %xmm8, %xmm10 +; SSE2-NEXT: pmulhw %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm10, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pmulhw %xmm11, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm11 -; SSE2-NEXT: psrlw $8, %xmm11 -; SSE2-NEXT: packuswb %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm9, %xmm10 -; SSE2-NEXT: pand %xmm9, %xmm8 -; SSE2-NEXT: packuswb %xmm10, %xmm8 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm8, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm11, %xmm4 +; SSE2-NEXT: pmulhw %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm10 +; SSE2-NEXT: pand %xmm8, %xmm11 +; SSE2-NEXT: packuswb %xmm10, %xmm11 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpgtb %xmm11, %xmm8 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm8 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm7, 48(%rsi) -; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm6, 32(%rsi) -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: movdqa %xmm5, 16(%rsi) -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm8, (%rsi) -; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm9, 16(%rsi) +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm11, (%rsi) +; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm3, 192(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm2, 128(%rdi) -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm1, 64(%rdi) -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm4, (%rdi) -; SSE2-NEXT: movdqa %xmm8, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: movdqa %xmm8, 224(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 240(%rdi) -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm5, 64(%rdi) +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: movdqa %xmm7, 224(%rdi) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: movdqa %xmm5, 240(%rdi) +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: movdqa %xmm6, 208(%rdi) +; SSE2-NEXT: movdqa %xmm9, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm9 +; SSE2-NEXT: psrad $31, %xmm9 +; SSE2-NEXT: movdqa %xmm9, 160(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: movdqa %xmm5, 208(%rdi) +; SSE2-NEXT: movdqa %xmm5, 176(%rdi) +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 144(%rdi) ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 160(%rdi) +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 176(%rdi) +; SSE2-NEXT: movdqa %xmm4, 112(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: movdqa %xmm6, 144(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, 80(%rdi) +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 96(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 112(%rdi) -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: movdqa %xmm7, 80(%rdi) -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm2, 32(%rdi) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 48(%rdi) +; SSE2-NEXT: movdqa %xmm1, 48(%rdi) ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 @@ -2068,173 +2068,173 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: pxor %xmm8, %xmm8 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] -; SSSE3-NEXT: pmulhw %xmm8, %xmm10 -; SSSE3-NEXT: movdqa %xmm10, %xmm8 -; SSSE3-NEXT: psrlw $8, %xmm8 ; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] +; SSSE3-NEXT: pmulhw %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm9, %xmm8 +; SSSE3-NEXT: psrlw $8, %xmm8 +; SSSE3-NEXT: pxor %xmm10, %xmm10 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] ; SSSE3-NEXT: pxor %xmm7, %xmm7 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSSE3-NEXT: pmulhw %xmm9, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm11 -; SSSE3-NEXT: psrlw $8, %xmm11 -; SSSE3-NEXT: packuswb %xmm8, %xmm11 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pand %xmm9, %xmm7 -; SSSE3-NEXT: packuswb %xmm10, %xmm7 +; SSSE3-NEXT: pmulhw %xmm10, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: psrlw $8, %xmm10 +; SSSE3-NEXT: packuswb %xmm8, %xmm10 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: packuswb %xmm9, %xmm7 ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pcmpgtb %xmm7, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm11, %xmm3 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm3 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSSE3-NEXT: pmulhw %xmm8, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSSE3-NEXT: pmulhw %xmm9, %xmm10 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] ; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; SSSE3-NEXT: movdqa %xmm10, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: pmulhw %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm8 -; SSSE3-NEXT: psrlw $8, %xmm8 -; SSSE3-NEXT: packuswb %xmm2, %xmm8 -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pmulhw %xmm9, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: psrlw $8, %xmm9 +; SSSE3-NEXT: packuswb %xmm2, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm10 +; SSSE3-NEXT: pand %xmm8, %xmm6 ; SSSE3-NEXT: packuswb %xmm10, %xmm6 ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pcmpgtb %xmm6, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm8, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSSE3-NEXT: pcmpeqb %xmm9, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; SSSE3-NEXT: pmulhw %xmm8, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSSE3-NEXT: pmulhw %xmm9, %xmm10 +; SSSE3-NEXT: pxor %xmm11, %xmm11 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] ; SSSE3-NEXT: movdqa %xmm10, %xmm1 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: pmulhw %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm8 -; SSSE3-NEXT: psrlw $8, %xmm8 -; SSSE3-NEXT: packuswb %xmm1, %xmm8 -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pand %xmm9, %xmm5 -; SSSE3-NEXT: packuswb %xmm10, %xmm5 +; SSSE3-NEXT: pmulhw %xmm11, %xmm9 +; SSSE3-NEXT: movdqa %xmm9, %xmm11 +; SSSE3-NEXT: psrlw $8, %xmm11 +; SSSE3-NEXT: packuswb %xmm1, %xmm11 +; SSSE3-NEXT: pand %xmm8, %xmm10 +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: packuswb %xmm10, %xmm9 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtb %xmm9, %xmm5 +; SSSE3-NEXT: pcmpeqb %xmm11, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtb %xmm5, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm8, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSSE3-NEXT: pmulhw %xmm8, %xmm10 +; SSSE3-NEXT: pmulhw %xmm1, %xmm10 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] ; SSSE3-NEXT: movdqa %xmm10, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pmulhw %xmm11, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, %xmm11 -; SSSE3-NEXT: psrlw $8, %xmm11 -; SSSE3-NEXT: packuswb %xmm0, %xmm11 -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pand %xmm9, %xmm8 -; SSSE3-NEXT: packuswb %xmm10, %xmm8 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtb %xmm8, %xmm4 -; SSSE3-NEXT: pcmpeqb %xmm11, %xmm4 +; SSSE3-NEXT: pmulhw %xmm1, %xmm11 +; SSSE3-NEXT: movdqa %xmm11, %xmm1 +; SSSE3-NEXT: psrlw $8, %xmm1 +; SSSE3-NEXT: packuswb %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm8, %xmm10 +; SSSE3-NEXT: pand %xmm8, %xmm11 +; SSSE3-NEXT: packuswb %xmm10, %xmm11 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: pcmpgtb %xmm11, %xmm8 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm8 ; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 ; SSSE3-NEXT: pxor %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm0, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm0 ; SSSE3-NEXT: movdqa %xmm7, 48(%rsi) -; SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm6, 32(%rsi) -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: movdqa %xmm5, 16(%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm8, (%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm9, 16(%rsi) +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm11, (%rsi) +; SSSE3-NEXT: movdqa %xmm3, %xmm7 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: movdqa %xmm3, 192(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm2, %xmm9 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: movdqa %xmm2, 128(%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm4, (%rdi) -; SSSE3-NEXT: movdqa %xmm8, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm8 -; SSSE3-NEXT: psrad $31, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, 224(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 240(%rdi) -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm5, 64(%rdi) +; SSSE3-NEXT: movdqa %xmm8, %xmm2 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm8, (%rdi) +; SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm7 +; SSSE3-NEXT: psrad $31, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, 224(%rdi) +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, 240(%rdi) +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm6 +; SSSE3-NEXT: psrad $31, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, 208(%rdi) +; SSSE3-NEXT: movdqa %xmm9, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm9 +; SSSE3-NEXT: psrad $31, %xmm9 +; SSSE3-NEXT: movdqa %xmm9, 160(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm5 ; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, 208(%rdi) +; SSSE3-NEXT: movdqa %xmm5, 176(%rdi) +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 144(%rdi) ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 160(%rdi) +; SSSE3-NEXT: movdqa %xmm3, 96(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 176(%rdi) +; SSSE3-NEXT: movdqa %xmm4, 112(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, 144(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, 80(%rdi) +; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 96(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 112(%rdi) -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm7 -; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, 80(%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, 32(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 @@ -2412,78 +2412,78 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX1-LABEL: smulo_v64i8: ; AVX1: # %bb.0: ; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX1-NEXT: vpmulhw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm8 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX1-NEXT: vpmulhw %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm7 -; AVX1-NEXT: vpackuswb %xmm8, %xmm7, %xmm8 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm8, %xmm8 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; AVX1-NEXT: vpmulhw %xmm6, %xmm9, %xmm6 -; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm9 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX1-NEXT: vpmulhw %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm6 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-NEXT: vpmulhw %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm7 +; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpackuswb %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtb %xmm5, %xmm4, %xmm8 +; AVX1-NEXT: vpcmpeqb %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX1-NEXT: vpmulhw %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm9 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm3 ; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm8 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] ; AVX1-NEXT: vpmulhw %xmm9, %xmm11, %xmm9 ; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm11 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX1-NEXT: vpmulhw %xmm6, %xmm10, %xmm6 -; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm10 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX1-NEXT: vpmulhw %xmm8, %xmm10, %xmm8 +; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm10 ; AVX1-NEXT: vpackuswb %xmm11, %xmm10, %xmm10 -; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm9 -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm9, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpgtb %xmm6, %xmm5, %xmm9 +; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm9 +; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpackuswb %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpcmpgtb %xmm8, %xmm4, %xmm9 ; AVX1-NEXT: vpcmpeqb %xmm10, %xmm9, %xmm9 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; AVX1-NEXT: vpmulhw %xmm10, %xmm11, %xmm10 ; AVX1-NEXT: vpsrlw $8, %xmm10, %xmm11 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] ; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 ; AVX1-NEXT: vpackuswb %xmm11, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm7, %xmm10, %xmm10 -; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm10, %xmm0, %xmm7 -; AVX1-NEXT: vpcmpgtb %xmm7, %xmm5, %xmm0 +; AVX1-NEXT: vpand %xmm6, %xmm10, %xmm10 +; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm10, %xmm0, %xmm6 +; AVX1-NEXT: vpcmpgtb %xmm6, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm10, %xmm10, %xmm10 -; AVX1-NEXT: vpxor %xmm10, %xmm8, %xmm5 -; AVX1-NEXT: vpxor %xmm3, %xmm10, %xmm3 -; AVX1-NEXT: vpxor %xmm10, %xmm9, %xmm2 -; AVX1-NEXT: vpxor %xmm0, %xmm10, %xmm0 -; AVX1-NEXT: vmovdqa %xmm4, 48(%rsi) +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm9, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm5, 48(%rsi) ; AVX1-NEXT: vmovdqa %xmm1, 32(%rsi) -; AVX1-NEXT: vmovdqa %xmm6, 16(%rsi) -; AVX1-NEXT: vmovdqa %xmm7, (%rsi) -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm1 +; AVX1-NEXT: vmovdqa %xmm8, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm6, (%rsi) +; AVX1-NEXT: vpmovsxbd %xmm7, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 192(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 128(%rdi) @@ -2491,13 +2491,13 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vmovdqa %xmm1, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 224(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 240(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 208(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] @@ -2955,44 +2955,44 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-NEXT: psrad $8, %xmm0 ; SSE2-NEXT: pslld $8, %xmm1 ; SSE2-NEXT: psrad $8, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE2-NEXT: psubd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pslld $8, %xmm1 ; SSE2-NEXT: psrad $8, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: movw %cx, 6(%rdi) -; SSE2-NEXT: movd %xmm2, %edx +; SSE2-NEXT: movd %xmm5, %edx ; SSE2-NEXT: movw %dx, 3(%rdi) ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: movb %al, 2(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, 9(%rdi) ; SSE2-NEXT: shrl $16, %ecx @@ -3010,44 +3010,44 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: psrad $8, %xmm0 ; SSSE3-NEXT: pslld $8, %xmm1 ; SSSE3-NEXT: psrad $8, %xmm1 -; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: paddd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: paddd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSSE3-NEXT: psubd %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-NEXT: psubd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pslld $8, %xmm1 ; SSSE3-NEXT: psrad $8, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSSE3-NEXT: movd %xmm0, %ecx ; SSSE3-NEXT: movw %cx, 6(%rdi) -; SSSE3-NEXT: movd %xmm2, %edx +; SSSE3-NEXT: movd %xmm5, %edx ; SSSE3-NEXT: movw %dx, 3(%rdi) ; SSSE3-NEXT: shrl $16, %eax ; SSSE3-NEXT: movb %al, 2(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, 9(%rdi) ; SSSE3-NEXT: shrl $16, %ecx @@ -3300,101 +3300,101 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %r8, %r15 +; SSE2-NEXT: movq %r8, %rbx ; SSE2-NEXT: movq %rdx, %r8 -; SSE2-NEXT: movq %rsi, %r11 -; SSE2-NEXT: movq %rdi, %r10 +; SSE2-NEXT: movq %rsi, %r10 +; SSE2-NEXT: movq %rdi, %r11 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; SSE2-NEXT: movq %rsi, %rdx ; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movq %r9, %rbx -; SSE2-NEXT: imulq %rdx, %rbx -; SSE2-NEXT: movq %r15, %rax +; SSE2-NEXT: movq %r9, %r15 +; SSE2-NEXT: imulq %rdx, %r15 +; SSE2-NEXT: movq %rbx, %rax ; SSE2-NEXT: mulq %rdx ; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: movq %rax, %r12 +; SSE2-NEXT: movq %rax, %r14 ; SSE2-NEXT: addq %rax, %rsi -; SSE2-NEXT: addq %rbx, %rsi +; SSE2-NEXT: addq %r15, %rsi ; SSE2-NEXT: movq %r9, %rax ; SSE2-NEXT: sarq $63, %rax ; SSE2-NEXT: movq %rax, %r13 -; SSE2-NEXT: imulq %r11, %r13 -; SSE2-NEXT: mulq %r10 -; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: movq %rdx, %rbx -; SSE2-NEXT: addq %r13, %rbx -; SSE2-NEXT: addq %rax, %rbx -; SSE2-NEXT: addq %r12, %r14 -; SSE2-NEXT: adcq %rsi, %rbx -; SSE2-NEXT: movq %r10, %rax -; SSE2-NEXT: mulq %r15 +; SSE2-NEXT: imulq %r10, %r13 +; SSE2-NEXT: mulq %r11 +; SSE2-NEXT: movq %rax, %r15 ; SSE2-NEXT: movq %rdx, %r12 -; SSE2-NEXT: movq %rax, %rsi +; SSE2-NEXT: addq %r13, %r12 +; SSE2-NEXT: addq %rax, %r12 +; SSE2-NEXT: addq %r14, %r15 +; SSE2-NEXT: adcq %rsi, %r12 ; SSE2-NEXT: movq %r11, %rax -; SSE2-NEXT: mulq %r15 -; SSE2-NEXT: movq %rdx, %r15 -; SSE2-NEXT: movq %rax, %r13 -; SSE2-NEXT: addq %r12, %r13 -; SSE2-NEXT: adcq $0, %r15 +; SSE2-NEXT: mulq %rbx +; SSE2-NEXT: movq %rdx, %r14 +; SSE2-NEXT: movq %rax, %rsi ; SSE2-NEXT: movq %r10, %rax +; SSE2-NEXT: mulq %rbx +; SSE2-NEXT: movq %rdx, %rbx +; SSE2-NEXT: movq %rax, %r13 +; SSE2-NEXT: addq %r14, %r13 +; SSE2-NEXT: adcq $0, %rbx +; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r9 -; SSE2-NEXT: movq %rdx, %r12 -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: addq %r13, %r10 -; SSE2-NEXT: adcq %r15, %r12 +; SSE2-NEXT: movq %rdx, %r11 +; SSE2-NEXT: movq %rax, %r14 +; SSE2-NEXT: addq %r13, %r14 +; SSE2-NEXT: adcq %rbx, %r11 ; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %r15d -; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: movzbl %al, %ebx +; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %r9 -; SSE2-NEXT: addq %r12, %rax -; SSE2-NEXT: adcq %r15, %rdx -; SSE2-NEXT: addq %r14, %rax +; SSE2-NEXT: addq %r11, %rax ; SSE2-NEXT: adcq %rbx, %rdx +; SSE2-NEXT: addq %r15, %rax +; SSE2-NEXT: adcq %r12, %rdx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE2-NEXT: movq %r10, 8(%r12) -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: xorq %r10, %rdx -; SSE2-NEXT: xorq %rax, %r10 +; SSE2-NEXT: movq %r14, 8(%r12) +; SSE2-NEXT: sarq $63, %r14 +; SSE2-NEXT: xorq %r14, %rdx +; SSE2-NEXT: xorq %rax, %r14 ; SSE2-NEXT: xorl %r15d, %r15d -; SSE2-NEXT: orq %rdx, %r10 +; SSE2-NEXT: orq %rdx, %r14 ; SSE2-NEXT: setne %r15b ; SSE2-NEXT: movq %rcx, %rdx ; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movq %rbp, %r10 -; SSE2-NEXT: imulq %rdx, %r10 +; SSE2-NEXT: movq %rbp, %r11 +; SSE2-NEXT: imulq %rdx, %r11 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %rdx ; SSE2-NEXT: movq %rdx, %r9 -; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: movq %rax, %r10 ; SSE2-NEXT: addq %rax, %r9 -; SSE2-NEXT: addq %r10, %r9 +; SSE2-NEXT: addq %r11, %r9 ; SSE2-NEXT: movq %rbp, %rax ; SSE2-NEXT: sarq $63, %rax ; SSE2-NEXT: movq %rax, %r14 ; SSE2-NEXT: imulq %rcx, %r14 ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %r11 -; SSE2-NEXT: movq %rdx, %r10 -; SSE2-NEXT: addq %r14, %r10 -; SSE2-NEXT: addq %rax, %r10 -; SSE2-NEXT: addq %rbx, %r11 -; SSE2-NEXT: adcq %r9, %r10 +; SSE2-NEXT: movq %rdx, %rbx +; SSE2-NEXT: addq %r14, %rbx +; SSE2-NEXT: addq %rax, %rbx +; SSE2-NEXT: addq %r10, %r11 +; SSE2-NEXT: adcq %r9, %rbx ; SSE2-NEXT: movq %r8, %rax ; SSE2-NEXT: mulq %rdi -; SSE2-NEXT: movq %rdx, %rbx +; SSE2-NEXT: movq %rdx, %r10 ; SSE2-NEXT: movq %rax, %r9 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rdi ; SSE2-NEXT: movq %rdx, %rdi ; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: addq %rbx, %r14 +; SSE2-NEXT: addq %r10, %r14 ; SSE2-NEXT: adcq $0, %rdi ; SSE2-NEXT: movq %r8, %rax ; SSE2-NEXT: mulq %rbp ; SSE2-NEXT: movq %rdx, %r8 -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: addq %r14, %rbx +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: addq %r14, %r10 ; SSE2-NEXT: adcq %rdi, %r8 ; SSE2-NEXT: setb %al ; SSE2-NEXT: movzbl %al, %edi @@ -3403,13 +3403,13 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: addq %r8, %rax ; SSE2-NEXT: adcq %rdi, %rdx ; SSE2-NEXT: addq %r11, %rax -; SSE2-NEXT: adcq %r10, %rdx -; SSE2-NEXT: movq %rbx, 24(%r12) -; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: xorq %rbx, %rdx -; SSE2-NEXT: xorq %rax, %rbx +; SSE2-NEXT: adcq %rbx, %rdx +; SSE2-NEXT: movq %r10, 24(%r12) +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: xorq %r10, %rdx +; SSE2-NEXT: xorq %rax, %r10 ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdx, %rbx +; SSE2-NEXT: orq %rdx, %r10 ; SSE2-NEXT: setne %al ; SSE2-NEXT: negl %eax ; SSE2-NEXT: movd %eax, %xmm1 @@ -3434,101 +3434,101 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: pushq %r13 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq %r8, %r15 +; SSSE3-NEXT: movq %r8, %rbx ; SSSE3-NEXT: movq %rdx, %r8 -; SSSE3-NEXT: movq %rsi, %r11 -; SSSE3-NEXT: movq %rdi, %r10 +; SSSE3-NEXT: movq %rsi, %r10 +; SSSE3-NEXT: movq %rdi, %r11 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; SSSE3-NEXT: movq %rsi, %rdx ; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movq %r9, %rbx -; SSSE3-NEXT: imulq %rdx, %rbx -; SSSE3-NEXT: movq %r15, %rax +; SSSE3-NEXT: movq %r9, %r15 +; SSSE3-NEXT: imulq %rdx, %r15 +; SSSE3-NEXT: movq %rbx, %rax ; SSSE3-NEXT: mulq %rdx ; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: movq %rax, %r12 +; SSSE3-NEXT: movq %rax, %r14 ; SSSE3-NEXT: addq %rax, %rsi -; SSSE3-NEXT: addq %rbx, %rsi +; SSSE3-NEXT: addq %r15, %rsi ; SSSE3-NEXT: movq %r9, %rax ; SSSE3-NEXT: sarq $63, %rax ; SSSE3-NEXT: movq %rax, %r13 -; SSSE3-NEXT: imulq %r11, %r13 -; SSSE3-NEXT: mulq %r10 -; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: movq %rdx, %rbx -; SSSE3-NEXT: addq %r13, %rbx -; SSSE3-NEXT: addq %rax, %rbx -; SSSE3-NEXT: addq %r12, %r14 -; SSSE3-NEXT: adcq %rsi, %rbx -; SSSE3-NEXT: movq %r10, %rax -; SSSE3-NEXT: mulq %r15 +; SSSE3-NEXT: imulq %r10, %r13 +; SSSE3-NEXT: mulq %r11 +; SSSE3-NEXT: movq %rax, %r15 ; SSSE3-NEXT: movq %rdx, %r12 -; SSSE3-NEXT: movq %rax, %rsi +; SSSE3-NEXT: addq %r13, %r12 +; SSSE3-NEXT: addq %rax, %r12 +; SSSE3-NEXT: addq %r14, %r15 +; SSSE3-NEXT: adcq %rsi, %r12 ; SSSE3-NEXT: movq %r11, %rax -; SSSE3-NEXT: mulq %r15 -; SSSE3-NEXT: movq %rdx, %r15 -; SSSE3-NEXT: movq %rax, %r13 -; SSSE3-NEXT: addq %r12, %r13 -; SSSE3-NEXT: adcq $0, %r15 +; SSSE3-NEXT: mulq %rbx +; SSSE3-NEXT: movq %rdx, %r14 +; SSSE3-NEXT: movq %rax, %rsi ; SSSE3-NEXT: movq %r10, %rax +; SSSE3-NEXT: mulq %rbx +; SSSE3-NEXT: movq %rdx, %rbx +; SSSE3-NEXT: movq %rax, %r13 +; SSSE3-NEXT: addq %r14, %r13 +; SSSE3-NEXT: adcq $0, %rbx +; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r9 -; SSSE3-NEXT: movq %rdx, %r12 -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: addq %r13, %r10 -; SSSE3-NEXT: adcq %r15, %r12 +; SSSE3-NEXT: movq %rdx, %r11 +; SSSE3-NEXT: movq %rax, %r14 +; SSSE3-NEXT: addq %r13, %r14 +; SSSE3-NEXT: adcq %rbx, %r11 ; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %r15d -; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: movzbl %al, %ebx +; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %r9 -; SSSE3-NEXT: addq %r12, %rax -; SSSE3-NEXT: adcq %r15, %rdx -; SSSE3-NEXT: addq %r14, %rax +; SSSE3-NEXT: addq %r11, %rax ; SSSE3-NEXT: adcq %rbx, %rdx +; SSSE3-NEXT: addq %r15, %rax +; SSSE3-NEXT: adcq %r12, %rdx ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSSE3-NEXT: movq %r10, 8(%r12) -; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: xorq %r10, %rdx -; SSSE3-NEXT: xorq %rax, %r10 +; SSSE3-NEXT: movq %r14, 8(%r12) +; SSSE3-NEXT: sarq $63, %r14 +; SSSE3-NEXT: xorq %r14, %rdx +; SSSE3-NEXT: xorq %rax, %r14 ; SSSE3-NEXT: xorl %r15d, %r15d -; SSSE3-NEXT: orq %rdx, %r10 +; SSSE3-NEXT: orq %rdx, %r14 ; SSSE3-NEXT: setne %r15b ; SSSE3-NEXT: movq %rcx, %rdx ; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movq %rbp, %r10 -; SSSE3-NEXT: imulq %rdx, %r10 +; SSSE3-NEXT: movq %rbp, %r11 +; SSSE3-NEXT: imulq %rdx, %r11 ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %rdx ; SSSE3-NEXT: movq %rdx, %r9 -; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: movq %rax, %r10 ; SSSE3-NEXT: addq %rax, %r9 -; SSSE3-NEXT: addq %r10, %r9 +; SSSE3-NEXT: addq %r11, %r9 ; SSSE3-NEXT: movq %rbp, %rax ; SSSE3-NEXT: sarq $63, %rax ; SSSE3-NEXT: movq %rax, %r14 ; SSSE3-NEXT: imulq %rcx, %r14 ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %r11 -; SSSE3-NEXT: movq %rdx, %r10 -; SSSE3-NEXT: addq %r14, %r10 -; SSSE3-NEXT: addq %rax, %r10 -; SSSE3-NEXT: addq %rbx, %r11 -; SSSE3-NEXT: adcq %r9, %r10 +; SSSE3-NEXT: movq %rdx, %rbx +; SSSE3-NEXT: addq %r14, %rbx +; SSSE3-NEXT: addq %rax, %rbx +; SSSE3-NEXT: addq %r10, %r11 +; SSSE3-NEXT: adcq %r9, %rbx ; SSSE3-NEXT: movq %r8, %rax ; SSSE3-NEXT: mulq %rdi -; SSSE3-NEXT: movq %rdx, %rbx +; SSSE3-NEXT: movq %rdx, %r10 ; SSSE3-NEXT: movq %rax, %r9 ; SSSE3-NEXT: movq %rcx, %rax ; SSSE3-NEXT: mulq %rdi ; SSSE3-NEXT: movq %rdx, %rdi ; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: addq %rbx, %r14 +; SSSE3-NEXT: addq %r10, %r14 ; SSSE3-NEXT: adcq $0, %rdi ; SSSE3-NEXT: movq %r8, %rax ; SSSE3-NEXT: mulq %rbp ; SSSE3-NEXT: movq %rdx, %r8 -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: addq %r14, %rbx +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: addq %r14, %r10 ; SSSE3-NEXT: adcq %rdi, %r8 ; SSSE3-NEXT: setb %al ; SSSE3-NEXT: movzbl %al, %edi @@ -3537,13 +3537,13 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: addq %r8, %rax ; SSSE3-NEXT: adcq %rdi, %rdx ; SSSE3-NEXT: addq %r11, %rax -; SSSE3-NEXT: adcq %r10, %rdx -; SSSE3-NEXT: movq %rbx, 24(%r12) -; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: xorq %rbx, %rdx -; SSSE3-NEXT: xorq %rax, %rbx +; SSSE3-NEXT: adcq %rbx, %rdx +; SSSE3-NEXT: movq %r10, 24(%r12) +; SSSE3-NEXT: sarq $63, %r10 +; SSSE3-NEXT: xorq %r10, %rdx +; SSSE3-NEXT: xorq %rax, %r10 ; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: orq %rdx, %rbx +; SSSE3-NEXT: orq %rdx, %r10 ; SSSE3-NEXT: setne %al ; SSSE3-NEXT: negl %eax ; SSSE3-NEXT: movd %eax, %xmm1 @@ -3568,101 +3568,101 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq %r8, %r15 +; SSE41-NEXT: movq %r8, %rbx ; SSE41-NEXT: movq %rdx, %r8 -; SSE41-NEXT: movq %rsi, %r11 -; SSE41-NEXT: movq %rdi, %r10 +; SSE41-NEXT: movq %rsi, %r10 +; SSE41-NEXT: movq %rdi, %r11 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; SSE41-NEXT: movq %rsi, %rdx ; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movq %r9, %rbx -; SSE41-NEXT: imulq %rdx, %rbx -; SSE41-NEXT: movq %r15, %rax +; SSE41-NEXT: movq %r9, %r15 +; SSE41-NEXT: imulq %rdx, %r15 +; SSE41-NEXT: movq %rbx, %rax ; SSE41-NEXT: mulq %rdx ; SSE41-NEXT: movq %rdx, %rsi -; SSE41-NEXT: movq %rax, %r12 +; SSE41-NEXT: movq %rax, %r14 ; SSE41-NEXT: addq %rax, %rsi -; SSE41-NEXT: addq %rbx, %rsi +; SSE41-NEXT: addq %r15, %rsi ; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: sarq $63, %rax ; SSE41-NEXT: movq %rax, %r13 -; SSE41-NEXT: imulq %r11, %r13 -; SSE41-NEXT: mulq %r10 -; SSE41-NEXT: movq %rax, %r14 -; SSE41-NEXT: movq %rdx, %rbx -; SSE41-NEXT: addq %r13, %rbx -; SSE41-NEXT: addq %rax, %rbx -; SSE41-NEXT: addq %r12, %r14 -; SSE41-NEXT: adcq %rsi, %rbx -; SSE41-NEXT: movq %r10, %rax -; SSE41-NEXT: mulq %r15 +; SSE41-NEXT: imulq %r10, %r13 +; SSE41-NEXT: mulq %r11 +; SSE41-NEXT: movq %rax, %r15 ; SSE41-NEXT: movq %rdx, %r12 -; SSE41-NEXT: movq %rax, %rsi +; SSE41-NEXT: addq %r13, %r12 +; SSE41-NEXT: addq %rax, %r12 +; SSE41-NEXT: addq %r14, %r15 +; SSE41-NEXT: adcq %rsi, %r12 ; SSE41-NEXT: movq %r11, %rax -; SSE41-NEXT: mulq %r15 -; SSE41-NEXT: movq %rdx, %r15 -; SSE41-NEXT: movq %rax, %r13 -; SSE41-NEXT: addq %r12, %r13 -; SSE41-NEXT: adcq $0, %r15 +; SSE41-NEXT: mulq %rbx +; SSE41-NEXT: movq %rdx, %r14 +; SSE41-NEXT: movq %rax, %rsi ; SSE41-NEXT: movq %r10, %rax +; SSE41-NEXT: mulq %rbx +; SSE41-NEXT: movq %rdx, %rbx +; SSE41-NEXT: movq %rax, %r13 +; SSE41-NEXT: addq %r14, %r13 +; SSE41-NEXT: adcq $0, %rbx +; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r9 -; SSE41-NEXT: movq %rdx, %r12 -; SSE41-NEXT: movq %rax, %r10 -; SSE41-NEXT: addq %r13, %r10 -; SSE41-NEXT: adcq %r15, %r12 +; SSE41-NEXT: movq %rdx, %r11 +; SSE41-NEXT: movq %rax, %r14 +; SSE41-NEXT: addq %r13, %r14 +; SSE41-NEXT: adcq %rbx, %r11 ; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %r15d -; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: movzbl %al, %ebx +; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %r9 -; SSE41-NEXT: addq %r12, %rax -; SSE41-NEXT: adcq %r15, %rdx -; SSE41-NEXT: addq %r14, %rax +; SSE41-NEXT: addq %r11, %rax ; SSE41-NEXT: adcq %rbx, %rdx +; SSE41-NEXT: addq %r15, %rax +; SSE41-NEXT: adcq %r12, %rdx ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE41-NEXT: movq %r10, 8(%r12) -; SSE41-NEXT: sarq $63, %r10 -; SSE41-NEXT: xorq %r10, %rdx -; SSE41-NEXT: xorq %rax, %r10 +; SSE41-NEXT: movq %r14, 8(%r12) +; SSE41-NEXT: sarq $63, %r14 +; SSE41-NEXT: xorq %r14, %rdx +; SSE41-NEXT: xorq %rax, %r14 ; SSE41-NEXT: xorl %r15d, %r15d -; SSE41-NEXT: orq %rdx, %r10 +; SSE41-NEXT: orq %rdx, %r14 ; SSE41-NEXT: setne %r15b ; SSE41-NEXT: movq %rcx, %rdx ; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movq %rbp, %r10 -; SSE41-NEXT: imulq %rdx, %r10 +; SSE41-NEXT: movq %rbp, %r11 +; SSE41-NEXT: imulq %rdx, %r11 ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %rdx ; SSE41-NEXT: movq %rdx, %r9 -; SSE41-NEXT: movq %rax, %rbx +; SSE41-NEXT: movq %rax, %r10 ; SSE41-NEXT: addq %rax, %r9 -; SSE41-NEXT: addq %r10, %r9 +; SSE41-NEXT: addq %r11, %r9 ; SSE41-NEXT: movq %rbp, %rax ; SSE41-NEXT: sarq $63, %rax ; SSE41-NEXT: movq %rax, %r14 ; SSE41-NEXT: imulq %rcx, %r14 ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %r11 -; SSE41-NEXT: movq %rdx, %r10 -; SSE41-NEXT: addq %r14, %r10 -; SSE41-NEXT: addq %rax, %r10 -; SSE41-NEXT: addq %rbx, %r11 -; SSE41-NEXT: adcq %r9, %r10 +; SSE41-NEXT: movq %rdx, %rbx +; SSE41-NEXT: addq %r14, %rbx +; SSE41-NEXT: addq %rax, %rbx +; SSE41-NEXT: addq %r10, %r11 +; SSE41-NEXT: adcq %r9, %rbx ; SSE41-NEXT: movq %r8, %rax ; SSE41-NEXT: mulq %rdi -; SSE41-NEXT: movq %rdx, %rbx +; SSE41-NEXT: movq %rdx, %r10 ; SSE41-NEXT: movq %rax, %r9 ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %rdi ; SSE41-NEXT: movq %rdx, %rdi ; SSE41-NEXT: movq %rax, %r14 -; SSE41-NEXT: addq %rbx, %r14 +; SSE41-NEXT: addq %r10, %r14 ; SSE41-NEXT: adcq $0, %rdi ; SSE41-NEXT: movq %r8, %rax ; SSE41-NEXT: mulq %rbp ; SSE41-NEXT: movq %rdx, %r8 -; SSE41-NEXT: movq %rax, %rbx -; SSE41-NEXT: addq %r14, %rbx +; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: addq %r14, %r10 ; SSE41-NEXT: adcq %rdi, %r8 ; SSE41-NEXT: setb %al ; SSE41-NEXT: movzbl %al, %edi @@ -3671,13 +3671,13 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: addq %r8, %rax ; SSE41-NEXT: adcq %rdi, %rdx ; SSE41-NEXT: addq %r11, %rax -; SSE41-NEXT: adcq %r10, %rdx -; SSE41-NEXT: movq %rbx, 24(%r12) -; SSE41-NEXT: sarq $63, %rbx -; SSE41-NEXT: xorq %rbx, %rdx -; SSE41-NEXT: xorq %rax, %rbx +; SSE41-NEXT: adcq %rbx, %rdx +; SSE41-NEXT: movq %r10, 24(%r12) +; SSE41-NEXT: sarq $63, %r10 +; SSE41-NEXT: xorq %r10, %rdx +; SSE41-NEXT: xorq %rax, %r10 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rdx, %rbx +; SSE41-NEXT: orq %rdx, %r10 ; SSE41-NEXT: setne %al ; SSE41-NEXT: negl %eax ; SSE41-NEXT: negl %r15d @@ -3701,101 +3701,101 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: pushq %r13 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %r8, %r15 +; AVX-NEXT: movq %r8, %rbx ; AVX-NEXT: movq %rdx, %r8 -; AVX-NEXT: movq %rsi, %r11 -; AVX-NEXT: movq %rdi, %r10 +; AVX-NEXT: movq %rsi, %r10 +; AVX-NEXT: movq %rdi, %r11 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; AVX-NEXT: movq %rsi, %rdx ; AVX-NEXT: sarq $63, %rdx -; AVX-NEXT: movq %r9, %rbx -; AVX-NEXT: imulq %rdx, %rbx -; AVX-NEXT: movq %r15, %rax +; AVX-NEXT: movq %r9, %r15 +; AVX-NEXT: imulq %rdx, %r15 +; AVX-NEXT: movq %rbx, %rax ; AVX-NEXT: mulq %rdx ; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: movq %rax, %r12 +; AVX-NEXT: movq %rax, %r14 ; AVX-NEXT: addq %rax, %rsi -; AVX-NEXT: addq %rbx, %rsi +; AVX-NEXT: addq %r15, %rsi ; AVX-NEXT: movq %r9, %rax ; AVX-NEXT: sarq $63, %rax ; AVX-NEXT: movq %rax, %r13 -; AVX-NEXT: imulq %r11, %r13 -; AVX-NEXT: mulq %r10 -; AVX-NEXT: movq %rax, %r14 -; AVX-NEXT: movq %rdx, %rbx -; AVX-NEXT: addq %r13, %rbx -; AVX-NEXT: addq %rax, %rbx -; AVX-NEXT: addq %r12, %r14 -; AVX-NEXT: adcq %rsi, %rbx -; AVX-NEXT: movq %r10, %rax -; AVX-NEXT: mulq %r15 +; AVX-NEXT: imulq %r10, %r13 +; AVX-NEXT: mulq %r11 +; AVX-NEXT: movq %rax, %r15 ; AVX-NEXT: movq %rdx, %r12 -; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: addq %r13, %r12 +; AVX-NEXT: addq %rax, %r12 +; AVX-NEXT: addq %r14, %r15 +; AVX-NEXT: adcq %rsi, %r12 ; AVX-NEXT: movq %r11, %rax -; AVX-NEXT: mulq %r15 -; AVX-NEXT: movq %rdx, %r15 -; AVX-NEXT: movq %rax, %r13 -; AVX-NEXT: addq %r12, %r13 -; AVX-NEXT: adcq $0, %r15 +; AVX-NEXT: mulq %rbx +; AVX-NEXT: movq %rdx, %r14 +; AVX-NEXT: movq %rax, %rsi ; AVX-NEXT: movq %r10, %rax +; AVX-NEXT: mulq %rbx +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: movq %rax, %r13 +; AVX-NEXT: addq %r14, %r13 +; AVX-NEXT: adcq $0, %rbx +; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r9 -; AVX-NEXT: movq %rdx, %r12 -; AVX-NEXT: movq %rax, %r10 -; AVX-NEXT: addq %r13, %r10 -; AVX-NEXT: adcq %r15, %r12 +; AVX-NEXT: movq %rdx, %r11 +; AVX-NEXT: movq %rax, %r14 +; AVX-NEXT: addq %r13, %r14 +; AVX-NEXT: adcq %rbx, %r11 ; AVX-NEXT: setb %al -; AVX-NEXT: movzbl %al, %r15d -; AVX-NEXT: movq %r11, %rax +; AVX-NEXT: movzbl %al, %ebx +; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %r9 -; AVX-NEXT: addq %r12, %rax -; AVX-NEXT: adcq %r15, %rdx -; AVX-NEXT: addq %r14, %rax +; AVX-NEXT: addq %r11, %rax ; AVX-NEXT: adcq %rbx, %rdx +; AVX-NEXT: addq %r15, %rax +; AVX-NEXT: adcq %r12, %rdx ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX-NEXT: movq %r10, 8(%r12) -; AVX-NEXT: sarq $63, %r10 -; AVX-NEXT: xorq %r10, %rdx -; AVX-NEXT: xorq %rax, %r10 +; AVX-NEXT: movq %r14, 8(%r12) +; AVX-NEXT: sarq $63, %r14 +; AVX-NEXT: xorq %r14, %rdx +; AVX-NEXT: xorq %rax, %r14 ; AVX-NEXT: xorl %r15d, %r15d -; AVX-NEXT: orq %rdx, %r10 +; AVX-NEXT: orq %rdx, %r14 ; AVX-NEXT: setne %r15b ; AVX-NEXT: movq %rcx, %rdx ; AVX-NEXT: sarq $63, %rdx -; AVX-NEXT: movq %rbp, %r10 -; AVX-NEXT: imulq %rdx, %r10 +; AVX-NEXT: movq %rbp, %r11 +; AVX-NEXT: imulq %rdx, %r11 ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %rdx ; AVX-NEXT: movq %rdx, %r9 -; AVX-NEXT: movq %rax, %rbx +; AVX-NEXT: movq %rax, %r10 ; AVX-NEXT: addq %rax, %r9 -; AVX-NEXT: addq %r10, %r9 +; AVX-NEXT: addq %r11, %r9 ; AVX-NEXT: movq %rbp, %rax ; AVX-NEXT: sarq $63, %rax ; AVX-NEXT: movq %rax, %r14 ; AVX-NEXT: imulq %rcx, %r14 ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %r11 -; AVX-NEXT: movq %rdx, %r10 -; AVX-NEXT: addq %r14, %r10 -; AVX-NEXT: addq %rax, %r10 -; AVX-NEXT: addq %rbx, %r11 -; AVX-NEXT: adcq %r9, %r10 +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: addq %r14, %rbx +; AVX-NEXT: addq %rax, %rbx +; AVX-NEXT: addq %r10, %r11 +; AVX-NEXT: adcq %r9, %rbx ; AVX-NEXT: movq %r8, %rax ; AVX-NEXT: mulq %rdi -; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: movq %rdx, %r10 ; AVX-NEXT: movq %rax, %r9 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rdi ; AVX-NEXT: movq %rdx, %rdi ; AVX-NEXT: movq %rax, %r14 -; AVX-NEXT: addq %rbx, %r14 +; AVX-NEXT: addq %r10, %r14 ; AVX-NEXT: adcq $0, %rdi ; AVX-NEXT: movq %r8, %rax ; AVX-NEXT: mulq %rbp ; AVX-NEXT: movq %rdx, %r8 -; AVX-NEXT: movq %rax, %rbx -; AVX-NEXT: addq %r14, %rbx +; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: addq %r14, %r10 ; AVX-NEXT: adcq %rdi, %r8 ; AVX-NEXT: setb %al ; AVX-NEXT: movzbl %al, %edi @@ -3804,13 +3804,13 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: addq %r8, %rax ; AVX-NEXT: adcq %rdi, %rdx ; AVX-NEXT: addq %r11, %rax -; AVX-NEXT: adcq %r10, %rdx -; AVX-NEXT: movq %rbx, 24(%r12) -; AVX-NEXT: sarq $63, %rbx -; AVX-NEXT: xorq %rbx, %rdx -; AVX-NEXT: xorq %rax, %rbx +; AVX-NEXT: adcq %rbx, %rdx +; AVX-NEXT: movq %r10, 24(%r12) +; AVX-NEXT: sarq $63, %r10 +; AVX-NEXT: xorq %r10, %rdx +; AVX-NEXT: xorq %rax, %r10 ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: orq %rdx, %rbx +; AVX-NEXT: orq %rdx, %r10 ; AVX-NEXT: setne %al ; AVX-NEXT: negl %eax ; AVX-NEXT: negl %r15d @@ -3834,114 +3834,113 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: pushq %r13 ; AVX512F-NEXT: pushq %r12 ; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: movq %r9, %rbp -; AVX512F-NEXT: movq %rcx, %r11 -; AVX512F-NEXT: movq %rdx, %r10 -; AVX512F-NEXT: movq %rsi, %r9 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512F-NEXT: movq %rcx, %r10 +; AVX512F-NEXT: movq %rdx, %r11 +; AVX512F-NEXT: movq %rdi, %rbp +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512F-NEXT: sarq $63, %rcx -; AVX512F-NEXT: movq %rsi, %rbx -; AVX512F-NEXT: imulq %rcx, %rbx -; AVX512F-NEXT: movq %r15, %rax +; AVX512F-NEXT: movq %rdi, %r15 +; AVX512F-NEXT: imulq %rcx, %r15 +; AVX512F-NEXT: movq %rbx, %rax ; AVX512F-NEXT: mulq %rcx ; AVX512F-NEXT: movq %rdx, %rcx -; AVX512F-NEXT: movq %rax, %r12 +; AVX512F-NEXT: movq %rax, %r14 ; AVX512F-NEXT: addq %rax, %rcx -; AVX512F-NEXT: addq %rbx, %rcx -; AVX512F-NEXT: movq %rsi, %rax +; AVX512F-NEXT: addq %r15, %rcx +; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: sarq $63, %rax ; AVX512F-NEXT: movq %rax, %r13 -; AVX512F-NEXT: imulq %r11, %r13 -; AVX512F-NEXT: mulq %r10 -; AVX512F-NEXT: movq %rax, %r14 -; AVX512F-NEXT: movq %rdx, %rbx -; AVX512F-NEXT: addq %r13, %rbx -; AVX512F-NEXT: addq %rax, %rbx -; AVX512F-NEXT: addq %r12, %r14 -; AVX512F-NEXT: adcq %rcx, %rbx -; AVX512F-NEXT: movq %r10, %rax -; AVX512F-NEXT: mulq %r15 +; AVX512F-NEXT: imulq %r10, %r13 +; AVX512F-NEXT: mulq %r11 +; AVX512F-NEXT: movq %rax, %r15 ; AVX512F-NEXT: movq %rdx, %r12 -; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: addq %r13, %r12 +; AVX512F-NEXT: addq %rax, %r12 +; AVX512F-NEXT: addq %r14, %r15 +; AVX512F-NEXT: adcq %rcx, %r12 ; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: mulq %r15 -; AVX512F-NEXT: movq %rdx, %r15 -; AVX512F-NEXT: movq %rax, %r13 -; AVX512F-NEXT: addq %r12, %r13 -; AVX512F-NEXT: adcq $0, %r15 +; AVX512F-NEXT: mulq %rbx +; AVX512F-NEXT: movq %rdx, %r14 +; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: movq %r10, %rax -; AVX512F-NEXT: mulq %rsi -; AVX512F-NEXT: movq %rdx, %r12 -; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: addq %r13, %r10 -; AVX512F-NEXT: adcq %r15, %r12 -; AVX512F-NEXT: setb %al -; AVX512F-NEXT: movzbl %al, %r15d +; AVX512F-NEXT: mulq %rbx +; AVX512F-NEXT: movq %rdx, %rbx +; AVX512F-NEXT: movq %rax, %r13 +; AVX512F-NEXT: addq %r14, %r13 +; AVX512F-NEXT: adcq $0, %rbx ; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: mulq %rsi -; AVX512F-NEXT: addq %r12, %rax -; AVX512F-NEXT: adcq %r15, %rdx -; AVX512F-NEXT: addq %r14, %rax +; AVX512F-NEXT: mulq %rdi +; AVX512F-NEXT: movq %rdx, %r11 +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: addq %r13, %r14 +; AVX512F-NEXT: adcq %rbx, %r11 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: movzbl %al, %ebx +; AVX512F-NEXT: movq %r10, %rax +; AVX512F-NEXT: mulq %rdi +; AVX512F-NEXT: addq %r11, %rax ; AVX512F-NEXT: adcq %rbx, %rdx +; AVX512F-NEXT: addq %r15, %rax +; AVX512F-NEXT: adcq %r12, %rdx ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512F-NEXT: movq %r10, 24(%r12) -; AVX512F-NEXT: sarq $63, %r10 -; AVX512F-NEXT: xorq %r10, %rdx -; AVX512F-NEXT: xorq %rax, %r10 -; AVX512F-NEXT: orq %rdx, %r10 +; AVX512F-NEXT: movq %r14, 24(%r12) +; AVX512F-NEXT: sarq $63, %r14 +; AVX512F-NEXT: xorq %r14, %rdx +; AVX512F-NEXT: xorq %rax, %r14 +; AVX512F-NEXT: orq %rdx, %r14 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: movq %r9, %rdx +; AVX512F-NEXT: movq %rsi, %rdx ; AVX512F-NEXT: sarq $63, %rdx -; AVX512F-NEXT: movq %rbp, %rsi -; AVX512F-NEXT: imulq %rdx, %rsi +; AVX512F-NEXT: movq %r9, %rdi +; AVX512F-NEXT: imulq %rdx, %rdi ; AVX512F-NEXT: movq %r8, %rax ; AVX512F-NEXT: mulq %rdx ; AVX512F-NEXT: movq %rdx, %r10 -; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: movq %rax, %r11 ; AVX512F-NEXT: addq %rax, %r10 -; AVX512F-NEXT: addq %rsi, %r10 -; AVX512F-NEXT: movq %rbp, %rax +; AVX512F-NEXT: addq %rdi, %r10 +; AVX512F-NEXT: movq %r9, %rax ; AVX512F-NEXT: sarq $63, %rax -; AVX512F-NEXT: movq %rax, %rsi -; AVX512F-NEXT: imulq %r9, %rsi -; AVX512F-NEXT: mulq %rdi +; AVX512F-NEXT: movq %rax, %rdi +; AVX512F-NEXT: imulq %rsi, %rdi +; AVX512F-NEXT: mulq %rbp ; AVX512F-NEXT: movq %rax, %rbx -; AVX512F-NEXT: movq %rdx, %r11 -; AVX512F-NEXT: addq %rsi, %r11 -; AVX512F-NEXT: addq %rax, %r11 -; AVX512F-NEXT: addq %r14, %rbx -; AVX512F-NEXT: adcq %r10, %r11 -; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rdx, %r14 +; AVX512F-NEXT: addq %rdi, %r14 +; AVX512F-NEXT: addq %rax, %r14 +; AVX512F-NEXT: addq %r11, %rbx +; AVX512F-NEXT: adcq %r10, %r14 +; AVX512F-NEXT: movq %rbp, %rax +; AVX512F-NEXT: mulq %r8 +; AVX512F-NEXT: movq %rdx, %r11 ; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: movq %r9, %rax +; AVX512F-NEXT: movq %rsi, %rax ; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rdx, %r8 ; AVX512F-NEXT: movq %rax, %r15 -; AVX512F-NEXT: addq %r14, %r15 +; AVX512F-NEXT: addq %r11, %r15 ; AVX512F-NEXT: adcq $0, %r8 -; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: mulq %rbp +; AVX512F-NEXT: movq %rbp, %rax +; AVX512F-NEXT: mulq %r9 ; AVX512F-NEXT: movq %rdx, %rdi -; AVX512F-NEXT: movq %rax, %r14 -; AVX512F-NEXT: addq %r15, %r14 +; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: addq %r15, %r11 ; AVX512F-NEXT: adcq %r8, %rdi ; AVX512F-NEXT: setb %al -; AVX512F-NEXT: movzbl %al, %esi -; AVX512F-NEXT: movq %r9, %rax -; AVX512F-NEXT: mulq %rbp +; AVX512F-NEXT: movzbl %al, %r8d +; AVX512F-NEXT: movq %rsi, %rax +; AVX512F-NEXT: mulq %r9 ; AVX512F-NEXT: addq %rdi, %rax -; AVX512F-NEXT: adcq %rsi, %rdx +; AVX512F-NEXT: adcq %r8, %rdx ; AVX512F-NEXT: addq %rbx, %rax -; AVX512F-NEXT: adcq %r11, %rdx -; AVX512F-NEXT: movq %r14, 8(%r12) -; AVX512F-NEXT: sarq $63, %r14 -; AVX512F-NEXT: xorq %r14, %rdx -; AVX512F-NEXT: xorq %rax, %r14 -; AVX512F-NEXT: orq %rdx, %r14 +; AVX512F-NEXT: adcq %r14, %rdx +; AVX512F-NEXT: movq %r11, 8(%r12) +; AVX512F-NEXT: sarq $63, %r11 +; AVX512F-NEXT: xorq %r11, %rdx +; AVX512F-NEXT: xorq %rax, %r11 +; AVX512F-NEXT: orq %rdx, %r11 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: kmovw %eax, %k1 @@ -3967,114 +3966,113 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: pushq %r13 ; AVX512BW-NEXT: pushq %r12 ; AVX512BW-NEXT: pushq %rbx -; AVX512BW-NEXT: movq %r9, %rbp -; AVX512BW-NEXT: movq %rcx, %r11 -; AVX512BW-NEXT: movq %rdx, %r10 -; AVX512BW-NEXT: movq %rsi, %r9 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512BW-NEXT: movq %rcx, %r10 +; AVX512BW-NEXT: movq %rdx, %r11 +; AVX512BW-NEXT: movq %rdi, %rbp +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-NEXT: sarq $63, %rcx -; AVX512BW-NEXT: movq %rsi, %rbx -; AVX512BW-NEXT: imulq %rcx, %rbx -; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: movq %rdi, %r15 +; AVX512BW-NEXT: imulq %rcx, %r15 +; AVX512BW-NEXT: movq %rbx, %rax ; AVX512BW-NEXT: mulq %rcx ; AVX512BW-NEXT: movq %rdx, %rcx -; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: movq %rax, %r14 ; AVX512BW-NEXT: addq %rax, %rcx -; AVX512BW-NEXT: addq %rbx, %rcx -; AVX512BW-NEXT: movq %rsi, %rax +; AVX512BW-NEXT: addq %r15, %rcx +; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: sarq $63, %rax ; AVX512BW-NEXT: movq %rax, %r13 -; AVX512BW-NEXT: imulq %r11, %r13 -; AVX512BW-NEXT: mulq %r10 -; AVX512BW-NEXT: movq %rax, %r14 -; AVX512BW-NEXT: movq %rdx, %rbx -; AVX512BW-NEXT: addq %r13, %rbx -; AVX512BW-NEXT: addq %rax, %rbx -; AVX512BW-NEXT: addq %r12, %r14 -; AVX512BW-NEXT: adcq %rcx, %rbx -; AVX512BW-NEXT: movq %r10, %rax -; AVX512BW-NEXT: mulq %r15 +; AVX512BW-NEXT: imulq %r10, %r13 +; AVX512BW-NEXT: mulq %r11 +; AVX512BW-NEXT: movq %rax, %r15 ; AVX512BW-NEXT: movq %rdx, %r12 -; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: addq %r13, %r12 +; AVX512BW-NEXT: addq %rax, %r12 +; AVX512BW-NEXT: addq %r14, %r15 +; AVX512BW-NEXT: adcq %rcx, %r12 ; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: mulq %r15 -; AVX512BW-NEXT: movq %rdx, %r15 -; AVX512BW-NEXT: movq %rax, %r13 -; AVX512BW-NEXT: addq %r12, %r13 -; AVX512BW-NEXT: adcq $0, %r15 +; AVX512BW-NEXT: mulq %rbx +; AVX512BW-NEXT: movq %rdx, %r14 +; AVX512BW-NEXT: movq %rax, %rcx ; AVX512BW-NEXT: movq %r10, %rax -; AVX512BW-NEXT: mulq %rsi -; AVX512BW-NEXT: movq %rdx, %r12 -; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: addq %r13, %r10 -; AVX512BW-NEXT: adcq %r15, %r12 -; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: movzbl %al, %r15d +; AVX512BW-NEXT: mulq %rbx +; AVX512BW-NEXT: movq %rdx, %rbx +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: addq %r14, %r13 +; AVX512BW-NEXT: adcq $0, %rbx ; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: mulq %rsi -; AVX512BW-NEXT: addq %r12, %rax -; AVX512BW-NEXT: adcq %r15, %rdx -; AVX512BW-NEXT: addq %r14, %rax +; AVX512BW-NEXT: mulq %rdi +; AVX512BW-NEXT: movq %rdx, %r11 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: addq %r13, %r14 +; AVX512BW-NEXT: adcq %rbx, %r11 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: movzbl %al, %ebx +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: mulq %rdi +; AVX512BW-NEXT: addq %r11, %rax ; AVX512BW-NEXT: adcq %rbx, %rdx +; AVX512BW-NEXT: addq %r15, %rax +; AVX512BW-NEXT: adcq %r12, %rdx ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512BW-NEXT: movq %r10, 24(%r12) -; AVX512BW-NEXT: sarq $63, %r10 -; AVX512BW-NEXT: xorq %r10, %rdx -; AVX512BW-NEXT: xorq %rax, %r10 -; AVX512BW-NEXT: orq %rdx, %r10 +; AVX512BW-NEXT: movq %r14, 24(%r12) +; AVX512BW-NEXT: sarq $63, %r14 +; AVX512BW-NEXT: xorq %r14, %rdx +; AVX512BW-NEXT: xorq %rax, %r14 +; AVX512BW-NEXT: orq %rdx, %r14 ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: movq %r9, %rdx +; AVX512BW-NEXT: movq %rsi, %rdx ; AVX512BW-NEXT: sarq $63, %rdx -; AVX512BW-NEXT: movq %rbp, %rsi -; AVX512BW-NEXT: imulq %rdx, %rsi +; AVX512BW-NEXT: movq %r9, %rdi +; AVX512BW-NEXT: imulq %rdx, %rdi ; AVX512BW-NEXT: movq %r8, %rax ; AVX512BW-NEXT: mulq %rdx ; AVX512BW-NEXT: movq %rdx, %r10 -; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: movq %rax, %r11 ; AVX512BW-NEXT: addq %rax, %r10 -; AVX512BW-NEXT: addq %rsi, %r10 -; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: addq %rdi, %r10 +; AVX512BW-NEXT: movq %r9, %rax ; AVX512BW-NEXT: sarq $63, %rax -; AVX512BW-NEXT: movq %rax, %rsi -; AVX512BW-NEXT: imulq %r9, %rsi -; AVX512BW-NEXT: mulq %rdi +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: imulq %rsi, %rdi +; AVX512BW-NEXT: mulq %rbp ; AVX512BW-NEXT: movq %rax, %rbx -; AVX512BW-NEXT: movq %rdx, %r11 -; AVX512BW-NEXT: addq %rsi, %r11 -; AVX512BW-NEXT: addq %rax, %r11 -; AVX512BW-NEXT: addq %r14, %rbx -; AVX512BW-NEXT: adcq %r10, %r11 -; AVX512BW-NEXT: movq %rdi, %rax -; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rdx, %r14 +; AVX512BW-NEXT: addq %rdi, %r14 +; AVX512BW-NEXT: addq %rax, %r14 +; AVX512BW-NEXT: addq %r11, %rbx +; AVX512BW-NEXT: adcq %r10, %r14 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: mulq %r8 +; AVX512BW-NEXT: movq %rdx, %r11 ; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: movq %rsi, %rax ; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rdx, %r8 ; AVX512BW-NEXT: movq %rax, %r15 -; AVX512BW-NEXT: addq %r14, %r15 +; AVX512BW-NEXT: addq %r11, %r15 ; AVX512BW-NEXT: adcq $0, %r8 -; AVX512BW-NEXT: movq %rdi, %rax -; AVX512BW-NEXT: mulq %rbp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: mulq %r9 ; AVX512BW-NEXT: movq %rdx, %rdi -; AVX512BW-NEXT: movq %rax, %r14 -; AVX512BW-NEXT: addq %r15, %r14 +; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: addq %r15, %r11 ; AVX512BW-NEXT: adcq %r8, %rdi ; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: movzbl %al, %esi -; AVX512BW-NEXT: movq %r9, %rax -; AVX512BW-NEXT: mulq %rbp +; AVX512BW-NEXT: movzbl %al, %r8d +; AVX512BW-NEXT: movq %rsi, %rax +; AVX512BW-NEXT: mulq %r9 ; AVX512BW-NEXT: addq %rdi, %rax -; AVX512BW-NEXT: adcq %rsi, %rdx +; AVX512BW-NEXT: adcq %r8, %rdx ; AVX512BW-NEXT: addq %rbx, %rax -; AVX512BW-NEXT: adcq %r11, %rdx -; AVX512BW-NEXT: movq %r14, 8(%r12) -; AVX512BW-NEXT: sarq $63, %r14 -; AVX512BW-NEXT: xorq %r14, %rdx -; AVX512BW-NEXT: xorq %rax, %r14 -; AVX512BW-NEXT: orq %rdx, %r14 +; AVX512BW-NEXT: adcq %r14, %rdx +; AVX512BW-NEXT: movq %r11, 8(%r12) +; AVX512BW-NEXT: sarq $63, %r11 +; AVX512BW-NEXT: xorq %r11, %rdx +; AVX512BW-NEXT: xorq %rax, %r11 +; AVX512BW-NEXT: orq %rdx, %r11 ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: andl $1, %eax ; AVX512BW-NEXT: kmovw %eax, %k1 diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index d634457069c0da..e84da8cebbf259 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -199,39 +199,39 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %edx, %xmm0 ; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %r9d, %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psubd %xmm0, %xmm4 +; SSE2-NEXT: psubd %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psubd %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubd %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movq %xmm3, 16(%rcx) ; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movq %xmm2, 16(%rdi) -; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movq %xmm1, 16(%rdi) +; SSE2-NEXT: movdqa %xmm2, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: ssubo_v6i32: @@ -240,50 +240,50 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %edx, %xmm0 ; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %r9d, %xmm0 ; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: psubd %xmm0, %xmm4 +; SSSE3-NEXT: psubd %xmm2, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 ; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: psubd %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psubd %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm1 ; SSSE3-NEXT: movq %xmm3, 16(%rcx) ; SSSE3-NEXT: movdqa %xmm4, (%rcx) -; SSSE3-NEXT: movq %xmm2, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: movq %xmm1, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm2, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ssubo_v6i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: movd %esi, %xmm1 -; SSE41-NEXT: pinsrd $1, %edx, %xmm1 -; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm1 -; SSE41-NEXT: movd %r9d, %xmm0 -; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: movd %r9d, %xmm1 +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 ; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero @@ -291,20 +291,20 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: psubd %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psubd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psubd %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movq %xmm1, 16(%rcx) +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: movq %xmm0, 16(%rcx) ; SSE41-NEXT: movdqa %xmm4, (%rcx) -; SSE41-NEXT: movq %xmm0, 16(%rdi) +; SSE41-NEXT: movq %xmm1, 16(%rdi) ; SSE41-NEXT: movdqa %xmm3, (%rdi) ; SSE41-NEXT: retq ; @@ -427,31 +427,31 @@ define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind { ; SSE-LABEL: ssubo_v16i32: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: psubd %xmm4, %xmm8 -; SSE-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: psubd %xmm4, %xmm9 +; SSE-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: psubd %xmm5, %xmm4 -; SSE-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE-NEXT: pcmpgtd %xmm8, %xmm5 ; SSE-NEXT: pcmpgtd %xmm4, %xmm1 ; SSE-NEXT: pxor %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: psubd %xmm6, %xmm5 -; SSE-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE-NEXT: pcmpgtd %xmm8, %xmm6 ; SSE-NEXT: pcmpgtd %xmm5, %xmm2 ; SSE-NEXT: pxor %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: psubd %xmm7, %xmm6 -; SSE-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE-NEXT: pcmpgtd %xmm8, %xmm7 ; SSE-NEXT: pcmpgtd %xmm6, %xmm3 ; SSE-NEXT: pxor %xmm7, %xmm3 ; SSE-NEXT: movdqa %xmm6, 48(%rdi) ; SSE-NEXT: movdqa %xmm5, 32(%rdi) ; SSE-NEXT: movdqa %xmm4, 16(%rdi) -; SSE-NEXT: movdqa %xmm8, (%rdi) +; SSE-NEXT: movdqa %xmm9, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: ssubo_v16i32: @@ -816,28 +816,27 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-LABEL: ssubo_v4i24: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pslld $8, %xmm1 ; SSE2-NEXT: psrad $8, %xmm1 +; SSE2-NEXT: pslld $8, %xmm0 +; SSE2-NEXT: psrad $8, %xmm0 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pslld $8, %xmm2 ; SSE2-NEXT: psrad $8, %xmm2 -; SSE2-NEXT: psubd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pslld $8, %xmm1 -; SSE2-NEXT: psrad $8, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movd %xmm2, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: movd %xmm0, %esi ; SSE2-NEXT: movw %si, 3(%rdi) ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: movb %al, 2(%rdi) @@ -847,32 +846,32 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movb %dl, 8(%rdi) ; SSE2-NEXT: shrl $16, %esi ; SSE2-NEXT: movb %sil, 5(%rdi) +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: ssubo_v4i24: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pslld $8, %xmm1 ; SSSE3-NEXT: psrad $8, %xmm1 +; SSSE3-NEXT: pslld $8, %xmm0 +; SSSE3-NEXT: psrad $8, %xmm0 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pslld $8, %xmm2 ; SSSE3-NEXT: psrad $8, %xmm2 -; SSSE3-NEXT: psubd %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pslld $8, %xmm1 -; SSSE3-NEXT: psrad $8, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSSE3-NEXT: movd %xmm2, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSSE3-NEXT: movd %xmm2, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSSE3-NEXT: movd %xmm0, %esi ; SSSE3-NEXT: movw %si, 3(%rdi) ; SSSE3-NEXT: shrl $16, %eax ; SSSE3-NEXT: movb %al, 2(%rdi) @@ -882,6 +881,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movb %dl, 8(%rdi) ; SSSE3-NEXT: shrl $16, %esi ; SSSE3-NEXT: movb %sil, 5(%rdi) +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ssubo_v4i24: diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 653c3a99691511..b28814dad4f356 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -222,78 +222,78 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: movd %r9d, %xmm2 +; SSE2-NEXT: movd %r9d, %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, 16(%rcx) ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: movq %xmm3, 16(%rcx) -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: movq %xmm2, 16(%rdi) -; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movq %xmm0, 16(%rdi) +; SSE2-NEXT: movdqa %xmm3, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: uaddo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: movd %r9d, %xmm2 +; SSSE3-NEXT: movd %r9d, %xmm0 ; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: paddd %xmm0, %xmm1 +; SSSE3-NEXT: paddd %xmm3, %xmm2 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm1, (%rcx) +; SSSE3-NEXT: movdqa %xmm2, (%rcx) +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: paddd %xmm0, %xmm1 +; SSSE3-NEXT: movq %xmm1, 16(%rcx) ; SSSE3-NEXT: pxor %xmm4, %xmm1 ; SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: paddd %xmm2, %xmm3 -; SSSE3-NEXT: movq %xmm3, 16(%rcx) -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: movq %xmm2, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: movq %xmm0, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm3, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: uaddo_v6i32: @@ -891,25 +891,24 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-LABEL: uaddo_v4i24: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movd %xmm2, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: movd %xmm0, %esi ; SSE2-NEXT: movw %si, 3(%rdi) ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: movb %al, 2(%rdi) @@ -919,29 +918,29 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movb %dl, 8(%rdi) ; SSE2-NEXT: shrl $16, %esi ; SSE2-NEXT: movb %sil, 5(%rdi) +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: uaddo_v4i24: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: paddd %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSSE3-NEXT: movd %xmm2, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSSE3-NEXT: movd %xmm2, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSSE3-NEXT: movd %xmm0, %esi ; SSSE3-NEXT: movw %si, 3(%rdi) ; SSSE3-NEXT: shrl $16, %eax ; SSSE3-NEXT: movb %al, 2(%rdi) @@ -951,6 +950,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movb %dl, 8(%rdi) ; SSSE3-NEXT: shrl $16, %esi ; SSSE3-NEXT: movb %sil, 5(%rdi) +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: uaddo_v4i24: diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index e929499c92cbdb..1241cfb0d44aff 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -376,21 +376,21 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd %esi, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: movd %r9d, %xmm0 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pmuludq %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -398,15 +398,15 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0] ; SSE2-NEXT: pmuludq %xmm2, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: movq %xmm0, 16(%rcx) +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE2-NEXT: movq %xmm1, 16(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: movq %xmm7, 16(%rdi) -; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: movdqa %xmm4, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v6i32: @@ -426,21 +426,21 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd %esi, %xmm3 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: movd %r9d, %xmm0 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pmuludq %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: pmuludq %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 +; SSSE3-NEXT: pmuludq %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4 ; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -448,15 +448,15 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0] ; SSSE3-NEXT: pmuludq %xmm2, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm7 ; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSSE3-NEXT: movq %xmm0, 16(%rcx) +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSSE3-NEXT: movq %xmm1, 16(%rcx) ; SSSE3-NEXT: movdqa %xmm3, (%rcx) ; SSSE3-NEXT: movq %xmm7, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: movdqa %xmm4, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v6i32: @@ -583,18 +583,17 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; SSE2-LABEL: umulo_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] @@ -604,31 +603,31 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] ; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: movdqa %xmm1, 16(%rdi) -; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v8i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] +; SSSE3-NEXT: pmuludq %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 -; SSSE3-NEXT: pxor %xmm7, %xmm0 +; SSSE3-NEXT: pxor %xmm7, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm3, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] @@ -638,14 +637,15 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] ; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 ; SSSE3-NEXT: pxor %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: retq ; @@ -766,49 +766,49 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 ; SSE2-NEXT: pxor %xmm12, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm6, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm14, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; SSE2-NEXT: pcmpeqd %xmm11, %xmm8 ; SSE2-NEXT: pxor %xmm12, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm14, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 -; SSE2-NEXT: pxor %xmm12, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm5 +; SSE2-NEXT: pxor %xmm12, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; SSE2-NEXT: movdqa %xmm3, 48(%rdi) ; SSE2-NEXT: movdqa %xmm2, 32(%rdi) ; SSE2-NEXT: movdqa %xmm1, 16(%rdi) ; SSE2-NEXT: movdqa %xmm9, (%rdi) ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v16i32: @@ -828,49 +828,49 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm5, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm8, %xmm13 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 ; SSSE3-NEXT: pxor %xmm12, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm6, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm14, %xmm13 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; SSSE3-NEXT: pcmpeqd %xmm11, %xmm8 ; SSSE3-NEXT: pxor %xmm12, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm7, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm14, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 -; SSSE3-NEXT: pxor %xmm12, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm5 +; SSSE3-NEXT: pxor %xmm12, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) ; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) ; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) ; SSSE3-NEXT: movdqa %xmm9, (%rdi) ; SSSE3-NEXT: movdqa %xmm4, %xmm1 ; SSSE3-NEXT: movdqa %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm6, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v16i32: @@ -1045,20 +1045,20 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; SSE2-NEXT: pmullw %xmm3, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE2-NEXT: pmullw %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm3, %xmm4 -; SSE2-NEXT: psrlw $8, %xmm5 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: packuswb %xmm3, %xmm5 +; SSE2-NEXT: psrlw $8, %xmm4 ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -1078,7 +1078,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: movdqa %xmm5, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v16i8: @@ -1086,20 +1086,20 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; SSSE3-NEXT: pmullw %xmm3, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: movdqa %xmm5, %xmm3 -; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSSE3-NEXT: pmullw %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm5, %xmm3 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSSE3-NEXT: pmullw %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm4 -; SSSE3-NEXT: packuswb %xmm3, %xmm4 -; SSSE3-NEXT: psrlw $8, %xmm5 +; SSSE3-NEXT: pand %xmm0, %xmm5 +; SSSE3-NEXT: packuswb %xmm3, %xmm5 +; SSSE3-NEXT: psrlw $8, %xmm4 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: packuswb %xmm5, %xmm0 +; SSSE3-NEXT: packuswb %xmm4, %xmm0 ; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 @@ -1119,7 +1119,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: movdqa %xmm5, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v16i8: @@ -1127,21 +1127,21 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pmullw %xmm3, %xmm5 -; SSE41-NEXT: pand %xmm5, %xmm4 -; SSE41-NEXT: packuswb %xmm1, %xmm4 +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pmullw %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: packuswb %xmm1, %xmm5 ; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm5 -; SSE41-NEXT: packuswb %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm5 +; SSE41-NEXT: psrlw $8, %xmm4 +; SSE41-NEXT: packuswb %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqb %xmm2, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 ; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -1155,7 +1155,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: movdqa %xmm5, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: umulo_v16i8: @@ -1627,62 +1627,62 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm11 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; SSE2-NEXT: pmullw %xmm8, %xmm11 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pand %xmm9, %xmm12 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packuswb %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm9, %xmm8 +; SSE2-NEXT: packuswb %xmm12, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm12 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; SSE2-NEXT: pmullw %xmm9, %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE2-NEXT: pmullw %xmm12, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm13 +; SSE2-NEXT: pand %xmm9, %xmm13 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: packuswb %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm6, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: pand %xmm9, %xmm12 +; SSE2-NEXT: packuswb %xmm13, %xmm12 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] ; SSE2-NEXT: movdqa %xmm2, %xmm13 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] -; SSE2-NEXT: pmullw %xmm9, %xmm13 -; SSE2-NEXT: movdqa %xmm13, %xmm14 -; SSE2-NEXT: pand %xmm8, %xmm14 +; SSE2-NEXT: pmullw %xmm5, %xmm13 +; SSE2-NEXT: movdqa %xmm13, %xmm5 +; SSE2-NEXT: pand %xmm9, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm9 -; SSE2-NEXT: packuswb %xmm14, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: packuswb %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] ; SSE2-NEXT: movdqa %xmm3, %xmm14 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] -; SSE2-NEXT: pmullw %xmm6, %xmm14 +; SSE2-NEXT: pmullw %xmm5, %xmm14 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm14, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: packuswb %xmm6, %xmm8 +; SSE2-NEXT: movdqa %xmm14, %xmm5 +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pand %xmm3, %xmm9 +; SSE2-NEXT: packuswb %xmm5, %xmm9 ; SSE2-NEXT: psrlw $8, %xmm14 ; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: packuswb %xmm14, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm13 ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: packuswb %xmm13, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm12 +; SSE2-NEXT: psrlw $8, %xmm4 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm12, %xmm1 +; SSE2-NEXT: packuswb %xmm4, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm11 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm11, %xmm0 @@ -1690,20 +1690,20 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pcmpeqb %xmm10, %xmm2 ; SSE2-NEXT: pcmpeqb %xmm10, %xmm1 ; SSE2-NEXT: pcmpeqb %xmm10, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm6, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm8, 48(%rsi) -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: movdqa %xmm9, 32(%rsi) -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm5, 16(%rsi) -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm4, (%rsi) -; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm9, 48(%rsi) +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm6, 32(%rsi) +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm12, 16(%rsi) +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm8, (%rsi) +; SSE2-NEXT: movdqa %xmm3, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm3, 192(%rdi) @@ -1716,25 +1716,25 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm1, 64(%rdi) ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm0, (%rdi) -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 224(%rdi) +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: movdqa %xmm8, 224(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, 240(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: movdqa %xmm5, 208(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: movdqa %xmm7, 208(%rdi) ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm3 @@ -1745,11 +1745,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, 176(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: movdqa %xmm8, 144(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: movdqa %xmm6, 144(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 @@ -1760,11 +1760,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, 112(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: movdqa %xmm7, 80(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: movdqa %xmm5, 80(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm1 @@ -1774,11 +1774,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, 48(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: movdqa %xmm6, 16(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 16(%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v64i8: @@ -1790,62 +1790,62 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movdqa %xmm0, %xmm11 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; SSSE3-NEXT: pmullw %xmm8, %xmm11 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: movdqa %xmm11, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pand %xmm9, %xmm12 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: packuswb %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm9 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSSE3-NEXT: movdqa %xmm1, %xmm12 +; SSSE3-NEXT: movdqa %xmm0, %xmm8 +; SSSE3-NEXT: pand %xmm9, %xmm8 +; SSSE3-NEXT: packuswb %xmm12, %xmm8 +; SSSE3-NEXT: movdqa %xmm5, %xmm12 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; SSSE3-NEXT: pmullw %xmm9, %xmm12 -; SSSE3-NEXT: movdqa %xmm12, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSSE3-NEXT: pmullw %xmm12, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm13 +; SSSE3-NEXT: pand %xmm9, %xmm13 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: packuswb %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSSE3-NEXT: movdqa %xmm1, %xmm12 +; SSSE3-NEXT: pand %xmm9, %xmm12 +; SSSE3-NEXT: packuswb %xmm13, %xmm12 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] ; SSSE3-NEXT: movdqa %xmm2, %xmm13 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] -; SSSE3-NEXT: pmullw %xmm9, %xmm13 -; SSSE3-NEXT: movdqa %xmm13, %xmm14 -; SSSE3-NEXT: pand %xmm8, %xmm14 +; SSSE3-NEXT: pmullw %xmm5, %xmm13 +; SSSE3-NEXT: movdqa %xmm13, %xmm5 +; SSSE3-NEXT: pand %xmm9, %xmm5 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm9 -; SSSE3-NEXT: packuswb %xmm14, %xmm9 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] +; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: packuswb %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] ; SSSE3-NEXT: movdqa %xmm3, %xmm14 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] -; SSSE3-NEXT: pmullw %xmm6, %xmm14 +; SSSE3-NEXT: pmullw %xmm5, %xmm14 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm14, %xmm6 -; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: packuswb %xmm6, %xmm8 +; SSSE3-NEXT: movdqa %xmm14, %xmm5 +; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm3, %xmm9 +; SSSE3-NEXT: packuswb %xmm5, %xmm9 ; SSSE3-NEXT: psrlw $8, %xmm14 ; SSSE3-NEXT: psrlw $8, %xmm3 ; SSSE3-NEXT: packuswb %xmm14, %xmm3 ; SSSE3-NEXT: psrlw $8, %xmm13 ; SSSE3-NEXT: psrlw $8, %xmm2 ; SSSE3-NEXT: packuswb %xmm13, %xmm2 -; SSSE3-NEXT: psrlw $8, %xmm12 +; SSSE3-NEXT: psrlw $8, %xmm4 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: packuswb %xmm12, %xmm1 +; SSSE3-NEXT: packuswb %xmm4, %xmm1 ; SSSE3-NEXT: psrlw $8, %xmm11 ; SSSE3-NEXT: psrlw $8, %xmm0 ; SSSE3-NEXT: packuswb %xmm11, %xmm0 @@ -1853,20 +1853,20 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pcmpeqb %xmm10, %xmm2 ; SSSE3-NEXT: pcmpeqb %xmm10, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm10, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pxor %xmm6, %xmm3 -; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm1 -; SSSE3-NEXT: pxor %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: movdqa %xmm8, 48(%rsi) -; SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSSE3-NEXT: movdqa %xmm9, 32(%rsi) -; SSSE3-NEXT: movdqa %xmm2, %xmm8 -; SSSE3-NEXT: movdqa %xmm5, 16(%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, (%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: movdqa %xmm9, 48(%rsi) +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, 32(%rsi) +; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm12, 16(%rsi) +; SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSSE3-NEXT: movdqa %xmm8, (%rsi) +; SSSE3-NEXT: movdqa %xmm3, %xmm8 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: movdqa %xmm3, 192(%rdi) @@ -1879,25 +1879,25 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: movdqa %xmm0, (%rdi) -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) +; SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm8 +; SSSE3-NEXT: psrad $31, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, 224(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, 240(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, 208(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm7 +; SSSE3-NEXT: psrad $31, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, 208(%rdi) ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm3 @@ -1908,11 +1908,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, 176(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm8 -; SSSE3-NEXT: psrad $31, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, 144(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm6 +; SSSE3-NEXT: psrad $31, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, 144(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm2 @@ -1923,11 +1923,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, 112(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm7 -; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, 80(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, 80(%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm1 @@ -1937,61 +1937,61 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, 16(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 16(%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v64i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: pxor %xmm13, %xmm13 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; SSE41-NEXT: pmullw %xmm4, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm9, %xmm4 -; SSE41-NEXT: pmullw %xmm10, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm10 -; SSE41-NEXT: pand %xmm9, %xmm10 -; SSE41-NEXT: packuswb %xmm4, %xmm10 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; SSE41-NEXT: pand %xmm10, %xmm4 +; SSE41-NEXT: pmullw %xmm11, %xmm8 +; SSE41-NEXT: movdqa %xmm8, %xmm11 +; SSE41-NEXT: pand %xmm10, %xmm11 +; SSE41-NEXT: packuswb %xmm4, %xmm11 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] ; SSE41-NEXT: pmullw %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pand %xmm9, %xmm5 -; SSE41-NEXT: pmullw %xmm11, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm11 -; SSE41-NEXT: pand %xmm9, %xmm11 -; SSE41-NEXT: packuswb %xmm5, %xmm11 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: pmullw %xmm12, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm12 +; SSE41-NEXT: pand %xmm10, %xmm12 +; SSE41-NEXT: packuswb %xmm5, %xmm12 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm13 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] ; SSE41-NEXT: pmullw %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pand %xmm9, %xmm6 -; SSE41-NEXT: pmullw %xmm12, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm12 -; SSE41-NEXT: pand %xmm9, %xmm12 -; SSE41-NEXT: packuswb %xmm6, %xmm12 +; SSE41-NEXT: pand %xmm10, %xmm6 +; SSE41-NEXT: pmullw %xmm13, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm13 +; SSE41-NEXT: pand %xmm10, %xmm13 +; SSE41-NEXT: packuswb %xmm6, %xmm13 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] ; SSE41-NEXT: pmullw %xmm7, %xmm3 ; SSE41-NEXT: pmullw %xmm14, %xmm6 ; SSE41-NEXT: movdqa %xmm3, %xmm7 -; SSE41-NEXT: pand %xmm9, %xmm7 -; SSE41-NEXT: pand %xmm6, %xmm9 -; SSE41-NEXT: packuswb %xmm7, %xmm9 +; SSE41-NEXT: pand %xmm10, %xmm7 +; SSE41-NEXT: pand %xmm6, %xmm10 +; SSE41-NEXT: packuswb %xmm7, %xmm10 ; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm6 ; SSE41-NEXT: packuswb %xmm3, %xmm6 @@ -2004,19 +2004,19 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm8 ; SSE41-NEXT: packuswb %xmm0, %xmm8 -; SSE41-NEXT: pcmpeqb %xmm13, %xmm6 -; SSE41-NEXT: pcmpeqb %xmm13, %xmm5 -; SSE41-NEXT: pcmpeqb %xmm13, %xmm4 -; SSE41-NEXT: pcmpeqb %xmm13, %xmm8 +; SSE41-NEXT: pcmpeqb %xmm9, %xmm6 +; SSE41-NEXT: pcmpeqb %xmm9, %xmm5 +; SSE41-NEXT: pcmpeqb %xmm9, %xmm4 +; SSE41-NEXT: pcmpeqb %xmm9, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE41-NEXT: pxor %xmm0, %xmm6 ; SSE41-NEXT: pxor %xmm0, %xmm5 ; SSE41-NEXT: pxor %xmm0, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm8 -; SSE41-NEXT: movdqa %xmm9, 48(%rsi) -; SSE41-NEXT: movdqa %xmm12, 32(%rsi) -; SSE41-NEXT: movdqa %xmm11, 16(%rsi) -; SSE41-NEXT: movdqa %xmm10, (%rsi) +; SSE41-NEXT: movdqa %xmm10, 48(%rsi) +; SSE41-NEXT: movdqa %xmm13, 32(%rsi) +; SSE41-NEXT: movdqa %xmm12, 16(%rsi) +; SSE41-NEXT: movdqa %xmm11, (%rsi) ; SSE41-NEXT: pmovsxbd %xmm6, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 192(%rdi) ; SSE41-NEXT: pmovsxbd %xmm5, %xmm0 @@ -2090,115 +2090,115 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX1-LABEL: umulo_v64i8: ; AVX1: # %bb.0: ; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm5 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpmullw %xmm7, %xmm9, %xmm7 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm9 -; AVX1-NEXT: vpackuswb %xmm4, %xmm9, %xmm4 +; AVX1-NEXT: vpackuswb %xmm5, %xmm9, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX1-NEXT: vpmullw %xmm9, %xmm10, %xmm9 -; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm11 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] +; AVX1-NEXT: vpmullw %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm9 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm10 -; AVX1-NEXT: vpand %xmm7, %xmm10, %xmm0 -; AVX1-NEXT: vpackuswb %xmm11, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; AVX1-NEXT: vpmullw %xmm2, %xmm11, %xmm11 -; AVX1-NEXT: vpand %xmm7, %xmm11, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm10, %xmm10 +; AVX1-NEXT: vpand %xmm8, %xmm10, %xmm2 +; AVX1-NEXT: vpackuswb %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX1-NEXT: vpmullw %xmm9, %xmm11, %xmm11 +; AVX1-NEXT: vpand %xmm8, %xmm11, %xmm9 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw %xmm12, %xmm13, %xmm12 -; AVX1-NEXT: vpand %xmm7, %xmm12, %xmm13 -; AVX1-NEXT: vpackuswb %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpand %xmm8, %xmm12, %xmm13 +; AVX1-NEXT: vpackuswb %xmm9, %xmm13, %xmm9 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] ; AVX1-NEXT: vpmullw %xmm13, %xmm14, %xmm13 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm7, %xmm13, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm7 -; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm3 +; AVX1-NEXT: vpand %xmm8, %xmm13, %xmm3 +; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm8 +; AVX1-NEXT: vpackuswb %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm8 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm3 +; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm8 ; AVX1-NEXT: vpsrlw $8, %xmm12, %xmm11 -; AVX1-NEXT: vpackuswb %xmm3, %xmm11, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm9 +; AVX1-NEXT: vpackuswb %xmm8, %xmm11, %xmm8 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm10, %xmm10 -; AVX1-NEXT: vpackuswb %xmm9, %xmm10, %xmm9 +; AVX1-NEXT: vpackuswb %xmm0, %xmm10, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8 -; AVX1-NEXT: vpackuswb %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm9, %xmm8 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm9 -; AVX1-NEXT: vpcmpeqd %xmm10, %xmm10, %xmm10 -; AVX1-NEXT: vpxor %xmm1, %xmm10, %xmm6 -; AVX1-NEXT: vpxor %xmm3, %xmm10, %xmm5 -; AVX1-NEXT: vpxor %xmm10, %xmm8, %xmm3 -; AVX1-NEXT: vpxor %xmm10, %xmm9, %xmm1 -; AVX1-NEXT: vmovdqa %xmm7, 48(%rsi) -; AVX1-NEXT: vmovdqa %xmm2, 32(%rsi) -; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) -; AVX1-NEXT: vmovdqa %xmm4, (%rsi) -; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 192(%rdi) -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 128(%rdi) -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi) -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 240(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 176(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 144(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 96(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 112(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 80(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 +; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm8, %xmm7 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm8 +; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7 +; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm0 +; AVX1-NEXT: vmovdqa %xmm3, 48(%rsi) +; AVX1-NEXT: vmovdqa %xmm9, 32(%rsi) +; AVX1-NEXT: vmovdqa %xmm2, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm5, (%rsi) +; AVX1-NEXT: vpmovsxbd %xmm8, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 192(%rdi) +; AVX1-NEXT: vpmovsxbd %xmm7, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 128(%rdi) +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi) +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, (%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 224(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,3,3,3] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 240(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 208(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 160(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 176(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 144(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 96(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 112(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 80(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdi) ; AVX1-NEXT: vzeroupper @@ -2270,17 +2270,17 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero +; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm5 +; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm2 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k2 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -2301,9 +2301,9 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpmovdb %zmm5, 48(%rdi) ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero -; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero ; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi) ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero ; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi) @@ -2614,38 +2614,37 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-LABEL: umulo_v4i24: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE2-NEXT: pxor %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: psrld $24, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: psrld $24, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: movw %cx, 6(%rdi) -; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: movd %xmm4, %edx ; SSE2-NEXT: movw %dx, 3(%rdi) ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: movb %al, 2(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, 9(%rdi) ; SSE2-NEXT: shrl $16, %ecx ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -2653,42 +2652,42 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movb %dl, 5(%rdi) ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: movb %al, 11(%rdi) +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v4i24: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 ; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 ; SSSE3-NEXT: pxor %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: psrld $24, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: psrld $24, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSSE3-NEXT: movd %xmm0, %ecx ; SSSE3-NEXT: movw %cx, 6(%rdi) -; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: movd %xmm4, %edx ; SSSE3-NEXT: movw %dx, 3(%rdi) ; SSSE3-NEXT: shrl $16, %eax ; SSSE3-NEXT: movb %al, 2(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, 9(%rdi) ; SSSE3-NEXT: shrl $16, %ecx ; SSSE3-NEXT: movb %cl, 8(%rdi) @@ -2696,6 +2695,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movb %dl, 5(%rdi) ; SSSE3-NEXT: shrl $16, %eax ; SSSE3-NEXT: movb %al, 11(%rdi) +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v4i24: @@ -2905,16 +2905,15 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: pushq %rbp ; SSE2-NEXT: pushq %r15 ; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %r9, %r11 -; SSE2-NEXT: movq %rcx, %r10 +; SSE2-NEXT: movq %r9, %r10 +; SSE2-NEXT: movq %rcx, %r11 ; SSE2-NEXT: movq %rdx, %rcx ; SSE2-NEXT: movq %rsi, %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSE2-NEXT: testq %r11, %r11 +; SSE2-NEXT: testq %r10, %r10 ; SSE2-NEXT: setne %dl ; SSE2-NEXT: testq %rsi, %rsi ; SSE2-NEXT: setne %bpl @@ -2922,33 +2921,33 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %rsi ; SSE2-NEXT: seto %r15b -; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %rdi -; SSE2-NEXT: seto %r12b -; SSE2-NEXT: orb %r15b, %r12b -; SSE2-NEXT: orb %bpl, %r12b -; SSE2-NEXT: leaq (%rsi,%rax), %r11 +; SSE2-NEXT: seto %r10b +; SSE2-NEXT: orb %r15b, %r10b +; SSE2-NEXT: orb %bpl, %r10b +; SSE2-NEXT: leaq (%rsi,%rax), %r15 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %rdi ; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: addq %r11, %rsi -; SSE2-NEXT: setb %r11b -; SSE2-NEXT: orb %r12b, %r11b +; SSE2-NEXT: addq %r15, %rsi +; SSE2-NEXT: setb %bpl +; SSE2-NEXT: orb %r10b, %bpl ; SSE2-NEXT: testq %r9, %r9 ; SSE2-NEXT: setne %al -; SSE2-NEXT: testq %r10, %r10 -; SSE2-NEXT: setne %bpl -; SSE2-NEXT: andb %al, %bpl -; SSE2-NEXT: movq %r10, %rax +; SSE2-NEXT: testq %r11, %r11 +; SSE2-NEXT: setne %r10b +; SSE2-NEXT: andb %al, %r10b +; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r14 ; SSE2-NEXT: movq %rax, %r8 -; SSE2-NEXT: seto %r10b +; SSE2-NEXT: seto %r11b ; SSE2-NEXT: movq %r9, %rax ; SSE2-NEXT: mulq %rcx ; SSE2-NEXT: seto %r9b +; SSE2-NEXT: orb %r11b, %r9b ; SSE2-NEXT: orb %r10b, %r9b -; SSE2-NEXT: orb %bpl, %r9b ; SSE2-NEXT: addq %rax, %r8 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %r14 @@ -2958,7 +2957,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: movzbl %r11b, %ecx +; SSE2-NEXT: movzbl %bpl, %ecx ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2967,7 +2966,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: movq %rdx, 24(%rbx) ; SSE2-NEXT: movq %rsi, 8(%rbx) ; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r14 ; SSE2-NEXT: popq %r15 ; SSE2-NEXT: popq %rbp @@ -2978,16 +2976,15 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: pushq %rbp ; SSSE3-NEXT: pushq %r15 ; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq %r9, %r11 -; SSSE3-NEXT: movq %rcx, %r10 +; SSSE3-NEXT: movq %r9, %r10 +; SSSE3-NEXT: movq %rcx, %r11 ; SSSE3-NEXT: movq %rdx, %rcx ; SSSE3-NEXT: movq %rsi, %rax ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSSE3-NEXT: testq %r11, %r11 +; SSSE3-NEXT: testq %r10, %r10 ; SSSE3-NEXT: setne %dl ; SSSE3-NEXT: testq %rsi, %rsi ; SSSE3-NEXT: setne %bpl @@ -2995,33 +2992,33 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %rsi ; SSSE3-NEXT: seto %r15b -; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %rdi -; SSSE3-NEXT: seto %r12b -; SSSE3-NEXT: orb %r15b, %r12b -; SSSE3-NEXT: orb %bpl, %r12b -; SSSE3-NEXT: leaq (%rsi,%rax), %r11 +; SSSE3-NEXT: seto %r10b +; SSSE3-NEXT: orb %r15b, %r10b +; SSSE3-NEXT: orb %bpl, %r10b +; SSSE3-NEXT: leaq (%rsi,%rax), %r15 ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %rdi ; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: addq %r11, %rsi -; SSSE3-NEXT: setb %r11b -; SSSE3-NEXT: orb %r12b, %r11b +; SSSE3-NEXT: addq %r15, %rsi +; SSSE3-NEXT: setb %bpl +; SSSE3-NEXT: orb %r10b, %bpl ; SSSE3-NEXT: testq %r9, %r9 ; SSSE3-NEXT: setne %al -; SSSE3-NEXT: testq %r10, %r10 -; SSSE3-NEXT: setne %bpl -; SSSE3-NEXT: andb %al, %bpl -; SSSE3-NEXT: movq %r10, %rax +; SSSE3-NEXT: testq %r11, %r11 +; SSSE3-NEXT: setne %r10b +; SSSE3-NEXT: andb %al, %r10b +; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r14 ; SSSE3-NEXT: movq %rax, %r8 -; SSSE3-NEXT: seto %r10b +; SSSE3-NEXT: seto %r11b ; SSSE3-NEXT: movq %r9, %rax ; SSSE3-NEXT: mulq %rcx ; SSSE3-NEXT: seto %r9b +; SSSE3-NEXT: orb %r11b, %r9b ; SSSE3-NEXT: orb %r10b, %r9b -; SSSE3-NEXT: orb %bpl, %r9b ; SSSE3-NEXT: addq %rax, %r8 ; SSSE3-NEXT: movq %rcx, %rax ; SSSE3-NEXT: mulq %r14 @@ -3031,7 +3028,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: movzbl %r11b, %ecx +; SSSE3-NEXT: movzbl %bpl, %ecx ; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -3040,7 +3037,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: movq %rdx, 24(%rbx) ; SSSE3-NEXT: movq %rsi, 8(%rbx) ; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r14 ; SSSE3-NEXT: popq %r15 ; SSSE3-NEXT: popq %rbp @@ -3051,16 +3047,15 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: pushq %rbp ; SSE41-NEXT: pushq %r15 ; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq %r9, %r11 -; SSE41-NEXT: movq %rcx, %r10 +; SSE41-NEXT: movq %r9, %r10 +; SSE41-NEXT: movq %rcx, %r11 ; SSE41-NEXT: movq %rdx, %rcx ; SSE41-NEXT: movq %rsi, %rax ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSE41-NEXT: testq %r11, %r11 +; SSE41-NEXT: testq %r10, %r10 ; SSE41-NEXT: setne %dl ; SSE41-NEXT: testq %rsi, %rsi ; SSE41-NEXT: setne %bpl @@ -3068,33 +3063,33 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %rsi ; SSE41-NEXT: seto %r15b -; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %rdi -; SSE41-NEXT: seto %r12b -; SSE41-NEXT: orb %r15b, %r12b -; SSE41-NEXT: orb %bpl, %r12b -; SSE41-NEXT: leaq (%rsi,%rax), %r11 +; SSE41-NEXT: seto %r10b +; SSE41-NEXT: orb %r15b, %r10b +; SSE41-NEXT: orb %bpl, %r10b +; SSE41-NEXT: leaq (%rsi,%rax), %r15 ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %rdi ; SSE41-NEXT: movq %rdx, %rsi -; SSE41-NEXT: addq %r11, %rsi -; SSE41-NEXT: setb %r11b -; SSE41-NEXT: orb %r12b, %r11b +; SSE41-NEXT: addq %r15, %rsi +; SSE41-NEXT: setb %bpl +; SSE41-NEXT: orb %r10b, %bpl ; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setne %al -; SSE41-NEXT: testq %r10, %r10 -; SSE41-NEXT: setne %bpl -; SSE41-NEXT: andb %al, %bpl -; SSE41-NEXT: movq %r10, %rax +; SSE41-NEXT: testq %r11, %r11 +; SSE41-NEXT: setne %r10b +; SSE41-NEXT: andb %al, %r10b +; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r14 ; SSE41-NEXT: movq %rax, %r8 -; SSE41-NEXT: seto %r10b +; SSE41-NEXT: seto %r11b ; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: mulq %rcx ; SSE41-NEXT: seto %r9b +; SSE41-NEXT: orb %r11b, %r9b ; SSE41-NEXT: orb %r10b, %r9b -; SSE41-NEXT: orb %bpl, %r9b ; SSE41-NEXT: addq %rax, %r8 ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %r14 @@ -3103,7 +3098,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: orb %r9b, %cl ; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: negl %ecx -; SSE41-NEXT: movzbl %r11b, %r8d +; SSE41-NEXT: movzbl %bpl, %r8d ; SSE41-NEXT: negl %r8d ; SSE41-NEXT: movd %r8d, %xmm0 ; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 @@ -3112,7 +3107,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: movq %rdx, 24(%rbx) ; SSE41-NEXT: movq %rsi, 8(%rbx) ; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r14 ; SSE41-NEXT: popq %r15 ; SSE41-NEXT: popq %rbp @@ -3123,16 +3117,15 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: pushq %rbp ; AVX-NEXT: pushq %r15 ; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %r9, %r11 -; AVX-NEXT: movq %rcx, %r10 +; AVX-NEXT: movq %r9, %r10 +; AVX-NEXT: movq %rcx, %r11 ; AVX-NEXT: movq %rdx, %rcx ; AVX-NEXT: movq %rsi, %rax ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; AVX-NEXT: testq %r11, %r11 +; AVX-NEXT: testq %r10, %r10 ; AVX-NEXT: setne %dl ; AVX-NEXT: testq %rsi, %rsi ; AVX-NEXT: setne %bpl @@ -3140,33 +3133,33 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %rsi ; AVX-NEXT: seto %r15b -; AVX-NEXT: movq %r11, %rax +; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %rdi -; AVX-NEXT: seto %r12b -; AVX-NEXT: orb %r15b, %r12b -; AVX-NEXT: orb %bpl, %r12b -; AVX-NEXT: leaq (%rsi,%rax), %r11 +; AVX-NEXT: seto %r10b +; AVX-NEXT: orb %r15b, %r10b +; AVX-NEXT: orb %bpl, %r10b +; AVX-NEXT: leaq (%rsi,%rax), %r15 ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %rdi ; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: addq %r11, %rsi -; AVX-NEXT: setb %r11b -; AVX-NEXT: orb %r12b, %r11b +; AVX-NEXT: addq %r15, %rsi +; AVX-NEXT: setb %bpl +; AVX-NEXT: orb %r10b, %bpl ; AVX-NEXT: testq %r9, %r9 ; AVX-NEXT: setne %al -; AVX-NEXT: testq %r10, %r10 -; AVX-NEXT: setne %bpl -; AVX-NEXT: andb %al, %bpl -; AVX-NEXT: movq %r10, %rax +; AVX-NEXT: testq %r11, %r11 +; AVX-NEXT: setne %r10b +; AVX-NEXT: andb %al, %r10b +; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r14 ; AVX-NEXT: movq %rax, %r8 -; AVX-NEXT: seto %r10b +; AVX-NEXT: seto %r11b ; AVX-NEXT: movq %r9, %rax ; AVX-NEXT: mulq %rcx ; AVX-NEXT: seto %r9b +; AVX-NEXT: orb %r11b, %r9b ; AVX-NEXT: orb %r10b, %r9b -; AVX-NEXT: orb %bpl, %r9b ; AVX-NEXT: addq %rax, %r8 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %r14 @@ -3175,7 +3168,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: orb %r9b, %cl ; AVX-NEXT: movzbl %cl, %ecx ; AVX-NEXT: negl %ecx -; AVX-NEXT: movzbl %r11b, %r8d +; AVX-NEXT: movzbl %bpl, %r8d ; AVX-NEXT: negl %r8d ; AVX-NEXT: vmovd %r8d, %xmm0 ; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 @@ -3184,7 +3177,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: movq %rdx, 24(%rbx) ; AVX-NEXT: movq %rsi, 8(%rbx) ; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 ; AVX-NEXT: popq %r14 ; AVX-NEXT: popq %r15 ; AVX-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index a58c3dd0d53073..86ca8ea3584545 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -240,37 +240,37 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE2-NEXT: movd %r8d, %xmm0 ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %r9d, %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psubd %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psubd %xmm2, %xmm0 -; SSE2-NEXT: movq %xmm0, 16(%rcx) -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: movq %xmm0, 16(%rdi) +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubd %xmm1, %xmm3 +; SSE2-NEXT: movq %xmm3, 16(%rcx) +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: movq %xmm3, 16(%rdi) ; SSE2-NEXT: movdqa %xmm4, (%rdi) ; SSE2-NEXT: retq ; @@ -281,37 +281,37 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSSE3-NEXT: movd %r8d, %xmm0 ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %r9d, %xmm0 ; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: psubd %xmm3, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: psubd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm4, (%rcx) -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: psubd %xmm2, %xmm0 -; SSSE3-NEXT: movq %xmm0, 16(%rcx) -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: movq %xmm0, 16(%rdi) +; SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psubd %xmm1, %xmm3 +; SSSE3-NEXT: movq %xmm3, 16(%rcx) +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: movq %xmm3, 16(%rdi) ; SSSE3-NEXT: movdqa %xmm4, (%rdi) ; SSSE3-NEXT: retq ; @@ -938,25 +938,24 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-LABEL: usubo_v4i24: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: psubd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movd %xmm2, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: movd %xmm0, %esi ; SSE2-NEXT: movw %si, 3(%rdi) ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: movb %al, 2(%rdi) @@ -966,29 +965,29 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movb %dl, 8(%rdi) ; SSE2-NEXT: shrl $16, %esi ; SSE2-NEXT: movb %sil, 5(%rdi) +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: usubo_v4i24: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: psubd %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSSE3-NEXT: movd %xmm2, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSSE3-NEXT: movd %xmm2, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSSE3-NEXT: movd %xmm0, %esi ; SSSE3-NEXT: movw %si, 3(%rdi) ; SSSE3-NEXT: shrl $16, %eax ; SSSE3-NEXT: movb %al, 2(%rdi) @@ -998,6 +997,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movb %dl, 8(%rdi) ; SSSE3-NEXT: shrl $16, %esi ; SSSE3-NEXT: movb %sil, 5(%rdi) +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: usubo_v4i24: diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index d3f357cd179525..ae6e676c624c60 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -843,43 +843,43 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { ; SSE2-LABEL: test_bitreverse_v32i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrlw $4, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psrlw $1, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrlw $4, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: psllw $4, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: psllw $4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -1012,47 +1012,47 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrlw $4, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psrlw $1, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrlw $8, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: psllw $8, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrlw $4, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: psllw $4, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: psllw $4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -1208,51 +1208,51 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psrlw $2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrlw $4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrlw $1, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrlw $1, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm6, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: psllw $4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: psllw $2, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -1410,53 +1410,53 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psrlw $2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrlw $4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrlw $1, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrlw $1, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm6, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: psllw $4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: psllw $2, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -1682,45 +1682,46 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; ; SSSE3-LABEL: test_bitreverse_v64i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pshufb %xmm4, %xmm6 +; SSSE3-NEXT: psrlw $4, %xmm0 ; SSSE3-NEXT: pand %xmm8, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pshufb %xmm0, %xmm6 -; SSSE3-NEXT: psrlw $4, %xmm5 -; SSSE3-NEXT: pand %xmm8, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pshufb %xmm5, %xmm0 -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pshufb %xmm5, %xmm6 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm8, %xmm1 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pshufb %xmm1, %xmm5 +; SSSE3-NEXT: pshufb %xmm0, %xmm5 ; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pshufb %xmm0, %xmm7 +; SSSE3-NEXT: psrlw $4, %xmm1 ; SSSE3-NEXT: pand %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm7, %xmm9 -; SSSE3-NEXT: pshufb %xmm1, %xmm9 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: por %xmm7, %xmm6 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm1 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm2 ; SSSE3-NEXT: pand %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSSE3-NEXT: pshufb %xmm2, %xmm6 -; SSSE3-NEXT: por %xmm9, %xmm6 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pand %xmm8, %xmm1 -; SSSE3-NEXT: pshufb %xmm1, %xmm7 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pshufb %xmm2, %xmm7 +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm3 ; SSSE3-NEXT: pand %xmm8, %xmm3 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: por %xmm7, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm6, %xmm2 +; SSSE3-NEXT: por %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm7, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: retq ; @@ -1973,51 +1974,51 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; ; SSSE3-LABEL: test_bitreverse_v32i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; SSSE3-NEXT: pshufb %xmm8, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; SSSE3-NEXT: pshufb %xmm10, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pshufb %xmm4, %xmm6 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pshufb %xmm0, %xmm5 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pshufb %xmm10, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pshufb %xmm0, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pshufb %xmm0, %xmm7 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm9, %xmm0 -; SSSE3-NEXT: pshufb %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pshufb %xmm1, %xmm9 -; SSSE3-NEXT: psrlw $4, %xmm5 -; SSSE3-NEXT: pand %xmm7, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pshufb %xmm5, %xmm1 -; SSSE3-NEXT: por %xmm9, %xmm1 -; SSSE3-NEXT: pshufb %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm7, %xmm5 -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pshufb %xmm5, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: por %xmm7, %xmm6 +; SSSE3-NEXT: pshufb %xmm10, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm1 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm2 -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm9, %xmm5 -; SSSE3-NEXT: pshufb %xmm8, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: pand %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pshufb %xmm2, %xmm7 +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pshufb %xmm10, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm3 -; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm8, %xmm3 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: por %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSSE3-NEXT: por %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm7, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: retq ; @@ -2331,51 +2332,51 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; ; SSSE3-LABEL: test_bitreverse_v16i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; SSSE3-NEXT: pshufb %xmm8, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; SSSE3-NEXT: pshufb %xmm10, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pshufb %xmm4, %xmm6 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pshufb %xmm0, %xmm5 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pshufb %xmm10, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pshufb %xmm0, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pshufb %xmm0, %xmm7 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm9, %xmm0 -; SSSE3-NEXT: pshufb %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pshufb %xmm1, %xmm9 -; SSSE3-NEXT: psrlw $4, %xmm5 -; SSSE3-NEXT: pand %xmm7, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pshufb %xmm5, %xmm1 -; SSSE3-NEXT: por %xmm9, %xmm1 -; SSSE3-NEXT: pshufb %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm7, %xmm5 -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pshufb %xmm5, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: por %xmm7, %xmm6 +; SSSE3-NEXT: pshufb %xmm10, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm1 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm2 -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm9, %xmm5 -; SSSE3-NEXT: pshufb %xmm8, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: pand %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pshufb %xmm2, %xmm7 +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pshufb %xmm10, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm3 -; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm8, %xmm3 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: por %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSSE3-NEXT: por %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm7, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: retq ; @@ -2697,51 +2698,51 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; ; SSSE3-LABEL: test_bitreverse_v8i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; SSSE3-NEXT: pshufb %xmm8, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; SSSE3-NEXT: pshufb %xmm10, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pshufb %xmm4, %xmm6 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pshufb %xmm0, %xmm5 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pshufb %xmm10, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pshufb %xmm0, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pshufb %xmm0, %xmm7 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm9, %xmm0 -; SSSE3-NEXT: pshufb %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pshufb %xmm1, %xmm9 -; SSSE3-NEXT: psrlw $4, %xmm5 -; SSSE3-NEXT: pand %xmm7, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pshufb %xmm5, %xmm1 -; SSSE3-NEXT: por %xmm9, %xmm1 -; SSSE3-NEXT: pshufb %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm7, %xmm5 -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pshufb %xmm5, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: por %xmm7, %xmm6 +; SSSE3-NEXT: pshufb %xmm10, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm1 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm2 -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm9, %xmm5 -; SSSE3-NEXT: pshufb %xmm8, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: pand %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pshufb %xmm2, %xmm7 +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pshufb %xmm10, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm3 -; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm8, %xmm3 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: por %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSSE3-NEXT: por %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm7, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll index 6d71564dd57f9f..4cd4e0571dd3d0 100644 --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -134,19 +134,19 @@ define <16 x float> @fadd_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm10, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 -; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: pandn %xmm7, %xmm8 -; SSE2-NEXT: por %xmm10, %xmm8 +; SSE2-NEXT: movdqa %xmm10, %xmm9 +; SSE2-NEXT: psrad $31, %xmm9 +; SSE2-NEXT: pandn %xmm7, %xmm9 +; SSE2-NEXT: por %xmm10, %xmm9 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm0, %xmm10 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] @@ -163,41 +163,41 @@ define <16 x float> @fadd_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: addps %xmm2, %xmm7 -; SSE2-NEXT: addps %xmm3, %xmm8 -; SSE2-NEXT: addps %xmm4, %xmm9 +; SSE2-NEXT: addps %xmm3, %xmm9 +; SSE2-NEXT: addps %xmm4, %xmm8 ; SSE2-NEXT: movaps %xmm7, %xmm1 -; SSE2-NEXT: movaps %xmm8, %xmm2 -; SSE2-NEXT: movaps %xmm9, %xmm3 +; SSE2-NEXT: movaps %xmm9, %xmm2 +; SSE2-NEXT: movaps %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: fadd_v16f32_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movaps %xmm3, %xmm8 -; SSE42-NEXT: movdqa %xmm0, %xmm9 -; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5 ; SSE42-NEXT: addps %xmm1, %xmm5 ; SSE42-NEXT: addps %xmm2, %xmm6 -; SSE42-NEXT: addps %xmm8, %xmm7 -; SSE42-NEXT: addps %xmm4, %xmm3 +; SSE42-NEXT: addps %xmm3, %xmm7 +; SSE42-NEXT: addps %xmm4, %xmm9 ; SSE42-NEXT: movaps %xmm5, %xmm0 ; SSE42-NEXT: movaps %xmm6, %xmm1 ; SSE42-NEXT: movaps %xmm7, %xmm2 +; SSE42-NEXT: movaps %xmm9, %xmm3 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fadd_v16f32_swap: @@ -232,19 +232,19 @@ define <16 x float> @fadd_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm10, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 -; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: pandn %xmm7, %xmm8 -; SSE2-NEXT: por %xmm10, %xmm8 +; SSE2-NEXT: movdqa %xmm10, %xmm9 +; SSE2-NEXT: psrad $31, %xmm9 +; SSE2-NEXT: pandn %xmm7, %xmm9 +; SSE2-NEXT: por %xmm10, %xmm9 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm0, %xmm10 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] @@ -261,41 +261,41 @@ define <16 x float> @fadd_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: addps %xmm2, %xmm7 -; SSE2-NEXT: addps %xmm3, %xmm8 -; SSE2-NEXT: addps %xmm4, %xmm9 +; SSE2-NEXT: addps %xmm3, %xmm9 +; SSE2-NEXT: addps %xmm4, %xmm8 ; SSE2-NEXT: movaps %xmm7, %xmm1 -; SSE2-NEXT: movaps %xmm8, %xmm2 -; SSE2-NEXT: movaps %xmm9, %xmm3 +; SSE2-NEXT: movaps %xmm9, %xmm2 +; SSE2-NEXT: movaps %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: fadd_v16f32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movaps %xmm3, %xmm8 -; SSE42-NEXT: movdqa %xmm0, %xmm9 -; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5 ; SSE42-NEXT: addps %xmm1, %xmm5 ; SSE42-NEXT: addps %xmm2, %xmm6 -; SSE42-NEXT: addps %xmm8, %xmm7 -; SSE42-NEXT: addps %xmm4, %xmm3 +; SSE42-NEXT: addps %xmm3, %xmm7 +; SSE42-NEXT: addps %xmm4, %xmm9 ; SSE42-NEXT: movaps %xmm5, %xmm0 ; SSE42-NEXT: movaps %xmm6, %xmm1 ; SSE42-NEXT: movaps %xmm7, %xmm2 +; SSE42-NEXT: movaps %xmm9, %xmm3 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fadd_v16f32_commute_swap: @@ -435,17 +435,17 @@ define <8 x float> @fsub_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x define <16 x float> @fsub_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { ; SSE2-LABEL: fsub_v16f32_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm9, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm9 ; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: pandn %xmm7, %xmm9 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: pandn %xmm7, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] @@ -458,8 +458,8 @@ define <16 x float> @fsub_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; SSE2-NEXT: pandn %xmm5, %xmm0 ; SSE2-NEXT: subps %xmm0, %xmm1 ; SSE2-NEXT: subps %xmm7, %xmm2 -; SSE2-NEXT: subps %xmm9, %xmm3 -; SSE2-NEXT: subps %xmm8, %xmm4 +; SSE2-NEXT: subps %xmm8, %xmm3 +; SSE2-NEXT: subps %xmm9, %xmm4 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: movaps %xmm3, %xmm2 @@ -530,18 +530,17 @@ define <16 x float> @fsub_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 define <16 x float> @fsub_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { ; SSE2-LABEL: fsub_v16f32_commute_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm8, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm9 ; SSE2-NEXT: psrad $31, %xmm9 ; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pandn %xmm7, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: pandn %xmm7, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] @@ -553,21 +552,21 @@ define <16 x float> @fsub_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm0 ; SSE2-NEXT: subps %xmm1, %xmm0 -; SSE2-NEXT: subps %xmm8, %xmm7 -; SSE2-NEXT: subps %xmm3, %xmm2 +; SSE2-NEXT: subps %xmm2, %xmm7 +; SSE2-NEXT: subps %xmm3, %xmm8 ; SSE2-NEXT: subps %xmm4, %xmm9 ; SSE2-NEXT: movaps %xmm7, %xmm1 +; SSE2-NEXT: movaps %xmm8, %xmm2 ; SSE2-NEXT: movaps %xmm9, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: fsub_v16f32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movaps %xmm2, %xmm8 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm2 -; SSE42-NEXT: psrad $31, %xmm2 -; SSE42-NEXT: pandn %xmm7, %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm8 +; SSE42-NEXT: psrad $31, %xmm8 +; SSE42-NEXT: pandn %xmm7, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm7 @@ -583,11 +582,12 @@ define <16 x float> @fsub_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; SSE42-NEXT: psrad $31, %xmm5 ; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 ; SSE42-NEXT: subps %xmm1, %xmm6 -; SSE42-NEXT: subps %xmm8, %xmm7 -; SSE42-NEXT: subps %xmm3, %xmm2 +; SSE42-NEXT: subps %xmm2, %xmm7 +; SSE42-NEXT: subps %xmm3, %xmm8 ; SSE42-NEXT: subps %xmm4, %xmm5 ; SSE42-NEXT: movaps %xmm6, %xmm0 ; SSE42-NEXT: movaps %xmm7, %xmm1 +; SSE42-NEXT: movaps %xmm8, %xmm2 ; SSE42-NEXT: movaps %xmm5, %xmm3 ; SSE42-NEXT: retq ; @@ -746,76 +746,76 @@ define <8 x float> @fmul_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x define <16 x float> @fmul_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { ; SSE2-LABEL: fmul_v16f32_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm2, %xmm8 +; SSE2-NEXT: movaps %xmm2, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm10 -; SSE2-NEXT: psrad $31, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm10, %xmm9 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10 -; SSE2-NEXT: por %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm8, %xmm11 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: por %xmm11, %xmm8 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm2, %xmm11 ; SSE2-NEXT: pandn %xmm7, %xmm2 -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: pandn %xmm6, %xmm9 -; SSE2-NEXT: por %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm7, %xmm11 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: por %xmm11, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm10 ; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: por %xmm11, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: mulps %xmm1, %xmm0 -; SSE2-NEXT: mulps %xmm8, %xmm9 +; SSE2-NEXT: mulps %xmm9, %xmm7 ; SSE2-NEXT: mulps %xmm3, %xmm2 -; SSE2-NEXT: mulps %xmm4, %xmm10 -; SSE2-NEXT: movaps %xmm9, %xmm1 -; SSE2-NEXT: movaps %xmm10, %xmm3 +; SSE2-NEXT: mulps %xmm4, %xmm8 +; SSE2-NEXT: movaps %xmm7, %xmm1 +; SSE2-NEXT: movaps %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: fmul_v16f32_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movaps %xmm3, %xmm8 -; SSE42-NEXT: movdqa %xmm0, %xmm9 -; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5 ; SSE42-NEXT: mulps %xmm1, %xmm5 ; SSE42-NEXT: mulps %xmm2, %xmm6 -; SSE42-NEXT: mulps %xmm8, %xmm7 -; SSE42-NEXT: mulps %xmm4, %xmm3 +; SSE42-NEXT: mulps %xmm3, %xmm7 +; SSE42-NEXT: mulps %xmm4, %xmm9 ; SSE42-NEXT: movaps %xmm5, %xmm0 ; SSE42-NEXT: movaps %xmm6, %xmm1 ; SSE42-NEXT: movaps %xmm7, %xmm2 +; SSE42-NEXT: movaps %xmm9, %xmm3 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fmul_v16f32_swap: @@ -848,76 +848,76 @@ define <16 x float> @fmul_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 define <16 x float> @fmul_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { ; SSE2-LABEL: fmul_v16f32_commute_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm2, %xmm8 +; SSE2-NEXT: movaps %xmm2, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm10 -; SSE2-NEXT: psrad $31, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm10, %xmm9 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10 -; SSE2-NEXT: por %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm8, %xmm11 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: por %xmm11, %xmm8 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm2, %xmm11 ; SSE2-NEXT: pandn %xmm7, %xmm2 -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: pandn %xmm6, %xmm9 -; SSE2-NEXT: por %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm7, %xmm11 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: por %xmm11, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm10 ; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: por %xmm11, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: mulps %xmm1, %xmm0 -; SSE2-NEXT: mulps %xmm8, %xmm9 +; SSE2-NEXT: mulps %xmm9, %xmm7 ; SSE2-NEXT: mulps %xmm3, %xmm2 -; SSE2-NEXT: mulps %xmm4, %xmm10 -; SSE2-NEXT: movaps %xmm9, %xmm1 -; SSE2-NEXT: movaps %xmm10, %xmm3 +; SSE2-NEXT: mulps %xmm4, %xmm8 +; SSE2-NEXT: movaps %xmm7, %xmm1 +; SSE2-NEXT: movaps %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: fmul_v16f32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movaps %xmm3, %xmm8 -; SSE42-NEXT: movdqa %xmm0, %xmm9 -; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5 ; SSE42-NEXT: mulps %xmm1, %xmm5 ; SSE42-NEXT: mulps %xmm2, %xmm6 -; SSE42-NEXT: mulps %xmm8, %xmm7 -; SSE42-NEXT: mulps %xmm4, %xmm3 +; SSE42-NEXT: mulps %xmm3, %xmm7 +; SSE42-NEXT: mulps %xmm4, %xmm9 ; SSE42-NEXT: movaps %xmm5, %xmm0 ; SSE42-NEXT: movaps %xmm6, %xmm1 ; SSE42-NEXT: movaps %xmm7, %xmm2 +; SSE42-NEXT: movaps %xmm9, %xmm3 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fmul_v16f32_commute_swap: @@ -1076,24 +1076,24 @@ define <8 x float> @fdiv_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x define <16 x float> @fdiv_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { ; SSE2-LABEL: fdiv_v16f32_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm9, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pand %xmm8, %xmm11 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 -; SSE2-NEXT: por %xmm11, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm9 ; SSE2-NEXT: psrad $31, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SSE2-NEXT: movdqa %xmm10, %xmm11 ; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pandn %xmm7, %xmm9 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 ; SSE2-NEXT: por %xmm11, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm8, %xmm11 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm11, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] @@ -1111,8 +1111,8 @@ define <16 x float> @fdiv_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: divps %xmm0, %xmm1 ; SSE2-NEXT: divps %xmm7, %xmm2 -; SSE2-NEXT: divps %xmm9, %xmm3 -; SSE2-NEXT: divps %xmm8, %xmm4 +; SSE2-NEXT: divps %xmm8, %xmm3 +; SSE2-NEXT: divps %xmm9, %xmm4 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: movaps %xmm3, %xmm2 @@ -1179,76 +1179,76 @@ define <16 x float> @fdiv_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 define <16 x float> @fdiv_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { ; SSE2-LABEL: fdiv_v16f32_commute_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm2, %xmm8 +; SSE2-NEXT: movaps %xmm2, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm10 -; SSE2-NEXT: psrad $31, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm10, %xmm9 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10 -; SSE2-NEXT: por %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm8, %xmm11 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: por %xmm11, %xmm8 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm2, %xmm11 ; SSE2-NEXT: pandn %xmm7, %xmm2 -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: pandn %xmm6, %xmm9 -; SSE2-NEXT: por %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm7, %xmm11 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: por %xmm11, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm10 ; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: por %xmm11, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: divps %xmm1, %xmm0 -; SSE2-NEXT: divps %xmm8, %xmm9 +; SSE2-NEXT: divps %xmm9, %xmm7 ; SSE2-NEXT: divps %xmm3, %xmm2 -; SSE2-NEXT: divps %xmm4, %xmm10 -; SSE2-NEXT: movaps %xmm9, %xmm1 -; SSE2-NEXT: movaps %xmm10, %xmm3 +; SSE2-NEXT: divps %xmm4, %xmm8 +; SSE2-NEXT: movaps %xmm7, %xmm1 +; SSE2-NEXT: movaps %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: fdiv_v16f32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movaps %xmm3, %xmm8 -; SSE42-NEXT: movdqa %xmm0, %xmm9 -; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5 ; SSE42-NEXT: divps %xmm1, %xmm5 ; SSE42-NEXT: divps %xmm2, %xmm6 -; SSE42-NEXT: divps %xmm8, %xmm7 -; SSE42-NEXT: divps %xmm4, %xmm3 +; SSE42-NEXT: divps %xmm3, %xmm7 +; SSE42-NEXT: divps %xmm4, %xmm9 ; SSE42-NEXT: movaps %xmm5, %xmm0 ; SSE42-NEXT: movaps %xmm6, %xmm1 ; SSE42-NEXT: movaps %xmm7, %xmm2 +; SSE42-NEXT: movaps %xmm9, %xmm3 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fdiv_v16f32_commute_swap: @@ -1404,34 +1404,34 @@ define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; ; SSE42-LABEL: fadd_v8f64_cast_cond: ; SSE42: # %bb.0: -; SSE42-NEXT: movapd %xmm0, %xmm9 +; SSE42-NEXT: movapd %xmm0, %xmm8 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] ; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0] ; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm8 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: pand %xmm0, %xmm9 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm9 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 -; SSE42-NEXT: addpd %xmm9, %xmm10 +; SSE42-NEXT: addpd %xmm8, %xmm10 ; SSE42-NEXT: addpd %xmm6, %xmm1 ; SSE42-NEXT: addpd %xmm7, %xmm2 ; SSE42-NEXT: addpd %xmm11, %xmm3 @@ -1518,67 +1518,67 @@ define <8 x double> @fsub_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; SSE2-LABEL: fsub_v8f64_cast_cond: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE2-NEXT: movdqa %xmm9, %xmm8 -; SSE2-NEXT: pand %xmm10, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2] -; SSE2-NEXT: pand %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm10, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,128] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,0,3,2] +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pand %xmm7, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,0,3,2] +; SSE2-NEXT: pand %xmm6, %xmm9 +; SSE2-NEXT: pand %xmm7, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: subpd %xmm9, %xmm0 -; SSE2-NEXT: subpd %xmm10, %xmm1 -; SSE2-NEXT: subpd %xmm7, %xmm2 -; SSE2-NEXT: subpd %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,0,3,2] +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] +; SSE2-NEXT: pand %xmm5, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,0,3,2] +; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm5, %xmm8 +; SSE2-NEXT: subpd %xmm8, %xmm0 +; SSE2-NEXT: subpd %xmm7, %xmm1 +; SSE2-NEXT: subpd %xmm9, %xmm2 +; SSE2-NEXT: subpd %xmm10, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: fsub_v8f64_cast_cond: ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 -; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE42-NEXT: movdqa %xmm9, %xmm8 -; SSE42-NEXT: pand %xmm10, %xmm8 -; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 -; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm9, %xmm10 +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128] +; SSE42-NEXT: movdqa %xmm8, %xmm10 +; SSE42-NEXT: pand %xmm9, %xmm10 +; SSE42-NEXT: pcmpeqq %xmm9, %xmm10 ; SSE42-NEXT: pand %xmm7, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 -; SSE42-NEXT: pand %xmm6, %xmm10 +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: movdqa %xmm8, %xmm9 +; SSE42-NEXT: pand %xmm7, %xmm9 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm9 +; SSE42-NEXT: pand %xmm6, %xmm9 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm9, %xmm7 +; SSE42-NEXT: movdqa %xmm8, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 ; SSE42-NEXT: pand %xmm5, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] -; SSE42-NEXT: pand %xmm5, %xmm9 -; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 -; SSE42-NEXT: pand %xmm4, %xmm9 -; SSE42-NEXT: subpd %xmm9, %xmm0 +; SSE42-NEXT: pand %xmm5, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm5, %xmm8 +; SSE42-NEXT: pand %xmm4, %xmm8 +; SSE42-NEXT: subpd %xmm8, %xmm0 ; SSE42-NEXT: subpd %xmm7, %xmm1 -; SSE42-NEXT: subpd %xmm10, %xmm2 -; SSE42-NEXT: subpd %xmm8, %xmm3 +; SSE42-NEXT: subpd %xmm9, %xmm2 +; SSE42-NEXT: subpd %xmm10, %xmm3 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fsub_v8f64_cast_cond: @@ -1734,34 +1734,34 @@ define <8 x double> @fmul_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; ; SSE42-LABEL: fmul_v8f64_cast_cond: ; SSE42: # %bb.0: -; SSE42-NEXT: movapd %xmm0, %xmm9 +; SSE42-NEXT: movapd %xmm0, %xmm8 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] ; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1.0E+0,1.0E+0] ; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm8 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: pand %xmm0, %xmm9 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm9 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 -; SSE42-NEXT: mulpd %xmm9, %xmm10 +; SSE42-NEXT: mulpd %xmm8, %xmm10 ; SSE42-NEXT: mulpd %xmm6, %xmm1 ; SSE42-NEXT: mulpd %xmm7, %xmm2 ; SSE42-NEXT: mulpd %xmm11, %xmm3 @@ -1922,38 +1922,38 @@ define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; ; SSE42-LABEL: fdiv_v8f64_cast_cond: ; SSE42: # %bb.0: -; SSE42-NEXT: movapd %xmm0, %xmm9 +; SSE42-NEXT: movapd %xmm0, %xmm8 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] ; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 -; SSE42-NEXT: movapd {{.*#+}} xmm11 = [1.0E+0,1.0E+0] -; SSE42-NEXT: movapd %xmm11, %xmm10 -; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm10 +; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1.0E+0,1.0E+0] +; SSE42-NEXT: movapd %xmm10, %xmm11 +; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 -; SSE42-NEXT: movapd %xmm11, %xmm7 +; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 -; SSE42-NEXT: movapd %xmm11, %xmm6 +; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm8 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 -; SSE42-NEXT: movdqa %xmm8, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm11 -; SSE42-NEXT: divpd %xmm11, %xmm9 +; SSE42-NEXT: pand %xmm0, %xmm9 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm9 +; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE42-NEXT: divpd %xmm10, %xmm8 ; SSE42-NEXT: divpd %xmm6, %xmm1 ; SSE42-NEXT: divpd %xmm7, %xmm2 -; SSE42-NEXT: divpd %xmm10, %xmm3 -; SSE42-NEXT: movapd %xmm9, %xmm0 +; SSE42-NEXT: divpd %xmm11, %xmm3 +; SSE42-NEXT: movapd %xmm8, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fdiv_v8f64_cast_cond: @@ -2140,67 +2140,67 @@ define <8 x i64> @add_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE2-LABEL: add_v8i64_cast_cond: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE2-NEXT: movdqa %xmm9, %xmm8 -; SSE2-NEXT: pand %xmm10, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2] -; SSE2-NEXT: pand %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm10, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,128] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,0,3,2] +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pand %xmm7, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,0,3,2] +; SSE2-NEXT: pand %xmm6, %xmm9 +; SSE2-NEXT: pand %xmm7, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,0,3,2] +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: paddq %xmm9, %xmm0 -; SSE2-NEXT: paddq %xmm10, %xmm1 -; SSE2-NEXT: paddq %xmm7, %xmm2 -; SSE2-NEXT: paddq %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,0,3,2] +; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm5, %xmm8 +; SSE2-NEXT: paddq %xmm8, %xmm0 +; SSE2-NEXT: paddq %xmm7, %xmm1 +; SSE2-NEXT: paddq %xmm9, %xmm2 +; SSE2-NEXT: paddq %xmm10, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: add_v8i64_cast_cond: ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 -; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE42-NEXT: movdqa %xmm9, %xmm8 -; SSE42-NEXT: pand %xmm10, %xmm8 -; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 -; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm9, %xmm10 +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128] +; SSE42-NEXT: movdqa %xmm8, %xmm10 +; SSE42-NEXT: pand %xmm9, %xmm10 +; SSE42-NEXT: pcmpeqq %xmm9, %xmm10 ; SSE42-NEXT: pand %xmm7, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 -; SSE42-NEXT: pand %xmm6, %xmm10 +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: movdqa %xmm8, %xmm9 +; SSE42-NEXT: pand %xmm7, %xmm9 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm9 +; SSE42-NEXT: pand %xmm6, %xmm9 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm9, %xmm7 +; SSE42-NEXT: movdqa %xmm8, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 ; SSE42-NEXT: pand %xmm5, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] -; SSE42-NEXT: pand %xmm5, %xmm9 -; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 -; SSE42-NEXT: pand %xmm4, %xmm9 -; SSE42-NEXT: paddq %xmm9, %xmm0 +; SSE42-NEXT: pand %xmm5, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm5, %xmm8 +; SSE42-NEXT: pand %xmm4, %xmm8 +; SSE42-NEXT: paddq %xmm8, %xmm0 ; SSE42-NEXT: paddq %xmm7, %xmm1 -; SSE42-NEXT: paddq %xmm10, %xmm2 -; SSE42-NEXT: paddq %xmm8, %xmm3 +; SSE42-NEXT: paddq %xmm9, %xmm2 +; SSE42-NEXT: paddq %xmm10, %xmm3 ; SSE42-NEXT: retq ; ; AVX2-LABEL: add_v8i64_cast_cond: @@ -2340,17 +2340,17 @@ define <8 x i32> @sub_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> define <16 x i32> @sub_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { ; SSE2-LABEL: sub_v16i32_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm9, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm9 ; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: pandn %xmm7, %xmm9 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: pandn %xmm7, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] @@ -2363,8 +2363,8 @@ define <16 x i32> @sub_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i3 ; SSE2-NEXT: pandn %xmm5, %xmm0 ; SSE2-NEXT: psubd %xmm0, %xmm1 ; SSE2-NEXT: psubd %xmm7, %xmm2 -; SSE2-NEXT: psubd %xmm9, %xmm3 -; SSE2-NEXT: psubd %xmm8, %xmm4 +; SSE2-NEXT: psubd %xmm8, %xmm3 +; SSE2-NEXT: psubd %xmm9, %xmm4 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm2 @@ -2435,18 +2435,17 @@ define <16 x i32> @sub_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i3 define <16 x i32> @sub_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { ; SSE2-LABEL: sub_v16i32_commute_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm8, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm9 ; SSE2-NEXT: psrad $31, %xmm9 ; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pandn %xmm7, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: pandn %xmm7, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] @@ -2458,21 +2457,21 @@ define <16 x i32> @sub_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm0 ; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: psubd %xmm8, %xmm7 -; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: psubd %xmm2, %xmm7 +; SSE2-NEXT: psubd %xmm3, %xmm8 ; SSE2-NEXT: psubd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm8, %xmm2 ; SSE2-NEXT: movdqa %xmm9, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: sub_v16i32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm2, %xmm8 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm2 -; SSE42-NEXT: psrad $31, %xmm2 -; SSE42-NEXT: pandn %xmm7, %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm8 +; SSE42-NEXT: psrad $31, %xmm8 +; SSE42-NEXT: pandn %xmm7, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm7 @@ -2488,11 +2487,12 @@ define <16 x i32> @sub_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE42-NEXT: psrad $31, %xmm5 ; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 ; SSE42-NEXT: psubd %xmm1, %xmm6 -; SSE42-NEXT: psubd %xmm8, %xmm7 -; SSE42-NEXT: psubd %xmm3, %xmm2 +; SSE42-NEXT: psubd %xmm2, %xmm7 +; SSE42-NEXT: psubd %xmm3, %xmm8 ; SSE42-NEXT: psubd %xmm4, %xmm5 ; SSE42-NEXT: movdqa %xmm6, %xmm0 ; SSE42-NEXT: movdqa %xmm7, %xmm1 +; SSE42-NEXT: movdqa %xmm8, %xmm2 ; SSE42-NEXT: movdqa %xmm5, %xmm3 ; SSE42-NEXT: retq ; @@ -2576,67 +2576,67 @@ define <8 x i64> @sub_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE2-LABEL: sub_v8i64_cast_cond: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE2-NEXT: movdqa %xmm9, %xmm8 -; SSE2-NEXT: pand %xmm10, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2] -; SSE2-NEXT: pand %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm10, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,128] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,0,3,2] +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pand %xmm7, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,0,3,2] +; SSE2-NEXT: pand %xmm6, %xmm9 +; SSE2-NEXT: pand %xmm7, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,0,3,2] +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: psubq %xmm9, %xmm0 -; SSE2-NEXT: psubq %xmm10, %xmm1 -; SSE2-NEXT: psubq %xmm7, %xmm2 -; SSE2-NEXT: psubq %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,0,3,2] +; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm5, %xmm8 +; SSE2-NEXT: psubq %xmm8, %xmm0 +; SSE2-NEXT: psubq %xmm7, %xmm1 +; SSE2-NEXT: psubq %xmm9, %xmm2 +; SSE2-NEXT: psubq %xmm10, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: sub_v8i64_cast_cond: ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 -; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE42-NEXT: movdqa %xmm9, %xmm8 -; SSE42-NEXT: pand %xmm10, %xmm8 -; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 -; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm9, %xmm10 +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128] +; SSE42-NEXT: movdqa %xmm8, %xmm10 +; SSE42-NEXT: pand %xmm9, %xmm10 +; SSE42-NEXT: pcmpeqq %xmm9, %xmm10 ; SSE42-NEXT: pand %xmm7, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 -; SSE42-NEXT: pand %xmm6, %xmm10 +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: movdqa %xmm8, %xmm9 +; SSE42-NEXT: pand %xmm7, %xmm9 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm9 +; SSE42-NEXT: pand %xmm6, %xmm9 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm9, %xmm7 +; SSE42-NEXT: movdqa %xmm8, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 ; SSE42-NEXT: pand %xmm5, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] -; SSE42-NEXT: pand %xmm5, %xmm9 -; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 -; SSE42-NEXT: pand %xmm4, %xmm9 -; SSE42-NEXT: psubq %xmm9, %xmm0 +; SSE42-NEXT: pand %xmm5, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm5, %xmm8 +; SSE42-NEXT: pand %xmm4, %xmm8 +; SSE42-NEXT: psubq %xmm8, %xmm0 ; SSE42-NEXT: psubq %xmm7, %xmm1 -; SSE42-NEXT: psubq %xmm10, %xmm2 -; SSE42-NEXT: psubq %xmm8, %xmm3 +; SSE42-NEXT: psubq %xmm9, %xmm2 +; SSE42-NEXT: psubq %xmm10, %xmm3 ; SSE42-NEXT: retq ; ; AVX2-LABEL: sub_v8i64_cast_cond: @@ -2984,41 +2984,41 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE42-NEXT: movdqa %xmm8, %xmm0 -; SSE42-NEXT: pand %xmm10, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 -; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1,1] -; SSE42-NEXT: movapd %xmm10, %xmm11 +; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,1] +; SSE42-NEXT: movdqa {{.*#+}} xmm8 = [64,128] +; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: pand %xmm8, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm8, %xmm0 +; SSE42-NEXT: movapd {{.*#+}} xmm8 = [1,1] +; SSE42-NEXT: movapd %xmm8, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm10, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 -; SSE42-NEXT: movapd %xmm10, %xmm7 +; SSE42-NEXT: movapd %xmm8, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm10, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 -; SSE42-NEXT: movapd %xmm10, %xmm6 +; SSE42-NEXT: movapd %xmm8, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm8 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 -; SSE42-NEXT: movdqa %xmm8, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE42-NEXT: pand %xmm0, %xmm10 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm10 +; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm8 ; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: psrlq $32, %xmm0 -; SSE42-NEXT: pmuludq %xmm10, %xmm0 -; SSE42-NEXT: movdqa %xmm10, %xmm4 +; SSE42-NEXT: pmuludq %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm4 ; SSE42-NEXT: psrlq $32, %xmm4 ; SSE42-NEXT: pmuludq %xmm9, %xmm4 ; SSE42-NEXT: paddq %xmm0, %xmm4 ; SSE42-NEXT: psllq $32, %xmm4 -; SSE42-NEXT: pmuludq %xmm9, %xmm10 -; SSE42-NEXT: paddq %xmm4, %xmm10 +; SSE42-NEXT: pmuludq %xmm9, %xmm8 +; SSE42-NEXT: paddq %xmm4, %xmm8 ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: psrlq $32, %xmm0 ; SSE42-NEXT: pmuludq %xmm6, %xmm0 @@ -3049,7 +3049,7 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: psllq $32, %xmm4 ; SSE42-NEXT: pmuludq %xmm11, %xmm3 ; SSE42-NEXT: paddq %xmm4, %xmm3 -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: mul_v8i64_cast_cond: @@ -3527,30 +3527,30 @@ define <8 x i32> @shl_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE2-LABEL: shl_v8i32_cast_cond: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pslld $23, %xmm5 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pslld $23, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm2, %xmm5 -; SSE2-NEXT: cvttps2dq %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE2-NEXT: paddd %xmm2, %xmm4 +; SSE2-NEXT: cvttps2dq %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm3 +; SSE2-NEXT: pmuludq %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: pslld $23, %xmm4 -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: cvttps2dq %xmm4, %xmm2 +; SSE2-NEXT: pslld $23, %xmm6 +; SSE2-NEXT: paddd %xmm2, %xmm6 +; SSE2-NEXT: cvttps2dq %xmm6, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -3626,20 +3626,20 @@ define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2] ; SSE2-NEXT: pand %xmm7, %xmm8 ; SSE2-NEXT: pand %xmm10, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32] ; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,0,3,2] ; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,0,3,2] +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] ; SSE2-NEXT: pand %xmm5, %xmm9 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 @@ -3652,13 +3652,13 @@ define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE2-NEXT: psllq %xmm5, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psllq %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE2-NEXT: psllq %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] ; SSE2-NEXT: psllq %xmm5, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psllq %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; SSE2-NEXT: psllq %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE2-NEXT: psllq %xmm5, %xmm2 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm3, %xmm4 @@ -3677,18 +3677,18 @@ define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [16,32] -; SSE42-NEXT: movdqa %xmm9, %xmm7 -; SSE42-NEXT: pand %xmm10, %xmm7 -; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 -; SSE42-NEXT: pand %xmm6, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm10 +; SSE42-NEXT: pand %xmm7, %xmm10 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 ; SSE42-NEXT: pand %xmm6, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm6, %xmm10 -; SSE42-NEXT: pand %xmm5, %xmm10 -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] -; SSE42-NEXT: pand %xmm5, %xmm9 +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa %xmm9, %xmm7 +; SSE42-NEXT: pand %xmm6, %xmm7 +; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 +; SSE42-NEXT: pand %xmm5, %xmm7 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] +; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 ; SSE42-NEXT: pand %xmm4, %xmm9 ; SSE42-NEXT: movdqa %xmm0, %xmm4 @@ -3697,13 +3697,13 @@ define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: psllq %xmm5, %xmm0 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: psllq %xmm10, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE42-NEXT: psllq %xmm7, %xmm4 +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] ; SSE42-NEXT: psllq %xmm5, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm4 -; SSE42-NEXT: psllq %xmm7, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; SSE42-NEXT: psllq %xmm10, %xmm4 +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE42-NEXT: psllq %xmm5, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm4 @@ -4141,9 +4141,9 @@ define <16 x i32> @lshr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { ; SSE2-LABEL: lshr_v16i32_commute_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: movdqa %xmm3, %xmm10 ; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm8 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: movdqa %xmm2, %xmm3 @@ -4165,7 +4165,7 @@ define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: psrld %xmm6, %xmm7 @@ -4173,10 +4173,10 @@ define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: psrld %xmm5, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: psrld %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm5, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] @@ -4195,7 +4195,7 @@ define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE2-NEXT: psrld %xmm5, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: psrld %xmm6, %xmm7 @@ -4203,10 +4203,10 @@ define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE2-NEXT: movdqa %xmm2, %xmm6 ; SSE2-NEXT: psrld %xmm5, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: psrld %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm5, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm6[0,3] @@ -4229,9 +4229,9 @@ define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; ; SSE42-LABEL: lshr_v16i32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm3, %xmm10 -; SSE42-NEXT: movdqa %xmm2, %xmm9 -; SSE42-NEXT: movdqa %xmm1, %xmm8 +; SSE42-NEXT: movdqa %xmm3, %xmm9 +; SSE42-NEXT: movdqa %xmm2, %xmm8 +; SSE42-NEXT: movdqa %xmm1, %xmm10 ; SSE42-NEXT: movdqa %xmm0, %xmm3 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -4252,48 +4252,48 @@ define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE42-NEXT: pslld $31, %xmm3 ; SSE42-NEXT: psrad $31, %xmm3 ; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm7 ; SSE42-NEXT: psrld %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm11 ; SSE42-NEXT: psrld %xmm6, %xmm11 ; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm6 ; SSE42-NEXT: psrld %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm5, %xmm0 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm7 ; SSE42-NEXT: psrld %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm8 -; SSE42-NEXT: psrld %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm10 +; SSE42-NEXT: psrld %xmm6, %xmm10 +; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm6 ; SSE42-NEXT: psrld %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm5, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5],xmm10[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm7 ; SSE42-NEXT: psrld %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm8 ; SSE42-NEXT: psrld %xmm6, %xmm8 ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: psrld %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm5, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7] @@ -4346,39 +4346,39 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE2-LABEL: lshr_v8i32_cast_cond: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrld %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld %xmm6, %xmm2 +; SSE2-NEXT: psrld %xmm5, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrld %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm3, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrld %xmm0, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrld %xmm0, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrld %xmm0, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: psrld %xmm4, %xmm5 @@ -4393,46 +4393,46 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42-LABEL: lshr_v8i32_cast_cond: ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] -; SSE42-NEXT: movdqa %xmm5, %xmm4 -; SSE42-NEXT: pand %xmm6, %xmm4 -; SSE42-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE42-NEXT: movdqa %xmm4, %xmm6 +; SSE42-NEXT: pand %xmm5, %xmm6 +; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE42-NEXT: pand %xmm3, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE42-NEXT: pand %xmm3, %xmm5 -; SSE42-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE42-NEXT: pand %xmm2, %xmm5 -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE42-NEXT: pand %xmm2, %xmm4 +; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm3 ; SSE42-NEXT: psrld %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: psrld %xmm6, %xmm7 +; SSE42-NEXT: psrld %xmm5, %xmm7 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm3[0,1,2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm5 -; SSE42-NEXT: psrld %xmm3, %xmm5 +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: psrld %xmm3, %xmm4 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm2, %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: psrld %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm6 -; SSE42-NEXT: psrld %xmm5, %xmm6 -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm5 +; SSE42-NEXT: psrld %xmm4, %xmm5 +; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm4 ; SSE42-NEXT: psrld %xmm3, %xmm4 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm2, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] ; SSE42-NEXT: retq ; ; AVX2-LABEL: lshr_v8i32_cast_cond: @@ -4477,20 +4477,20 @@ define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2] ; SSE2-NEXT: pand %xmm7, %xmm8 ; SSE2-NEXT: pand %xmm10, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32] ; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,0,3,2] ; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,0,3,2] +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] ; SSE2-NEXT: pand %xmm5, %xmm9 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 @@ -4503,13 +4503,13 @@ define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE2-NEXT: psrlq %xmm5, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrlq %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE2-NEXT: psrlq %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] ; SSE2-NEXT: psrlq %xmm5, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrlq %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; SSE2-NEXT: psrlq %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE2-NEXT: psrlq %xmm5, %xmm2 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm3, %xmm4 @@ -4528,16 +4528,16 @@ define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [16,32] -; SSE42-NEXT: movdqa %xmm9, %xmm7 -; SSE42-NEXT: pand %xmm10, %xmm7 -; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 -; SSE42-NEXT: pand %xmm6, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm10 +; SSE42-NEXT: pand %xmm7, %xmm10 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 ; SSE42-NEXT: pand %xmm6, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm6, %xmm10 -; SSE42-NEXT: pand %xmm5, %xmm10 +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa %xmm9, %xmm7 +; SSE42-NEXT: pand %xmm6, %xmm7 +; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 +; SSE42-NEXT: pand %xmm5, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 @@ -4548,13 +4548,13 @@ define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: psrlq %xmm5, %xmm0 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: psrlq %xmm10, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE42-NEXT: psrlq %xmm7, %xmm4 +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] ; SSE42-NEXT: psrlq %xmm5, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm4 -; SSE42-NEXT: psrlq %xmm7, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; SSE42-NEXT: psrlq %xmm10, %xmm4 +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE42-NEXT: psrlq %xmm5, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm4 @@ -4992,9 +4992,9 @@ define <16 x i32> @ashr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { ; SSE2-LABEL: ashr_v16i32_commute_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: movdqa %xmm3, %xmm10 ; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm8 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: movdqa %xmm2, %xmm3 @@ -5016,7 +5016,7 @@ define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: psrad %xmm6, %xmm7 @@ -5024,10 +5024,10 @@ define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: psrad %xmm5, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: psrad %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm5, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] @@ -5046,7 +5046,7 @@ define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE2-NEXT: psrad %xmm5, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: psrad %xmm6, %xmm7 @@ -5054,10 +5054,10 @@ define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE2-NEXT: movdqa %xmm2, %xmm6 ; SSE2-NEXT: psrad %xmm5, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: psrad %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm5, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm6[0,3] @@ -5080,9 +5080,9 @@ define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; ; SSE42-LABEL: ashr_v16i32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm3, %xmm10 -; SSE42-NEXT: movdqa %xmm2, %xmm9 -; SSE42-NEXT: movdqa %xmm1, %xmm8 +; SSE42-NEXT: movdqa %xmm3, %xmm9 +; SSE42-NEXT: movdqa %xmm2, %xmm8 +; SSE42-NEXT: movdqa %xmm1, %xmm10 ; SSE42-NEXT: movdqa %xmm0, %xmm3 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -5103,48 +5103,48 @@ define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; SSE42-NEXT: pslld $31, %xmm3 ; SSE42-NEXT: psrad $31, %xmm3 ; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm7 ; SSE42-NEXT: psrad %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm11 ; SSE42-NEXT: psrad %xmm6, %xmm11 ; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm6 ; SSE42-NEXT: psrad %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm5, %xmm0 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm7 ; SSE42-NEXT: psrad %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm8 -; SSE42-NEXT: psrad %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm10 +; SSE42-NEXT: psrad %xmm6, %xmm10 +; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm6 ; SSE42-NEXT: psrad %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm5, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5],xmm10[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm7 ; SSE42-NEXT: psrad %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm8 ; SSE42-NEXT: psrad %xmm6, %xmm8 ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: psrad %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm5, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7] @@ -5197,39 +5197,39 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE2-LABEL: ashr_v8i32_cast_cond: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrad %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad %xmm6, %xmm2 +; SSE2-NEXT: psrad %xmm5, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrad %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrad %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm3, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrad %xmm0, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad %xmm0, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrad %xmm0, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: psrad %xmm4, %xmm5 @@ -5244,46 +5244,46 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42-LABEL: ashr_v8i32_cast_cond: ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] -; SSE42-NEXT: movdqa %xmm5, %xmm4 -; SSE42-NEXT: pand %xmm6, %xmm4 -; SSE42-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE42-NEXT: movdqa %xmm4, %xmm6 +; SSE42-NEXT: pand %xmm5, %xmm6 +; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE42-NEXT: pand %xmm3, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE42-NEXT: pand %xmm3, %xmm5 -; SSE42-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE42-NEXT: pand %xmm2, %xmm5 -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE42-NEXT: pand %xmm2, %xmm4 +; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm3 ; SSE42-NEXT: psrad %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: psrad %xmm6, %xmm7 +; SSE42-NEXT: psrad %xmm5, %xmm7 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm3[0,1,2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm5 -; SSE42-NEXT: psrad %xmm3, %xmm5 +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: psrad %xmm3, %xmm4 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm2, %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: psrad %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm6 -; SSE42-NEXT: psrad %xmm5, %xmm6 -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm5 +; SSE42-NEXT: psrad %xmm4, %xmm5 +; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm4 ; SSE42-NEXT: psrad %xmm3, %xmm4 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm2, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] ; SSE42-NEXT: retq ; ; AVX2-LABEL: ashr_v8i32_cast_cond: @@ -5335,13 +5335,13 @@ define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] ; SSE2-NEXT: pand %xmm6, %xmm7 ; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [4,8] -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm6, %xmm10 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] +; SSE2-NEXT: pand %xmm5, %xmm10 +; SSE2-NEXT: pand %xmm6, %xmm10 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] ; SSE2-NEXT: pand %xmm5, %xmm9 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 @@ -5351,28 +5351,28 @@ define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: psrlq %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm11 -; SSE2-NEXT: psrlq %xmm10, %xmm11 +; SSE2-NEXT: psrlq %xmm6, %xmm11 ; SSE2-NEXT: movsd {{.*#+}} xmm11 = xmm5[0],xmm11[1] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psrlq %xmm9, %xmm5 -; SSE2-NEXT: psrlq %xmm10, %xmm0 +; SSE2-NEXT: psrlq %xmm6, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE2-NEXT: xorpd %xmm11, %xmm0 ; SSE2-NEXT: psubq %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: psrlq %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: psrlq %xmm9, %xmm10 -; SSE2-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1] +; SSE2-NEXT: psrlq %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,3,2,3] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: psrlq %xmm6, %xmm9 +; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrlq %xmm6, %xmm5 -; SSE2-NEXT: psrlq %xmm9, %xmm1 +; SSE2-NEXT: psrlq %xmm10, %xmm5 +; SSE2-NEXT: psrlq %xmm6, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] -; SSE2-NEXT: xorpd %xmm10, %xmm1 -; SSE2-NEXT: psubq %xmm10, %xmm1 +; SSE2-NEXT: xorpd %xmm9, %xmm1 +; SSE2-NEXT: psubq %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: psrlq %xmm7, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] @@ -5412,11 +5412,11 @@ define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: pand %xmm10, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [4,8] -; SSE42-NEXT: movdqa %xmm9, %xmm6 -; SSE42-NEXT: pand %xmm10, %xmm6 -; SSE42-NEXT: pcmpeqq %xmm10, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa %xmm9, %xmm10 +; SSE42-NEXT: pand %xmm6, %xmm10 +; SSE42-NEXT: pcmpeqq %xmm6, %xmm10 +; SSE42-NEXT: pand %xmm5, %xmm10 ; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 @@ -5424,28 +5424,28 @@ define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] ; SSE42-NEXT: movdqa %xmm4, %xmm5 ; SSE42-NEXT: psrlq %xmm9, %xmm5 -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,3,2,3] ; SSE42-NEXT: movdqa %xmm4, %xmm11 -; SSE42-NEXT: psrlq %xmm10, %xmm11 +; SSE42-NEXT: psrlq %xmm6, %xmm11 ; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm5[0,1,2,3],xmm11[4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm5 ; SSE42-NEXT: psrlq %xmm9, %xmm5 -; SSE42-NEXT: psrlq %xmm10, %xmm0 +; SSE42-NEXT: psrlq %xmm6, %xmm0 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] ; SSE42-NEXT: pxor %xmm11, %xmm0 ; SSE42-NEXT: psubq %xmm11, %xmm0 ; SSE42-NEXT: movdqa %xmm4, %xmm5 -; SSE42-NEXT: psrlq %xmm6, %xmm5 -; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] -; SSE42-NEXT: movdqa %xmm4, %xmm10 -; SSE42-NEXT: psrlq %xmm9, %xmm10 -; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm5[0,1,2,3],xmm10[4,5,6,7] +; SSE42-NEXT: psrlq %xmm10, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,3,2,3] +; SSE42-NEXT: movdqa %xmm4, %xmm9 +; SSE42-NEXT: psrlq %xmm6, %xmm9 +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm5[0,1,2,3],xmm9[4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm5 -; SSE42-NEXT: psrlq %xmm6, %xmm5 -; SSE42-NEXT: psrlq %xmm9, %xmm1 +; SSE42-NEXT: psrlq %xmm10, %xmm5 +; SSE42-NEXT: psrlq %xmm6, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pxor %xmm10, %xmm1 -; SSE42-NEXT: psubq %xmm10, %xmm1 +; SSE42-NEXT: pxor %xmm9, %xmm1 +; SSE42-NEXT: psubq %xmm9, %xmm1 ; SSE42-NEXT: movdqa %xmm4, %xmm5 ; SSE42-NEXT: psrlq %xmm7, %xmm5 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] @@ -5513,37 +5513,37 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] ; SSE2-NEXT: pslld $31, %xmm8 ; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: pandn %xmm7, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,1] -; SSE2-NEXT: pand %xmm9, %xmm8 -; SSE2-NEXT: por %xmm10, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pandn %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1,1] +; SSE2-NEXT: pand %xmm10, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] ; SSE2-NEXT: pslld $31, %xmm7 ; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pandn %xmm6, %xmm10 -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: por %xmm10, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm9 +; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: por %xmm9, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] ; SSE2-NEXT: pslld $31, %xmm6 ; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pandn %xmm5, %xmm10 -; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: por %xmm10, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: por %xmm9, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] ; SSE2-NEXT: pslld $31, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm9 +; SSE2-NEXT: pand %xmm5, %xmm10 ; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: por %xmm9, %xmm5 +; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: movq %xmm6, %rcx ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: cqto diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll index f4d6b52377f574..0425585115c257 100644 --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -523,21 +523,21 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE2-NEXT: pcmpgtd %xmm6, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: packssdw %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm7, %xmm3 ; SSE2-NEXT: pxor %xmm8, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 @@ -548,8 +548,8 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE2-NEXT: pand %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: packssdw %xmm3, %xmm0 ; SSE2-NEXT: packssdw %xmm2, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: test_cmp_v8i64: @@ -878,12 +878,11 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind { define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind { ; SSE-LABEL: test_cmp_v16f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm8 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 @@ -891,17 +890,18 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind ; SSE-NEXT: cmpltpd %xmm6, %xmm14 ; SSE-NEXT: packssdw %xmm15, %xmm14 ; SSE-NEXT: cmpltpd %xmm5, %xmm13 -; SSE-NEXT: cmpltpd %xmm4, %xmm9 -; SSE-NEXT: packssdw %xmm13, %xmm9 -; SSE-NEXT: packssdw %xmm14, %xmm9 -; SSE-NEXT: cmpltpd %xmm3, %xmm12 +; SSE-NEXT: cmpltpd %xmm4, %xmm12 +; SSE-NEXT: packssdw %xmm13, %xmm12 +; SSE-NEXT: packssdw %xmm14, %xmm12 +; SSE-NEXT: cmpltpd %xmm3, %xmm11 ; SSE-NEXT: cmpltpd %xmm2, %xmm10 -; SSE-NEXT: packssdw %xmm12, %xmm10 -; SSE-NEXT: cmpltpd %xmm1, %xmm11 -; SSE-NEXT: cmpltpd %xmm8, %xmm0 -; SSE-NEXT: packssdw %xmm11, %xmm0 -; SSE-NEXT: packssdw %xmm10, %xmm0 -; SSE-NEXT: packsswb %xmm9, %xmm0 +; SSE-NEXT: packssdw %xmm11, %xmm10 +; SSE-NEXT: cmpltpd %xmm1, %xmm9 +; SSE-NEXT: cmpltpd %xmm0, %xmm8 +; SSE-NEXT: packssdw %xmm9, %xmm8 +; SSE-NEXT: packssdw %xmm10, %xmm8 +; SSE-NEXT: packsswb %xmm12, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: test_cmp_v16f64: @@ -979,8 +979,8 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 @@ -995,12 +995,12 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind { ; SSE-NEXT: packsswb %xmm14, %xmm12 ; SSE-NEXT: pmovmskb %xmm12, %ecx ; SSE-NEXT: cmpltps %xmm7, %xmm11 -; SSE-NEXT: cmpltps %xmm6, %xmm9 -; SSE-NEXT: packssdw %xmm11, %xmm9 -; SSE-NEXT: cmpltps %xmm5, %xmm10 +; SSE-NEXT: cmpltps %xmm6, %xmm10 +; SSE-NEXT: packssdw %xmm11, %xmm10 +; SSE-NEXT: cmpltps %xmm5, %xmm9 ; SSE-NEXT: cmpltps %xmm4, %xmm8 -; SSE-NEXT: packssdw %xmm10, %xmm8 -; SSE-NEXT: packsswb %xmm9, %xmm8 +; SSE-NEXT: packssdw %xmm9, %xmm8 +; SSE-NEXT: packsswb %xmm10, %xmm8 ; SSE-NEXT: pmovmskb %xmm8, %edx ; SSE-NEXT: shll $16, %edx ; SSE-NEXT: orl %ecx, %edx @@ -1096,22 +1096,22 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind { ; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm6 -; SSE2-NEXT: packssdw %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: packssdw %xmm9, %xmm7 ; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 ; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 @@ -1123,8 +1123,8 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind { ; SSE2-NEXT: pand %xmm10, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: packssdw %xmm7, %xmm4 ; SSE2-NEXT: packssdw %xmm6, %xmm4 +; SSE2-NEXT: packssdw %xmm7, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm3 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 @@ -1143,22 +1143,22 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind { ; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: packssdw %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: pxor {{[0-9]+}}(%rsp), %xmm8 ; SSE2-NEXT: movdqa %xmm0, %xmm1 @@ -1169,8 +1169,8 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind { ; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: packssdw %xmm3, %xmm0 ; SSE2-NEXT: packssdw %xmm2, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 ; SSE2-NEXT: packsswb %xmm4, %xmm0 ; SSE2-NEXT: retq ; @@ -1530,29 +1530,29 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind { ; SSE-NEXT: shll $16, %edx ; SSE-NEXT: orl %ecx, %edx ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: pmovmskb %xmm2, %esi +; SSE-NEXT: pmovmskb %xmm2, %ecx ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pmovmskb %xmm3, %ecx -; SSE-NEXT: shll $16, %ecx -; SSE-NEXT: orl %esi, %ecx -; SSE-NEXT: shlq $32, %rcx -; SSE-NEXT: orq %rdx, %rcx +; SSE-NEXT: pmovmskb %xmm3, %esi +; SSE-NEXT: shll $16, %esi +; SSE-NEXT: orl %ecx, %esi +; SSE-NEXT: shlq $32, %rsi +; SSE-NEXT: orq %rdx, %rsi ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: pmovmskb %xmm4, %edx +; SSE-NEXT: pmovmskb %xmm4, %ecx ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pmovmskb %xmm5, %esi -; SSE-NEXT: shll $16, %esi -; SSE-NEXT: orl %edx, %esi +; SSE-NEXT: pmovmskb %xmm5, %edx +; SSE-NEXT: shll $16, %edx +; SSE-NEXT: orl %ecx, %edx ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: pmovmskb %xmm6, %edx +; SSE-NEXT: pmovmskb %xmm6, %ecx ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pmovmskb %xmm7, %edi ; SSE-NEXT: shll $16, %edi -; SSE-NEXT: orl %edx, %edi +; SSE-NEXT: orl %ecx, %edi ; SSE-NEXT: shlq $32, %rdi -; SSE-NEXT: orq %rsi, %rdi +; SSE-NEXT: orq %rdx, %rdi ; SSE-NEXT: movq %rdi, 8(%rax) -; SSE-NEXT: movq %rcx, (%rax) +; SSE-NEXT: movq %rsi, (%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: test_cmp_v128i8: @@ -1567,35 +1567,35 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind { ; AVX1-NEXT: shll $16, %edx ; AVX1-NEXT: orl %ecx, %edx ; AVX1-NEXT: vpcmpgtb %xmm5, %xmm1, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %esi +; AVX1-NEXT: vpmovmskb %xmm0, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: shll $16, %ecx -; AVX1-NEXT: orl %esi, %ecx -; AVX1-NEXT: shlq $32, %rcx -; AVX1-NEXT: orq %rdx, %rcx +; AVX1-NEXT: vpmovmskb %xmm0, %esi +; AVX1-NEXT: shll $16, %esi +; AVX1-NEXT: orl %ecx, %esi +; AVX1-NEXT: shlq $32, %rsi +; AVX1-NEXT: orq %rdx, %rsi ; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %edx +; AVX1-NEXT: vpmovmskb %xmm0, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %esi -; AVX1-NEXT: shll $16, %esi -; AVX1-NEXT: orl %edx, %esi -; AVX1-NEXT: vpcmpgtb %xmm7, %xmm3, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %edx +; AVX1-NEXT: shll $16, %edx +; AVX1-NEXT: orl %ecx, %edx +; AVX1-NEXT: vpcmpgtb %xmm7, %xmm3, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %edi ; AVX1-NEXT: shll $16, %edi -; AVX1-NEXT: orl %edx, %edi +; AVX1-NEXT: orl %ecx, %edi ; AVX1-NEXT: shlq $32, %rdi -; AVX1-NEXT: orq %rsi, %rdi +; AVX1-NEXT: orq %rdx, %rdi ; AVX1-NEXT: movq %rdi, 8(%rax) -; AVX1-NEXT: movq %rcx, (%rax) +; AVX1-NEXT: movq %rsi, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1742,35 +1742,35 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind ; SSE-NEXT: packssdw %xmm4, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: cmpltpd %xmm1, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: cmpltpd %xmm0, %xmm4 -; SSE-NEXT: packssdw %xmm2, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: packssdw %xmm3, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: packsswb %xmm5, %xmm4 +; SSE-NEXT: cmpltpd %xmm0, %xmm1 +; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: packssdw %xmm3, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: pmovmskb %xmm4, %ecx +; SSE-NEXT: packsswb %xmm5, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: pmovmskb %xmm1, %ecx +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: packssdw %xmm1, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: packssdw %xmm1, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: packssdw %xmm3, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: packssdw %xmm2, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: packssdw %xmm2, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm3, %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %edx ; SSE-NEXT: shll $16, %edx ; SSE-NEXT: orl %ecx, %edx @@ -1935,22 +1935,22 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm6 -; SSE2-NEXT: packssdw %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: packssdw %xmm9, %xmm7 ; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 ; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 @@ -1962,8 +1962,8 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE2-NEXT: pand %xmm10, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: packssdw %xmm7, %xmm4 ; SSE2-NEXT: packssdw %xmm6, %xmm4 +; SSE2-NEXT: packssdw %xmm7, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm3 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 @@ -1982,22 +1982,22 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: packssdw %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 @@ -2009,8 +2009,8 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: packsswb %xmm4, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %ecx ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 @@ -2023,21 +2023,8 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 @@ -2050,6 +2037,19 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: packssdw %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 @@ -2062,8 +2062,8 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 @@ -2084,36 +2084,36 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: packssdw %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 ; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pxor {{[0-9]+}}(%rsp), %xmm8 ; SSE2-NEXT: movdqa %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: packssdw %xmm2, %xmm4 ; SSE2-NEXT: packssdw %xmm3, %xmm4 -; SSE2-NEXT: packssdw %xmm1, %xmm4 ; SSE2-NEXT: packsswb %xmm0, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %edx ; SSE2-NEXT: shll $16, %edx @@ -2125,12 +2125,12 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE42: # %bb.0: ; SSE42-NEXT: movq %rdi, %rax ; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 ; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 -; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 ; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 +; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 ; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 +; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 ; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm7 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm6 @@ -2149,20 +2149,20 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE42-NEXT: packsswb %xmm4, %xmm0 ; SSE42-NEXT: pmovmskb %xmm0, %ecx ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm15 -; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm13 -; SSE42-NEXT: packssdw %xmm15, %xmm13 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm14 -; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9 -; SSE42-NEXT: packssdw %xmm14, %xmm9 -; SSE42-NEXT: packssdw %xmm13, %xmm9 +; SSE42-NEXT: packssdw %xmm15, %xmm14 +; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm13 +; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11 +; SSE42-NEXT: packssdw %xmm13, %xmm11 +; SSE42-NEXT: packssdw %xmm14, %xmm11 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm12 +; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9 +; SSE42-NEXT: packssdw %xmm12, %xmm9 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm10 -; SSE42-NEXT: packssdw %xmm12, %xmm10 -; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm8 -; SSE42-NEXT: packssdw %xmm11, %xmm8 ; SSE42-NEXT: packssdw %xmm10, %xmm8 -; SSE42-NEXT: packsswb %xmm9, %xmm8 +; SSE42-NEXT: packssdw %xmm9, %xmm8 +; SSE42-NEXT: packsswb %xmm11, %xmm8 ; SSE42-NEXT: pmovmskb %xmm8, %edx ; SSE42-NEXT: shll $16, %edx ; SSE42-NEXT: orl %ecx, %edx diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index cdabd7fab081cb..76a02500a34e26 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -4479,14 +4479,14 @@ define <1 x i64> @constrained_vector_fptoui_v1i64_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: comiss %xmm0, %xmm2 -; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm0, %xmm1 +; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: ja .LBB115_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm2 ; CHECK-NEXT: .LBB115_2: # %entry -; CHECK-NEXT: subss %xmm1, %xmm0 +; CHECK-NEXT: subss %xmm2, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %rcx ; CHECK-NEXT: setbe %al ; CHECK-NEXT: movzbl %al, %eax @@ -4527,13 +4527,13 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: comiss %xmm2, %xmm1 -; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm2, %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: xorps %xmm3, %xmm3 ; CHECK-NEXT: ja .LBB116_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: .LBB116_2: # %entry ; CHECK-NEXT: subss %xmm3, %xmm2 ; CHECK-NEXT: cvttss2si %xmm2, %rax @@ -4543,12 +4543,12 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 { ; CHECK-NEXT: xorq %rax, %rcx ; CHECK-NEXT: movq %rcx, %xmm2 ; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: comiss %xmm3, %xmm1 +; CHECK-NEXT: comiss %xmm3, %xmm0 ; CHECK-NEXT: ja .LBB116_4 ; CHECK-NEXT: # %bb.3: # %entry -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: .LBB116_4: # %entry -; CHECK-NEXT: subss %xmm0, %xmm3 +; CHECK-NEXT: subss %xmm1, %xmm3 ; CHECK-NEXT: cvttss2si %xmm3, %rax ; CHECK-NEXT: setbe %cl ; CHECK-NEXT: movzbl %cl, %ecx @@ -4618,13 +4618,13 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: comiss %xmm2, %xmm1 -; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm2, %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: xorps %xmm3, %xmm3 ; CHECK-NEXT: ja .LBB117_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: .LBB117_2: # %entry ; CHECK-NEXT: subss %xmm3, %xmm2 ; CHECK-NEXT: cvttss2si %xmm2, %rcx @@ -4633,11 +4633,11 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 { ; CHECK-NEXT: shlq $63, %rax ; CHECK-NEXT: xorq %rcx, %rax ; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: comiss %xmm2, %xmm1 +; CHECK-NEXT: comiss %xmm2, %xmm0 ; CHECK-NEXT: xorps %xmm3, %xmm3 ; CHECK-NEXT: ja .LBB117_4 ; CHECK-NEXT: # %bb.3: # %entry -; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: .LBB117_4: # %entry ; CHECK-NEXT: subss %xmm3, %xmm2 ; CHECK-NEXT: cvttss2si %xmm2, %rcx @@ -4646,12 +4646,12 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 { ; CHECK-NEXT: shlq $63, %rdx ; CHECK-NEXT: xorq %rcx, %rdx ; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: comiss %xmm2, %xmm1 +; CHECK-NEXT: comiss %xmm2, %xmm0 ; CHECK-NEXT: ja .LBB117_6 ; CHECK-NEXT: # %bb.5: # %entry -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: .LBB117_6: # %entry -; CHECK-NEXT: subss %xmm0, %xmm2 +; CHECK-NEXT: subss %xmm1, %xmm2 ; CHECK-NEXT: cvttss2si %xmm2, %rsi ; CHECK-NEXT: setbe %cl ; CHECK-NEXT: movzbl %cl, %ecx @@ -4731,13 +4731,13 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: comiss %xmm0, %xmm2 -; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm0, %xmm1 +; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: xorps %xmm3, %xmm3 ; CHECK-NEXT: ja .LBB118_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: movaps %xmm1, %xmm3 ; CHECK-NEXT: .LBB118_2: # %entry ; CHECK-NEXT: subss %xmm3, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -4746,11 +4746,11 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 { ; CHECK-NEXT: shlq $63, %rax ; CHECK-NEXT: xorq %rcx, %rax ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: comiss %xmm0, %xmm2 +; CHECK-NEXT: comiss %xmm0, %xmm1 ; CHECK-NEXT: xorps %xmm4, %xmm4 ; CHECK-NEXT: ja .LBB118_4 ; CHECK-NEXT: # %bb.3: # %entry -; CHECK-NEXT: movaps %xmm2, %xmm4 +; CHECK-NEXT: movaps %xmm1, %xmm4 ; CHECK-NEXT: .LBB118_4: # %entry ; CHECK-NEXT: movq %rax, %xmm3 ; CHECK-NEXT: subss %xmm4, %xmm0 @@ -4761,11 +4761,11 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 { ; CHECK-NEXT: xorq %rax, %rcx ; CHECK-NEXT: movq %rcx, %xmm0 ; CHECK-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: comiss %xmm4, %xmm2 +; CHECK-NEXT: comiss %xmm4, %xmm1 ; CHECK-NEXT: xorps %xmm5, %xmm5 ; CHECK-NEXT: ja .LBB118_6 ; CHECK-NEXT: # %bb.5: # %entry -; CHECK-NEXT: movaps %xmm2, %xmm5 +; CHECK-NEXT: movaps %xmm1, %xmm5 ; CHECK-NEXT: .LBB118_6: # %entry ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; CHECK-NEXT: subss %xmm5, %xmm4 @@ -4776,12 +4776,12 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 { ; CHECK-NEXT: xorq %rax, %rcx ; CHECK-NEXT: movq %rcx, %xmm3 ; CHECK-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: comiss %xmm4, %xmm2 +; CHECK-NEXT: comiss %xmm4, %xmm1 ; CHECK-NEXT: ja .LBB118_8 ; CHECK-NEXT: # %bb.7: # %entry -; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm2 ; CHECK-NEXT: .LBB118_8: # %entry -; CHECK-NEXT: subss %xmm1, %xmm4 +; CHECK-NEXT: subss %xmm2, %xmm4 ; CHECK-NEXT: cvttss2si %xmm4, %rax ; CHECK-NEXT: setbe %cl ; CHECK-NEXT: movzbl %cl, %ecx @@ -5033,14 +5033,14 @@ define <1 x i64> @constrained_vector_fptoui_v1i64_v1f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: comisd %xmm0, %xmm2 -; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: comisd %xmm0, %xmm1 +; CHECK-NEXT: xorpd %xmm2, %xmm2 ; CHECK-NEXT: ja .LBB123_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movapd %xmm2, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm2 ; CHECK-NEXT: .LBB123_2: # %entry -; CHECK-NEXT: subsd %xmm1, %xmm0 +; CHECK-NEXT: subsd %xmm2, %xmm0 ; CHECK-NEXT: cvttsd2si %xmm0, %rcx ; CHECK-NEXT: setbe %al ; CHECK-NEXT: movzbl %al, %eax @@ -5081,13 +5081,13 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: comisd %xmm2, %xmm1 -; CHECK-NEXT: xorpd %xmm0, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: comisd %xmm2, %xmm0 +; CHECK-NEXT: xorpd %xmm1, %xmm1 ; CHECK-NEXT: xorpd %xmm3, %xmm3 ; CHECK-NEXT: ja .LBB124_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movapd %xmm1, %xmm3 +; CHECK-NEXT: movapd %xmm0, %xmm3 ; CHECK-NEXT: .LBB124_2: # %entry ; CHECK-NEXT: subsd %xmm3, %xmm2 ; CHECK-NEXT: cvttsd2si %xmm2, %rax @@ -5097,12 +5097,12 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() #0 { ; CHECK-NEXT: xorq %rax, %rcx ; CHECK-NEXT: movq %rcx, %xmm2 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; CHECK-NEXT: comisd %xmm3, %xmm1 +; CHECK-NEXT: comisd %xmm3, %xmm0 ; CHECK-NEXT: ja .LBB124_4 ; CHECK-NEXT: # %bb.3: # %entry -; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: .LBB124_4: # %entry -; CHECK-NEXT: subsd %xmm0, %xmm3 +; CHECK-NEXT: subsd %xmm1, %xmm3 ; CHECK-NEXT: cvttsd2si %xmm3, %rax ; CHECK-NEXT: setbe %cl ; CHECK-NEXT: movzbl %cl, %ecx @@ -5173,13 +5173,13 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: comisd %xmm2, %xmm1 -; CHECK-NEXT: xorpd %xmm0, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: comisd %xmm2, %xmm0 +; CHECK-NEXT: xorpd %xmm1, %xmm1 ; CHECK-NEXT: xorpd %xmm3, %xmm3 ; CHECK-NEXT: ja .LBB125_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movapd %xmm1, %xmm3 +; CHECK-NEXT: movapd %xmm0, %xmm3 ; CHECK-NEXT: .LBB125_2: # %entry ; CHECK-NEXT: subsd %xmm3, %xmm2 ; CHECK-NEXT: cvttsd2si %xmm2, %rcx @@ -5188,11 +5188,11 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 { ; CHECK-NEXT: shlq $63, %rax ; CHECK-NEXT: xorq %rcx, %rax ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: comisd %xmm2, %xmm1 +; CHECK-NEXT: comisd %xmm2, %xmm0 ; CHECK-NEXT: xorpd %xmm3, %xmm3 ; CHECK-NEXT: ja .LBB125_4 ; CHECK-NEXT: # %bb.3: # %entry -; CHECK-NEXT: movapd %xmm1, %xmm3 +; CHECK-NEXT: movapd %xmm0, %xmm3 ; CHECK-NEXT: .LBB125_4: # %entry ; CHECK-NEXT: subsd %xmm3, %xmm2 ; CHECK-NEXT: cvttsd2si %xmm2, %rcx @@ -5201,12 +5201,12 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 { ; CHECK-NEXT: shlq $63, %rdx ; CHECK-NEXT: xorq %rcx, %rdx ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: comisd %xmm2, %xmm1 +; CHECK-NEXT: comisd %xmm2, %xmm0 ; CHECK-NEXT: ja .LBB125_6 ; CHECK-NEXT: # %bb.5: # %entry -; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: .LBB125_6: # %entry -; CHECK-NEXT: subsd %xmm0, %xmm2 +; CHECK-NEXT: subsd %xmm1, %xmm2 ; CHECK-NEXT: cvttsd2si %xmm2, %rsi ; CHECK-NEXT: setbe %cl ; CHECK-NEXT: movzbl %cl, %ecx @@ -5286,13 +5286,13 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: comisd %xmm0, %xmm2 -; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: comisd %xmm0, %xmm1 +; CHECK-NEXT: xorpd %xmm2, %xmm2 ; CHECK-NEXT: xorpd %xmm3, %xmm3 ; CHECK-NEXT: ja .LBB126_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movapd %xmm2, %xmm3 +; CHECK-NEXT: movapd %xmm1, %xmm3 ; CHECK-NEXT: .LBB126_2: # %entry ; CHECK-NEXT: subsd %xmm3, %xmm0 ; CHECK-NEXT: cvttsd2si %xmm0, %rcx @@ -5301,11 +5301,11 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 { ; CHECK-NEXT: shlq $63, %rax ; CHECK-NEXT: xorq %rcx, %rax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: comisd %xmm0, %xmm2 +; CHECK-NEXT: comisd %xmm0, %xmm1 ; CHECK-NEXT: xorpd %xmm4, %xmm4 ; CHECK-NEXT: ja .LBB126_4 ; CHECK-NEXT: # %bb.3: # %entry -; CHECK-NEXT: movapd %xmm2, %xmm4 +; CHECK-NEXT: movapd %xmm1, %xmm4 ; CHECK-NEXT: .LBB126_4: # %entry ; CHECK-NEXT: movq %rax, %xmm3 ; CHECK-NEXT: subsd %xmm4, %xmm0 @@ -5316,11 +5316,11 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 { ; CHECK-NEXT: xorq %rax, %rcx ; CHECK-NEXT: movq %rcx, %xmm0 ; CHECK-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; CHECK-NEXT: comisd %xmm4, %xmm2 +; CHECK-NEXT: comisd %xmm4, %xmm1 ; CHECK-NEXT: xorpd %xmm5, %xmm5 ; CHECK-NEXT: ja .LBB126_6 ; CHECK-NEXT: # %bb.5: # %entry -; CHECK-NEXT: movapd %xmm2, %xmm5 +; CHECK-NEXT: movapd %xmm1, %xmm5 ; CHECK-NEXT: .LBB126_6: # %entry ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; CHECK-NEXT: subsd %xmm5, %xmm4 @@ -5331,12 +5331,12 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 { ; CHECK-NEXT: xorq %rax, %rcx ; CHECK-NEXT: movq %rcx, %xmm3 ; CHECK-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; CHECK-NEXT: comisd %xmm4, %xmm2 +; CHECK-NEXT: comisd %xmm4, %xmm1 ; CHECK-NEXT: ja .LBB126_8 ; CHECK-NEXT: # %bb.7: # %entry -; CHECK-NEXT: movapd %xmm2, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm2 ; CHECK-NEXT: .LBB126_8: # %entry -; CHECK-NEXT: subsd %xmm1, %xmm4 +; CHECK-NEXT: subsd %xmm2, %xmm4 ; CHECK-NEXT: cvttsd2si %xmm4, %rax ; CHECK-NEXT: setbe %cl ; CHECK-NEXT: movzbl %cl, %ecx @@ -7717,30 +7717,30 @@ entry: define <16 x float> @vpaddd_mask_test(<16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone strictfp { ; CHECK-LABEL: vpaddd_mask_test: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm10, %xmm10 -; CHECK-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; CHECK-NEXT: pcmpeqd %xmm10, %xmm8 +; CHECK-NEXT: pxor %xmm8, %xmm8 ; CHECK-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; CHECK-NEXT: pcmpeqd %xmm10, %xmm9 +; CHECK-NEXT: pcmpeqd %xmm8, %xmm9 +; CHECK-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; CHECK-NEXT: pcmpeqd %xmm8, %xmm10 ; CHECK-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 -; CHECK-NEXT: pcmpeqd %xmm10, %xmm11 -; CHECK-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm10 +; CHECK-NEXT: pcmpeqd %xmm8, %xmm11 +; CHECK-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm8 ; CHECK-NEXT: addps %xmm3, %xmm7 ; CHECK-NEXT: addps %xmm2, %xmm6 ; CHECK-NEXT: addps %xmm1, %xmm5 ; CHECK-NEXT: addps %xmm0, %xmm4 -; CHECK-NEXT: andps %xmm10, %xmm0 -; CHECK-NEXT: andnps %xmm4, %xmm10 -; CHECK-NEXT: orps %xmm10, %xmm0 +; CHECK-NEXT: andps %xmm8, %xmm0 +; CHECK-NEXT: andnps %xmm4, %xmm8 +; CHECK-NEXT: orps %xmm8, %xmm0 ; CHECK-NEXT: andps %xmm11, %xmm1 ; CHECK-NEXT: andnps %xmm5, %xmm11 ; CHECK-NEXT: orps %xmm11, %xmm1 -; CHECK-NEXT: andps %xmm9, %xmm2 -; CHECK-NEXT: andnps %xmm6, %xmm9 -; CHECK-NEXT: orps %xmm9, %xmm2 -; CHECK-NEXT: andps %xmm8, %xmm3 -; CHECK-NEXT: andnps %xmm7, %xmm8 -; CHECK-NEXT: orps %xmm8, %xmm3 +; CHECK-NEXT: andps %xmm10, %xmm2 +; CHECK-NEXT: andnps %xmm6, %xmm10 +; CHECK-NEXT: orps %xmm10, %xmm2 +; CHECK-NEXT: andps %xmm9, %xmm3 +; CHECK-NEXT: andnps %xmm7, %xmm9 +; CHECK-NEXT: orps %xmm9, %xmm3 ; CHECK-NEXT: retq ; ; AVX1-LABEL: vpaddd_mask_test: diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 2b97280113bb63..7ca97f38f66d04 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -180,22 +180,22 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; X86-SSE2-LABEL: var_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm3, %xmm4 ; X86-SSE2-NEXT: psrlq $1, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: psrlq %xmm5, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 -; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1] -; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 +; X86-SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1] +; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psllq %xmm2, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm3, %xmm0 +; X86-SSE2-NEXT: orpd %xmm5, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ret <2 x i64> %res @@ -204,26 +204,26 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { ; SSE2-LABEL: var_funnnel_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psrld %xmm3, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld %xmm7, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm5, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] -; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: psrld %xmm5, %xmm7 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm4, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,3] +; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 @@ -234,7 +234,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE2-NEXT: pmuludq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v4i32: @@ -616,24 +616,24 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pslld $23, %xmm5 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE2-NEXT: paddd %xmm4, %xmm5 -; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pslld $23, %xmm4 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE2-NEXT: paddd %xmm5, %xmm4 +; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm4 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm5, %xmm3 +; X86-SSE2-NEXT: pmuludq %xmm4, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm6, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm6, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; X86-SSE2-NEXT: psrad $16, %xmm3 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld $23, %xmm2 -; X86-SSE2-NEXT: paddd %xmm4, %xmm2 +; X86-SSE2-NEXT: paddd %xmm5, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 @@ -653,47 +653,47 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; SSE2-LABEL: var_funnnel_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm3, %xmm6 -; SSE2-NEXT: cvttps2dq %xmm6, %xmm6 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $23, %xmm4 -; SSE2-NEXT: paddd %xmm3, %xmm4 -; SSE2-NEXT: cvttps2dq %xmm4, %xmm7 -; SSE2-NEXT: pslld $16, %xmm7 -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: packssdw %xmm6, %xmm7 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: pmullw %xmm7, %xmm4 -; SSE2-NEXT: psrlw $8, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $23, %xmm5 -; SSE2-NEXT: paddd %xmm3, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm6, %xmm5 ; SSE2-NEXT: cvttps2dq %xmm5, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $23, %xmm4 +; SSE2-NEXT: paddd %xmm6, %xmm4 +; SSE2-NEXT: cvttps2dq %xmm4, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: psrlw $8, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm2 +; SSE2-NEXT: paddd %xmm6, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm5, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: pmullw %xmm1, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm5, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -916,10 +916,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; X86-SSE2-LABEL: var_funnnel_v16i8: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: pxor %xmm5, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; X86-SSE2-NEXT: movdqa %xmm4, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pslld $23, %xmm6 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] @@ -927,36 +927,36 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; X86-SSE2-NEXT: cvttps2dq %xmm6, %xmm6 ; X86-SSE2-NEXT: pslld $16, %xmm6 ; X86-SSE2-NEXT: psrad $16, %xmm6 -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; X86-SSE2-NEXT: pslld $23, %xmm4 -; X86-SSE2-NEXT: paddd %xmm3, %xmm4 -; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm7 +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: pslld $23, %xmm5 +; X86-SSE2-NEXT: paddd %xmm3, %xmm5 +; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm7 ; X86-SSE2-NEXT: pslld $16, %xmm7 ; X86-SSE2-NEXT: psrad $16, %xmm7 ; X86-SSE2-NEXT: packssdw %xmm6, %xmm7 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; X86-SSE2-NEXT: pmullw %xmm7, %xmm4 -; X86-SSE2-NEXT: psrlw $8, %xmm4 -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pslld $23, %xmm5 -; X86-SSE2-NEXT: paddd %xmm3, %xmm5 -; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm5 -; X86-SSE2-NEXT: pslld $16, %xmm5 -; X86-SSE2-NEXT: psrad $16, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; X86-SSE2-NEXT: pmullw %xmm7, %xmm5 +; X86-SSE2-NEXT: psrlw $8, %xmm5 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pslld $23, %xmm4 +; X86-SSE2-NEXT: paddd %xmm3, %xmm4 +; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm4 +; X86-SSE2-NEXT: pslld $16, %xmm4 +; X86-SSE2-NEXT: psrad $16, %xmm4 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd %xmm3, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm2 -; X86-SSE2-NEXT: packssdw %xmm5, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm4, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X86-SSE2-NEXT: pmullw %xmm1, %xmm2 ; X86-SSE2-NEXT: psrlw $8, %xmm2 -; X86-SSE2-NEXT: packuswb %xmm4, %xmm2 +; X86-SSE2-NEXT: packuswb %xmm5, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 37d4f3b3dff54d..12410e71d3e3ec 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -524,37 +524,36 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; ; SSE41-LABEL: var_funnnel_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlw $4, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psllw $4, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: psllw $5, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: psllw $5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: psrlw $6, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psllw $2, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: psrlw $7, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm2, %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: var_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll index 0b6361ffd4fae3..68b30e85e89476 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll @@ -24,26 +24,26 @@ declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind { ; SSE2-LABEL: var_funnnel_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psrld %xmm3, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld %xmm7, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm5, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] -; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: psrld %xmm5, %xmm7 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm4, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,3] +; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 @@ -54,7 +54,7 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; SSE2-NEXT: pmuludq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v2i32: diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index ea54d0567eccf1..9eca4335e1d85c 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -181,22 +181,22 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; X86-SSE2-LABEL: var_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: pand %xmm4, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: psrlq %xmm5, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 -; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1] -; X86-SSE2-NEXT: pandn %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 +; X86-SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1] +; X86-SSE2-NEXT: pandn %xmm3, %xmm2 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psllq %xmm2, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm3, %xmm0 +; X86-SSE2-NEXT: orpd %xmm5, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ret <2 x i64> %res @@ -205,25 +205,25 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { ; SSE2-LABEL: var_funnnel_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psrld %xmm3, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld %xmm7, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm5, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] -; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: psrld %xmm5, %xmm7 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm4, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,3] +; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 @@ -235,7 +235,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE2-NEXT: pmuludq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v4i32: diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index b39b7c140a451d..8ae44e951267f7 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -510,54 +510,54 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX1-LABEL: var_funnnel_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm5 -; AVX1-NEXT: vpsllw $4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm4 +; AVX1-NEXT: vpsllw $4, %xmm4, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm6 +; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm8 ; AVX1-NEXT: vpsllw $5, %xmm8, %xmm8 -; AVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpsllw $2, %xmm6, %xmm9 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm9 +; AVX1-NEXT: vpblendvb %xmm8, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsllw $2, %xmm4, %xmm5 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5 ; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm6 -; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm9 +; AVX1-NEXT: vpblendvb %xmm8, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5 ; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 -; AVX1-NEXT: vpsrlw $4, %xmm8, %xmm9 +; AVX1-NEXT: vpblendvb %xmm8, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm5, %xmm8 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm10, %xmm9, %xmm9 -; AVX1-NEXT: vpsllw $5, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm8 -; AVX1-NEXT: vpsrlw $2, %xmm8, %xmm9 +; AVX1-NEXT: vpand %xmm10, %xmm8, %xmm8 +; AVX1-NEXT: vpsllw $5, %xmm6, %xmm6 +; AVX1-NEXT: vpblendvb %xmm6, %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw $2, %xmm5, %xmm8 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX1-NEXT: vpand %xmm11, %xmm9, %xmm9 -; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm8 -; AVX1-NEXT: vpsrlw $1, %xmm8, %xmm9 +; AVX1-NEXT: vpand %xmm11, %xmm8, %xmm8 +; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpblendvb %xmm6, %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm8 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX1-NEXT: vpand %xmm12, %xmm9, %xmm9 -; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm7 -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm12, %xmm8, %xmm8 +; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpblendvb %xmm6, %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $4, %xmm0, %xmm7 -; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm4 -; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 -; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5 +; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm7, %xmm2, %xmm5 +; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3 +; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3 -; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm3 ; AVX1-NEXT: vpand %xmm3, %xmm10, %xmm3 ; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 @@ -571,7 +571,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 2bd03507a0249c..ff521cf4d63097 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -223,56 +223,56 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 -; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6 +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpxor %ymm7, %ymm3, %ymm8 +; AVX512F-NEXT: vpxor %ymm7, %ymm6, %ymm8 ; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512F-NEXT: vpand %ymm4, %ymm9, %ymm4 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 -; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm6 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm9, %ymm4 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5 -; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm5 -; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm5 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm4 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5 +; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm4 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 +; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4 -; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm4 @@ -290,56 +290,56 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6 +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm6 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpxor %ymm7, %ymm3, %ymm8 +; AVX512VL-NEXT: vpxor %ymm7, %ymm6, %ymm8 ; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm8 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512VL-NEXT: vpand %ymm4, %ymm9, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 -; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5 -; AVX512VL-NEXT: vpxor %ymm7, %ymm2, %ymm6 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpxor %ymm7, %ymm2, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm4, %ymm9, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm4 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5 +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm4 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 +; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4 -; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4 diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll index 56896927e7e5ad..9e83058a8f04ee 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll @@ -24,25 +24,25 @@ declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind { ; SSE2-LABEL: var_funnnel_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psrld %xmm3, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld %xmm7, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm5, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] -; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: psrld %xmm5, %xmm7 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm4, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,3] +; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 @@ -54,7 +54,7 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; SSE2-NEXT: pmuludq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v2i32: @@ -482,12 +482,12 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: psrld $5, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: psrld $4, %xmm2 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; X86-SSE2-NEXT: psrld $5, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: psrld $4, %xmm3 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3] ; X86-SSE2-NEXT: movl $268435456, %eax # imm = 0x10000000 ; X86-SSE2-NEXT: movd %eax, %xmm1 ; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 @@ -496,7 +496,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm3, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) diff --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll index b4cffcd171b332..b1662913def574 100644 --- a/llvm/test/CodeGen/X86/vector-gep.ll +++ b/llvm/test/CodeGen/X86/vector-gep.ll @@ -123,65 +123,65 @@ define <64 x ptr> @AGEP9(ptr %param, <64 x i32> %off) nounwind { ; CHECK-NEXT: andl $-32, %esp ; CHECK-NEXT: subl $160, %esp ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm3 -; CHECK-NEXT: vbroadcastss 12(%ebp), %xmm5 -; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vbroadcastss 12(%ebp), %xmm6 +; CHECK-NEXT: vpaddd %xmm3, %xmm6, %xmm3 ; CHECK-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm6, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm6, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm6, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm6, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm6, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovdqa 40(%ebp), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm6, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovdqa 56(%ebp), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm6, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovdqa 72(%ebp), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm6, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, (%esp) # 16-byte Spill ; CHECK-NEXT: vmovdqa 88(%ebp), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm2 +; CHECK-NEXT: vpaddd %xmm0, %xmm6, %xmm2 ; CHECK-NEXT: vmovdqa 104(%ebp), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm6, %xmm1 ; CHECK-NEXT: vmovdqa 120(%ebp), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 -; CHECK-NEXT: vmovdqa 136(%ebp), %xmm6 -; CHECK-NEXT: vpaddd %xmm6, %xmm6, %xmm6 -; CHECK-NEXT: vpaddd %xmm6, %xmm5, %xmm6 +; CHECK-NEXT: vpaddd %xmm0, %xmm6, %xmm0 +; CHECK-NEXT: vmovdqa 136(%ebp), %xmm5 +; CHECK-NEXT: vpaddd %xmm5, %xmm5, %xmm5 +; CHECK-NEXT: vpaddd %xmm5, %xmm6, %xmm5 ; CHECK-NEXT: vmovdqa 152(%ebp), %xmm7 ; CHECK-NEXT: vpaddd %xmm7, %xmm7, %xmm7 -; CHECK-NEXT: vpaddd %xmm7, %xmm5, %xmm7 +; CHECK-NEXT: vpaddd %xmm7, %xmm6, %xmm7 ; CHECK-NEXT: vmovdqa 168(%ebp), %xmm4 ; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpaddd %xmm4, %xmm6, %xmm4 ; CHECK-NEXT: vmovdqa 184(%ebp), %xmm3 ; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm6, %xmm3 ; CHECK-NEXT: movl 8(%ebp), %eax ; CHECK-NEXT: vmovdqa %xmm3, 240(%eax) ; CHECK-NEXT: vmovdqa %xmm4, 224(%eax) ; CHECK-NEXT: vmovdqa %xmm7, 208(%eax) -; CHECK-NEXT: vmovdqa %xmm6, 192(%eax) +; CHECK-NEXT: vmovdqa %xmm5, 192(%eax) ; CHECK-NEXT: vmovdqa %xmm0, 176(%eax) ; CHECK-NEXT: vmovdqa %xmm1, 160(%eax) ; CHECK-NEXT: vmovdqa %xmm2, 144(%eax) diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index ed6f5007b77e32..c76846ee461f3b 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -3132,13 +3132,13 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { ; AVX-LABEL: cvt_2f64_to_2i16: ; AVX: # %bb.0: ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: callq __truncdfhf2@PLT -; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __truncdfhf2@PLT -; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: retq @@ -3177,14 +3177,14 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -3208,14 +3208,14 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -3309,14 +3309,14 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -3340,14 +3340,14 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -3442,14 +3442,14 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -3473,14 +3473,14 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -4000,14 +4000,14 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -4035,14 +4035,14 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -4150,14 +4150,14 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind { ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -4185,14 +4185,14 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind { ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 34c584e8eb7add..3cae6e09d887b0 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -670,81 +670,81 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE2-LABEL: test_remconstant_16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: packuswb %xmm2, %xmm3 -; SSE2-NEXT: psrlw $7, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: paddb %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: packuswb %xmm1, %xmm3 +; SSE2-NEXT: psrlw $7, %xmm2 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: paddb %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: psubb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_remconstant_16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE41-NEXT: psraw $8, %xmm2 -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255] +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: paddb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE41-NEXT: psraw $8, %xmm1 +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE41-NEXT: psraw $8, %xmm3 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: packuswb %xmm2, %xmm3 -; SSE41-NEXT: psrlw $7, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm3 +; SSE41-NEXT: psrlw $7, %xmm2 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: paddb %xmm3, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: packuswb %xmm1, %xmm2 -; SSE41-NEXT: psubb %xmm2, %xmm0 +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: packuswb %xmm2, %xmm1 +; SSE41-NEXT: psubb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_remconstant_16i8: diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll index f15f5cba290307..38a6482a063271 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -649,17 +649,17 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-LABEL: test_remconstant_32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm5, %xmm5 @@ -673,45 +673,45 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 -; AVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpsubb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpsraw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_remconstant_32i8: diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 1b55a401f401d2..3754bb94068cbe 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -530,17 +530,17 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: test_remconstant_64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31] -; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm3 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] +; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm5 ; AVX512F-NEXT: vpaddb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpsraw $8, %ymm5, %ymm5 @@ -554,45 +554,45 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm7 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm7, %ymm7 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 -; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpackuswb %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsubb %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpsraw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpsraw $8, %ymm5, %ymm5 -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm5, %ymm2 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm3 +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpsraw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_remconstant_64i8: diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index 1ce21cb39b2e8f..cca90db6110152 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -820,29 +820,29 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE41-NEXT: psrlw $8, %xmm4 ; SSE41-NEXT: packuswb %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psubb %xmm4, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: paddb %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psubb %xmm4, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: packuswb %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: packuswb %xmm2, %xmm3 -; SSE41-NEXT: psubb %xmm3, %xmm0 +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: packuswb %xmm3, %xmm2 +; SSE41-NEXT: psubb %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_remconstant_16i8: diff --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll index 10a840218c8640..bd7120179d4ca0 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll @@ -131,27 +131,27 @@ define void @test_sdiv7_v2i32(ptr %x, ptr %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: pmuludq %xmm2, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X86-NEXT: movdqa %xmm1, %xmm3 +; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86-NEXT: movdqa %xmm0, %xmm3 ; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X86-NEXT: pmuludq %xmm2, %xmm3 +; X86-NEXT: pmuludq %xmm1, %xmm3 ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-NEXT: pxor %xmm3, %xmm3 -; X86-NEXT: pcmpgtd %xmm1, %xmm3 -; X86-NEXT: pand %xmm2, %xmm3 -; X86-NEXT: paddd %xmm1, %xmm3 -; X86-NEXT: psubd %xmm3, %xmm0 -; X86-NEXT: paddd %xmm1, %xmm0 -; X86-NEXT: movdqa %xmm0, %xmm1 -; X86-NEXT: psrld $31, %xmm1 -; X86-NEXT: psrad $2, %xmm0 -; X86-NEXT: paddd %xmm1, %xmm0 -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-NEXT: pand %xmm1, %xmm3 +; X86-NEXT: paddd %xmm0, %xmm3 +; X86-NEXT: psubd %xmm3, %xmm2 +; X86-NEXT: paddd %xmm0, %xmm2 +; X86-NEXT: movdqa %xmm2, %xmm0 +; X86-NEXT: psrld $31, %xmm0 +; X86-NEXT: psrad $2, %xmm2 +; X86-NEXT: paddd %xmm0, %xmm2 +; X86-NEXT: movq %xmm2, (%eax) ; X86-NEXT: retl %a = load <2 x i32>, ptr %x %b = sdiv <2 x i32> %a, diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll index 63ca7c6a005732..fefa17b26fc05d 100644 --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -14,17 +14,17 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x ; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] @@ -32,15 +32,15 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] @@ -51,8 +51,8 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x ; SSE-NEXT: movdqa %xmm4, 96(%rdi) ; SSE-NEXT: movdqa %xmm0, 80(%rdi) ; SSE-NEXT: movdqa %xmm7, 64(%rdi) -; SSE-NEXT: movdqa %xmm1, 48(%rdi) -; SSE-NEXT: movdqa %xmm2, 32(%rdi) +; SSE-NEXT: movdqa %xmm2, 48(%rdi) +; SSE-NEXT: movdqa %xmm1, 32(%rdi) ; SSE-NEXT: movdqa %xmm8, 16(%rdi) ; SSE-NEXT: movdqa %xmm5, (%rdi) ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index 900847c8a191fe..fa5b89f750c9e5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -134,17 +134,17 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-LABEL: load_i16_stride2_vf16: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pslld $16, %xmm4 ; SSE-NEXT: psrad $16, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pslld $16, %xmm5 ; SSE-NEXT: psrad $16, %xmm5 ; SSE-NEXT: packssdw %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pslld $16, %xmm4 ; SSE-NEXT: psrad $16, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm6 @@ -152,15 +152,15 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: psrad $16, %xmm6 ; SSE-NEXT: packssdw %xmm4, %xmm6 ; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm3, %xmm1 ; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm6, (%rsi) ; SSE-NEXT: movdqa %xmm5, 16(%rsi) ; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride2_vf16: @@ -265,55 +265,55 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa 80(%rdi), %xmm4 ; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa 112(%rdi), %xmm5 ; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm7 ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pslld $16, %xmm8 -; SSE-NEXT: psrad $16, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pslld $16, %xmm5 -; SSE-NEXT: psrad $16, %xmm5 -; SSE-NEXT: packssdw %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pslld $16, %xmm10 -; SSE-NEXT: psrad $16, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pslld $16, %xmm8 -; SSE-NEXT: psrad $16, %xmm8 -; SSE-NEXT: packssdw %xmm10, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pslld $16, %xmm11 -; SSE-NEXT: psrad $16, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pslld $16, %xmm9 +; SSE-NEXT: psrad $16, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pslld $16, %xmm6 +; SSE-NEXT: psrad $16, %xmm6 +; SSE-NEXT: packssdw %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pslld $16, %xmm9 +; SSE-NEXT: psrad $16, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pslld $16, %xmm10 ; SSE-NEXT: psrad $16, %xmm10 -; SSE-NEXT: packssdw %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: packssdw %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pslld $16, %xmm9 +; SSE-NEXT: psrad $16, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pslld $16, %xmm11 ; SSE-NEXT: psrad $16, %xmm11 +; SSE-NEXT: packssdw %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pslld $16, %xmm9 +; SSE-NEXT: psrad $16, %xmm9 ; SSE-NEXT: movdqa %xmm0, %xmm12 ; SSE-NEXT: pslld $16, %xmm12 ; SSE-NEXT: psrad $16, %xmm12 -; SSE-NEXT: packssdw %xmm11, %xmm12 -; SSE-NEXT: psrad $16, %xmm9 +; SSE-NEXT: packssdw %xmm9, %xmm12 +; SSE-NEXT: psrad $16, %xmm8 ; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: packssdw %xmm9, %xmm3 +; SSE-NEXT: packssdw %xmm8, %xmm3 ; SSE-NEXT: psrad $16, %xmm7 ; SSE-NEXT: psrad $16, %xmm2 ; SSE-NEXT: packssdw %xmm7, %xmm2 -; SSE-NEXT: psrad $16, %xmm6 +; SSE-NEXT: psrad $16, %xmm5 ; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm6, %xmm1 +; SSE-NEXT: packssdw %xmm5, %xmm1 ; SSE-NEXT: psrad $16, %xmm4 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm12, 32(%rsi) -; SSE-NEXT: movdqa %xmm10, 48(%rsi) -; SSE-NEXT: movdqa %xmm8, (%rsi) -; SSE-NEXT: movdqa %xmm5, 16(%rsi) +; SSE-NEXT: movdqa %xmm11, 48(%rsi) +; SSE-NEXT: movdqa %xmm10, (%rsi) +; SSE-NEXT: movdqa %xmm6, 16(%rsi) ; SSE-NEXT: movdqa %xmm0, 32(%rdx) ; SSE-NEXT: movdqa %xmm1, 48(%rdx) ; SSE-NEXT: movdqa %xmm2, (%rdx) @@ -501,132 +501,134 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-LABEL: load_i16_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movdqa 96(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: pslld $16, %xmm12 -; SSE-NEXT: psrad $16, %xmm12 -; SSE-NEXT: packssdw %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: pslld $16, %xmm10 -; SSE-NEXT: psrad $16, %xmm10 -; SSE-NEXT: packssdw %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pslld $16, %xmm11 +; SSE-NEXT: psrad $16, %xmm11 +; SSE-NEXT: packssdw %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm9 ; SSE-NEXT: psrad $16, %xmm9 ; SSE-NEXT: packssdw %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: pslld $16, %xmm10 +; SSE-NEXT: psrad $16, %xmm10 +; SSE-NEXT: packssdw %xmm0, %xmm10 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: pslld $16, %xmm11 -; SSE-NEXT: psrad $16, %xmm11 -; SSE-NEXT: packssdw %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pslld $16, %xmm12 +; SSE-NEXT: psrad $16, %xmm12 +; SSE-NEXT: packssdw %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: pslld $16, %xmm13 -; SSE-NEXT: psrad $16, %xmm13 -; SSE-NEXT: packssdw %xmm0, %xmm13 -; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm14 +; SSE-NEXT: psrad $16, %xmm14 +; SSE-NEXT: packssdw %xmm0, %xmm14 +; SSE-NEXT: movdqa 240(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: movdqa 224(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: movdqa 224(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm15 ; SSE-NEXT: pslld $16, %xmm15 ; SSE-NEXT: psrad $16, %xmm15 ; SSE-NEXT: packssdw %xmm0, %xmm15 -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pslld $16, %xmm3 +; SSE-NEXT: psrad $16, %xmm3 +; SSE-NEXT: packssdw %xmm1, %xmm3 +; SSE-NEXT: movdqa 208(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pslld $16, %xmm4 ; SSE-NEXT: psrad $16, %xmm4 -; SSE-NEXT: packssdw %xmm1, %xmm4 -; SSE-NEXT: movdqa 208(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pslld $16, %xmm6 -; SSE-NEXT: psrad $16, %xmm6 ; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm6, %xmm1 -; SSE-NEXT: psrad $16, %xmm14 +; SSE-NEXT: packssdw %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrad $16, %xmm4 +; SSE-NEXT: packssdw %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: psrad $16, %xmm6 -; SSE-NEXT: packssdw %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrad $16, %xmm4 +; SSE-NEXT: packssdw %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: psrad $16, %xmm14 -; SSE-NEXT: packssdw %xmm0, %xmm14 -; SSE-NEXT: psrad $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrad $16, %xmm4 +; SSE-NEXT: packssdw %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrad $16, %xmm5 -; SSE-NEXT: packssdw %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm6 -; SSE-NEXT: packssdw %xmm0, %xmm6 +; SSE-NEXT: packssdw %xmm5, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: packssdw %xmm0, %xmm3 +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: psrad $16, %xmm4 +; SSE-NEXT: packssdw %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm7 -; SSE-NEXT: packssdw %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: psrad $16, %xmm5 +; SSE-NEXT: packssdw %xmm0, %xmm5 +; SSE-NEXT: psrad $16, %xmm13 ; SSE-NEXT: psrad $16, %xmm8 +; SSE-NEXT: packssdw %xmm13, %xmm8 +; SSE-NEXT: psrad $16, %xmm7 ; SSE-NEXT: psrad $16, %xmm2 -; SSE-NEXT: packssdw %xmm8, %xmm2 +; SSE-NEXT: packssdw %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm1, 96(%rsi) -; SSE-NEXT: movdqa %xmm4, 32(%rsi) +; SSE-NEXT: movdqa %xmm3, 32(%rsi) ; SSE-NEXT: movdqa %xmm15, 112(%rsi) -; SSE-NEXT: movdqa %xmm13, 48(%rsi) -; SSE-NEXT: movdqa %xmm11, 64(%rsi) -; SSE-NEXT: movdqa %xmm9, (%rsi) -; SSE-NEXT: movdqa %xmm10, 80(%rsi) -; SSE-NEXT: movdqa %xmm12, 16(%rsi) +; SSE-NEXT: movdqa %xmm14, 48(%rsi) +; SSE-NEXT: movdqa %xmm12, 64(%rsi) +; SSE-NEXT: movdqa %xmm10, (%rsi) +; SSE-NEXT: movdqa %xmm9, 80(%rsi) +; SSE-NEXT: movdqa %xmm11, 16(%rsi) ; SSE-NEXT: movdqa %xmm2, 96(%rdx) -; SSE-NEXT: movdqa %xmm7, 112(%rdx) -; SSE-NEXT: movdqa %xmm3, 64(%rdx) -; SSE-NEXT: movdqa %xmm6, 80(%rdx) -; SSE-NEXT: movdqa %xmm5, 32(%rdx) -; SSE-NEXT: movdqa %xmm14, 48(%rdx) +; SSE-NEXT: movdqa %xmm8, 112(%rdx) +; SSE-NEXT: movdqa %xmm5, 64(%rdx) +; SSE-NEXT: movdqa %xmm4, 80(%rdx) +; SSE-NEXT: movdqa %xmm6, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 1b47c0f3193d3a..2e54504d64819c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -282,27 +282,27 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i16_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,0] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,2,3,4,5,6,7] @@ -318,19 +318,19 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm8 ; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movdqa %xmm8, (%rdx) ; SSE-NEXT: movdqa %xmm6, (%rcx) ; SSE-NEXT: retq @@ -426,105 +426,105 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE: # %bb.0: ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm7 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm9[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm11[2,0] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,0] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm12, %xmm9 -; SSE-NEXT: por %xmm11, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm9[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm10[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm12, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm11 ; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pand %xmm5, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm12 ; SSE-NEXT: por %xmm11, %xmm12 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: pandn %xmm13, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm10, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: por %xmm11, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movaps %xmm7, (%rsi) -; SSE-NEXT: movdqa %xmm11, 16(%rdx) -; SSE-NEXT: movdqa %xmm9, (%rdx) -; SSE-NEXT: movdqa %xmm8, 16(%rcx) -; SSE-NEXT: movdqa %xmm10, (%rcx) +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movaps %xmm7, 16(%rsi) +; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movdqa %xmm13, 16(%rdx) +; SSE-NEXT: movdqa %xmm10, (%rdx) +; SSE-NEXT: movdqa %xmm9, 16(%rcx) +; SSE-NEXT: movdqa %xmm8, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride3_vf16: @@ -616,11 +616,11 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i16_stride3_vf16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm3 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512F-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm3 ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] @@ -633,7 +633,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6 +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -643,9 +643,9 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] ; AVX512F-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] @@ -685,343 +685,331 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i16_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movdqa 96(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm13 -; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa 64(%rdi), %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] -; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa 176(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: movdqa 144(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm10 +; SSE-NEXT: movdqa 80(%rdi), %xmm13 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[0,1,2,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm6[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,0] +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,1] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm6[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm6[2,0] +; SSE-NEXT: movdqa 112(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm12, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa 128(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,0],xmm7[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] ; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] -; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa 128(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm13, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pandn %xmm7, %xmm13 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 -; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movdqa %xmm5, 32(%rdx) -; SSE-NEXT: movdqa %xmm13, 48(%rdx) -; SSE-NEXT: movdqa %xmm15, (%rdx) -; SSE-NEXT: movdqa %xmm10, 16(%rdx) -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm4, 16(%rcx) -; SSE-NEXT: addq $40, %rsp -; SSE-NEXT: retq -; -; AVX1-ONLY-LABEL: load_i16_stride3_vf32: -; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: por %xmm12, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] +; SSE-NEXT: pandn %xmm5, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm2, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 48(%rdx) +; SSE-NEXT: movdqa %xmm4, (%rdx) +; SSE-NEXT: movdqa %xmm14, 16(%rdx) +; SSE-NEXT: movdqa %xmm13, 32(%rcx) +; SSE-NEXT: movdqa %xmm1, 48(%rcx) +; SSE-NEXT: movdqa %xmm6, (%rcx) +; SSE-NEXT: movdqa %xmm7, 16(%rcx) +; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: retq +; +; AVX1-ONLY-LABEL: load_i16_stride3_vf32: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5,6],xmm9[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm11[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm8[2],xmm5[3,4],xmm8[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm6[0,1],xmm9[2],xmm6[3,4],xmm9[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3],mem[4],xmm6[5,6],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3],mem[4],xmm1[5,6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm15, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, (%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1029,23 +1017,23 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15] +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm2 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6],ymm5[7],ymm2[8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13,14],ymm5[15] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15] @@ -1058,40 +1046,40 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm11 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm11, %ymm11 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] @@ -1100,11 +1088,11 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rsi) ; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1223,447 +1211,437 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i16_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 +; SSE-NEXT: subq $344, %rsp # imm = 0x158 ; SSE-NEXT: movdqa 192(%rdi), %xmm14 -; SSE-NEXT: movdqa 272(%rdi), %xmm6 -; SSE-NEXT: movdqa 240(%rdi), %xmm5 -; SSE-NEXT: movdqa 256(%rdi), %xmm7 -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa 64(%rdi), %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,2,1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,1,2,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: movdqa 256(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] -; SSE-NEXT: movdqa 208(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa 224(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa 368(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa 128(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa 304(%rdi), %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa 320(%rdi), %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa 64(%rdi), %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,2,1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,0],xmm1[2,0] +; SSE-NEXT: movdqa 208(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm6[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm7[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm14, %xmm5 +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa 368(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm14[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm9[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm10[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: movdqa 304(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa 320(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm9[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm12, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm15, %xmm8 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm13, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: por %xmm3, %xmm11 ; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm2, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm9, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm13 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: por %xmm5, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pand %xmm12, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,1,2,0] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,2] -; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1680,10 +1658,10 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movdqa %xmm3, 96(%rdx) -; SSE-NEXT: movdqa %xmm6, 32(%rdx) -; SSE-NEXT: movdqa %xmm10, 112(%rdx) -; SSE-NEXT: movdqa %xmm15, 48(%rdx) +; SSE-NEXT: movdqa %xmm8, 96(%rdx) +; SSE-NEXT: movdqa %xmm15, 32(%rdx) +; SSE-NEXT: movdqa %xmm9, 112(%rdx) +; SSE-NEXT: movdqa %xmm13, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1692,210 +1670,226 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movdqa %xmm13, 96(%rcx) -; SSE-NEXT: movdqa %xmm9, 112(%rcx) -; SSE-NEXT: movdqa %xmm11, 64(%rcx) -; SSE-NEXT: movdqa %xmm14, 80(%rcx) -; SSE-NEXT: movdqa %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm5, 48(%rcx) -; SSE-NEXT: movdqa %xmm8, (%rcx) -; SSE-NEXT: movdqa %xmm4, 16(%rcx) -; SSE-NEXT: addq $440, %rsp # imm = 0x1B8 +; SSE-NEXT: movdqa %xmm4, 96(%rcx) +; SSE-NEXT: movdqa %xmm12, 112(%rcx) +; SSE-NEXT: movdqa %xmm5, 64(%rcx) +; SSE-NEXT: movdqa %xmm3, 80(%rcx) +; SSE-NEXT: movdqa %xmm6, 32(%rcx) +; SSE-NEXT: movdqa %xmm7, 48(%rcx) +; SSE-NEXT: movdqa %xmm10, (%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: addq $344, %rsp # imm = 0x158 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride3_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1],xmm15[2],xmm11[3,4],xmm15[5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2],xmm1[3,4],xmm9[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm3[1],xmm12[2,3],xmm3[4],xmm12[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm13[2],xmm8[3,4],xmm13[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm11[2],xmm15[3,4],xmm11[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm4[2],xmm8[3,4],xmm4[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm9[0,1],mem[2],xmm9[3,4],mem[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm1[0,1],mem[2],xmm1[3,4],mem[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm5[0,1],mem[2],xmm5[3,4],mem[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm6[0,1],mem[2],xmm6[3,4],mem[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0,1],xmm8[2],xmm12[3,4],xmm8[5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm11[2],xmm8[3,4],xmm11[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm15[2],xmm3[3,4],xmm15[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm4[2],mem[3,4],xmm4[5],mem[6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3,4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpblendw $219, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm12[2],xmm8[3,4],xmm12[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm5[1],xmm15[2,3],xmm5[4],xmm15[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm3[1],xmm15[2,3],xmm3[4],xmm15[5,6],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3],mem[4],xmm5[5,6],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3],mem[4],xmm6[5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3],xmm6[4],mem[5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] @@ -1903,50 +1897,42 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3],mem[4],xmm7[5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0],xmm8[1],mem[2,3],xmm8[4],mem[5,6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rdx) +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm7, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm7, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm7, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm7, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm7, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm7, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm7, 80(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 48(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 96(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 16(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 80(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%rcx) ; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper @@ -1955,172 +1941,171 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-LABEL: load_i16_stride3_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $136, %rsp -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3 -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11 -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm8 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5,6],ymm10[7],ymm8[8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13,14],ymm10[15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm8, %ymm10 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm4, %ymm6, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm5, %ymm1, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm6, %ymm4, %ymm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm0, %ymm7, %ymm15 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm7, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7],ymm10[8,9,10],ymm8[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15] -; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm12[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5,6],ymm8[7],ymm12[8],ymm8[9],ymm12[10,11],ymm8[12],ymm12[13,14],ymm8[15] +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm12 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0,1,2],ymm12[3,4,5,6,7],ymm8[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm11[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3],ymm8[4],ymm11[5,6],ymm8[7],ymm11[8],ymm8[9],ymm11[10,11],ymm8[12],ymm11[13,14],ymm8[15] +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7],ymm8[8,9,10],ymm4[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm1, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm15[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6],ymm4[7],ymm15[8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14],ymm4[15] +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm12 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm12 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm5[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7,8,9],ymm0[10],ymm5[11,12],ymm0[13],ymm5[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7] -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1],ymm2[2],ymm13[3,4],ymm2[5],ymm13[6,7,8,9],ymm2[10],ymm13[11,12],ymm2[13],ymm13[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7],ymm1[8],ymm3[9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7],ymm3[8],ymm7[9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm9[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 96(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 32(%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: addq $136, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -2135,17 +2120,17 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm5 +; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm6 ; AVX512F-NEXT: vmovdqa 272(%rdi), %xmm8 ; AVX512F-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX512F-NEXT: vmovdqa %xmm2, %xmm14 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm17 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512F-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] +; AVX512F-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 320(%rdi), %ymm22 ; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm23 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm6 @@ -2162,47 +2147,48 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] ; AVX512F-NEXT: vpshufb %xmm15, %xmm13, %xmm13 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm24 +; AVX512F-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm18 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm14 ; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 -; AVX512F-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512F-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512F-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm10 +; AVX512F-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13,14],ymm12[15] +; AVX512F-NEXT: vpshufb %ymm11, %ymm10, %ymm10 ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm11 ; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512F-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 -; AVX512F-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512F-NEXT: vpshufb %ymm3, %ymm10, %ymm2 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] +; AVX512F-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm10[3,4,5,6,7] +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm5 +; AVX512F-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6],ymm10[7],ymm5[8],ymm10[9],ymm5[10,11],ymm10[12],ymm5[13,14],ymm10[15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm5, %ymm2 +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm5[2],xmm10[3,4],xmm5[5],xmm10[6,7] ; AVX512F-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] ; AVX512F-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm19 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 ; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-NEXT: vmovdqa64 %ymm2, %ymm27 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX512F-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512F-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512F-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512F-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512F-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm28 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -2210,57 +2196,58 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512F-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %xmm14, %xmm7 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX512F-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512F-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX512F-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512F-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa64 %xmm17, %xmm6 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX512F-NEXT: vmovdqa64 %xmm8, %xmm26 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512F-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm17 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512F-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512F-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 -; AVX512F-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX512F-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm1 +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512F-NEXT: vmovdqa64 %ymm27, %ymm2 +; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX512F-NEXT: vmovdqa64 %xmm28, %xmm3 +; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512F-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm2 +; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm10[2],xmm5[3,4],xmm10[5],xmm5[6,7] +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512F-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 +; AVX512F-NEXT: vpternlogq $226, %ymm14, %ymm9, %ymm13 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512F-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] +; AVX512F-NEXT: vpternlogq $226, %ymm16, %ymm0, %ymm15 +; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm15[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm15[1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7],ymm3[8],ymm15[9,10],ymm3[11],ymm15[12,13],ymm3[14],ymm15[15] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512F-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6],xmm5[7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512F-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512F-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512F-NEXT: vpshufb %xmm7, %xmm5, %xmm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 ; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm5 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] @@ -2274,23 +2261,23 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] ; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] -; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512F-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3],xmm4[4],xmm6[5,6],xmm4[7] +; AVX512F-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512F-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm5 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX512F-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512F-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 ; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm4 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm2, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index 5a7abf8cdf9fbf..31fb4be1bcfff0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -248,50 +248,50 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i16_stride4_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,2,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm0, (%rsi) -; SSE-NEXT: movapd %xmm7, (%rdx) -; SSE-NEXT: movapd %xmm8, (%rcx) -; SSE-NEXT: movapd %xmm1, (%r8) +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm10, (%rsi) +; SSE-NEXT: movapd %xmm5, (%rdx) +; SSE-NEXT: movapd %xmm7, (%rcx) +; SSE-NEXT: movapd %xmm2, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf8: @@ -516,60 +516,62 @@ define void @load_i16_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i16_stride4_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 96(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movdqa 112(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,1,2,0,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[3,1,2,3] @@ -831,13 +833,13 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] @@ -845,48 +847,48 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm12 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,2,3,1,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,0,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[2,0,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -914,14 +916,14 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] @@ -932,14 +934,14 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] @@ -958,8 +960,8 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] @@ -977,13 +979,13 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1138,10 +1140,10 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i16_stride4_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $248, %rsp -; SSE-NEXT: movdqa 224(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: movdqa 224(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm2 @@ -1162,54 +1164,54 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm12[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm1 @@ -1217,10 +1219,10 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $237, (%rsp), %xmm1 # 16-byte Folded Reload @@ -1228,51 +1230,49 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm15[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,3,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm2[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1280,296 +1280,294 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm15[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movapd %xmm12, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm15[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm3[0],xmm14[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movapd %xmm7, 32(%rdx) +; SSE-NEXT: movapd %xmm11, (%rdx) +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movapd %xmm1, 32(%rcx) -; SSE-NEXT: movapd %xmm8, (%rcx) -; SSE-NEXT: movapd %xmm15, 48(%rcx) -; SSE-NEXT: movapd %xmm10, 16(%rcx) -; SSE-NEXT: movapd %xmm3, 32(%r8) -; SSE-NEXT: movapd %xmm7, (%r8) -; SSE-NEXT: movapd %xmm14, 48(%r8) -; SSE-NEXT: movapd %xmm11, 16(%r8) +; SSE-NEXT: movapd %xmm13, (%rcx) +; SSE-NEXT: movapd %xmm5, 48(%rcx) +; SSE-NEXT: movapd %xmm6, 16(%rcx) +; SSE-NEXT: movapd %xmm4, 32(%r8) +; SSE-NEXT: movapd %xmm14, (%r8) +; SSE-NEXT: movapd %xmm0, 48(%r8) +; SSE-NEXT: movapd %xmm9, 16(%r8) ; SSE-NEXT: addq $248, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $280, %rsp # imm = 0x118 -; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: subq $264, %rsp # imm = 0x108 +; AVX1-ONLY-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm6[1,2,3],xmm4[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm0[1,2,3],xmm4[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm6[1,2,3],xmm11[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm0[1,2,3],xmm12[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm0[1,2,3],xmm13[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm0[1,2,3],xmm11[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm6[1,2,3],xmm10[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm6[1,2,3],xmm12[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm6[1,2,3],xmm14[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm6[1,2,3],xmm13[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm0[1,2,3],xmm15[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0],xmm0[1,2,3],xmm8[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm15, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0],xmm0[1,2,3],xmm6[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] @@ -1578,33 +1576,33 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -1624,13 +1622,13 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: addq $280, %rsp # imm = 0x118 +; AVX1-ONLY-NEXT: addq $264, %rsp # imm = 0x108 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride4_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $168, %rsp +; AVX2-SLOW-NEXT: subq $184, %rsp ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1672,89 +1670,90 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] @@ -1762,49 +1761,50 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1812,36 +1812,36 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) @@ -1851,25 +1851,25 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: addq $168, %rsp +; AVX2-SLOW-NEXT: addq $184, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $104, %rsp -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1877,152 +1877,152 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm8 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm9 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm8 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm13 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm15 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm10[0,1],xmm8[2,3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm13 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm14 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 ; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FAST-NEXT: addq $104, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $184, %rsp +; AVX2-FAST-PERLANE-NEXT: subq $168, %rsp ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -2064,179 +2064,177 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm15, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $184, %rsp +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: addq $168, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2247,121 +2245,121 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vpmovqw %ymm2, %xmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpmovqw %ymm3, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpmovqw %ymm5, %xmm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm5[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13 ; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm1, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm1, %zmm7 +; AVX512F-SLOW-NEXT: vpmovqw %zmm7, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[0,1,1,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm9 ; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm1, %zmm13 -; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm1, %zmm12 +; AVX512F-SLOW-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[0,1,2,0,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm5[2,0,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm0, %zmm14 ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm12[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm0, %zmm0 @@ -2369,8 +2367,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -2378,43 +2376,43 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-LABEL: load_i16_stride4_vf32: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm10 +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm10 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14] ; AVX512F-FAST-NEXT: vpermt2d %ymm7, %ymm11, %ymm10 -; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm7 +; AVX512F-FAST-NEXT: vpmovqw %zmm5, %xmm7 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm13 +; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm3, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm13 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm4 -; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm11, %ymm4 +; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm3 +; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm11, %ymm3 ; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm3[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm9 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm1, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm5, %zmm13 ; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX512F-FAST-NEXT: vpsrlq $16, %zmm0, %zmm13 ; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 @@ -2422,40 +2420,40 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm12[0,1,2,3],zmm9[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [1,3,2,3,1,3,5,7] ; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm13 ; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm15 ; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm11, %ymm15 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm1, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm5, %zmm13 ; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 ; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 ; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm11, %ymm3 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm13[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX512F-FAST-NEXT: vpermt2d %ymm1, %ymm11, %ymm2 +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm13[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpsrlq $48, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -2610,7 +2608,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -2621,7 +2619,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 224(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] @@ -2820,17 +2818,17 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2838,35 +2836,36 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] -; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm15[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2874,49 +2873,48 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm2[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2924,416 +2922,418 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm6[0],xmm13[1] -; SSE-NEXT: pshuflw $231, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm15[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm6[0],xmm10[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm6[0],xmm15[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] +; SSE-NEXT: pshuflw $231, (%rsp), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm14[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 16(%rdx) ; SSE-NEXT: movapd %xmm2, 96(%rcx) ; SSE-NEXT: movapd %xmm5, 32(%rcx) -; SSE-NEXT: movapd %xmm9, 112(%rcx) -; SSE-NEXT: movapd %xmm10, 48(%rcx) -; SSE-NEXT: movapd %xmm14, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movapd %xmm7, 112(%r8) -; SSE-NEXT: movapd %xmm4, 96(%r8) -; SSE-NEXT: movapd %xmm3, 80(%r8) +; SSE-NEXT: movapd %xmm11, 112(%rcx) +; SSE-NEXT: movapd %xmm12, 48(%rcx) +; SSE-NEXT: movapd %xmm13, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movapd %xmm8, 112(%r8) +; SSE-NEXT: movapd %xmm9, 96(%r8) +; SSE-NEXT: movapd %xmm0, 80(%r8) ; SSE-NEXT: movapd %xmm15, 64(%r8) -; SSE-NEXT: movapd %xmm12, 48(%r8) -; SSE-NEXT: movapd %xmm13, 32(%r8) -; SSE-NEXT: movapd %xmm8, 16(%r8) -; SSE-NEXT: movapd %xmm0, (%r8) +; SSE-NEXT: movapd %xmm10, 48(%r8) +; SSE-NEXT: movapd %xmm1, 32(%r8) +; SSE-NEXT: movapd %xmm3, 16(%r8) +; SSE-NEXT: movapd %xmm4, (%r8) ; SSE-NEXT: addq $824, %rsp # imm = 0x338 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $776, %rsp # imm = 0x308 -; AVX1-ONLY-NEXT: vpxor %xmm10, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm10[1,2,3],xmm8[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: subq $824, %rsp # imm = 0x338 +; AVX1-ONLY-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm3[1,2,3],xmm5[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm3[1,2,3],xmm10[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm10[1,2,3],xmm4[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm10[1,2,3],xmm7[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm10[1,2,3],xmm13[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm3[1,2,3],xmm9[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm3[1,2,3],xmm13[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm10[1,2,3],xmm14[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm10[1,2,3],xmm15[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm3[1,2,3],xmm15[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1,2,3],xmm11[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm3[1,2,3],xmm8[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0],xmm3[1,2,3],xmm6[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0],xmm3[1,2,3],xmm7[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0],xmm10[1,2,3],xmm5[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] @@ -3364,9 +3364,9 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -3381,37 +3381,32 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -3439,12 +3434,12 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm4[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 @@ -3457,7 +3452,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] @@ -3487,7 +3482,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshuflw $116, (%rsp), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] @@ -3506,9 +3501,9 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -3523,13 +3518,13 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3570,7 +3565,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm15, (%r8) -; AVX1-ONLY-NEXT: addq $776, %rsp # imm = 0x308 +; AVX1-ONLY-NEXT: addq $824, %rsp # imm = 0x338 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3661,9 +3656,9 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 @@ -3724,17 +3719,17 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm14 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3774,43 +3769,43 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3837,18 +3832,18 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3876,25 +3871,25 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] @@ -3909,54 +3904,54 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm4[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] @@ -3965,23 +3960,23 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] @@ -3990,39 +3985,39 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -4041,7 +4036,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] @@ -4071,8 +4066,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, (%r8) ; AVX2-SLOW-NEXT: addq $696, %rsp # imm = 0x2B8 ; AVX2-SLOW-NEXT: vzeroupper @@ -4081,243 +4076,246 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-LABEL: load_i16_stride4_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm3[1,2,3],mem[4],ymm3[5,6,7],mem[8],ymm3[9,10,11],mem[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm3[1,2,3],mem[4],ymm3[5,6,7],mem[8],ymm3[9,10,11],mem[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm3[1,2,3],mem[4],ymm3[5,6,7],mem[8],ymm3[9,10,11],mem[12],ymm3[13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm3[1,2,3],mem[4],ymm3[5,6,7],mem[8],ymm3[9,10,11],mem[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm3[1,2,3],mem[4],ymm3[5,6,7],mem[8],ymm3[9,10,11],mem[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm3[1,2,3],mem[4],ymm3[5,6,7],mem[8],ymm3[9,10,11],mem[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FAST-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm2, %xmm6 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = mem[0],ymm3[1,2,3],mem[4],ymm3[5,6,7],mem[8],ymm3[9,10,11],mem[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FAST-NEXT: vpackusdw %xmm9, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm3[1,2,3],mem[4],ymm3[5,6,7],mem[8],ymm3[9,10,11],mem[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX2-FAST-NEXT: vpackusdw %xmm9, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpackusdw %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm9 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm15 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm14 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm10 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 400(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm11 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm9 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm15 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm5 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 400(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm13 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[3,1,2,3] ; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm14 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm13 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshufd $231, (%rsp), %xmm7 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm7 = mem[3,1,2,3] @@ -4326,19 +4324,19 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 @@ -4350,36 +4348,36 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm10 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -4394,7 +4392,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -4515,168 +4514,167 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4691,101 +4689,102 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] @@ -4794,23 +4793,23 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] @@ -4819,89 +4818,89 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, (%r8) ; AVX2-FAST-PERLANE-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -4909,439 +4908,439 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-SLOW-LABEL: load_i16_stride4_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $104, %rsp +; AVX512F-SLOW-NEXT: subq $136, %rsp ; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 112(%rdi), %xmm20 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm20[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm21 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm21[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 368(%rdi), %xmm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm27[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 336(%rdi), %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 80(%rdi), %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 208(%rdi), %xmm25 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm19 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm19[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm21 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm21[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 368(%rdi), %xmm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 352(%rdi), %xmm24 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm24[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX512F-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 336(%rdi), %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 464(%rdi), %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm25[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm24[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm28 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm26[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm19[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm26 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm29[0,2,2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm31 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm22[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm25[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm28 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm31 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm24[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm24 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm25 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm26[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm19[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm21[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm26 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm27[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm22[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm20[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm21[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm16[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm17[3,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[3,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm29[3,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm25[3,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vpmovqw %ymm5, %xmm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm29 -; AVX512F-SLOW-NEXT: vpmovqw %zmm29, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpmovqw %ymm4, %xmm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm25 -; AVX512F-SLOW-NEXT: vpmovqw %zmm25, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vpmovqw %ymm5, %xmm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512F-SLOW-NEXT: vpmovqw %zmm18, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm29, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm25, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm18, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpmovqw %ymm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm20 +; AVX512F-SLOW-NEXT: vpmovqw %zmm20, %xmm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512F-SLOW-NEXT: vpmovqw %ymm13, %xmm13 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm18 +; AVX512F-SLOW-NEXT: vpmovqw %zmm18, %xmm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpmovqw %ymm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512F-SLOW-NEXT: vpmovqw %zmm17, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpmovqw %ymm2, %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm13 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm13 +; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm20, %zmm2 +; AVX512F-SLOW-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm29, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm25, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm18, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm8[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm18, %zmm3 +; AVX512F-SLOW-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm17, %zmm2 +; AVX512F-SLOW-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm13, %zmm3 +; AVX512F-SLOW-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm20, %zmm2 +; AVX512F-SLOW-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm18, %zmm6 +; AVX512F-SLOW-NEXT: vpmovqw %zmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm11 +; AVX512F-SLOW-NEXT: vpmovqw %zmm11, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm13, %zmm11 +; AVX512F-SLOW-NEXT: vpmovqw %zmm11, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm10[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm29, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm25, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm20, %zmm2 +; AVX512F-SLOW-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm18, %zmm4 +; AVX512F-SLOW-NEXT: vpmovqw %zmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm17, %zmm4 +; AVX512F-SLOW-NEXT: vpmovqw %zmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm18, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm4, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r8) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512F-SLOW-NEXT: addq $104, %rsp +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm13, %zmm4 +; AVX512F-SLOW-NEXT: vpmovqw %zmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rsi) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512F-SLOW-NEXT: addq $136, %rsp ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride4_vf64: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm23 -; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %zmm26 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] ; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm24 -; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm1, %ymm10 +; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm9, %ymm10 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> ; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm1, %ymm11 +; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm9, %ymm11 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm3 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] ; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm7, %ymm3 ; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm27 -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm28 -; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm12 -; AVX512F-FAST-NEXT: vpermt2d %ymm9, %ymm7, %ymm12 -; AVX512F-FAST-NEXT: vpmovqw %zmm30, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm26 +; AVX512F-FAST-NEXT: vpermd %ymm26, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm9, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm13 +; AVX512F-FAST-NEXT: vpermt2d %ymm12, %ymm7, %ymm13 +; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm12[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 480(%rdi), %ymm16 -; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm9 +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm13 ; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm17 -; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm1, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm13 -; AVX512F-FAST-NEXT: vpermt2d %ymm9, %ymm7, %ymm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm26, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm9, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm14 +; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm7, %ymm14 +; AVX512F-FAST-NEXT: vpmovqw %zmm28, %xmm13 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 ; AVX512F-FAST-NEXT: vmovdqa64 352(%rdi), %ymm18 -; AVX512F-FAST-NEXT: vpermd %ymm18, %ymm1, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm14 +; AVX512F-FAST-NEXT: vpermd %ymm18, %ymm9, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm15 ; AVX512F-FAST-NEXT: vmovdqa64 320(%rdi), %ymm20 -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm15 -; AVX512F-FAST-NEXT: vpermt2d %ymm14, %ymm7, %ymm15 -; AVX512F-FAST-NEXT: vpmovqw %zmm23, %xmm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm14[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm9, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm9 +; AVX512F-FAST-NEXT: vpermt2d %ymm15, %ymm7, %ymm9 +; AVX512F-FAST-NEXT: vpmovqw %zmm23, %xmm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm9[0,1,2,3],zmm13[4,5,6,7] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm13 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm4, %zmm14 -; AVX512F-FAST-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm4, %zmm13 +; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm30, %zmm8 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm2, %zmm8 ; AVX512F-FAST-NEXT: vpmovqw %zmm8, %xmm8 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm11[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm26, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm28, %zmm3 ; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpsrlq $16, %zmm23, %zmm3 ; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [1,3,2,3,1,3,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm15, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm4, %zmm0 -; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm15, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm15, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm14 -; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm7, %ymm14 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm30, %zmm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm13[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm15, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm11 -; AVX512F-FAST-NEXT: vpermt2d %ymm1, %ymm7, %ymm11 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm26, %zmm1 -; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpermd %ymm18, %ymm15, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm15, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm23, %zmm2 -; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] +; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm1, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm8 +; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vpermt2d %ymm8, %ymm7, %ymm12 +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm4, %zmm8 +; AVX512F-FAST-NEXT: vpmovqw %zmm8, %xmm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vpermd %ymm26, %ymm1, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm14 +; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm1, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm15 +; AVX512F-FAST-NEXT: vpermt2d %ymm14, %ymm7, %ymm15 +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm2, %zmm14 +; AVX512F-FAST-NEXT: vpmovqw %zmm14, %xmm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm14[0,1,2,3],zmm13[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm1, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm15 +; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm1, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm13 +; AVX512F-FAST-NEXT: vpermt2d %ymm15, %ymm7, %ymm13 +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm28, %zmm15 +; AVX512F-FAST-NEXT: vpmovqw %zmm15, %xmm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vpermd %ymm18, %ymm1, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 +; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6 +; AVX512F-FAST-NEXT: vpermt2d %ymm5, %ymm7, %ymm6 +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm23, %zmm5 +; AVX512F-FAST-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm13[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm30, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm26, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm23, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm28, %zmm3 +; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm23, %zmm3 +; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 64(%rsi) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 64(%rdx) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rcx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 64(%rcx) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%r8) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r8) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 8cfd55b3044f56..1b468a0a28f5f2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -450,62 +450,62 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 ; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,3] ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] +; SSE-NEXT: andps %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,1,0,1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: psrlq $48, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,3,2,3] +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,1,1,3] -; SSE-NEXT: psllq $48, %xmm6 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,1,3] +; SSE-NEXT: psllq $48, %xmm8 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0,1,3] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: pand %xmm13, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] @@ -522,29 +522,29 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: por %xmm13, %xmm7 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: pandn %xmm8, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm10, %xmm15 ; SSE-NEXT: por %xmm15, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[3,0] ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm7[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,0] ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] ; SSE-NEXT: movdqa %xmm2, (%rsi) ; SSE-NEXT: movdqa %xmm1, (%rdx) -; SSE-NEXT: movaps %xmm5, (%rcx) +; SSE-NEXT: movaps %xmm7, (%rcx) ; SSE-NEXT: movaps %xmm11, (%r8) ; SSE-NEXT: movaps %xmm3, (%r9) ; SSE-NEXT: retq @@ -613,42 +613,42 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-LABEL: load_i16_stride5_vf8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] ; AVX2-SLOW-NEXT: vpbroadcastw 70(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vpsllq $48, %xmm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,0] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsi) +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX2-SLOW-NEXT: vmovdqa %xmm3, (%rdx) ; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%r8) @@ -745,42 +745,42 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-LABEL: load_i16_stride5_vf8: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] ; AVX512F-SLOW-NEXT: vpbroadcastw 70(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] ; AVX512F-SLOW-NEXT: vpsllq $48, %xmm3, %xmm5 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,1,2,0] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, (%r8) @@ -869,12 +869,13 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 144(%rdi), %xmm14 +; SSE-NEXT: movaps 144(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm8 ; SSE-NEXT: movdqa 96(%rdi), %xmm7 -; SSE-NEXT: movdqa 128(%rdi), %xmm15 +; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa 112(%rdi), %xmm12 -; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 ; SSE-NEXT: movdqa (%rdi), %xmm11 ; SSE-NEXT: movdqa 16(%rdi), %xmm9 ; SSE-NEXT: movdqa 32(%rdi), %xmm13 @@ -883,24 +884,28 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm13, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,3] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm3, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: por %xmm10, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, %xmm15 ; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] @@ -909,9 +914,10 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: andps %xmm6, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: andps %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,1] +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -923,78 +929,73 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: psllq $48, %xmm2 -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm2, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psllq $48, %xmm6 +; SSE-NEXT: movaps %xmm3, %xmm12 +; SSE-NEXT: andnps %xmm6, %xmm12 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm12 ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: pand %xmm14, %xmm15 ; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,0] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,2,0] @@ -1002,7 +1003,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] @@ -1010,80 +1011,78 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm13[3,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm13, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm4[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm4[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm13 -; SSE-NEXT: por %xmm14, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm13, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm5[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm14, %xmm13 +; SSE-NEXT: por %xmm4, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm11[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm12[3,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm12, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[2,0] ; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: por %xmm5, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movdqa %xmm6, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movaps %xmm12, (%rdx) ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps %xmm15, (%rcx) ; SSE-NEXT: movaps %xmm13, 16(%r8) ; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps %xmm3, 16(%r9) +; SSE-NEXT: movaps %xmm14, 16(%r9) ; SSE-NEXT: movaps %xmm8, (%r9) ; SSE-NEXT: retq ; @@ -1311,13 +1310,13 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm6 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0] ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] @@ -1328,44 +1327,44 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,7,1,6> ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,6,0,1,3,6,0] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm10[3,4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,2,u,u,5,7,2,4> ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] ; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7> ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] ; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] @@ -1577,61 +1576,61 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0] -; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,3,5,0,0,3,5,0] +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm5, %ymm5 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,u,u,4,7,1,6> -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512F-FAST-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0] ; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm9 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm8 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,2,u,u,5,7,2,4> ; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] ; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm10 ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7> ; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] ; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm10 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] @@ -1643,12 +1642,12 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,5,7,0,2,5,7] ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] ; AVX512F-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm5, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512F-FAST-NEXT: vmovdqa %ymm5, (%rdx) ; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm8, (%r8) ; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%r9) @@ -1704,53 +1703,51 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $408, %rsp # imm = 0x198 +; SSE-NEXT: subq $376, %rsp # imm = 0x178 ; SSE-NEXT: movdqa 64(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm11 -; SSE-NEXT: movdqa 176(%rdi), %xmm12 +; SSE-NEXT: movdqa 176(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm3 -; SSE-NEXT: movdqa 192(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; SSE-NEXT: movaps {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: andps %xmm15, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] ; SSE-NEXT: movdqa %xmm10, %xmm6 @@ -1773,14 +1770,13 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 240(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] @@ -1796,17 +1792,17 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa 112(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm14 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 80(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] @@ -1818,17 +1814,17 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: andps %xmm15, %xmm2 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1836,22 +1832,24 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: psllq $48, %xmm2 ; SSE-NEXT: movaps %xmm15, %xmm3 ; SSE-NEXT: andnps %xmm2, %xmm3 ; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: orps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1859,23 +1857,22 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 ; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] +; SSE-NEXT: psrlq $48, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -1885,24 +1882,20 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 ; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1912,287 +1905,287 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 ; SSE-NEXT: pandn %xmm1, %xmm15 ; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: andnps %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,0,1,3] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: andnps %xmm14, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[2,0] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[3,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: andnps %xmm6, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: andnps %xmm4, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[3,0] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: andnps %xmm8, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[3,0] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm10, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm14[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm8[0,2] -; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,2] -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm7, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] -; SSE-NEXT: por %xmm12, %xmm2 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm12[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm3[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm12[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,0] -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0] -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rsi) +; SSE-NEXT: movdqa %xmm15, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rdx) +; SSE-NEXT: movaps %xmm7, 16(%rcx) +; SSE-NEXT: movaps %xmm11, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rcx) ; SSE-NEXT: movaps %xmm10, 16(%r8) ; SSE-NEXT: movaps %xmm9, 48(%r8) -; SSE-NEXT: movaps %xmm13, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%r8) -; SSE-NEXT: movaps %xmm11, 16(%r9) -; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%r8) +; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps %xmm1, 48(%r9) ; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: movaps %xmm3, 32(%r9) -; SSE-NEXT: addq $408, %rsp # imm = 0x198 +; SSE-NEXT: movaps %xmm14, 32(%r9) +; SSE-NEXT: addq $376, %rsp # imm = 0x178 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf32: @@ -2204,16 +2197,16 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] @@ -2221,45 +2214,44 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] @@ -2274,152 +2266,153 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm6, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm2, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm10[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm9 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm15[2,3],xmm2[4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0,1],xmm13[2,3],xmm7[4,5],xmm13[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpsllq $48, %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm6, %ymm9 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm9, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm5[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm5[4,5],xmm6[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm13[2,3],xmm14[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm14[2,3],xmm11[4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm8 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm6[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm15[4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,1,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,1,2,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm5[4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm13[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm4[2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm7[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm13[4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7] @@ -2430,50 +2423,50 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm9[4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm11[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm7[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm8[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 @@ -2496,14 +2489,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) @@ -2513,13 +2506,13 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm11, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8 @@ -2528,416 +2521,411 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-SLOW-LABEL: load_i16_stride5_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-SLOW-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6],ymm9[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3],xmm10[4,5],xmm11[6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4],ymm3[5],ymm12[6,7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12],ymm3[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6],ymm12[7] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1,2,3],xmm10[4,5],xmm12[6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm10, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5],ymm10[6],ymm13[7] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1,2,3],xmm10[4,5],xmm13[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm13, %ymm10, %ymm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm13 ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm12 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0],xmm13[1],xmm14[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm11[1],xmm10[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm13[2],xmm14[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm13[2],xmm14[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7],ymm15[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm11[2],xmm10[3] ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13],ymm3[14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0],xmm14[1],xmm13[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm14[2],xmm13[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6],ymm8[7] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1],ymm7[2],mem[3],ymm7[4],mem[5,6],ymm7[7],mem[8,9],ymm7[10],mem[11],ymm7[12],mem[13,14],ymm7[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0],xmm10[1],xmm11[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4],ymm5[5],ymm15[6,7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12],ymm5[13],ymm15[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4],ymm0[5],ymm8[6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm10[2],xmm11[3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm14[2],xmm13[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm0[1,2],ymm7[3],ymm0[4],ymm7[5],ymm0[6,7],ymm7[8],ymm0[9,10],ymm7[11],ymm0[12],ymm7[13],ymm0[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm10[2],xmm11[3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7],ymm8[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm6[2],ymm3[3],ymm6[4],ymm3[5,6],ymm6[7],ymm3[8,9],ymm6[10],ymm3[11],ymm6[12],ymm3[13,14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-SLOW-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $107, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-SLOW-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm15 +; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm12 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1,2,3],xmm9[4,5],xmm12[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4],ymm15[5],ymm1[6,7],ymm15[8],ymm1[9,10],ymm15[11],ymm1[12],ymm15[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1,2,3],xmm8[4,5],xmm14[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm12, %ymm10, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm8, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,u,u,u,4,7,1,6> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm15[1],ymm5[2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10],ymm15[11],ymm5[12,13],ymm15[14],ymm5[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6],xmm15[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm10, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm14, %ymm8, %ymm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3],xmm8[4,5,6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,3,1,3,0,3,5,7] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm13 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0,1,2,3,4],ymm13[5,6,7],ymm9[8,9,10,11,12],ymm13[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2,3,4],ymm12[5,6,7],ymm10[8,9,10,11,12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm11 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,2,3,1,3,6,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7],ymm14[8,9,10,11,12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm14 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm13[2],ymm5[3],ymm13[4],ymm5[5,6],ymm13[7],ymm5[8,9],ymm13[10],ymm5[11],ymm13[12],ymm5[13,14],ymm13[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,u,u,5,7,2,4> +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7],ymm0[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2,3,4],ymm12[5,6,7],ymm0[8,9,10,11,12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8,9],ymm0[10],ymm3[11],ymm0[12],ymm3[13,14],ymm0[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3,4],xmm12[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,2,u,u,5,7,2,4> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3],ymm5[4],ymm7[5,6],ymm5[7],ymm7[8,9],ymm5[10],ymm7[11],ymm5[12],ymm7[13,14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,4,7,0,2,4,7,0] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7],ymm7[8,9],ymm6[10],ymm7[11],ymm6[12],ymm7[13,14],ymm6[15] +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5],ymm14[6],mem[7,8],ymm14[9],mem[10,11],ymm14[12],mem[13],ymm14[14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,3,u,u,6,0,3,5> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] +; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1],ymm8[2],ymm15[3],ymm8[4],ymm15[5,6],ymm8[7],ymm15[8,9],ymm8[10],ymm15[11],ymm8[12],ymm15[13,14],ymm8[15] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) @@ -2951,186 +2939,173 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm15, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FAST-NEXT: addq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: subq $136, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5],ymm1[6],ymm10[7,8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13],ymm1[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1,2,3],xmm9[4,5],xmm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm13[5],ymm9[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm13[1,2,3],xmm9[4,5],xmm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5],ymm8[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3],xmm9[4,5,6],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm10[1],ymm1[2],ymm10[3],ymm1[4,5],ymm10[6],ymm1[7,8],ymm10[9],ymm1[10],ymm10[11],ymm1[12,13],ymm10[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10,11],ymm7[12],ymm10[13],ymm7[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm13, %ymm8, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm15[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm14, %ymm15, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7],ymm0[8,9,10,11,12],ymm14[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm6[2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm9[1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13],ymm10[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3],ymm13[4],ymm7[5,6],ymm13[7],ymm7[8,9],ymm13[10],ymm7[11],ymm13[12],ymm7[13,14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm15[1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0],xmm0[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7],ymm11[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm9[0,1],xmm8[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5,6,7],ymm13[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm0[2],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7],ymm12[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm13[2],ymm1[3],ymm13[4],ymm1[5,6],ymm13[7],ymm1[8,9],ymm13[10],ymm1[11],ymm13[12],ymm1[13,14],ymm13[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4],xmm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm8[0],xmm9[1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm11[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4],xmm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm1[1,2],ymm13[3],ymm1[4],ymm13[5],ymm1[6,7],ymm13[8],ymm1[9,10],ymm13[11],ymm1[12],ymm13[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm5[1],ymm10[2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm7[1,2],ymm13[3],ymm7[4],ymm13[5],ymm7[6,7],ymm13[8],ymm7[9,10],ymm13[11],ymm7[12],ymm13[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm6[0,1],xmm15[2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm8[0,1],xmm9[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],xmm14[2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7],ymm7[8,9],ymm6[10],ymm7[11],ymm6[12],ymm7[13,14],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5],ymm13[6],ymm4[7,8],ymm13[9],ymm4[10,11],ymm13[12],ymm4[13],ymm13[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] @@ -3139,43 +3114,41 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: addq $136, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3188,9 +3161,9 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] @@ -3198,20 +3171,20 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13],ymm8[14],ymm6[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] @@ -3220,33 +3193,33 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm11 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0],xmm12[1],xmm11[2,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm16 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm16 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm9, %xmm14 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,3,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm13[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3],xmm15[4,5,6],xmm14[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6],xmm15[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3] @@ -3254,7 +3227,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm14 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm18 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm14[1],ymm4[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] @@ -3269,7 +3242,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm8[0,1],ymm6[2],ymm8[3],ymm6[4],ymm8[5,6],ymm6[7],ymm8[8,9],ymm6[10],ymm8[11],ymm6[12],ymm8[13,14],ymm6[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] @@ -3283,7 +3256,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm14 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm17 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1],ymm5[2],ymm7[3],ymm5[4],ymm7[5,6],ymm5[7],ymm7[8,9],ymm5[10],ymm7[11],ymm5[12],ymm7[13,14],ymm5[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm13 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2],ymm4[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] @@ -3295,7 +3268,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4],ymm6[5],ymm8[6,7],ymm6[8],ymm8[9,10],ymm6[11],ymm8[12],ymm6[13],ymm8[14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] @@ -3314,34 +3287,34 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm14[1,2,3,4,5,6,7],ymm4[8],ymm14[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm10[2],xmm9[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm7[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10],ymm5[11],ymm7[12],ymm5[13],ymm7[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm10[2],xmm9[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm5[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7] ; AVX512F-SLOW-NEXT: movb $7, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] @@ -3349,7 +3322,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, (%rcx) @@ -3613,257 +3586,259 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1016, %rsp # imm = 0x3F8 +; SSE-NEXT: subq $1000, %rsp # imm = 0x3E8 ; SSE-NEXT: movdqa 464(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm8 -; SSE-NEXT: movdqa 416(%rdi), %xmm11 -; SSE-NEXT: movdqa 448(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm6 +; SSE-NEXT: movdqa 400(%rdi), %xmm11 +; SSE-NEXT: movdqa 416(%rdi), %xmm2 +; SSE-NEXT: movdqa 448(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm15 +; SSE-NEXT: movdqa 432(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm8 ; SSE-NEXT: movdqa 96(%rdi), %xmm10 -; SSE-NEXT: movdqa 128(%rdi), %xmm14 -; SSE-NEXT: movdqa 112(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm15 +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm13, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm9, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm12 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 368(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 336(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 320(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] +; SSE-NEXT: andps %xmm9, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[3,1,2,3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 +; SSE-NEXT: andps %xmm9, %xmm5 +; SSE-NEXT: orps %xmm5, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 368(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa 336(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 320(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 +; SSE-NEXT: andps %xmm9, %xmm5 +; SSE-NEXT: orps %xmm5, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 288(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa 256(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 240(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 608(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 +; SSE-NEXT: andps %xmm9, %xmm5 +; SSE-NEXT: orps %xmm5, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 608(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa 576(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 560(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 560(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,2,2,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] ; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 208(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 +; SSE-NEXT: andps %xmm9, %xmm5 +; SSE-NEXT: orps %xmm5, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 208(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 160(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: andps %xmm13, %xmm5 -; SSE-NEXT: orps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: andnps %xmm3, %xmm1 +; SSE-NEXT: andps %xmm9, %xmm5 +; SSE-NEXT: orps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 528(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa 512(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa 512(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa 496(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 480(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 480(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] ; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: andps %xmm9, %xmm4 ; SSE-NEXT: orps %xmm4, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3] +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: psllq $48, %xmm4 -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: andnps %xmm4, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: andnps %xmm4, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: orps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -3874,21 +3849,22 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,3,2,3] +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -3899,21 +3875,20 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] +; SSE-NEXT: psrlq $48, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -3924,47 +3899,47 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3] +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -3975,256 +3950,256 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: psrlq $48, %xmm3 ; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,1,3] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: punpckhdq (%rsp), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,1,1,3] +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,3] +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm9[2,3] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[3,0] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm13, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm1[3,0] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] @@ -4234,242 +4209,251 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm14, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[3,0] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm12, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm2, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: andnps %xmm11, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd $232, (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[3,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: andnps %xmm12, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[3,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: andnps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm14 -; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: andnps %xmm10, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[3,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: andnps %xmm8, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE-NEXT: movaps %xmm1, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[0,2] +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE-NEXT: movaps %xmm1, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE-NEXT: movaps %xmm1, %xmm14 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm9[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm15[0,2] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm7[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,2] -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm8 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshufhw $232, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: andnps %xmm11, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: andnps %xmm15, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0] @@ -4491,7 +4475,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,2,2,3] ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -4499,7 +4483,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,2,2,3] @@ -4536,13 +4520,14 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] -; SSE-NEXT: orps %xmm9, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,1,1,3] +; SSE-NEXT: orps %xmm11, %xmm0 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4559,8 +4544,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movdqa %xmm9, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4592,7 +4576,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm10, 112(%r8) -; SSE-NEXT: movaps %xmm14, 96(%r8) +; SSE-NEXT: movaps %xmm12, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4605,22 +4589,22 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps %xmm6, 112(%r9) +; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps %xmm2, 96(%r9) ; SSE-NEXT: movaps %xmm3, 80(%r9) ; SSE-NEXT: movaps %xmm4, 64(%r9) ; SSE-NEXT: movaps %xmm5, 48(%r9) -; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm6, 32(%r9) ; SSE-NEXT: movaps %xmm7, 16(%r9) ; SSE-NEXT: movaps %xmm8, (%r9) -; SSE-NEXT: addq $1016, %rsp # imm = 0x3F8 +; SSE-NEXT: addq $1000, %rsp # imm = 0x3E8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] @@ -4632,7 +4616,6 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] @@ -4646,12 +4629,12 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] @@ -4668,9 +4651,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] @@ -4678,39 +4661,39 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,1,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4720,11 +4703,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 @@ -4751,11 +4734,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4781,19 +4764,20 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm5, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7] @@ -4803,99 +4787,100 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpblendw $48, (%rsp), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2,3,4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm14[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm9[2,3],xmm13[4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpsllq $48, %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2,3],xmm14[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0,1],xmm8[2,3],xmm10[4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpsllq $48, %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm10[2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm13[2,3],xmm9[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1,2,3],xmm13[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm12 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm8[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm8[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -4904,9 +4889,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -4915,111 +4900,111 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm0[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm15[0,1,2,3],mem[4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,0] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm9[0,1,2,3],mem[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm7[2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm9[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,0] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,2,0] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5031,106 +5016,105 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm0[2,3],mem[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3],xmm14[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm14[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1,2,3],xmm10[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm6[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm13[2,3],xmm14[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm7[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5144,115 +5128,116 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2,3],mem[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,1,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2,3],xmm7[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm13[0,1,2],mem[3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm12[0,1,2],mem[3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -5277,226 +5262,230 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm12, (%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm11, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride5_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1048, %rsp # imm = 0x418 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-SLOW-NEXT: subq $1080, %rsp # imm = 0x438 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10,11],ymm12[12],ymm8[13],ymm12[14],ymm8[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm13[1,2],ymm10[3],ymm13[4],ymm10[5],ymm13[6,7],ymm10[8],ymm13[9,10],ymm10[11],ymm13[12],ymm10[13],ymm13[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5],ymm0[6],ymm6[7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13],ymm0[14],ymm6[15] ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4],ymm9[5],ymm4[6,7],ymm9[8],ymm4[9,10],ymm9[11],ymm4[12],ymm9[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm12[0],mem[1],ymm12[2],mem[3],ymm12[4,5],mem[6],ymm12[7,8],mem[9],ymm12[10],mem[11],ymm12[12,13],mem[14],ymm12[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6],xmm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13],ymm10[14],ymm13[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm8, %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13],ymm9[14],ymm12[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0],xmm9[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm10[1],mem[2],ymm10[3],mem[4,5],ymm10[6],mem[7,8],ymm10[9],mem[10],ymm10[11],mem[12,13],ymm10[14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3],xmm2[4,5,6],xmm10[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5],ymm11[6],ymm15[7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13],ymm11[14],ymm15[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6],xmm11[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm10, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8],ymm0[9],ymm9[10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6],xmm11[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm9[0],xmm3[1],xmm9[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 624(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm5[1],xmm12[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7],ymm4[8,9,10,11,12],ymm10[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm10 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 624(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm4[0],xmm15[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2,3,4],ymm11[5,6,7],ymm5[8,9,10,11,12],ymm11[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm5[0],xmm7[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm8 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm13 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0,1,2,3,4],ymm13[5,6,7],ymm4[8,9,10,11,12],ymm13[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0,1,2,3,4],ymm13[5,6,7],ymm5[8,9,10,11,12],ymm13[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm5[0],xmm7[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm3[2],xmm9[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm11 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm9[2],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm15[2],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm15[2],xmm12[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm7[2],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = @@ -5504,64 +5493,66 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0],xmm14[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0],xmm9[1],xmm11[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm12[1],mem[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm12[1],ymm7[2,3],ymm12[4],ymm7[5],ymm12[6],ymm7[7,8],ymm12[9],ymm7[10,11],ymm12[12],ymm7[13],ymm12[14],ymm7[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3],mem[4],ymm9[5,6],mem[7],ymm9[8,9],mem[10],ymm9[11],mem[12],ymm9[13,14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3,4],xmm4[5,6,7] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm10[1],mem[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm14, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3,4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm10[2],ymm5[3],ymm10[4],ymm5[5,6],ymm10[7],ymm5[8,9],ymm10[10],ymm5[11],ymm10[12],ymm5[13,14],ymm10[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3,4],xmm4[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5],ymm14[6],mem[7,8],ymm14[9],mem[10,11],ymm14[12],mem[13],ymm14[14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4],xmm3[5,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -5572,12 +5563,12 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw $181, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15] +; AVX2-SLOW-NEXT: vpblendw $181, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm8[1],mem[2],ymm8[3],mem[4,5],ymm8[6],mem[7,8],ymm8[9],mem[10],ymm8[11],mem[12,13],ymm8[14],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm15[1,2],mem[3],ymm15[4],mem[5],ymm15[6,7],mem[8],ymm15[9,10],mem[11],ymm15[12],mem[13],ymm15[14,15] +; AVX2-SLOW-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm13[0],mem[1,2],ymm13[3],mem[4],ymm13[5],mem[6,7],ymm13[8],mem[9,10],ymm13[11],mem[12],ymm13[13],mem[14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = @@ -5585,614 +5576,612 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[0,1],xmm15[2],mem[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm8[0,1],mem[2],xmm8[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm13[0],mem[1],ymm13[2],mem[3],ymm13[4,5],mem[6],ymm13[7,8],mem[9],ymm13[10],mem[11],ymm13[12,13],mem[14],ymm13[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm11[1,2],mem[3],ymm11[4],mem[5],ymm11[6,7],mem[8],ymm11[9,10],mem[11],ymm11[12],mem[13],ymm11[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10],ymm7[11],ymm12[12,13],ymm7[14],ymm12[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[0,1],xmm11[2],mem[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm9[0,1],xmm11[2],xmm9[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm11[1],ymm15[2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7,8],ymm11[9],ymm15[10],ymm11[11],ymm15[12,13],ymm11[14],ymm15[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm5[1,2],ymm10[3],ymm5[4],ymm10[5],ymm5[6,7],ymm10[8],ymm5[9,10],ymm10[11],ymm5[12],ymm10[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm14[2],xmm8[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm10[0,1],mem[2],xmm10[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm12[1,2],ymm14[3],ymm12[4],ymm14[5],ymm12[6,7],ymm14[8],ymm12[9,10],ymm14[11],ymm12[12],ymm14[13],ymm12[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm12[2],xmm10[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm13[0,1],mem[2],xmm13[3] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $107, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm7[1],mem[2,3],ymm7[4],mem[5],ymm7[6],mem[7,8],ymm7[9],mem[10,11],ymm7[12],mem[13],ymm7[14],mem[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm11[2],mem[3],ymm11[4],mem[5,6],ymm11[7],mem[8,9],ymm11[10],mem[11],ymm11[12],mem[13,14],ymm11[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5],ymm5[6],mem[7,8],ymm5[9],mem[10,11],ymm5[12],mem[13],ymm5[14],mem[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm9[2],mem[3],ymm9[4],mem[5,6],ymm9[7],mem[8,9],ymm9[10],mem[11],ymm9[12],mem[13,14],ymm9[15] +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm15[2],mem[3],ymm15[4],mem[5,6],ymm15[7],mem[8,9],ymm15[10],mem[11],ymm15[12],mem[13,14],ymm15[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13],ymm14[14],ymm12[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: addq $1048, %rsp # imm = 0x418 +; AVX2-SLOW-NEXT: addq $1080, %rsp # imm = 0x438 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride5_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5],ymm6[6],ymm0[7,8],ymm6[9],ymm0[10,11],ymm6[12],ymm0[13],ymm6[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4],ymm3[5],ymm5[6,7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12],ymm3[13],ymm5[14,15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5],ymm4[6],ymm9[7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13],ymm4[14],ymm9[15] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4],ymm8[5],ymm12[6,7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12],ymm8[13],ymm12[14,15] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10,11],ymm13[12],ymm12[13],ymm13[14],ymm12[15] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4],ymm8[5],ymm15[6,7],ymm8[8],ymm15[9,10],ymm8[11],ymm15[12],ymm8[13],ymm15[14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13],ymm10[14],ymm7[15] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15] ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15] ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm6[0],mem[1],ymm6[2],mem[3],ymm6[4,5],mem[6],ymm6[7,8],mem[9],ymm6[10],mem[11],ymm6[12,13],mem[14],ymm6[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5],ymm14[6],ymm11[7,8],ymm14[9],ymm11[10,11],ymm14[12],ymm11[13],ymm14[14],ymm11[15] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5,6],xmm11[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10,11],ymm8[12],ymm12[13],ymm8[14],ymm12[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,u,u,u,4,7,1,6> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm13[0],mem[1],ymm13[2],mem[3],ymm13[4,5],mem[6],ymm13[7,8],mem[9],ymm13[10],mem[11],ymm13[12,13],mem[14],ymm13[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5],mem[6],ymm15[7,8],mem[9],ymm15[10,11],mem[12],ymm15[13],mem[14],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3],xmm1[4,5,6],xmm11[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm6, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10],ymm12[11],ymm3[12,13],ymm12[14],ymm3[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6],xmm12[7] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm14 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5],ymm0[6],ymm5[7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13],ymm0[14],ymm5[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,1,3,0,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1,2,3,4],ymm11[5,6,7],ymm3[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1,2,3,4],ymm11[5,6,7],ymm3[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm10[5,6,7],ymm3[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,2,3,1,3,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10,11],ymm11[12],ymm8[13],ymm11[14],ymm8[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm9[0,1],mem[2],ymm9[3],mem[4],ymm9[5,6],mem[7],ymm9[8,9],mem[10],ymm9[11],mem[12],ymm9[13,14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5],mem[6],ymm15[7,8],mem[9],ymm15[10,11],mem[12],ymm15[13],mem[14],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11],ymm10[12],ymm6[13],ymm10[14],ymm6[15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5],ymm8[6],ymm2[7,8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13],ymm8[14],ymm2[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5],ymm14[6],ymm2[7,8],ymm14[9],ymm2[10,11],ymm14[12],ymm2[13],ymm14[14],ymm2[15] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4],ymm3[5],ymm13[6,7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12],ymm3[13],ymm13[14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7,8],ymm8[9],ymm11[10],ymm8[11],ymm11[12,13],ymm8[14],ymm11[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm12[0],mem[1,2],ymm12[3],mem[4],ymm12[5],mem[6,7],ymm12[8],mem[9,10],ymm12[11],mem[12],ymm12[13],mem[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm9[1,2],mem[3],ymm9[4],mem[5],ymm9[6,7],mem[8],ymm9[9,10],mem[11],ymm9[12],mem[13],ymm9[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm15[1],ymm8[2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7,8],ymm15[9],ymm8[10],ymm15[11],ymm8[12,13],ymm15[14],ymm8[15] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1,2],ymm9[3],ymm12[4],ymm9[5],ymm12[6,7],ymm9[8],ymm12[9,10],ymm9[11],ymm12[12],ymm9[13],ymm12[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2],xmm0[3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10],ymm6[11],ymm10[12,13],ymm6[14],ymm10[15] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4],ymm13[5],ymm10[6,7],ymm13[8],ymm10[9,10],ymm13[11],ymm10[12],ymm13[13],ymm10[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4],ymm11[5],ymm10[6,7],ymm11[8],ymm10[9,10],ymm11[11],ymm10[12],ymm11[13],ymm10[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7,8],ymm2[9],ymm14[10],ymm2[11],ymm14[12,13],ymm2[14],ymm14[15] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,3,u,u,6,0,3,5> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm8, (%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] +; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,2,1,3,0,2,5,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm8[0,1],mem[2],ymm8[3],mem[4],ymm8[5,6],mem[7],ymm8[8,9],mem[10],ymm8[11],mem[12],ymm8[13,14],mem[15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1],mem[2],ymm7[3],mem[4],ymm7[5,6],mem[7],ymm7[8,9],mem[10],ymm7[11],mem[12],ymm7[13,14],mem[15] +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5],ymm9[6],mem[7,8],ymm9[9],mem[10,11],ymm9[12],mem[13],ymm9[14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm15, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1],mem[2],ymm8[3],mem[4],ymm8[5,6],mem[7],ymm8[8,9],mem[10],ymm8[11],mem[12],ymm8[13,14],mem[15] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm7, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FAST-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -6200,14 +6189,14 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $1080, %rsp # imm = 0x438 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm4 @@ -6229,56 +6218,55 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10,11],ymm5[12],ymm12[13],ymm5[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm2[1,2],ymm6[3],ymm2[4],ymm6[5],ymm2[6,7],ymm6[8],ymm2[9,10],ymm6[11],ymm2[12],ymm6[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13],ymm5[14],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -6289,280 +6277,276 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm12[1],mem[2],ymm12[3],mem[4,5],ymm12[6],mem[7,8],ymm12[9],mem[10],ymm12[11],mem[12,13],ymm12[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10],ymm13[11],ymm14[12,13],ymm13[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3],xmm1[4,5,6],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 624(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 624(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm5[1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5,6,7],ymm6[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm7[1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5,6,7],ymm6[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm6[0],xmm11[1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm12[2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7],ymm15[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm1[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm5[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm7[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm11[2],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10,11],ymm10[12],ymm2[13],ymm10[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm9[2],ymm15[3],ymm9[4],ymm15[5,6],ymm9[7],ymm15[8,9],ymm9[10],ymm15[11],ymm9[12],ymm15[13,14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0],xmm12[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5],ymm14[6],mem[7,8],ymm14[9],mem[10,11],ymm14[12],mem[13],ymm14[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm8[2],ymm12[3],ymm8[4],ymm12[5,6],ymm8[7],ymm12[8,9],ymm8[10],ymm12[11],ymm8[12],ymm12[13,14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm10[1],xmm13[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0],xmm13[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7],ymm4[8,9],mem[10],ymm4[11],mem[12],ymm4[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3,4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $13, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2],mem[3],ymm8[4,5],mem[6],ymm8[7,8],mem[9],ymm8[10],mem[11],ymm8[12,13],mem[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7,8],ymm7[9],ymm13[10],ymm7[11],ymm13[12,13],ymm7[14],ymm13[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm11[1,2],mem[3],ymm11[4],mem[5],ymm11[6,7],mem[8],ymm11[9,10],mem[11],ymm11[12],mem[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm1[0,1],mem[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,1],xmm13[2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0],ymm7[1,2],mem[3],ymm7[4],mem[5],ymm7[6,7],mem[8],ymm7[9,10],mem[11],ymm7[12],mem[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $4, (%rsp), %xmm4, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm4[0,1],mem[2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4,5],ymm2[6],ymm10[7,8],ymm2[9],ymm10[10],ymm2[11],ymm10[12,13],ymm2[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm15[1,2],ymm9[3],ymm15[4],ymm9[5],ymm15[6,7],ymm9[8],ymm15[9,10],ymm9[11],ymm15[12],ymm9[13],ymm15[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm2[0,1],mem[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4],ymm11[5],ymm10[6,7],ymm11[8],ymm10[9,10],ymm11[11],ymm10[12],ymm11[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm13[2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm14[0],mem[1],ymm14[2],mem[3],ymm14[4,5],mem[6],ymm14[7,8],mem[9],ymm14[10],mem[11],ymm14[12,13],mem[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4],ymm8[5],ymm12[6,7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12],ymm8[13],ymm12[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm14[0,1],mem[2],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10],ymm7[11],ymm12[12,13],ymm7[14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm7[1],ymm11[2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7,8],ymm7[9],ymm11[10],ymm7[11],ymm11[12,13],ymm7[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm5[2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1],xmm4[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6571,209 +6555,206 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm13, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm9[0],mem[1],ymm9[2,3],mem[4],ymm9[5],mem[6],ymm9[7,8],mem[9],ymm9[10,11],mem[12],ymm9[13],mem[14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4],xmm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0,1],ymm7[2],ymm11[3],ymm7[4],ymm11[5,6],ymm7[7],ymm11[8,9],ymm7[10],ymm11[11],ymm7[12],ymm11[13,14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3,4],xmm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4],xmm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7],ymm4[8,9],mem[10],ymm4[11],mem[12],ymm4[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm7[2],ymm12[3],ymm7[4],ymm12[5,6],ymm7[7],ymm12[8,9],ymm7[10],ymm12[11],ymm7[12],ymm12[13,14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: addq $1080, %rsp # imm = 0x438 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512F-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm13 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4],ymm3[5],ymm10[6,7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12],ymm3[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm14 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm15 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm2, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2],ymm2[3],ymm6[4],ymm2[5],ymm6[6,7],ymm2[8],ymm6[9,10],ymm2[11],ymm6[12],ymm2[13],ymm6[14,15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4,5],xmm2[6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm18 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10,11],ymm13[12],ymm12[13],ymm13[14],ymm12[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm30 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm15, %ymm29 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm2, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13],ymm8[14],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm28 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm24 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm27 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm3[2],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm7[2],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] @@ -6782,10 +6763,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5],ymm11[6],ymm4[7,8],ymm11[9],ymm4[10,11],ymm11[12],ymm4[13],ymm11[14],ymm4[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 @@ -6798,758 +6778,760 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm13[1],xmm7[2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm13[1],xmm15[2,3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm27 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm27[3,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm20[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm28 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm28[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm12 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm25, %zmm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11],ymm5[12],ymm4[13,14],ymm5[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4],xmm0[5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm4[2],ymm11[3],ymm4[4],ymm11[5,6],ymm4[7],ymm11[8,9],ymm4[10],ymm11[11],ymm4[12],ymm11[13,14],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512F-SLOW-NEXT: vmovdqa %ymm11, %ymm4 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3,4],xmm0[5,6,7] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13],ymm12[14],ymm15[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1,2],ymm7[3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm2[1,2],ymm9[3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm24[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm26[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm27[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm8[1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm25, %zmm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm13[2],xmm10[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm30 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm11[1],ymm6[2,3],ymm11[4],ymm6[5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10,11],ymm11[12],ymm6[13],ymm11[14],ymm6[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2],ymm8[3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm28[0,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm27, %xmm13 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm13[2],xmm15[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm18 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm20, %xmm13 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4],ymm6[5],ymm3[6,7],ymm6[8],ymm3[9,10],ymm6[11],ymm3[12],ymm6[13],ymm3[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm9[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm6[1,2],ymm12[3],ymm6[4],ymm12[5],ymm6[6,7],ymm12[8],ymm6[9,10],ymm12[11],ymm6[12],ymm12[13],ymm6[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0],xmm10[1],xmm14[2],xmm10[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm25, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2],ymm9[3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm24[0,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm26 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm27, %xmm10 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm22 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm16 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm25, %zmm0 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm26[0,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm20 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm24, %xmm8 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm7[2],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5],ymm5[6],ymm11[7,8],ymm5[9],ymm11[10,11],ymm5[12],ymm11[13],ymm5[14],ymm11[15] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5],ymm0[6],ymm8[7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13],ymm0[14],ymm8[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3,4],xmm2[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm8[1],xmm12[2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %ymm21 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3],ymm4[4],ymm0[5,6],ymm4[7],ymm0[8,9],ymm4[10],ymm0[11],ymm4[12],ymm0[13,14],ymm4[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3,4],xmm0[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm11[0],xmm15[1],xmm11[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm15[1],ymm5[2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10],ymm15[11],ymm5[12,13],ymm15[14],ymm5[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm28[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm27[2],xmm15[3],xmm27[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm18[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm20[2],xmm10[3],xmm20[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10,11],ymm12[12],ymm6[13],ymm12[14],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm28 +; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm12 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm31 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm20[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $173, (%rsp), %ymm3, %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6],ymm12[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm6[1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm26[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm27[2],xmm9[3],xmm27[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6],ymm9[7] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm13[3,4],xmm9[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm1[0],xmm7[1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13],ymm0[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm24 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm14 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7,8],ymm14[9],ymm1[10,11],ymm14[12],ymm1[13],ymm14[14],ymm1[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3,4],xmm9[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm11[1],ymm5[2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7,8],ymm11[9],ymm5[10],ymm11[11],ymm5[12,13],ymm11[14],ymm5[15] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa %ymm8, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10],ymm8[11],ymm3[12,13],ymm8[14],ymm3[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm15 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2],xmm2[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm1[2],ymm4[3],ymm1[4],ymm4[5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11],ymm1[12],ymm4[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm15[2],ymm5[3],ymm15[4],ymm5[5,6],ymm15[7],ymm5[8,9],ymm15[10],ymm5[11],ymm15[12],ymm5[13,14],ymm15[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm15 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0],xmm1[1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm4[1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm5[2],xmm7[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2],xmm11[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2,3],xmm0[4,5],xmm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13],ymm11[14],ymm12[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4,5],xmm4[6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm21 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm15[1,2],ymm6[3],ymm15[4],ymm6[5],ymm15[6,7],ymm6[8],ymm15[9,10],ymm6[11],ymm15[12],ymm6[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm13[2],xmm8[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 ; AVX512F-SLOW-NEXT: movb $7, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm8 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3,4,5,6,7],ymm8[8],ymm4[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm6 -; AVX512F-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0],ymm6[1,2],mem[3],ymm6[4],mem[5],ymm6[6,7],mem[8],ymm6[9,10],mem[11],ymm6[12],mem[13],ymm6[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4],ymm8[5],ymm4[6,7],ymm8[8],ymm4[9,10],ymm8[11],ymm4[12],ymm8[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2],xmm10[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $148, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7],ymm4[8,9],mem[10],ymm4[11],mem[12],ymm4[13,14],mem[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm7 +; AVX512F-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm6 = mem[0],ymm7[1],mem[2,3],ymm7[4],mem[5],ymm7[6],mem[7,8],ymm7[9],mem[10,11],ymm7[12],mem[13],ymm7[14],mem[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2],ymm14[3],ymm2[4,5],ymm14[6],ymm2[7,8],ymm14[9],ymm2[10],ymm14[11],ymm2[12,13],ymm14[14],ymm2[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6],xmm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%r8) +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-SLOW-NEXT: addq $584, %rsp # imm = 0x248 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512F-FAST-NEXT: subq $552, %rsp # imm = 0x228 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX512F-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %ymm11 -; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %ymm10 ; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %ymm7 ; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %ymm8 ; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm27 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,4,7,1,4,6,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] -; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] -; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm29 -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm18, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm26, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm15 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm11 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [1,3,0,2,4,6,1,3] +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpor %ymm6, %ymm4, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1,2],xmm11[3,4],xmm4[5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5],ymm14[6],ymm10[7,8],ymm14[9],ymm10[10,11],ymm14[12],ymm10[13],ymm14[14],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm28 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,4,7,1,4,6,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm17, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [8,9,3,2,4,5,7,6] +; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm18, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,3,1,3,0,3,5,7] +; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm25 +; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm19, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa %ymm7, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm30, %zmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm12 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13],ymm5[14],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4,5],xmm5[6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm3 ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm12[2],ymm9[3],ymm12[4],ymm9[5,6],ymm12[7],ymm9[8,9],ymm12[10],ymm9[11],ymm12[12],ymm9[13,14],ymm12[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm17, %ymm3 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm18, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm14 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm17, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm2[2],ymm8[3],ymm2[4],ymm8[5,6],ymm2[7],ymm8[8,9],ymm2[10],ymm8[11],ymm2[12],ymm8[13,14],ymm2[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm9[3,4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %ymm21, %ymm18, %ymm0 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512F-FAST-NEXT: vpermd %ymm22, %ymm19, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm30, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm30 -; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 -; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm10[1],ymm6[2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10],ymm10[11],ymm6[12,13],ymm10[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm21 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <2,u,u,u,4,7,1,6> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm22 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,u,u,u,4,7,1,6> +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm31 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm16, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm23, %xmm18 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm20 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm18[0],xmm10[1],xmm18[1] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,2,5,7,4,7,u,u> +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm14[1],ymm4[2,3],ymm14[4],ymm4[5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10,11],ymm14[12],ymm4[13],ymm14[14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm18 +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm19, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm13[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,4,6,3,1,4,6,3] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0],ymm8[1,2],ymm2[3],ymm8[4],ymm2[5],ymm8[6,7],ymm2[8],ymm8[9,10],ymm2[11],ymm8[12],ymm2[13],ymm8[14,15] +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm21, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vporq %ymm1, %ymm0, %ymm18 -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm31, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm11 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <0,2,5,7,4,7,u,u> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm19 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm20, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] -; AVX512F-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm25, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm18, %zmm26, %zmm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 +; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm13, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm14[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [1,3,2,3,1,3,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm22, %ymm24, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-FAST-NEXT: vpblendw $181, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm14 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm14[1],ymm2[2],ymm14[3],ymm2[4,5],ymm14[6],ymm2[7,8],ymm14[9],ymm2[10],ymm14[11],ymm2[12,13],ymm14[14],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5,6],xmm10[7] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm28 -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm21, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm20, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm27, %xmm4 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] +; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm16, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm5 +; AVX512F-FAST-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm19, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm16, %xmm5 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4],ymm6[5],ymm8[6,7],ymm6[8],ymm8[9,10],ymm6[11],ymm8[12],ymm6[13],ymm8[14,15] -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm24, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm25, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm6[1,2],ymm2[3],ymm6[4],ymm2[5],ymm6[6,7],ymm2[8],ymm6[9,10],ymm2[11],ymm6[12],ymm2[13],ymm6[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm24, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10,11],ymm12[12],ymm8[13],ymm12[14],ymm8[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm20 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,3,5,2,5,7,u,u> -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = <0,2,u,u,5,7,2,4> -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm25, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm23[2],xmm1[3],xmm23[3] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <0,3,5,2,5,7,u,u> +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm15[1],ymm8[2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7,8],ymm15[9],ymm8[10],ymm15[11],ymm8[12,13],ymm15[14],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm15, %ymm18 +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm26, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3,4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,2,u,u,5,7,2,4> +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm12 +; AVX512F-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15] +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm24, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm5, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm21, %zmm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3,4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,4,6,0,1,4,6,0] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm22, %ymm13, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm30, %zmm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm24, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm14 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm15 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13],ymm2[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm26, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm9 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm16[2],xmm4[3],xmm16[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm4[2],ymm7[3],ymm4[4],ymm7[5,6],ymm4[7],ymm7[8,9],ymm4[10],ymm7[11],ymm4[12],ymm7[13,14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm30 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm22 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5],ymm4[6],ymm13[7,8],ymm4[9],ymm13[10,11],ymm4[12],ymm13[13],ymm4[14],ymm13[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm25, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm5, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm21, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm14[2],ymm2[3],ymm14[4],ymm2[5,6],ymm14[7],ymm2[8,9],ymm14[10],ymm2[11],ymm14[12],ymm2[13,14],ymm14[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm26 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5],ymm7[6],ymm11[7,8],ymm7[9],ymm11[10,11],ymm7[12],ymm11[13],ymm7[14],ymm11[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm28 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm27 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm24, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm13, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm30, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,6,0,5,u,u,u> -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7],ymm12[8,9],ymm10[10],ymm12[11],ymm10[12],ymm12[13,14],ymm10[15] -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm11[1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <1,3,6,0,5,u,u,u> +; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm14[2],ymm8[3],ymm14[4],ymm8[5,6],ymm14[7],ymm8[8,9],ymm14[10],ymm8[11],ymm14[12],ymm8[13,14],ymm14[15] +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm20, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] ; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0],xmm14[1],xmm15[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm31 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm17 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm8[2],ymm6[3],ymm8[4],ymm6[5,6],ymm8[7],ymm6[8,9],ymm8[10],ymm6[11],ymm8[12],ymm6[13,14],ymm8[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm20 -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpermd %ymm22, %ymm4, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm8 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0],xmm10[1],xmm9[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm18 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm29 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm15[2],ymm6[3],ymm15[4],ymm6[5,6],ymm15[7],ymm6[8,9],ymm15[10],ymm6[11],ymm15[12],ymm6[13,14],ymm15[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm15, %ymm30 +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm20, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm9[2],xmm7[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <1,4,6,3,6,u,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm26, %ymm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm15[1,2],ymm4[3],ymm15[4],ymm4[5],ymm15[6,7],ymm4[8],ymm15[9,10],ymm4[11],ymm15[12],ymm4[13],ymm15[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,3,u,u,5,0,2,7> -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm25 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm25, %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm11 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm11[1,2],ymm9[3],ymm11[4],ymm9[5],ymm11[6,7],ymm9[8],ymm11[9,10],ymm9[11],ymm11[12],ymm9[13],ymm11[14,15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4],ymm14[5],ymm13[6,7],ymm14[8],ymm13[9,10],ymm14[11],ymm13[12],ymm14[13],ymm13[14,15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,4,6,3,6,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,2,1,3,0,2,5,7] +; AVX512F-FAST-NEXT: vpermd %ymm22, %ymm24, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10],ymm5[11],ymm7[12],ymm5[13],ymm7[14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm20 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,3,u,u,5,0,2,7> +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm16 +; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm9[3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm9, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm14 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm14[1,2,3],xmm6[4,5],xmm14[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm15 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm6[1,2,3,4,5,6,7],ymm15[8],ymm6[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm15 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm2[1,2],ymm15[3],ymm2[4],ymm15[5],ymm2[6,7],ymm15[8],ymm2[9,10],ymm15[11],ymm2[12],ymm15[13],ymm2[14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10],ymm8[11],ymm13[12,13],ymm8[14],ymm13[15] -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm27, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm26, %ymm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm24, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm9, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm2[2],xmm6[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm24, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm9 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm4, %ymm3 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm12[2],ymm10[3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8,9],ymm12[10],ymm10[11],ymm12[12],ymm10[13,14],ymm12[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8,9],ymm5[10],ymm3[11],ymm5[12],ymm3[13,14],ymm5[15] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,3,u,u,6,0,3,5> ; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5],ymm6[6],ymm15[7,8],ymm6[9],ymm15[10,11],ymm6[12],ymm15[13],ymm6[14],ymm15[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] @@ -7559,8 +7541,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm6 = mem[0],ymm14[1],mem[2],ymm14[3],mem[4,5],ymm14[6],mem[7,8],ymm14[9],mem[10],ymm14[11],mem[12,13],ymm14[14],mem[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] @@ -7569,19 +7551,17 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3],ymm15[4],ymm3[5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10,11],ymm15[12],ymm3[13],ymm15[14],ymm3[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm6[2],ymm13[3],ymm6[4],ymm13[5,6],ymm6[7],ymm13[8,9],ymm6[10],ymm13[11],ymm6[12],ymm13[13,14],ymm6[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm11[2],ymm13[3],ymm11[4],ymm13[5,6],ymm11[7],ymm13[8,9],ymm11[10],ymm13[11],ymm11[12],ymm13[13,14],ymm11[15] ; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 @@ -7603,105 +7583,105 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, (%r8) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512F-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-FAST-NEXT: addq $552, %rsp # imm = 0x228 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride5_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm8, %zmm7 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm13, %zmm7 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm6 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm8 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm13, %zmm15 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm12 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm13, %zmm17 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm16 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm16 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u> -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm13, %zmm19 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm18 +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm18 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm18 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u> +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm12, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm10, %zmm1 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index 3decaa366af785..280231eb2e07f4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -248,57 +248,57 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i16_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: psrld $16, %xmm7 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3] -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,3,2,3] +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm9 ; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm4, (%rsi) +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movq %xmm5, (%rsi) ; SSE-NEXT: movq %xmm3, (%rdx) ; SSE-NEXT: movq %xmm9, (%rcx) -; SSE-NEXT: movq %xmm2, (%r8) +; SSE-NEXT: movq %xmm1, (%r8) ; SSE-NEXT: movq %xmm6, (%r9) ; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq @@ -543,118 +543,118 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i16_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm8 -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 ; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[3,0] -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,3] -; SSE-NEXT: pslld $16, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm13[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,0] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,1,0,3] -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm8[0] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,3] +; SSE-NEXT: pslld $16, %xmm7 +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm13[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm11[2,0] +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: psrld $16, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,1,0,3] +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm2, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm5[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm6[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm14 -; SSE-NEXT: por %xmm8, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm6, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm12[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm12, %xmm8 -; SSE-NEXT: por %xmm14, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm13[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm12, %xmm9 +; SSE-NEXT: por %xmm14, %xmm9 ; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: psrlq $48, %xmm12 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm12[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm12[0] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm13[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm10, %xmm9 -; SSE-NEXT: por %xmm11, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm11[0],xmm10[1,2,3] -; SSE-NEXT: andps %xmm6, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm10 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: por %xmm10, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm4[1] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm10[0],xmm6[1,2,3] +; SSE-NEXT: andps %xmm8, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: por %xmm6, %xmm12 ; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE-NEXT: psrld $16, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] -; SSE-NEXT: andps %xmm6, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm8, (%rcx) -; SSE-NEXT: movdqa %xmm9, (%r8) +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3] +; SSE-NEXT: andps %xmm8, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movdqa %xmm9, (%rcx) +; SSE-NEXT: movdqa %xmm11, (%r8) ; SSE-NEXT: movdqa %xmm12, (%r9) -; SSE-NEXT: movdqa %xmm6, (%rax) +; SSE-NEXT: movdqa %xmm8, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf8: @@ -1081,519 +1081,530 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i16_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $136, %rsp -; SSE-NEXT: movdqa 112(%rdi), %xmm9 -; SSE-NEXT: movdqa 128(%rdi), %xmm7 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: subq $104, %rsp +; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa 128(%rdi), %xmm8 +; SSE-NEXT: movdqa 64(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm9 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm11 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa 160(%rdi), %xmm14 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm3[2,3] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm7[3,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[2,3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: movdqa 96(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,0,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa 160(%rdi), %xmm7 +; SSE-NEXT: movdqa 176(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[2,3] +; SSE-NEXT: pslld $16, %xmm12 +; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm9[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm12[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pandn %xmm15, %xmm13 -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,0] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: psrld $16, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm14[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: por %xmm10, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm12 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm13[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm10, %xmm5 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm10[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pand %xmm14, %xmm11 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm9[1] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: andps %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: psrlq $48, %xmm15 +; SSE-NEXT: andps %xmm14, %xmm11 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: psrlq $48, %xmm8 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: psrlq $48, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm4, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps %xmm9, (%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm7, 16(%r8) -; SSE-NEXT: movdqa %xmm11, (%r8) -; SSE-NEXT: movdqa %xmm4, 16(%r9) +; SSE-NEXT: movdqa %xmm3, 16(%r8) +; SSE-NEXT: movdqa %xmm10, (%r8) +; SSE-NEXT: movdqa %xmm11, 16(%r9) ; SSE-NEXT: movdqa %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm12, 16(%rax) -; SSE-NEXT: movdqa %xmm10, (%rax) -; SSE-NEXT: addq $136, %rsp +; SSE-NEXT: movdqa %xmm14, 16(%rax) +; SSE-NEXT: movdqa %xmm5, (%rax) +; SSE-NEXT: addq $104, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $88, %rsp +; AVX1-ONLY-NEXT: subq $152, %rsp ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm7, %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpslld $16, %xmm10, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpslld $16, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm14, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm7 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm13 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4],xmm7[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1,2,3],xmm13[4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm9[4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm8[0],xmm1[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4],xmm9[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm13[0],xmm1[0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3],xmm4[4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3,4],xmm15[5,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm9[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm9[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm9[0,1],mem[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm10[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: addq $88, %rsp +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: addq $152, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride6_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm5[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm1[0,1],ymm5[0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2],ymm10[3,4,5,6,7],ymm1[8,9,10],ymm10[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] @@ -1601,11 +1612,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -1615,7 +1626,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -1642,73 +1653,73 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,1,0,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rdx) +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rdx) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride6_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] @@ -1716,11 +1727,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -1729,7 +1740,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -1753,70 +1764,70 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rdx) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rsi) +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] @@ -1824,11 +1835,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -1837,7 +1848,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -1861,40 +1872,40 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2119,56 +2130,56 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i16_stride6_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 +; AVX512BW-NEXT: vpermi2w %ymm4, %ymm3, %ymm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 +; AVX512BW-NEXT: vpermi2w %ymm4, %ymm3, %ymm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 +; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] -; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512BW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm9 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm9[5,6,7] +; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx) ; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx) ; AVX512BW-NEXT: vmovdqa %ymm7, (%r8) ; AVX512BW-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-NEXT: vmovdqa %ymm2, (%rax) +; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <96 x i16>, ptr %in.vec, align 64 @@ -2191,38 +2202,36 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i16_stride6_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 -; SSE-NEXT: movdqa 304(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm14 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] @@ -2230,557 +2239,553 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa 352(%rdi), %xmm2 -; SSE-NEXT: movdqa 368(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa 352(%rdi), %xmm3 +; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa 336(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 208(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa 256(%rdi), %xmm8 +; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[3,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: movdqa 272(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm7[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm7 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa 240(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa 160(%rdi), %xmm5 -; SSE-NEXT: movdqa 176(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm9[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm8 +; SSE-NEXT: movdqa 176(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[3,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm9 -; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE-NEXT: movdqa 144(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm13[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,0] +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[2,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm11 +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE-NEXT: movdqa 144(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm8[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: psrld $16, %xmm8 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm8[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm8[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm8 ; SSE-NEXT: psrld $16, %xmm8 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pandn %xmm13, %xmm7 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pandn %xmm9, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm4[2,0] +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm14[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm14[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps $132, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pandn %xmm11, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm12[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm6[0] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm11, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[0,1],mem[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm12[1] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: andps %xmm8, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm14[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm14, %xmm4 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: andps %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: andps %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: andps %xmm8, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: psrlq $48, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm12 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: andps %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm8, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm12, %xmm7 +; SSE-NEXT: andps %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm12, %xmm9 +; SSE-NEXT: andps %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm13[0],xmm1[1,2,3] +; SSE-NEXT: andps %xmm8, %xmm1 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm10, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm15, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm11, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movdqa %xmm3, 16(%r9) ; SSE-NEXT: movdqa %xmm4, 32(%r9) ; SSE-NEXT: movdqa %xmm5, 48(%r9) -; SSE-NEXT: movdqa %xmm11, (%r9) +; SSE-NEXT: movdqa %xmm6, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm15, 16(%rax) +; SSE-NEXT: movdqa %xmm8, 16(%rax) ; SSE-NEXT: movdqa %xmm9, 32(%rax) -; SSE-NEXT: movdqa %xmm1, 48(%rax) -; SSE-NEXT: movdqa %xmm2, (%rax) +; SSE-NEXT: movdqa %xmm7, 48(%rax) +; SSE-NEXT: movdqa %xmm0, (%rax) ; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $552, %rsp # imm = 0x228 +; AVX1-ONLY-NEXT: subq $568, %rsp # imm = 0x238 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,1,0,3] @@ -2791,15 +2796,15 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 @@ -2811,7 +2816,6 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 @@ -2823,392 +2827,393 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,5,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm9 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm10[0,1],mem[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm11[0,1],mem[2,3],xmm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm12[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm13[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3],xmm8[4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm13[0,1],mem[2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm1, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm3[4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm11 +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm12[0,1,2,3],mem[4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm3[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm4[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm15[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm11[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm2[3,4],xmm9[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3,4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm9[0],xmm4[0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm12 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm15[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm10[1] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm2, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm11, %ymm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $238, (%rsp), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm8, %ymm10, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,4,6] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm13[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm6[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm14[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -3225,967 +3230,955 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX1-ONLY-NEXT: addq $568, %rsp # imm = 0x238 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride6_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm15 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[2,3],ymm2[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm3[0,1],ymm2[0,1] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm9 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm8 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3],xmm13[4],xmm4[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm11[1],ymm14[2,3,4,5],ymm11[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,2,0,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3],xmm11[4,5],xmm9[6],xmm11[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2],ymm9[3,4,5,6,7],ymm8[8,9,10],ymm9[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm14[1],ymm11[2,3,4,5],ymm14[6],ymm11[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3],xmm14[4,5],xmm12[6],xmm14[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm15 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[2,2,2,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1,2],xmm10[3],xmm15[4,5],xmm10[6],xmm15[7] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm13[2],xmm6[3],xmm13[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0],ymm8[1],mem[2,3,4,5],ymm8[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2],ymm10[3,4,5,6,7],ymm6[8,9,10],ymm10[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm10[3],xmm6[4,5],xmm10[6],xmm6[7] +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5,6],mem[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm9 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2],xmm0[3],xmm12[4,5],xmm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[0],ymm11[1],mem[2,3,4,5],ymm11[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3],xmm6[4,5],xmm4[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm13, %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3],xmm7[4,5],xmm3[6],xmm7[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3],xmm8[4],xmm7[5,6],xmm8[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm7[2],mem[3],ymm7[4],mem[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,1,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1,2],xmm12[3],xmm11[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[3,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3],ymm7[4],mem[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX2-FAST-NEXT: subq $520, %rsp # imm = 0x208 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm4[2],ymm14[3,4],ymm4[5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3],xmm13[4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm0 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm12[1],ymm7[2,3],ymm12[4],ymm7[5,6],ymm12[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm11 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm6[1],ymm10[2,3,4,5],ymm6[6],ymm10[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3],xmm15[4,5],xmm13[6],xmm15[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3],xmm9[4,5],xmm4[6],xmm9[7] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm15 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5,6],ymm14[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3],xmm13[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[0],ymm15[1],mem[2,3,4,5],ymm15[6],mem[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm9[2],xmm4[3],xmm9[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] +; AVX2-FAST-NEXT: vpblendd $107, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm7 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6],xmm15[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm7 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1,2],xmm9[3],xmm7[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm10[4],xmm6[5,6],xmm10[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm15[2],mem[3],ymm15[4],mem[5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1,2],xmm4[3],xmm8[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm8[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3],xmm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm3 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm4 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-FAST-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX2-FAST-PERLANE-NEXT: subq $520, %rsp # imm = 0x208 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm4[2],ymm14[3,4],ymm4[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3],xmm13[4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm0 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm12[1],ymm7[2,3],ymm12[4],ymm7[5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm6[1],ymm10[2,3,4,5],ymm6[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3],xmm15[4,5],xmm13[6],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3],xmm9[4,5],xmm4[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm9, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3],xmm13[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0],ymm15[1],mem[2,3,4,5],ymm15[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm9[2],xmm4[3],xmm9[4,5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm6, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1,2],xmm9[3],xmm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm10[4],xmm6[5,6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm15[2],mem[3],ymm15[4],mem[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1,2],xmm4[3],xmm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $136, %rsp -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: pushq %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5],xmm9[6],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm7, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm8, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm6[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm13[1],ymm9[2,3,4,5],ymm13[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm12[1],ymm9[2,3,4,5],ymm12[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm28 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm8[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm11[1],ymm13[2,3,4,5],ymm11[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm10[3],xmm7[4,5],xmm10[6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm10[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7],ymm10[8,9,10],ymm7[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm31 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1] @@ -4193,53 +4186,51 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm1[2],ymm15[3],ymm1[4],ymm15[5,6],ymm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm12[4],xmm1[5,6],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm13[2],ymm14[3],ymm13[4],ymm14[5,6],ymm13[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm2 @@ -4249,15 +4240,15 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5,6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm19 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[3,1,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] @@ -4274,925 +4265,921 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm9[4],xmm6[5],xmm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4,5],ymm15[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm22, %ymm11, %ymm10 ; AVX512F-ONLY-SLOW-NEXT: movw $31, %ax ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm14[4],xmm10[5],xmm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0,1,2,3],xmm8[4],xmm15[5],xmm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm15[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm22, %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm3, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm22, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm19, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm3, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm18, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $136, %rsp +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-SLOW-NEXT: popq %rax ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $136, %rsp +; AVX512F-ONLY-FAST-NEXT: subq $72, %rsp ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm9 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm6[2],xmm8[3],xmm6[4,5],xmm8[6,7] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm9, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm9, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3,4,5],ymm0[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3,4,5],ymm0[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3],xmm12[4,5],xmm8[6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm9[1],ymm13[2,3,4,5],ymm9[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2],xmm3[3],xmm11[4,5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm14[2],xmm8[3],xmm14[4,5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3],xmm11[4,5],xmm2[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm2, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm26 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm16[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm1[2],ymm15[3],ymm1[4],ymm15[5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm14[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm16[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5,6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm9[2],ymm13[3],ymm9[4],ymm13[5,6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm23, %ymm13, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm17, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1,2],xmm8[3],xmm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4],xmm11[5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4,5],ymm15[6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm12[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm14[1],xmm3[2,3],xmm14[4],xmm3[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm22, %ymm13, %ymm3 ; AVX512F-ONLY-FAST-NEXT: movw $31, %ax ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm15[4],xmm9[5],xmm15[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4,5],ymm9[6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4],xmm10[5],xmm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm7, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm22, %ymm11, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm20, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm21, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm29, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $136, %rsp +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $72, %rsp ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i16_stride6_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: pushq %rax -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2],ymm4[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm16 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5,6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5],xmm11[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm12, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm8 ; AVX512DQ-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3],xmm10[4,5],xmm5[6],xmm10[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm14[2],xmm6[3],xmm14[4,5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3],xmm9[4,5],xmm15[6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm9, %zmm9 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm9, %zmm17, %zmm6 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm20 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm30 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm30 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5,6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm10[2],ymm13[3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5,6],xmm12[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm12[2],ymm7[3],ymm12[4],ymm7[5,6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm7, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm17, %zmm18 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5,6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1,2],xmm8[3],xmm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,4,5] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,0,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm8[4],xmm4[5],xmm8[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3],xmm9[4],xmm5[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm7 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm9, %ymm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm12, %ymm7 ; AVX512DQ-SLOW-NEXT: movw $31, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm14[4],xmm9[5],xmm14[6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm7, %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3],ymm12[4],ymm7[5,6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1,2,3],xmm9[4],xmm14[5],xmm9[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4,5],ymm11[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4],ymm9[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3],xmm6[4],xmm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm1, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, (%rdx) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm9 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm9 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, (%r8) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-SLOW-NEXT: popq %rax ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i16_stride6_vf32: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6],ymm12[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm18 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm16 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6],ymm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm6[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0],ymm1[1],ymm13[2,3,4,5],ymm1[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3],xmm8[4,5],xmm0[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm10[1],ymm0[2,3,4,5],ymm10[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3],xmm11[4,5],xmm10[6],xmm11[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm0[1],ymm10[2,3,4,5],ymm0[6],ymm10[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm26 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm11 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm8 ; AVX512DQ-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm15 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3],xmm15[4,5],xmm6[6],xmm15[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3],xmm9[4,5],xmm15[6],xmm9[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm17, %zmm13 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm17, %zmm11 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm30 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm25 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3],xmm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm9, %xmm16 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm16[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm10[4],xmm5[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm10[2],ymm0[3],ymm10[4],ymm0[5,6],ymm10[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4],xmm10[5,6],xmm6[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm17, %zmm18 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3],xmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4],xmm2[5,6],xmm7[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,4,5] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm16 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3],xmm12[4],xmm6[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5],xmm7[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm13[1],xmm3[2,3],xmm13[4],xmm3[5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm19, %ymm12, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm19, %ymm14, %ymm3 ; AVX512DQ-FAST-NEXT: movw $31, %ax ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6],ymm12[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5],xmm8[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5],ymm11[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3],ymm12[4],ymm3[5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3],xmm6[4],xmm8[5],xmm6[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2,3,4,5],ymm8[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3,4],ymm6[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm19, %ymm7, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm19, %ymm9, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm7, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%rdx) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm6 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -5200,88 +5187,88 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i16_stride6_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} ; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm9 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2} ; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm10 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: movw $31, %di ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm3 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm0 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <192 x i16>, ptr %in.vec, align 64 @@ -5303,39 +5290,37 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i16_stride6_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1176, %rsp # imm = 0x498 -; SSE-NEXT: movdqa 496(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: subq $1144, %rsp # imm = 0x478 +; SSE-NEXT: movdqa 496(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm3 -; SSE-NEXT: movdqa 176(%rdi), %xmm0 -; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa 512(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] @@ -5343,13 +5328,13 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 480(%rdi), %xmm0 @@ -5357,33 +5342,32 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa 544(%rdi), %xmm3 -; SSE-NEXT: movdqa 560(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa 560(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa 528(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa 528(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5393,20 +5377,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5444,8 +5426,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa 448(%rdi), %xmm3 ; SSE-NEXT: movdqa 464(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5483,8 +5465,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa 352(%rdi), %xmm3 ; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5521,27 +5503,27 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa 736(%rdi), %xmm3 -; SSE-NEXT: movdqa 752(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa 752(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa 720(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa 720(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5559,82 +5541,81 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 256(%rdi), %xmm4 +; SSE-NEXT: movdqa 256(%rdi), %xmm3 ; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 608(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 ; SSE-NEXT: movdqa 592(%rdi), %xmm13 ; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 640(%rdi), %xmm5 -; SSE-NEXT: movdqa 656(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] +; SSE-NEXT: movdqa 640(%rdi), %xmm14 +; SSE-NEXT: movdqa 656(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[2,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm7[2,3] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm7 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa 624(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[1,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa 624(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm10[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 @@ -5647,25 +5628,24 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 @@ -5678,24 +5658,24 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 @@ -5708,160 +5688,104 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[2,0] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm12[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm12[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm15[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm15[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm15[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -5870,26 +5794,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 ; SSE-NEXT: por %xmm6, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -5898,9 +5822,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: pand %xmm13, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -5908,471 +5832,528 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm6, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm12, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm12, %xmm15 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm9 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm12, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pshufhw $231, (%rsp), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm6, %xmm12 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm10[0] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm10, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm12 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm11 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm12, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm11, %xmm7 +; SSE-NEXT: pshufhw $231, (%rsp), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm7 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm7[0] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm14[0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm9[1] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: andps %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm15[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] +; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[1],mem[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[1],mem[1] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm8[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: andps %xmm13, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm12[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm12 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm11 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[1],mem[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[1],mem[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: andps %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm4 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -6386,9 +6367,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: andps %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 @@ -6405,9 +6386,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm2 +; SSE-NEXT: andps %xmm13, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm0 @@ -6421,13 +6402,13 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: andps %xmm13, %xmm1 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6444,8 +6425,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm9, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6493,8 +6473,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movdqa %xmm7, 112(%r9) -; SSE-NEXT: movdqa %xmm8, 96(%r9) -; SSE-NEXT: movdqa %xmm15, 80(%r9) +; SSE-NEXT: movdqa %xmm10, 96(%r9) +; SSE-NEXT: movdqa %xmm11, 80(%r9) ; SSE-NEXT: movdqa %xmm12, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) @@ -6505,25 +6485,24 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm14, 112(%rax) +; SSE-NEXT: movdqa %xmm13, 112(%rax) ; SSE-NEXT: movdqa %xmm3, 96(%rax) ; SSE-NEXT: movdqa %xmm4, 80(%rax) -; SSE-NEXT: movdqa %xmm10, 64(%rax) -; SSE-NEXT: movdqa %xmm11, 48(%rax) -; SSE-NEXT: movdqa %xmm9, 32(%rax) -; SSE-NEXT: movdqa %xmm5, 16(%rax) -; SSE-NEXT: movdqa %xmm6, (%rax) -; SSE-NEXT: addq $1176, %rsp # imm = 0x498 +; SSE-NEXT: movdqa %xmm15, 64(%rax) +; SSE-NEXT: movdqa %xmm14, 48(%rax) +; SSE-NEXT: movdqa %xmm5, 32(%rax) +; SSE-NEXT: movdqa %xmm6, 16(%rax) +; SSE-NEXT: movdqa %xmm8, (%rax) +; SSE-NEXT: addq $1144, %rsp # imm = 0x478 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1368, %rsp # imm = 0x558 +; AVX1-ONLY-NEXT: subq $1352, %rsp # imm = 0x548 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -6539,11 +6518,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] @@ -6571,14 +6550,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6626,16 +6605,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 @@ -6671,18 +6650,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm0 @@ -6706,41 +6685,40 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 720(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -6751,12 +6729,13 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm15 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] @@ -6770,9 +6749,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6782,7 +6761,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] @@ -6790,8 +6769,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm13 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm13 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] @@ -6804,10 +6783,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6815,7 +6793,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufd $250, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] @@ -6825,56 +6803,56 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm9 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm9 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm4 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6901,9 +6879,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -6916,83 +6894,84 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, (%rsp), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm3[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3],xmm11[4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm4[4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -7001,37 +6980,35 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpblendw $207, (%rsp), %xmm6, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3],xmm6[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm14 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4],xmm14[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7049,8 +7026,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 @@ -7060,62 +7037,59 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm0 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3,4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7131,8 +7105,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 @@ -7142,9 +7116,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7153,9 +7127,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm1[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 @@ -7168,20 +7142,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm9[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm2[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[1],mem[1] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -7204,9 +7179,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[1],mem[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 @@ -7231,7 +7207,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm7 @@ -7243,7 +7219,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,4,6] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm8[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7] @@ -7266,8 +7242,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload @@ -7275,8 +7251,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] @@ -7312,9 +7288,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm8 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm9[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 @@ -7333,7 +7309,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -7375,7 +7351,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufhw $212, (%rsp), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3,4,5,6,7] @@ -7394,13 +7370,13 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -7464,17 +7440,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: addq $1368, %rsp # imm = 0x558 +; AVX1-ONLY-NEXT: addq $1352, %rsp # imm = 0x548 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-SLOW-NEXT: subq $1240, %rsp # imm = 0x4D8 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm8 @@ -7483,102 +7459,101 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm7[2,3],ymm6[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm5[2,3],ymm4[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[0,1],ymm4[0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[0,1],ymm4[0,1] ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm15 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm7[1],ymm12[2,3,4,5],ymm7[6],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm7, %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm15[1],ymm11[2,3,4,5],ymm15[6],ymm11[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm10 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm13, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqu %ymm13, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm11 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0],xmm14[1],xmm11[2,3],xmm14[4],xmm11[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3,4,5],ymm11[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0],xmm13[1],xmm9[2,3],xmm13[4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3],xmm14[4],xmm5[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm3[1],ymm4[2,3,4,5],ymm3[6],ymm4[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,2,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 @@ -7592,13 +7567,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7617,19 +7591,19 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7646,7 +7620,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm1 @@ -7656,8 +7630,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -7687,8 +7661,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm5 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,2,0,3] @@ -7782,24 +7756,24 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[2,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -7813,11 +7787,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7826,15 +7800,15 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] @@ -7846,70 +7820,70 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1,2],xmm10[3],xmm5[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm9, %ymm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0,1,2],ymm4[3,4,5,6,7],ymm13[8,9,10],ymm4[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7],ymm10[8,9,10],ymm4[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm10[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm14[4],xmm4[5,6],xmm14[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1,2],xmm11[3],xmm9[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm14, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm10, %ymm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = mem[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4],xmm9[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1,2],xmm8[3],xmm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm9 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] @@ -7919,174 +7893,172 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] ; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpshuflw $103, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm8[4],xmm0[5],xmm8[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm15 +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm15 ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4],xmm10[5],xmm7[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4],xmm10[5],xmm9[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3],xmm3[4],xmm10[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 96(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm13, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-SLOW-NEXT: addq $1272, %rsp # imm = 0x4F8 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-SLOW-NEXT: addq $1240, %rsp # imm = 0x4D8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-NEXT: subq $1240, %rsp # imm = 0x4D8 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm8 @@ -8095,30 +8067,32 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[0,1],ymm4[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm2[0,1] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm15 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm7[1],ymm12[2,3,4,5],ymm7[6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8126,68 +8100,66 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm10 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm13 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm13, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3],xmm14[4],xmm5[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm3[1],ymm4[2,3,4,5],ymm3[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm3 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8202,13 +8174,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8227,8 +8198,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8236,11 +8207,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8256,7 +8227,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 @@ -8266,8 +8237,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -8295,8 +8266,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2],xmm0[3],xmm6[4,5],xmm0[6],xmm6[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm6 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,0,3] @@ -8383,300 +8354,300 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm0 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm5 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7],ymm7[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[0,1,2,1] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm12[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX2-FAST-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm4 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshuflw $103, (%rsp), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,3,2,1] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm1[4],xmm13[5],xmm1[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5],xmm8[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm15, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, (%rax) -; AVX2-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-FAST-NEXT: addq $1240, %rsp # imm = 0x4D8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-PERLANE-NEXT: subq $1240, %rsp # imm = 0x4D8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm8 @@ -8685,30 +8656,32 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[0,1],ymm4[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm7[1],ymm12[2,3,4,5],ymm7[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8716,68 +8689,66 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm12, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3],xmm14[4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm3[1],ymm4[2,3,4,5],ymm3[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm9, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8792,13 +8763,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8817,8 +8787,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8826,11 +8796,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8846,7 +8816,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm1 @@ -8856,8 +8826,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -8885,8 +8855,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2],xmm0[3],xmm6[4,5],xmm0[6],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,0,3] @@ -8973,319 +8943,319 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm13 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm8, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm13, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7],ymm7[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw $103, (%rsp), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm12, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm1[4],xmm13[5],xmm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5],xmm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm15, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1240, %rsp # imm = 0x4D8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1480, %rsp # imm = 0x5C8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: subq $1416, %rsp # imm = 0x588 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm1, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -9296,9 +9266,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] @@ -9308,8 +9278,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 480(%rdi), %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9317,12 +9287,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -9333,8 +9303,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -9347,18 +9317,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm13, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9366,159 +9336,159 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3],xmm1[4,5],xmm12[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm12[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm12, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm14, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm12, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 @@ -9526,33 +9496,33 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm26 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm29, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4] @@ -9564,8 +9534,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] @@ -9579,91 +9549,92 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm4[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7],ymm0[8,9,10],ymm15[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1,2,3],xmm2[4],xmm15[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0],xmm15[1,2],xmm0[3],xmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5,6],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7],ymm13[8,9,10],ymm12[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm28, %zmm29, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[1,1,1,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm3, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[1,1,1,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -9672,254 +9643,255 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm12, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm30 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm5, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: movw $31, %ax ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm9, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm24[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1],xmm0[2,3],xmm7[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm26[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm18[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm13, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm13, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2,3],xmm7[4],xmm13[5],xmm7[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm7, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4],xmm11[5],xmm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm5, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 64(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX512F-ONLY-SLOW-NEXT: addq $1416, %rsp # imm = 0x588 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1480, %rsp # imm = 0x5C8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: subq $1544, %rsp # imm = 0x608 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9928,44 +9900,45 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -9974,188 +9947,189 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3],xmm15[4,5],xmm0[6],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3],xmm13[4,5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm13, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm4[1],ymm0[2,3,4,5],ymm4[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm15[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3],xmm2[4,5],xmm9[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3],xmm9[4,5],xmm1[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0],ymm0[1],ymm9[2,3,4,5],ymm0[6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,6] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2],xmm1[3],xmm6[4,5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3],xmm5[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1],ymm7[2],mem[3],ymm7[4],mem[5,6],ymm7[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm26 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] @@ -10164,352 +10138,351 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm22[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm15[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm11, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm25, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm1[1,2],xmm13[3],xmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm24 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm11[1,2],xmm13[3],xmm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2],xmm13[3],xmm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm13 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7],ymm15[8,9,10],ymm13[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4],xmm13[5,6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm24, %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1,2],xmm8[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2],xmm5[3],xmm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,7,5,6,5] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm22, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm25, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm0 ; AVX512F-ONLY-FAST-NEXT: movw $31, %ax ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm22 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2,3],xmm5[4],xmm15[5],xmm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4],xmm1[5],xmm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm15[4],xmm10[5],xmm15[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm10[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm7[1],xmm13[2,3],xmm7[4],xmm13[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm11, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm7, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4],xmm14[5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3],xmm1[4],xmm9[5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4],xmm13[5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm15, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3],xmm0[4],xmm12[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3],xmm10[4],xmm4[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm30, %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -10534,11 +10507,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -10550,9 +10523,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm21 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm21[0,2,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] @@ -10572,7 +10544,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm2 @@ -10584,7 +10556,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm26 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm18 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] @@ -10608,60 +10580,60 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3],xmm1[4,5],xmm12[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm12[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm12, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm0 +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7] @@ -10670,46 +10642,46 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm5 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm3 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,5,5,5,5] @@ -10722,20 +10694,20 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] @@ -10746,63 +10718,62 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm25 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm3[2],ymm0[3],ymm3[4],ymm0[5,6],ymm3[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,6,4] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm5 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm20, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm21, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -10817,8 +10788,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] @@ -10827,8 +10798,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] +; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -10840,60 +10811,60 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7],ymm0[8,9,10],ymm15[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm20, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm28 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm26 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1,2,3],xmm2[4],xmm15[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm21 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm20 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm15[1,2],xmm2[3],xmm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3],xmm2[4],xmm12[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm13 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm27 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm19 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] @@ -10906,10 +10877,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm5 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] @@ -10918,105 +10889,105 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm22 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm20 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm18 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm10 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm30 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm2, %ymm1 ; AVX512DQ-SLOW-NEXT: movw $31, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm29 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm31 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm7, %xmm29 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm1, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm25 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm24 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm24 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] @@ -11026,65 +10997,64 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm7, %ymm12 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm12[1],xmm4[2,3],xmm12[4],xmm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0],xmm14[1],xmm3[2,3],xmm14[4],xmm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4],xmm14[5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm26 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm30[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm14 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm10 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5],xmm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm9[4],xmm15[5],xmm9[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm9, %zmm9 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm17[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm15 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm15, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1,2,3],xmm10[4],xmm15[5],xmm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm10 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm12[1],xmm3[2,3],xmm12[4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm10, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm29[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1],xmm2[2,3],xmm14[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm10, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm7, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm7, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5],xmm3[6,7] @@ -11099,18 +11069,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm3, %zmm24 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm21 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm27, %zmm23 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm24, %zmm27, %zmm12 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 64(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) @@ -11128,11 +11097,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 @@ -11142,231 +11111,233 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],mem[2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm3[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3,4,5],ymm5[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 ; AVX512DQ-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm0 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3],xmm14[4,5],xmm1[6],xmm14[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2],xmm0[3],xmm11[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm11, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm4[1],ymm6[2,3,4,5],ymm4[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm29 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2],ymm13[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3],xmm2[4,5],xmm10[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3],xmm12[4,5],xmm10[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0],ymm0[1],ymm12[2,3,4,5],ymm0[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm28 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm12 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3],xmm6[4,5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm16, %zmm4 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm10, %zmm0, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm2 ; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3],xmm4[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm5[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm16, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm4 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm22 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm24 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -11374,26 +11345,24 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm25 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,6,4] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm27 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm21, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 @@ -11402,288 +11371,291 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,5,6,4] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm18[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm15[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm29 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm21 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm18 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm1[1,2],xmm13[3],xmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm9[1,2],xmm15[3],xmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6],xmm14[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2],xmm13[3],xmm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5,6],xmm11[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4],xmm13[5,6],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,4,5] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1,2],xmm9[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2],xmm5[3],xmm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm26, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm6 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm9 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm21, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: movw $31, %ax +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm28 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm29 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm12 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm10 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm2 -; AVX512DQ-FAST-NEXT: movw $31, %ax -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm30 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm21, %ymm2, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm17 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm24 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm24 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm24 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3],xmm1[4],xmm11[5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3],xmm9[4],xmm13[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm21, %ymm13, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3],xmm13[4],xmm14[5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0,1,2,3],xmm8[4],xmm15[5],xmm8[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm9, %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4],xmm13[5],xmm9[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4],ymm9[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm12, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2,3],xmm4[4],xmm14[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm26 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm11, %zmm0, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1,2,3],xmm4[4],xmm11[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3],xmm0[4],xmm12[5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm21, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3],xmm4[4],xmm10[5],xmm4[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 @@ -11697,19 +11669,20 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm2, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm25 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm26, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-FAST-NEXT: addq $904, %rsp # imm = 0x388 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -11717,143 +11690,143 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i16_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm14, %zmm15 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm11, %zmm10 ; AVX512BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm10 {%k1} ; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k2} ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm14 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm11 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm16, %zmm17 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm14 {%k2} ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm16 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm14 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm15 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm18, %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = <34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm16 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm16 {%k2} ; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1} ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm18 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm20 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm16 -; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm18, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm19 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm20 -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 -; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = <4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm23 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm18 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm18, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 ; AVX512BW-NEXT: movw $31, %di ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm21, %zmm24 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm23 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = <5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm21 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm19, %zmm24 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm21 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm19 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm18, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm1 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm1 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm7, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 0771fcea0714cd..66193ec8555c7d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -271,77 +271,77 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: psrlq $16, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1] -; SSE-NEXT: pslld $16, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: psrlq $16, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[1,1,1,1] +; SSE-NEXT: pslld $16, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm0, (%rdx) -; SSE-NEXT: movq %xmm7, (%rcx) +; SSE-NEXT: movq %xmm1, (%rdx) +; SSE-NEXT: movq %xmm6, (%rcx) ; SSE-NEXT: movq %xmm8, (%r8) -; SSE-NEXT: movq %xmm6, (%r9) -; SSE-NEXT: movq %xmm10, (%rdi) -; SSE-NEXT: movq %xmm1, (%rax) +; SSE-NEXT: movq %xmm7, (%r9) +; SSE-NEXT: movq %xmm9, (%rdi) +; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf4: @@ -406,49 +406,49 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-SLOW-NEXT: vmovq %xmm1, (%rsi) +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovq %xmm5, (%rsi) ; AVX2-SLOW-NEXT: vmovq %xmm6, (%rdx) -; AVX2-SLOW-NEXT: vmovq %xmm3, (%rcx) -; AVX2-SLOW-NEXT: vmovq %xmm4, (%r8) -; AVX2-SLOW-NEXT: vmovq %xmm5, (%r9) +; AVX2-SLOW-NEXT: vmovq %xmm2, (%rcx) +; AVX2-SLOW-NEXT: vmovq %xmm3, (%r8) +; AVX2-SLOW-NEXT: vmovq %xmm4, (%r9) ; AVX2-SLOW-NEXT: vmovq %xmm7, (%r10) ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax) ; AVX2-SLOW-NEXT: vzeroupper @@ -556,52 +556,52 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,6,4,6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-SLOW-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rdx) -; AVX512F-SLOW-NEXT: vmovq %xmm2, (%rcx) +; AVX512F-SLOW-NEXT: vmovq %xmm4, (%rdx) +; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rcx) ; AVX512F-SLOW-NEXT: vmovq %xmm5, (%r8) ; AVX512F-SLOW-NEXT: vmovq %xmm6, (%r9) ; AVX512F-SLOW-NEXT: vmovq %xmm7, (%r10) -; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rax) +; AVX512F-SLOW-NEXT: vmovq %xmm2, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -609,48 +609,48 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6],xmm1[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm8 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovq %xmm3, (%rsi) ; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx) -; AVX512F-FAST-NEXT: vmovq %xmm1, (%rcx) +; AVX512F-FAST-NEXT: vmovq %xmm0, (%rcx) ; AVX512F-FAST-NEXT: vmovq %xmm5, (%r8) ; AVX512F-FAST-NEXT: vmovq %xmm6, (%r9) ; AVX512F-FAST-NEXT: vmovq %xmm7, (%r10) -; AVX512F-FAST-NEXT: vmovq %xmm2, (%rax) +; AVX512F-FAST-NEXT: vmovq %xmm1, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -707,23 +707,23 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm9 ; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movdqa 80(%rdi), %xmm7 -; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa 96(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 80(%rdi), %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm7 +; SSE-NEXT: movdqa 96(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm11 ; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm8[2,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm6[2,2] ; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: movaps %xmm4, %xmm12 ; SSE-NEXT: andnps %xmm5, %xmm12 @@ -734,137 +734,142 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: por %xmm12, %xmm5 ; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,1,0,1] -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] -; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: pandn %xmm15, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm12 -; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm15, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; SSE-NEXT: por %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm11, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,1,0,1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: psrld $16, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm14, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pand %xmm11, %xmm14 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: por %xmm13, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm12[0],xmm10[1,2,3] +; SSE-NEXT: andps %xmm3, %xmm10 +; SSE-NEXT: orps %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm15 ; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm14, %xmm11 -; SSE-NEXT: por %xmm15, %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: pandn %xmm11, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm8[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm15[0],xmm11[1,2,3] -; SSE-NEXT: andps %xmm3, %xmm11 -; SSE-NEXT: orps %xmm14, %xmm11 -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: por %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: psrld $16, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm12[0],xmm15[1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[2,3,2,3] +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: psrld $16, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm13[0],xmm1[1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: andps %xmm3, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm13, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: psrlq $16, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; SSE-NEXT: por %xmm15, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm15, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: andps %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,4,7] +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: psrlq $16, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; SSE-NEXT: psrlq $48, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm10[0,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm11[0,2] ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] ; SSE-NEXT: movdqa %xmm5, (%rsi) ; SSE-NEXT: movdqa %xmm4, (%rdx) -; SSE-NEXT: movaps %xmm11, (%rcx) +; SSE-NEXT: movaps %xmm10, (%rcx) ; SSE-NEXT: movdqa %xmm3, (%r8) -; SSE-NEXT: movapd %xmm13, (%r9) -; SSE-NEXT: movaps %xmm14, (%rdi) -; SSE-NEXT: movapd %xmm1, (%rax) +; SSE-NEXT: movapd %xmm1, (%r9) +; SSE-NEXT: movaps %xmm12, (%rdi) +; SSE-NEXT: movapd %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf8: @@ -1092,23 +1097,23 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm2[6],xmm10[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm14 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm13 @@ -1130,8 +1135,8 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rsi) ; AVX2-FAST-NEXT: vmovdqa %xmm6, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %xmm9, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %xmm10, (%r8) -; AVX2-FAST-NEXT: vmovdqa %xmm11, (%r9) +; AVX2-FAST-NEXT: vmovdqa %xmm11, (%r8) +; AVX2-FAST-NEXT: vmovdqa %xmm10, (%r9) ; AVX2-FAST-NEXT: vmovdqa %xmm7, (%r10) ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -1171,23 +1176,23 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm2[6],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm14, %xmm13 @@ -1209,8 +1214,8 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, (%r10) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1332,23 +1337,23 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm2[6],xmm8[7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm11 @@ -1370,8 +1375,8 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa %xmm6, (%rdx) ; AVX512F-FAST-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %xmm8, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %xmm9, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %xmm9, (%r8) +; AVX512F-FAST-NEXT: vmovdqa %xmm8, (%r9) ; AVX512F-FAST-NEXT: vmovdqa %xmm10, (%r10) ; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-FAST-NEXT: vzeroupper @@ -1427,694 +1432,700 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $232, %rsp -; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm12 -; SSE-NEXT: movdqa 128(%rdi), %xmm6 +; SSE-NEXT: subq $184, %rsp +; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: movdqa 128(%rdi), %xmm9 ; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm13 -; SSE-NEXT: movdqa 176(%rdi), %xmm15 -; SSE-NEXT: movdqa 208(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: movdqa 208(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: andnps %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: andnps %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,1,0,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm3 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,0,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: andnps %xmm0, %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: andnps %xmm0, %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm2 -; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pand %xmm7, %xmm13 +; SSE-NEXT: por %xmm5, %xmm13 +; SSE-NEXT: pand %xmm14, %xmm13 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm12, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movdqa %xmm15, %xmm5 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: pand %xmm1, %xmm5 ; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,1] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 ; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm11 -; SSE-NEXT: orps %xmm4, %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: orps %xmm0, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: andnps %xmm5, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: orps %xmm10, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[0,1,0,1] +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm15[0],xmm13[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm13 +; SSE-NEXT: orps %xmm10, %xmm13 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm15[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm8[0],xmm10[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm15, %xmm8 +; SSE-NEXT: andps %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm1[0],xmm10[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; SSE-NEXT: andps %xmm14, %xmm10 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: andnps %xmm1, %xmm15 -; SSE-NEXT: orps %xmm0, %xmm15 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: por %xmm10, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: psrld $16, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: psrlq $16, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: psrlq $16, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: psrld $16, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm11 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm11 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pshuflw $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,1,2,3] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movdqa %xmm7, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm13, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movdqa %xmm14, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movapd %xmm1, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r9) +; SSE-NEXT: movapd %xmm11, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm15, (%rax) -; SSE-NEXT: movaps %xmm14, 16(%rax) +; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm11, (%rax) +; SSE-NEXT: movapd %xmm10, (%rax) ; SSE-NEXT: movapd %xmm0, 16(%rax) -; SSE-NEXT: addq $232, %rsp +; SSE-NEXT: addq $184, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $264, %rsp # imm = 0x108 +; AVX1-ONLY-NEXT: subq $232, %rsp ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,6],xmm10[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,3,2,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,5,6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm1[2],xmm2[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm1[2],xmm13[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm13, %xmm14 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3,4],xmm15[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm15, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm15, %ymm10 ; AVX1-ONLY-NEXT: vandps %ymm15, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2,3,4,5],xmm8[6],mem[7] +; AVX1-ONLY-NEXT: vorps %ymm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2,3,4,5],xmm6[6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm13 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm5, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm14[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm1[1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0],xmm4[1],xmm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm10[3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vorps %ymm9, %ymm15, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm8, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5,6,7] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm12[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm5, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm4, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm5[6],xmm15[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5],xmm3[6],xmm1[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1,2],xmm8[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0],xmm7[1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm6[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm2, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1,2],xmm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,1,0,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5],xmm2[6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5],xmm2[6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm11[1],xmm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,2] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1],mem[0],zero -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm6[1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = zero,xmm0[1],mem[0],zero +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm6[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: addq $264, %rsp # imm = 0x108 +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $232, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2256,17 +2267,17 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6,7],ymm14[8],ymm11[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] @@ -2277,7 +2288,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm5[1,2,3,4,5,6,7],ymm1[8],ymm5[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2287,7 +2298,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-SLOW-NEXT: vzeroupper @@ -2300,16 +2311,16 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5] ; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] @@ -2319,8 +2330,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] @@ -2331,7 +2342,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] @@ -2339,8 +2350,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] @@ -2350,7 +2361,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] @@ -2361,81 +2372,81 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm7[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3],xmm15[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,3,2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm13, %ymm15, %ymm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1,2,3,4,5,6,7],ymm11[8],ymm12[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm15 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2],xmm15[3],xmm9[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1,2,3,4,5,6,7],ymm5[8],ymm9[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm15 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,2,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2],xmm12[3],xmm6[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1,2,3,4,5,6,7],ymm9[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,5,1,4,2,5,1,4] ; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm14, %ymm6 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,3,7,0,0,3,7,0] ; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5,6,7],ymm5[8,9,10,11,12],ymm14[13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5,6,7],ymm6[8,9,10,11,12],ymm14[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,4,7,3,6,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1,2,3,4,5,6,7],ymm12[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0],ymm6[1,2,3,4,5,6,7],ymm13[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm13[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,4,7,0,0,4,7,0] ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,4,0,3,7,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] @@ -2452,7 +2463,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -2465,59 +2476,58 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm11[4],xmm9[5],xmm11[6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3,4,5],xmm9[6],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7,8,9,10],ymm12[11],ymm11[12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3,4,5],xmm12[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4,5,6,7],ymm9[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4],ymm13[5,6,7,8,9,10,11],ymm12[12],ymm13[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] @@ -2525,78 +2535,78 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,1,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm12[1,2,3,4,5,6,7],ymm8[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm12[1,2,3,4,5,6,7],ymm9[8],ymm12[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3],xmm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm13, %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm14, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7,8,9,10,11,12,13],ymm15[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2],xmm15[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1,2,3,4,5,6,7],ymm14[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4],xmm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,0,1,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1],xmm8[2],xmm15[3],xmm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1,2,3,4,5,6,7],ymm14[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2],xmm13[3],xmm12[4],xmm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,u,u,u,0,1,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5,6,7],ymm12[8,9,10,11,12],ymm15[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1,2,3,4,5,6],ymm5[7,8],ymm15[9,10,11,12,13,14],ymm5[15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2,3,4,5,6,7],ymm11[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0],ymm12[1,2,3,4,5,6,7],ymm5[8],ymm12[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,2,3,0,1,14,15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,2,3,0,1,14,15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] @@ -2613,11 +2623,11 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -2758,17 +2768,17 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm11[1,2,3,4,5,6,7],ymm4[8],ymm11[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] @@ -2780,7 +2790,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%rdx) @@ -2788,7 +2798,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %ymm9, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa %ymm10, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm11, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm4, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper @@ -2800,9 +2810,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] ; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] @@ -2812,8 +2822,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2851,14 +2861,14 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm8, %ymm14, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm14, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] @@ -2866,65 +2876,65 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1],xmm8[2],xmm15[3],xmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm10, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm15, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u> ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,4,8,11,15,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] ; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] @@ -2935,11 +2945,11 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, (%r8) ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper @@ -2951,9 +2961,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] ; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] @@ -2963,8 +2973,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] @@ -3002,14 +3012,14 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm14, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm14, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] @@ -3017,65 +3027,65 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1],xmm8[2],xmm15[3],xmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm15, %zmm13 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm14 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm14 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u> ; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,4,8,11,15,u,u,u> ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] ; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm13 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] @@ -3086,11 +3096,11 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, (%r8) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -3100,65 +3110,65 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42] ; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43] ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm8 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45] ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm10 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47] ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm10[1,2,3,4,5,6,7],ymm3[8],ymm10[9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx) ; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx) ; AVX512BW-NEXT: vmovdqa %ymm7, (%r8) ; AVX512BW-NEXT: vmovdqa %ymm8, (%r9) ; AVX512BW-NEXT: vmovdqa %ymm9, (%r10) -; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <112 x i16>, ptr %in.vec, align 64 @@ -3182,82 +3192,80 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $600, %rsp # imm = 0x258 +; SSE-NEXT: subq $568, %rsp # imm = 0x238 ; SSE-NEXT: movdqa 304(%rdi), %xmm5 ; SSE-NEXT: movdqa 288(%rdi), %xmm6 -; SSE-NEXT: movdqa 112(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm11 ; SSE-NEXT: movdqa 128(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps 160(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm9 -; SSE-NEXT: movdqa 176(%rdi), %xmm12 -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movdqa 176(%rdi), %xmm15 +; SSE-NEXT: movdqa 208(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2] -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm9[2,2] +; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm13, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,1,0,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm10 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movaps 272(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movaps %xmm13, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 224(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa 240(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm0 @@ -3265,33 +3273,33 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 416(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movaps 384(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movaps %xmm13, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 352(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 @@ -3306,14 +3314,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2] -; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movaps %xmm13, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3323,536 +3331,530 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm11, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] ; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pandn %xmm14, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm8 -; SSE-NEXT: orps %xmm1, %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm4, %xmm13 +; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: pand %xmm12, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm5[0],xmm14[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm14 -; SSE-NEXT: orps %xmm4, %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: orps %xmm7, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,1,0,1] -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm4 +; SSE-NEXT: orps %xmm3, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm8 +; SSE-NEXT: orps %xmm3, %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,1] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,0,1] +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] ; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: psrlq $16, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: psrlq $16, %xmm10 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: psrld $16, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: psrld $16, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm11, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm12, %xmm11 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -3873,43 +3875,46 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] @@ -3918,7 +3923,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -3931,8 +3936,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movdqa %xmm13, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3947,7 +3951,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movdqa %xmm15, (%r8) +; SSE-NEXT: movdqa %xmm12, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3965,8 +3969,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm6, (%rax) ; SSE-NEXT: movaps %xmm7, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rax) +; SSE-NEXT: movaps %xmm8, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -3974,157 +3977,156 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm3, 48(%rax) ; SSE-NEXT: movapd %xmm4, 32(%rax) ; SSE-NEXT: movapd %xmm5, 16(%rax) -; SSE-NEXT: addq $600, %rsp # imm = 0x258 +; SSE-NEXT: addq $568, %rsp # imm = 0x238 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm6[2],xmm7[2],zero +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm7[2],xmm8[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm11[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm0[2],xmm6[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm10[6],xmm15[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5],xmm15[6],xmm13[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm13[0],mem[1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm9[1],xmm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] @@ -4132,479 +4134,475 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5],xmm7[6],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm7[6],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm11[0],mem[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm0[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm1[1],xmm5[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm1[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsllq $16, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm3, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm3[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3,4,5,6],xmm13[7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13 +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm13, %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm3[6],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm6[0,1,2,3,4,5],mem[6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3,4,5],xmm7[6],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm0[1,2],xmm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm4[1,2],xmm13[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm5[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm15, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm6[6],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm1[1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5],xmm11[6],xmm13[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0],xmm14[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm14[0,1,2,3,4,5],mem[6],xmm14[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5],xmm0[6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5],xmm5[6],xmm7[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm15[1],xmm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = zero,xmm2[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm14[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm13[1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = zero,xmm3[1],mem[0],zero +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm7[1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX1-ONLY-NEXT: vzeroupper @@ -4612,136 +4610,129 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-SLOW-LABEL: load_i16_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $520, %rsp # imm = 0x208 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-SLOW-NEXT: subq $552, %rsp # imm = 0x228 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4,5],ymm13[6],ymm15[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1],xmm8[2,3,4,5],xmm5[6],xmm8[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5,6,7,8,9,10],ymm9[11],ymm0[12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2,3,4,5],xmm2[6],xmm9[7] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3,4,5],xmm9[6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm12[2,3],ymm3[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm12[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4],ymm2[5,6,7,8,9,10,11],ymm10[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm15[1],ymm13[2,3,4],ymm15[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm9 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -4750,51 +4741,49 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5],xmm14[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4],xmm1[5],xmm13[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3,4,5],ymm9[6],ymm7[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -4806,35 +4795,38 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6,7],ymm10[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] @@ -4843,126 +4835,125 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3,4,5],ymm14[6],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm14[2],ymm15[3,4,5],ymm14[6],ymm15[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm8 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6],ymm8[7,8,9,10,11,12,13],ymm5[14],ymm8[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6],ymm8[7,8,9,10,11,12,13],ymm4[14],ymm8[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm7[2],ymm2[3,4,5],ymm7[6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7,8,9,10,11,12,13],ymm3[14],ymm4[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm7[6],xmm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7],ymm5[8,9,10,11,12],ymm12[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm13 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm9[2],ymm13[3,4],ymm9[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1,2,3,4,5,6],ymm14[7,8],ymm4[9,10,11,12,13,14],ymm14[15] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6,7,8],ymm10[9],ymm8[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7],ymm4[8,9,10,11,12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] @@ -4971,62 +4962,62 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r9) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm9[2,3],ymm13[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4],xmm3[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd $237, (%rsp), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7,8],ymm3[9],ymm1[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-SLOW-NEXT: addq $520, %rsp # imm = 0x208 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-SLOW-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -5034,366 +5025,374 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,1,u,4,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,6,1,u,5,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm5[2],ymm12[3,4],ymm5[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,5,1,u,4,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2,3,4,5],xmm2[6],xmm9[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3,4,5],xmm1[6],xmm11[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,1,u,5,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3,4,5],xmm13[6],xmm1[7] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2],xmm1[3],xmm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2],xmm1[3],xmm11[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [3,6,2,5,3,6,2,5] ; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,2] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,2] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,1,0,2] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2],ymm5[3,4,5],ymm14[6],ymm5[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm9[4],xmm2[5],xmm9[6],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3,4,5],xmm0[6],xmm5[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm7[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm3[1],ymm12[2,3],ymm3[4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm6 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1],ymm10[2],ymm7[3,4,5],ymm10[6],ymm7[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1],xmm12[2],xmm5[3],xmm12[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,5,1,4,2,5,1,4] +; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5,6,7],ymm9[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6,7],ymm15[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7],ymm8[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm3[2],ymm11[3,4],ymm3[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm6[3],ymm15[4,5],ymm6[6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm14[0,1],mem[2,3],ymm14[4,5],mem[6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd (%rsp), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5],mem[6],ymm8[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,4,0,3,7,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -5407,7 +5406,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) @@ -5418,8 +5417,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -5432,381 +5430,388 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $552, %rsp # imm = 0x228 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm8[2],ymm13[3,4,5],ymm8[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm8[4],xmm2[5],xmm8[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1],xmm8[2,3,4,5],xmm5[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3],ymm1[4,5,6,7,8,9,10],ymm5[11],ymm1[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm7[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm4[1],ymm10[2,3],ymm4[4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3,4,5],xmm9[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4],ymm2[5,6,7,8,9,10,11],ymm10[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm6[3],ymm14[4,5],ymm6[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm1[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm4[2],ymm9[3,4,5],ymm4[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6,7],ymm12[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm4[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm5[2],ymm15[3,4,5],ymm5[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5],xmm14[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4,5],ymm9[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1,2,3,4,5,6,7],ymm12[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm8 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm11[0,1],mem[2],ymm11[3,4],mem[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6],ymm8[7,8,9,10,11,12,13],ymm3[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm6[1],ymm11[2,3,4],ymm6[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3,4,5,6,7],ymm8[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6],ymm12[7,8,9,10,11,12,13],ymm4[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0],xmm4[1],xmm13[2],xmm4[3],xmm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm14 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm3[2],ymm12[3,4,5],ymm3[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6],ymm4[7,8,9,10,11,12,13],ymm2[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5,6,7],ymm4[8,9,10,11,12],ymm12[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm11 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm6[2],ymm11[3,4],ymm6[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3],xmm8[4],xmm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7],ymm10[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3,4,5,6],ymm13[7,8],ymm10[9,10,11,12,13,14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1],xmm1[2],xmm12[3],xmm1[4],xmm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm10[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7],ymm12[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6,7,8],ymm10[9],ymm5[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm4[1,2,3,4,5,6,7],ymm12[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm12[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6,7,8],ymm6[9],ymm4[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm12 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1],xmm10[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7,8],ymm4[9],ymm2[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) @@ -5827,372 +5832,364 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride7_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: pushq %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm5, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,4,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,1,0,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[0,1,2,1,4,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3],xmm7[4],xmm2[5],xmm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6,7,8,9,10],ymm2[11],ymm8[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm8, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm8[4],xmm2[5],xmm8[6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3,4,5],xmm8[6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm14, %xmm17 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,1,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm15, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm13[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm14, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm15, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6],ymm2[7,8,9,10,11,12,13],ymm8[14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2],xmm7[3,4,5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3],xmm10[4],xmm0[5],xmm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6,7,8,9,10],ymm10[11],ymm11[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3,4,5],xmm11[6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm15[2],ymm14[3,4,5],ymm15[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm5[1],xmm12[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,0,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm9, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7,8,9,10,11],ymm10[12],ymm9[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm9, %ymm10, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm9, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm9, %ymm10, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm8[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm22[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm12, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3],xmm8[4],xmm9[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,5],xmm0[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm31 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,7,6] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5,6,7],ymm9[8,9,10,11,12],ymm11[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm5, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6],ymm13[7,8,9,10,11,12,13],ymm0[14],ymm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm28[0,2,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2],xmm13[3,4,5,6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm13[2],ymm2[3,4],ymm13[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2,3,4,5,6],ymm11[7,8],ymm2[9,10,11,12,13,14],ymm11[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7,8],ymm1[9,10,11,12,13,14],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm13[2,3],ymm2[4,5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm11[1],ymm9[2,3,4],ymm11[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm11[4],xmm2[5],xmm11[6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm13[3],ymm2[4,5],ymm13[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4],xmm2[5],xmm14[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm2[2],ymm13[3,4,5],ymm2[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm14, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1,2],ymm2[3,4,5,6,7],ymm14[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3],xmm9[4],xmm0[5],xmm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1],xmm13[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm4[2],ymm12[3,4,5],ymm4[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2],ymm11[3,4],ymm2[5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm15[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5,6,7,8,9,10],ymm13[11],ymm10[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm10[3],ymm1[4,5,6,7,8,9,10],ymm10[11],ymm1[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm10, %ymm13, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3,4,5],xmm10[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm10[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4],ymm10[5,6,7,8,9,10,11],ymm14[12],ymm10[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2],xmm14[3],xmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm10, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7,8,9,10,11],ymm10[12],ymm1[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0],xmm10[1],xmm14[2],xmm10[3],xmm14[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm10, %ymm9, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6],ymm10[7,8],ymm9[9,10,11,12,13,14],ymm10[15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm10, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7,8],ymm1[9,10,11,12,13,14],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm14 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm9[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4,5,6,7,8],ymm9[9],ymm4[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm1[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7,8],ymm3[9],ymm1[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm15[2],ymm3[3,4,5],ymm15[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm9[2],ymm11[3,4,5],ymm9[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,3,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1,2],ymm5[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm3, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm24 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm2, %zmm24 ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm3, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm25, %zmm27 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm13, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm2, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm27, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm12, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm2, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm26, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-SLOW-NEXT: popq %rax +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm25, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; @@ -6200,669 +6197,664 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <1,u,u,u,5,8,12,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <1,u,u,u,4,8,11,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,4,7,11,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [10,3,6,15,12,13,6,15] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <1,u,u,u,5,8,12,15> +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm17, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,u,u,u,4,8,11,15> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm17, %zmm19, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,4,7,11,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm27, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm25, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm27[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm7, %ymm11, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm6, %ymm11, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm12 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm22 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3],xmm15[4],xmm11[5],xmm15[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1,2],xmm11[3,4,5,6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm11[3],xmm9[4],xmm11[5],xmm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3,4,5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm8, %ymm9, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5],xmm0[6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3,4,5],xmm8[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm2, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm26, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm27, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm6, %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm2[2],ymm8[3,4,5],ymm2[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3,4,5],xmm9[6],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,11,2,11,12,5,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm13, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1,2],ymm6[3,4,5,6,7],ymm13[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm15, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm16, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0],xmm12[1],xmm8[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm9[4],xmm15[5],xmm9[6],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3,4,5,6],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3,4,5],xmm13[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vporq %ymm7, %ymm9, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3,4,5],xmm7[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [2,5,2,5,2,5,2,5] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm27, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm13[4],xmm7[5],xmm13[6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm18, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm13, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm17, %zmm25, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2],xmm2[3],xmm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm24, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vporq %ymm7, %ymm2, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3,4,5],xmm7[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm27[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm2[2],ymm9[3,4,5],ymm2[6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm15[4],xmm7[5],xmm15[6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,11,2,11,12,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm15, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7],ymm15[8,9,10],ymm7[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm15, %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm8, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm19, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,3,u,0,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7],ymm10[8,9,10,11,12],ymm13[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm12, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm10, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,3,u,0,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm27, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7],ymm11[8,9,10,11,12],ymm10[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm12, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm10, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm13, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2,3,4,5],xmm6[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3,4,5],xmm13[6],xmm11[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,u,u,u,6,9,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm11, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm11, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm6[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3,4,5],xmm10[6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <3,u,u,u,6,10,13,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2],xmm0[3],xmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7],ymm10[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,u,u,u,6,9,13,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm13, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,7,11,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3,4,5],xmm13[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm27, %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7],ymm14[8,9,10,11,12],ymm13[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <3,u,u,u,6,10,13,u> +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm12, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,4,8,11,15,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,14,7,10,3] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm22, %zmm2, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm26, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm21, %zmm25, %zmm31 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm27, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm21, %zmm30, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm26, %zmm30, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm30, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm20, %zmm25, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm16, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm24, %zmm25, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm12, %zmm30, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm8, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i16_stride7_vf32: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: pushq %rax -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm5, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,4,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,1,0,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1,2],xmm2[3],xmm3[4],xmm2[5],xmm3[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm5[1],xmm15[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm18 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm24 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm19 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm0[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3],xmm12[4],xmm0[5],xmm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3],ymm14[4,5,6,7,8,9,10],ymm12[11],ymm14[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %ymm12, %ymm14, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm11[1],xmm13[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,0,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm8, %zmm19 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4],ymm8[5,6,7,8,9,10,11],ymm12[12],ymm8[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vporq %ymm8, %ymm12, %ymm22 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm8[1],xmm12[2,3,4,5],xmm8[6],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm5[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,0,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm11, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm12, %zmm25 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm11 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2],xmm15[3],xmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vporq %ymm14, %ymm0, %ymm23 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm5[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm15, %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm21 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm24[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm13, %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3],xmm0[4],xmm5[5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm29 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm5, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm27 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5],xmm1[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm8, %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm8, %xmm27 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm13, %xmm28 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm14 ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6],ymm0[7,8,9,10,11,12,13],ymm5[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2],ymm5[3,4],ymm14[5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6],ymm3[7,8],ymm2[9,10,11,12,13,14],ymm3[15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2],xmm5[3,4,5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5],xmm0[6],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm8[1,2,3,4,5,6],ymm12[7,8],ymm8[9,10,11,12,13,14],ymm12[15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4,5,6],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4,5,6,7,8],ymm8[9],ymm0[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm12[4],xmm8[5],xmm12[6],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm25, %zmm24 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm20, %zmm25 ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm18 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm31 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm5[2],ymm14[3,4,5],ymm5[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3,4,5],xmm0[6],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm4[2],ymm5[3,4,5],ymm4[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm12[4],xmm8[5],xmm12[6],xmm8[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm18 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm20, %zmm22 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm3[2],ymm1[3,4,5],ymm3[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6],ymm0[7,8,9,10,11,12,13],ymm8[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm24 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7,8,9,10],ymm13[11],ymm12[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3,4,5],xmm11[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm12, %ymm11, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm8 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4],ymm11[5,6,7,8,9,10,11],ymm12[12],ymm11[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm13[5,6,7],ymm9[8,9,10,11,12],ymm13[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm9, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm9, %xmm21 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm21[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm9 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm12[3],ymm3[4,5,6,7,8,9,10],ymm12[11],ymm3[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5],xmm10[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm10, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm3[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4],ymm3[5,6,7,8,9,10,11],ymm10[12],ymm3[13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm10, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm2[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm2[1,2,3,4,5,6],ymm10[7,8],ymm2[9,10,11,12,13,14],ymm10[15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2,3,4],ymm4[5,6,7],ymm10[8,9,10,11,12],ymm4[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1],ymm13[2,3],ymm4[4,5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6,7,8],ymm7[9],ymm3[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,4,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2],xmm7[3],xmm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,1] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm25, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3,4,5],ymm14[6],ymm15[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,3,1] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm19 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-SLOW-NEXT: popq %rax ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq @@ -6870,45 +6862,42 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <1,u,u,u,5,8,12,15> ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm18, %zmm15 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,u,u,u,4,8,11,15> ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm2, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm30, %zmm9 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm0, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,u,u,4,7,11,14> ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm4, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm21, %zmm5 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vpermd %zmm25, %zmm28, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm26[0,1,0,2] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm26[0,1,0,2] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vporq %ymm7, %ymm10, %ymm22 ; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7 @@ -6918,230 +6907,224 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm6 ; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3],xmm11[4],xmm10[5],xmm11[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3],xmm12[4],xmm10[5],xmm12[6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4,5,6],xmm8[7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm9 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm23 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm9, %ymm20 +; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm9, %ymm21 ; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm3[2],ymm9[3,4,5],ymm3[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm10 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0],xmm0[1],xmm10[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm25 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4],xmm13[5],xmm11[6],xmm13[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm3[2],ymm10[3,4,5],ymm3[6],ymm10[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm12[4],xmm9[5],xmm12[6],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm0[1],xmm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm24 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6],xmm11[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5],xmm2[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3,4,5,6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3,4,5],xmm13[6],xmm9[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0],xmm2[1],xmm13[2,3,4,5],xmm2[6],xmm13[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [2,5,2,5,2,5,2,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm13, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm31 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm14 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2,3],xmm8[4],xmm12[5],xmm8[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm19, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4],xmm13[5],xmm8[6],xmm13[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm19, %zmm13 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm12, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm21, %zmm12 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm28, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm11, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5],xmm11[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2],xmm9[3],xmm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm9, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm10[1],ymm3[2,3],ymm10[4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm26[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm26[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm19 ; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6],xmm13[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm8[2],ymm2[3,4,5],ymm8[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm0[4],xmm11[5],xmm0[6],xmm11[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2,3,4,5],xmm11[6],xmm14[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = <0,3,7,10,14,u,u,u> ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm17, %zmm19 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm28, %zmm14 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm19 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm12, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm28 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm10, %xmm9 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,u,u,u,6,9,13,u> -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm29, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm28 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4],ymm10[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm9 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm12, %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,6,9,13,u> +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm3, %zmm3 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm12 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2,3,4,5],xmm14[6],xmm9[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm10, %zmm10 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm9, %ymm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpermd %zmm25, %zmm18, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,4,7,11,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm9 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,3,3,u,0,3,7,u> ; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm31, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm13 -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm9 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3,4,5],xmm13[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7],ymm11[8,9,10,11,12],ymm13[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm31, %xmm9 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm11 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm10[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3,4,5],xmm11[6],xmm9[7] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm14[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm12 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <3,u,u,u,6,10,13,u> -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3],xmm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3],xmm14[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm13, %zmm13 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm17, %zmm9 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7],ymm13[8,9,10],ymm9[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,4,8,11,15,u,u,u> +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm17, %zmm10 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm12, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] @@ -7149,25 +7132,25 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm2, %zmm2 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm10, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm11, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm25 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm25, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm24, %zmm1, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -7175,121 +7158,121 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm3 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm9 ; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm3 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0] +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm9, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm11 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm11 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm12 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm13 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm14 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <224 x i16>, ptr %in.vec, align 64 @@ -7314,798 +7297,849 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i16_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1352, %rsp # imm = 0x548 -; SSE-NEXT: movdqa 640(%rdi), %xmm9 -; SSE-NEXT: movdqa 624(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa 640(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 624(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm11 ; SSE-NEXT: movdqa 128(%rdi), %xmm10 -; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm13 -; SSE-NEXT: movdqa 192(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm14 ; SSE-NEXT: movdqa 176(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm6[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[2,2] +; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,0,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa 656(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,0,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa 656(%rdi), %xmm15 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movaps 608(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 592(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 560(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 576(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps 592(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: movdqa 560(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa 576(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 80(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movaps 32(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 528(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 512(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movaps 496(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 448(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 464(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps 480(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: movdqa 448(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa 464(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 416(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 400(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movaps 384(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 336(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 352(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 864(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 848(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps 368(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 880(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 864(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 848(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movaps 832(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 816(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 784(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 800(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps 816(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: movdqa 784(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa 800(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 288(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movaps 272(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 224(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps 256(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa 240(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 768(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 752(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 736(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movaps 720(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 672(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 688(%rdi), %xmm0 +; SSE-NEXT: movaps 704(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: movdqa 672(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa 688(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: psrld $16, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm15, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm9, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pand %xmm14, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm4 -; SSE-NEXT: orps %xmm2, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm4, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm5, %xmm13 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: pand %xmm12, %xmm13 +; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd $196, (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm6 -; SSE-NEXT: orps %xmm2, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: orps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: orps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm6, %xmm0 +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm7, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm6, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm7, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm11, %xmm0 +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm10, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,1,0,1] -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] +; SSE-NEXT: pand %xmm6, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,1,0,1] +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm5 -; SSE-NEXT: orps %xmm6, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm13, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm11[0],xmm6[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm6 +; SSE-NEXT: orps %xmm10, %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] ; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: andps %xmm14, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm10 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -8113,541 +8147,489 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7] +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: psrlq $16, %xmm3 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm3 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm3 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm3 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm3 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: psrld $16, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $196, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: psrld $16, %xmm12 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: psrlq $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: psrld $16, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: psrlq $16, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -8655,13 +8637,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8674,14 +8656,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8698,13 +8680,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -8714,11 +8696,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] @@ -8734,11 +8716,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] @@ -8754,7 +8736,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] @@ -8782,7 +8764,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -8817,8 +8800,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movdqa %xmm13, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8849,7 +8831,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movdqa %xmm14, 112(%r8) +; SSE-NEXT: movdqa %xmm12, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8881,9 +8863,10 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm11, 112(%rax) -; SSE-NEXT: movaps %xmm12, 96(%rax) -; SSE-NEXT: movaps %xmm13, 80(%rax) +; SSE-NEXT: movaps %xmm14, 112(%rax) +; SSE-NEXT: movaps %xmm15, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8892,17 +8875,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 112(%rax) ; SSE-NEXT: movapd %xmm4, 96(%rax) ; SSE-NEXT: movapd %xmm5, 80(%rax) ; SSE-NEXT: movapd %xmm6, 64(%rax) ; SSE-NEXT: movapd %xmm7, 48(%rax) -; SSE-NEXT: movapd %xmm8, 32(%rax) -; SSE-NEXT: movapd %xmm9, 16(%rax) -; SSE-NEXT: movapd %xmm10, (%rax) +; SSE-NEXT: movapd %xmm9, 32(%rax) +; SSE-NEXT: movapd %xmm10, 16(%rax) +; SSE-NEXT: movapd %xmm11, (%rax) ; SSE-NEXT: addq $1352, %rsp # imm = 0x548 ; SSE-NEXT: retq ; @@ -8931,11 +8913,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] @@ -8953,11 +8935,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm6[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm7[2],xmm3[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 @@ -8970,7 +8952,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 @@ -8980,11 +8962,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] @@ -8996,10 +8978,10 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm15[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm5[2],zero ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm4 @@ -9036,40 +9018,40 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm9[2],xmm3[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm10[2],xmm12[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 @@ -9085,18 +9067,18 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] @@ -9108,11 +9090,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm14[2],xmm3[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm15[2],xmm3[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9150,150 +9132,148 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm6[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm0[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[0,1,2,3,4,5],mem[6],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm1[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1],xmm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm1[0],mem[1],xmm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm14[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5],xmm0[6],xmm10[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,5],xmm9[6],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm15[1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm14[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] @@ -9303,13 +9283,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -9334,8 +9313,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] @@ -9343,32 +9321,32 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] @@ -9378,9 +9356,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] @@ -9394,37 +9372,38 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsllq $16, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsllq $16, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,1] @@ -9432,30 +9411,30 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] @@ -9463,28 +9442,28 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufd $68, (%rsp), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7] ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm14 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm10, %ymm14 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm8, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -9493,8 +9472,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2,3,4,5],mem[6],xmm7[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3,4,5],xmm7[6],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3,4,5,6,7] @@ -9525,22 +9504,22 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm10, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm9, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm8, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm11[0,1,2,3,4,5],mem[6],xmm11[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm7[0,1,2,3,4,5],mem[6],xmm7[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1,2],xmm12[3,4,5,6,7] @@ -9552,22 +9531,22 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm11[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0],xmm8[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 @@ -9579,39 +9558,41 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm13[0,1,2,3,4,5],mem[6],xmm13[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm12[6],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm4[1,2],xmm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm1[0],mem[1],xmm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; AVX1-ONLY-NEXT: vpunpckhwd (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm14, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] @@ -9625,8 +9606,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm0[6],mem[7] +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm3[1,2],xmm12[3,4,5,6,7] @@ -9664,12 +9645,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -9680,12 +9663,10 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] @@ -9693,8 +9674,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -9702,46 +9682,46 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -9751,19 +9731,19 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] @@ -9772,7 +9752,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhwd (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload @@ -9780,16 +9760,17 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] @@ -9820,247 +9801,286 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm2[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm14[1],xmm9[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm0[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm1[0,1,2,3,4,5],mem[6],xmm1[7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5],xmm7[6],xmm8[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm1[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm12[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = zero,xmm3[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5],xmm2[6],xmm8[7] +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm2[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm13[1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = zero,xmm14[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm3[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = zero,xmm15[1],mem[0],zero -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = zero,xmm0[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm8[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload @@ -10090,7 +10110,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufd $196, (%rsp), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7] @@ -10099,65 +10119,25 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm3, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = zero,xmm0[1],mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0],xmm0[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm3, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi) +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10193,529 +10173,525 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) ; AVX1-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1448, %rsp # imm = 0x5A8 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-SLOW-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm6, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm0[2],ymm8[3,4,5],ymm0[6],ymm8[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7,8,9,10],ymm3[11],ymm2[12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7,8,9,10],ymm4[11],ymm2[12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm10 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm10[2],ymm1[3,4],ymm10[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3],ymm2[4,5,6,7,8,9,10],ymm7[11],ymm2[12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm7, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3],ymm2[4,5,6,7,8,9,10],ymm7[11],ymm2[12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm7, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3],ymm2[4,5,6,7,8,9,10],ymm7[11],ymm2[12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm11[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4],ymm5[5,6,7,8,9,10,11],ymm2[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm13[2,3],ymm3[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4],ymm5[5,6,7,8,9,10,11],ymm8[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm14[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4],ymm5[5,6,7,8,9,10,11],ymm8[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6,7,8,9,10,11],ymm7[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2],xmm2[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm12[0,1,2],mem[3],ymm12[4,5],mem[6],ymm12[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm4[1],ymm11[2,3,4],ymm4[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2],xmm2[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm8[1],ymm13[2,3,4],ymm8[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2],xmm2[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3,4],ymm0[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2],xmm2[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm13 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd $31, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,4,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5],xmm10[6],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm7[2],ymm3[3,4,5],ymm7[6],ymm3[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm8[2],ymm2[3,4,5],ymm8[6],ymm2[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3,4,5],ymm9[6],ymm7[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3,4,5],xmm0[6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm10[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3,4,5],xmm0[6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm6[3],ymm11[4,5],ymm6[6],ymm11[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3,4,5],xmm0[6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,2,0,4,5,6,4] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm10[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm11[1],ymm6[2,3],ymm11[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm13[2],ymm12[3,4,5],ymm13[6],ymm12[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2],xmm4[3],xmm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] @@ -10725,61 +10701,62 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4,5],ymm6[6],mem[7] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm6 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $36, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7,8,9,10,11,12,13],ymm4[14],ymm2[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $221, (%rsp), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm7[0,1],mem[2],ymm7[3,4,5],mem[6],ymm7[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] @@ -10787,27 +10764,28 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7] +; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5],xmm2[6],xmm5[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] @@ -10820,7 +10798,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] @@ -10828,590 +10806,585 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm8[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm6[2],ymm11[3,4],ymm6[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa 880(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5,6,7],ymm9[8,9,10,11,12],ymm15[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $204, (%rsp), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm9[1,2,3,4,5,6,7],ymm4[8],ymm9[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1],xmm4[2],xmm9[3],xmm4[4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm15[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,5],xmm4[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7],ymm9[8,9,10,11,12],ymm14[13,14,15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7,8],ymm6[9,10,11,12,13,14],ymm7[15] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,2],ymm6[3],mem[4,5],ymm6[6],mem[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm9[1,2,3,4,5,6,7],ymm2[8],ymm9[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7,8],ymm5[9],ymm2[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4],xmm2[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4,5,6,7,8],ymm9[9],ymm2[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4],xmm9[5],xmm12[6,7] +; AVX2-SLOW-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm5[1],mem[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3,4,5,6,7],ymm6[8],ymm7[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4,5,6,7,8],ymm10[9],ymm7[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2],ymm6[3],mem[4,5],ymm6[6],mem[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm6[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm9[1,2,3,4,5,6,7],ymm2[8],ymm9[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6,7,8],ymm12[9],ymm9[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3],xmm11[4],xmm8[5],xmm11[6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpblendd $72, (%rsp), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4,5,6,7,8],ymm7[9],ymm1[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4],xmm7[5],xmm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6,7,8],ymm8[9],ymm1[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1],ymm7[2,3],mem[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7],ymm7[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2],ymm8[3],mem[4,5],ymm8[6],mem[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm7[1,2,3,4,5,6,7],ymm1[8],ymm7[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-SLOW-NEXT: addq $1448, %rsp # imm = 0x5A8 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-SLOW-NEXT: addq $1432, %rsp # imm = 0x598 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride7_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm8[2],ymm2[3,4,5],ymm8[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm14[2],ymm5[3,4,5],ymm14[6],ymm5[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm14[1],ymm4[2,3,4],ymm14[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,5,1,u,4,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,1,u,5,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm9[2],ymm3[3,4],ymm9[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,5,1,u,4,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2,3,4,5],xmm5[6],xmm9[7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm15, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2,3,4,5],xmm5[6],xmm9[7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm15, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2,3,4,5],xmm5[6],xmm9[7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,6,1,u,5,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,1,u,5,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,1,u,5,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,6,1,u,5,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm13[0,1,2],mem[3],ymm13[4,5],mem[6],ymm13[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm10[3],ymm1[4,5],ymm10[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,2,5,3,6,2,5] ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm7 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm9 ; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm9 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5],xmm14[6],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1],ymm12[2],ymm14[3,4,5],ymm12[6],ymm14[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7],ymm4[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm11[2],ymm8[3,4,5],ymm11[6],ymm8[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] @@ -11419,42 +11392,43 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm6 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -11467,436 +11441,440 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm9[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,1,3] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm4[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3,4],ymm6[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3,4,5],mem[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm7[2],ymm13[3,4],ymm7[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm11 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3,4,5],mem[6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm9[2],ymm11[3,4,5],ymm9[6],ymm11[7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $221, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,5,1,4,2,5,1,4] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm6[3],ymm15[4,5],ymm6[6],ymm15[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm6 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7],ymm15[8,9,10,11,12],ymm14[13,14,15] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6,7],ymm15[8],ymm14[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5,6,7],ymm15[8,9,10,11,12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm13 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7],ymm0[8,9,10,11,12],ymm4[13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm7[1],mem[2,3],ymm7[4],mem[5,6,7] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,0,3,7,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,4,0,3,7,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-FAST-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-FAST-NEXT: vzeroupper @@ -11904,571 +11882,573 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1448, %rsp # imm = 0x5A8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: subq $1464, %rsp # imm = 0x5B8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4,5],ymm5[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2],ymm12[3,4,5],ymm9[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4,5],ymm13[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4,5],ymm0[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2,3,4,5],xmm2[6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm15[2],ymm6[3,4],ymm15[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2,3,4,5],xmm2[6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4],ymm2[5,6,7,8,9,10,11],ymm7[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm15[2,3],ymm10[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4],ymm2[5,6,7,8,9,10,11],ymm7[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm14[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7,8,9,10,11],ymm4[12],ymm3[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm10[1],mem[2,3,4],ymm10[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6,7,8,9,10,11],ymm6[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2],ymm2[3],mem[4,5],ymm2[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2],ymm2[3],mem[4,5],ymm2[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm14[1],ymm4[2,3,4],ymm14[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm10[0,1,2],mem[3],ymm10[4,5],mem[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm0[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm10, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm14, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm15[4],xmm6[5],xmm15[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm6[1,2,3,4,5,6,7],ymm1[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm5[2],ymm3[3,4,5],ymm5[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm6[2],ymm3[3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm15[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm12[2],ymm4[3,4,5],ymm12[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm14[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm9[3],ymm2[4,5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm12[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm5 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm14[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm12, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6],ymm7[7,8,9,10,11,12,13],ymm5[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm1[1],ymm8[2,3,4],ymm1[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4,5],mem[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $187, (%rsp), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm13[2],ymm8[3,4,5],ymm13[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0,1],mem[2],ymm14[3,4,5],mem[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm9[0,1],mem[2],ymm9[3,4,5],mem[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2],ymm11[3],mem[4,5],ymm11[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -12476,116 +12456,117 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 656(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4],xmm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3],xmm1[4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm4[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7],ymm1[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 880(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5,6,7],ymm11[8,9,10,11,12],ymm15[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7],ymm15[8,9,10,11,12],ymm14[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm11[1,2,3,4,5,6,7],ymm8[8],ymm11[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm6[0],ymm14[1,2,3,4,5,6,7],ymm6[8],ymm14[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2],xmm11[3],xmm8[4],xmm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,5],xmm14[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2],xmm13[3],xmm6[4],xmm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm6[0,1,2,3,4,5],xmm7[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7],ymm8[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6],ymm13[7,8],ymm8[9,10,11,12,13,14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5,6,7],ymm13[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, (%rsp), %ymm7, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7,8],ymm13[9,10,11,12,13,14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm13, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm15[2,3],mem[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -12595,111 +12576,112 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4],xmm3[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6,7,8],ymm9[9],ymm8[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[3],xmm5[4],xmm3[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm3[0],mem[1],xmm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm11[1,2,3,4,5,6,7],ymm2[8],ymm11[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3,4,5,6,7,8],ymm13[9],ymm11[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm9[2,3],mem[4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4],xmm9[5],xmm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6,7,8],ymm8[9],ymm1[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6,7,8],ymm9[9],ymm1[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3],xmm11[4],xmm9[5],xmm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm8[1,2,3,4,5,6,7],ymm1[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1],xmm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5,6,7],ymm9[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm9[1,2,3,4,5,6,7],ymm1[8],ymm9[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3],xmm11[4],xmm9[5],xmm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0],xmm6[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7,8],ymm6[9],ymm5[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6,7,8],ymm6[9],ymm4[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2],ymm15[3],mem[4,5],ymm15[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -12753,1350 +12735,1345 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1448, %rsp # imm = 0x5A8 +; AVX2-FAST-PERLANE-NEXT: addq $1464, %rsp # imm = 0x5B8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1864, %rsp # imm = 0x748 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: subq $1832, %rsp # imm = 0x728 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 480(%rdi), %ymm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm13[2],ymm2[3,4,5],ymm13[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm2, %ymm0, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 672(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm2[2],ymm4[3,4,5],ymm2[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm5[2],ymm3[3,4,5],ymm5[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm19 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm19[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm25[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm15[1],xmm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 688(%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7,8,9,10],ymm5[11],ymm6[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,1,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2,3,4,5],xmm6[6],xmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1],ymm12[2],ymm4[3,4,5],ymm12[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm16[0,1,1,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm25[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0],xmm1[1],xmm9[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,0,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3],ymm10[4,5,6,7,8,9,10],ymm5[11],ymm10[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2,3,4,5],xmm7[6],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,1,1,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm20[0,1,0,2] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm23, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm21[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4,5,6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 688(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm5[1],xmm7[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,0,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm15[2,3],ymm2[4,5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm15, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4],ymm13[5,6,7,8,9,10,11],ymm14[12],ymm13[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm11[0],ymm8[1],ymm11[2,3],ymm8[4],ymm11[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm13, %ymm14, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm19[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[0,1,0,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm9, %xmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm0[2,3],ymm10[4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4],ymm1[5,6,7,8,9,10,11],ymm13[12],ymm1[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm20[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm19[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm3, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm31, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3],xmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm20[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm7, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm28, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm16, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3],xmm5[4],xmm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm21, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4],xmm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,5],xmm8[6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm23, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm22 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm11[1],xmm8[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6],ymm0[7,8,9,10,11,12,13],ymm13[14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm0, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm10[3],ymm1[4,5],ymm10[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6],ymm0[7,8,9,10,11,12,13],ymm14[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4],xmm14[5],xmm0[6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5,6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm14, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm18[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3,4,5,6],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3,4,5,6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7,8],ymm13[9,10,11,12,13,14],ymm14[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm13 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm24, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4,5,6,7,8],ymm0[9],ymm13[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm12[2],ymm3[3,4,5],ymm12[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm22, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm18 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm18[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3,4,5,6,7,8],ymm0[9],ymm6[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm24, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm8[2],ymm2[3,4,5],ymm8[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm8, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm22, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm4[2],ymm13[3,4,5],ymm4[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3,4,5,6,7],ymm11[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm13[3],ymm2[4,5],ymm13[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3,4,5,6,7],ymm6[8,9,10],ymm11[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,6,5,4] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4,5],ymm5[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6],ymm11[7,8,9,10,11,12,13],ymm0[14],ymm11[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7,8,9,10,11,12,13],ymm0[14],ymm9[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm4[3],ymm13[4,5],ymm4[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm27[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm13[3],ymm9[4,5,6,7,8,9,10],ymm13[11],ymm9[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm11, %ymm13, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm9, %ymm13, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4,5],ymm0[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7,8,9,10,11,12,13],ymm0[14],ymm6[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm22[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3],ymm6[4,5,6,7,8,9,10],ymm11[11],ymm6[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm11[1],xmm6[2,3,4,5],xmm11[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6,7,8,9,10],ymm9[11],ymm1[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0],ymm2[1],ymm15[2,3],ymm2[4],ymm15[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3,4,5],xmm13[6],xmm9[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm9, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2],xmm6[3],xmm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0],ymm11[1],ymm8[2,3,4],ymm11[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2],xmm9[3],xmm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm16 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm2[3],ymm7[4,5],ymm2[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm11[1],xmm15[2],xmm11[3],xmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3,4,5],xmm13[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3,4],ymm1[5],ymm5[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4,5],ymm2[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,3,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4,5,6,7,8],ymm15[9],ymm8[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7,8],ymm15[9],ymm14[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm14, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,4,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm20, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm4[0,1],mem[2],ymm4[3,4,5],mem[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm20, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm8[1],ymm12[2,3,4],ymm8[5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1],mem[2],ymm8[3,4,5],mem[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm12, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm11[1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm23 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm10 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm30 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm25 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm11, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm21 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm12, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm13, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm19, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm11 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm23, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1864, %rsp # imm = 0x748 +; AVX512F-ONLY-SLOW-NEXT: addq $1832, %rsp # imm = 0x728 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $1768, %rsp # imm = 0x6E8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 480(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,5,9,u,12,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,12,5,12,5,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 480(%rdi), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm6, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm2, %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 672(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm22 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm23[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm7[2],ymm2[3,4,5],ymm7[6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm1, %ymm4, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm4, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm11[2],ymm3[3,4,5],ymm11[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm15, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm0[2],ymm3[3,4,5],ymm0[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm30[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm1[1],xmm13[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm0[2],ymm7[3,4,5],ymm0[6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm29[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm2[1],xmm4[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,6,9,u,13,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm3[1],xmm9[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,6,9,u,13,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm2[1],ymm10[2,3],ymm2[4],ymm10[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm11, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm8, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1],xmm8[2,3,4,5],xmm5[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm11, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm18 = [2,5,2,5,2,5,2,5] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm23, %ymm18, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm29, %ymm18, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,3,6,15,12,13,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm18, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3,4,5],xmm10[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm23[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm8[1],xmm15[2],xmm8[3],xmm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm8, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm4[1],ymm11[2,3,4],ymm4[5],ymm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2],xmm0[3],xmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm18, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm30[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm29[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm20, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm23, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm25, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm18, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm17, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm17, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,u,u,u,4,7,11,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm23, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm31, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,3,3,0,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm23, %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,5,9,12,2,5,9,12] ; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm31 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm6, %zmm15, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7],ymm5[8,9,10,11,12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm31, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3],xmm12[4],xmm3[5],xmm12[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4,5,6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,u,u,u,4,8,11,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3,4,5,6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm15, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3],xmm3[4],xmm12[5],xmm3[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4,5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4],xmm1[5],xmm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,u,u,u,4,7,11,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm6, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm29, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm22, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm23, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm6, %zmm22, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3],xmm12[4],xmm1[5],xmm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4,5,6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm2[3],ymm10[4,5],ymm2[6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5],xmm1[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,u,u,u,4,8,11,15> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3,4,5,6],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm22, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm1[4],xmm14[5],xmm1[6],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4,5,6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm22, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,u,u,u,5,8,12,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm23, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <1,u,u,u,5,8,12,15> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm8, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm12, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm18, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm6[2],ymm0[3,4,5],ymm6[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm13[2],ymm3[3,4,5],ymm13[6],ymm3[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm29, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,11,2,11,12,5,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm5, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm5, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm9, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 832(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2],ymm13[3,4,5],ymm1[6],ymm13[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 832(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2],ymm6[3,4,5],ymm1[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm22 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -14105,603 +14082,602 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm15, %zmm16, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm26, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm25 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3,4,5],xmm11[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm4[1],ymm10[2,3],ymm4[4],ymm10[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm12[1],xmm5[2,3,4,5],xmm12[6],xmm5[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,u,u,u,6,9,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,u,u,u,6,9,13,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm14, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm16, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm6[3],ymm13[4,5],ymm6[6],ymm13[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm24, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm22 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm15, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm15, %zmm16, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <0,4,7,11,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm26, %zmm17, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3,4,5],xmm12[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3],xmm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1],xmm15[2],xmm5[3],xmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,u,6,10,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm12, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <3,u,u,u,6,10,13,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm18, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7],ymm15[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm24, %zmm17, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0],ymm14[1],ymm4[2,3],ymm14[4],ymm4[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm17 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm16, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm4[1],xmm11[2],xmm4[3],xmm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3,4,5],xmm12[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm18, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm8[3,4,5,6,7],ymm2[8,9,10],ymm8[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,4,8,11,15,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm12[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm10[2],ymm5[3,4],ymm10[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,10,3,14,7,10,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm17, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3,4,5,6,7],ymm3[8,9,10],ymm4[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm24, %zmm16, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm17, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm13 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm17 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm16 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm10, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm9[1,2],ymm7[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm11, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm7[1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3,4,5,6,7],ymm2[8,9,10],ymm7[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm9 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm10 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm14, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm18, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rax) ; AVX512F-ONLY-FAST-NEXT: addq $1768, %rsp # imm = 0x6E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i16_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $1560, %rsp # imm = 0x618 +; AVX512DQ-SLOW-NEXT: subq $1512, %rsp # imm = 0x5E8 ; AVX512DQ-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm24 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vporq %ymm3, %ymm2, %ymm27 -; AVX512DQ-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vporq %ymm2, %ymm0, %ymm22 +; AVX512DQ-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 672(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm18[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm8 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm19[0,1,0,2] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7,8,9,10],ymm0[11],ymm2[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3,4,5],xmm3[6],xmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7,8,9,10],ymm5[11],ymm6[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm26 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm1[1],xmm15[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0],xmm4[1],xmm11[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,0,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7,8,9,10],ymm6[11],ymm7[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2],ymm12[3,4,5],ymm14[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm20[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm16 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3],ymm13[4,5,6,7,8,9,10],ymm5[11],ymm13[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1],xmm13[2,3,4,5],xmm7[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm30 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm4[2],ymm0[3,4,5],ymm4[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm13[4],xmm5[5],xmm13[6],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm23 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm23[0,1,0,2] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm25[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 688(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0],xmm10[1],xmm4[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5,6,7,8,9,10,11],ymm15[12],ymm14[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm1[1],xmm15[2,3,4,5],xmm1[6],xmm15[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm14, %ymm14 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm14, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm11, %xmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm18 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm19[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,0,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm11, %xmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm9 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm15 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3,4,5],xmm15[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm24 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm9 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm20[0,1,1,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm23[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm10, %xmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vporq %ymm8, %ymm7, %ymm30 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm4[1],ymm11[2,3,4],ymm4[5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm11[1],ymm9[2,3,4],ymm11[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm18[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm8[1],ymm15[2,3,4],ymm8[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm19[0,1,1,3] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm17, %xmm7 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm26, %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm11[1],ymm9[2,3,4],ymm11[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm25, %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm23[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm7 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm16, %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm13 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5],xmm5[6],xmm6[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm22, %xmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm14 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3],xmm2[4],xmm7[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm23, %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7],ymm7[8,9,10,11,12],ymm6[13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,5],xmm3[6],xmm14[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,6] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7],ymm0[8,9,10,11,12],ymm12[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm9 +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm29, %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm0[2],xmm12[3],xmm0[4],xmm12[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm17, %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm14[1],xmm3[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm25[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 {%k1} # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm18 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 {%k1} # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 @@ -14710,475 +14686,483 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm10 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm5 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm7 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm8[3],ymm15[4,5],ymm8[6],ymm15[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm20 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6],ymm9[7,8,9,10,11,12,13],ymm13[14],ymm9[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3,4,5,6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 {%k1} # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6],ymm0[7,8,9,10,11,12,13],ymm13[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4],xmm13[5],xmm9[6],xmm13[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4,5,6],ymm15[7,8],ymm13[9,10,11,12,13,14],ymm15[15] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm1[3],ymm11[4,5],ymm1[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3,4,5,6],xmm13[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm13, %xmm29 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm29[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1,2,3,4,5,6],ymm13[7,8],ymm11[9,10,11,12,13,14],ymm13[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3,4,5,6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3,4,5,6],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm16 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm16[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2,3],xmm4[4],xmm9[5],xmm4[6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1,2,3,4,5,6],ymm12[7,8],ymm9[9,10,11,12,13,14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3,4,5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3,4],ymm8[5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm9 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4,5,6,7,8],ymm0[9],ymm9[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm4[2],ymm1[3,4,5],ymm4[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm13[4],xmm9[5],xmm13[6],xmm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm23, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm29 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm12[4],xmm9[5],xmm12[6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm21, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm16 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm29[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm16[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm9 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm15 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm4 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm23, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm25 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4,5,6,7,8],ymm0[9],ymm4[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4],xmm4[5],xmm9[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm21, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm2[3],ymm7[4,5],ymm2[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm24 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm19[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm4 = mem[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm15[2],ymm3[3,4,5],ymm15[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4],xmm4[5],xmm9[6],xmm4[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm28, %zmm24 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm9, %ymm9 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm28 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm20[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm30 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm5[2],ymm9[3,4],ymm5[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm20 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm4 = mem[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm23 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm7[2],ymm2[3,4,5],ymm7[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,5,4] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm30 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm23 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm2[2],mem[3,4,5],ymm2[6],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm4 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6],ymm13[7,8,9,10,11,12,13],ymm0[14],ymm13[15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7,8,9,10,11,12,13],ymm0[14],ymm4[15] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5],xmm0[6],xmm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm9[1],xmm14[2,3,4,5],xmm9[6],xmm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm13[3],ymm2[4,5],ymm13[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm6[2,3],ymm14[4,5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm14[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7,8,9,10],ymm11[11],ymm4[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm13, %ymm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4,5],ymm14[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm11, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4,5],ymm8[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm4 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7,8,9,10,11,12,13],ymm0[14],ymm9[15] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5,6,7,8,9,10],ymm11[11],ymm9[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7,8,9,10,11,12,13],ymm0[14],ymm4[15] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm11, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm17[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3],ymm4[4,5,6,7,8,9,10],ymm9[11],ymm4[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3,4,5],xmm11[6],xmm9[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm9, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7,8,9,10,11],ymm4[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm19 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2],xmm9[3],xmm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6],ymm9[7,8],ymm1[9,10,11,12,13,14],ymm9[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm20 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3,4,5,6],ymm9[7,8],ymm4[9,10,11,12,13,14],ymm9[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm17 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm13 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2],xmm9[3],xmm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm4[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7,8,9,10,11],ymm4[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm12[1],ymm3[2,3],ymm12[4],ymm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3],xmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6],ymm9[7,8],ymm1[9,10,11,12,13,14],ymm9[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm14[2,3],ymm6[4,5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3,4,5,6],ymm9[7,8],ymm4[9,10,11,12,13,14],ymm9[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm9 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm22, %zmm8 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6,7,8],ymm4[9],ymm0[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm1[3],ymm10[4,5],ymm1[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,3,1] +; AVX512DQ-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,3,1] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm11 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm7[1],ymm12[2,3],ymm7[4],ymm12[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7,8],ymm3[9],ymm1[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm9 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm14[3],ymm6[4,5],ymm14[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] @@ -15186,26 +15170,25 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm22, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,3,1] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -15213,746 +15196,749 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, (%r8) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX512DQ-SLOW-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1288, %rsp # imm = 0x508 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa 480(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <2,5,9,u,12,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm31, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 480(%rdi), %ymm5 ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm22 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm6, %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa 672(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512DQ-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,0,2] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm4, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa 672(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm20 +; AVX512DQ-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm30 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm30[0,1,0,2] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm28 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm8 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm4, %ymm27 ; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm9 +; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm18 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm4, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm8[2],ymm1[3,4,5],ymm8[6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm3[1],xmm13[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm31 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm7 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm6[2],ymm1[3,4,5],ymm6[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0],xmm9[1],xmm14[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm9, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2,3,4,5],xmm4[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm0[2],ymm3[3,4,5],ymm0[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm21 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm21[0,1,0,2] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, (%rsp) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm3[1],xmm12[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm10, %ymm13 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm14 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm10, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm22 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm23[0,1,0,2] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm1[1],xmm9[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <2,6,9,u,13,u,u,u> +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2],xmm2[3],xmm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3,4,5],xmm9[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm19[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm20 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1],xmm12[2],xmm5[3],xmm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm6 -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm31, %zmm22, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2],ymm6[3],ymm13[4,5],ymm6[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm18 = [2,5,2,5,2,5,2,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm30, %ymm18, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm14, %xmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10 +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm22, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm18, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm31, %zmm7, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm10, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm21[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm23 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm8 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm31, %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm2, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3,4,5],xmm10[6],xmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm30[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm23[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm21, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [3,6,10,13,3,6,10,13] ; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm26, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7] ; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm24, %zmm2, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm22, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,3,3,0,3,7,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7],ymm7[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm11, %xmm7 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3],xmm9[4],xmm2[5],xmm9[6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 {%k1} # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm9 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm18, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm30, %ymm2, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm12, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm7 +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm20, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, %xmm11 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm24 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,u,u,4,7,11,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5,6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm26, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 {%k1} # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm13, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3,4,5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm12, %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 {%k1} # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm15 +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm22, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm18 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm30, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm12, %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} xmm22 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3],xmm15[4],xmm8[5],xmm15[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4,5,6],xmm9[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,u,u,u,4,8,11,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm14, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4,5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm4, %zmm10 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4],xmm10[5],xmm5[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3,4,5,6],xmm8[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4,5],ymm13[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,u,u,u,5,8,12,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm26, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm27 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm6 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3],xmm11[4],xmm4[5],xmm11[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm27 +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5,6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vpermd %zmm5, %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3],xmm4[4],xmm10[5],xmm4[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,u,u,u,4,8,11,15> +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3,4,5,6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm5, %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3,4,5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm9[2],ymm6[3,4,5],ymm9[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5],xmm10[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,u,u,u,5,8,12,15> +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm12, %ymm4 +; AVX512DQ-FAST-NEXT: vpermd %zmm5, %zmm26, %zmm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm5 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm26, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm14 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 ; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm14 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm9, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,11,2,11,12,5,8,9] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa 832(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm20 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5],xmm10[6],xmm12[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, %xmm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm27, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm25 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm14, %ymm20 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm13[2],ymm2[3,4],ymm13[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm31, %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm30, %zmm21 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2,3,4,5],xmm1[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,u,u,u,6,9,13,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <2,u,u,u,6,9,13,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm22, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7],ymm11[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm27, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm11 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm10[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm30, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1],xmm10[2,3,4,5],xmm1[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm22, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm20 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm13[2,3],ymm2[4,5],ymm13[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm27 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm25 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm29 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm22, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3,4,5],xmm15[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1],xmm8[2],xmm15[3],xmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = <0,4,7,11,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm31, %zmm25, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <3,u,u,u,6,10,13,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm22, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3,4,5],xmm8[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3,4],ymm13[5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <3,u,u,u,6,10,13,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm23, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm4[1,2],ymm11[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm11, %ymm14, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm25, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm23, %zmm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,4,8,11,15,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm31, %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1],ymm10[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm6 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1,2],ymm9[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm15[2],ymm3[3,4],ymm15[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1,2],ymm6[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3,4,5,6,7],ymm3[8,9,10],ymm4[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm14 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rdx) +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm14 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rax) -; AVX512DQ-FAST-NEXT: addq $1288, %rsp # imm = 0x508 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -15962,189 +15948,189 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm10, %zmm9 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm9 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm17, %zmm18 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm18 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm18 ; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm9 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm17 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm19, %zmm17 -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm16 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm16 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm10 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm18, %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm19 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm22 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm17, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm17, %zmm16 ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm18 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm18 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm20, %zmm18 -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm17 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm17 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm21 ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm20, %zmm21 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm21 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm21 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm23, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm19, %zmm18 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm18 {%k1} ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2} -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm20 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm20 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm22, %zmm20 -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm23 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm23 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm19 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm22, %zmm23 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm24, %zmm23 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm25, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2} -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm25, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm20 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm22 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm24, %zmm22 -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm25 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm25 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm21 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm22, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm27, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm26, %zmm24 -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm27 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm24, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm25, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm27 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm27, %zmm28 +; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm28 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm24, %zmm22 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm25 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm27 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm22, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm27, %zmm28 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm24, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm25, %zmm26 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm29, %zmm30 -; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm30 {%k2} -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm26, %zmm24 -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm27 +; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm30 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm24, %zmm22 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm25 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm29 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm24, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm29 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm22, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm14, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm1 {%k2} +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm22, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm14, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm30, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index 47f5ac1c5fc406..5ef8032fe2cc51 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -326,77 +326,77 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm9 -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 ; SSE-NEXT: movdqa 112(%rdi), %xmm11 ; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm12[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm7[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps %xmm6, (%rdx) +; SSE-NEXT: movaps %xmm8, (%rdx) ; SSE-NEXT: movaps %xmm4, (%rcx) -; SSE-NEXT: movaps %xmm8, (%r8) -; SSE-NEXT: movaps %xmm7, (%r9) +; SSE-NEXT: movaps %xmm10, (%r8) +; SSE-NEXT: movaps %xmm6, (%r9) ; SSE-NEXT: movaps %xmm11, (%r11) ; SSE-NEXT: movaps %xmm0, (%r10) -; SSE-NEXT: movaps %xmm9, (%rax) +; SSE-NEXT: movaps %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride8_vf8: @@ -535,26 +535,26 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-SLOW-LABEL: load_i16_stride8_vf8: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm0, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpermt2d %xmm2, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm12 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,1,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] @@ -563,100 +563,100 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] ; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm15, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpermi2d %xmm1, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpermi2d %xmm0, %xmm1, %xmm6 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm1[0,1,2],xmm10[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512F-SLOW-NEXT: vpermt2d %xmm3, %xmm15, %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, (%rsi) +; AVX512F-SLOW-NEXT: vpermt2d %xmm3, %xmm15, %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %xmm7, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa %xmm8, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %xmm9, (%r8) -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa %xmm10, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride8_vf8: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm13 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm7 +; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm6, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm7[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm2 -; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm9, %xmm2 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm15 +; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm14, %xmm15 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] +; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm15 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm0, %xmm15 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] +; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm15, %xmm13 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm12 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm3, %xmm6 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX512F-FAST-NEXT: vpermi2d %xmm4, %xmm8, %xmm14 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [3,7,3,7] -; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm10, %xmm15 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512F-FAST-NEXT: vpermi2d %xmm4, %xmm5, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX512F-FAST-NEXT: vpermi2d %xmm6, %xmm7, %xmm9 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512F-FAST-NEXT: vpermi2d %xmm4, %xmm5, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpermt2d %xmm6, %xmm10, %xmm7 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm15, %xmm8 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa %xmm2, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %xmm11, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %xmm3, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %xmm7, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa %xmm2, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %xmm5, (%r8) +; AVX512F-FAST-NEXT: vmovdqa %xmm6, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa %xmm8, (%rax) +; AVX512F-FAST-NEXT: vmovdqa %xmm9, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa %xmm1, (%rax) +; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa %xmm4, (%rax) +; AVX512F-FAST-NEXT: vmovdqa %xmm1, (%rax) ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride8_vf8: @@ -715,51 +715,51 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i16_stride8_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $152, %rsp +; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm3 ; SSE-NEXT: movdqa 240(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm12 -; SSE-NEXT: movdqa 144(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm10 -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm5 ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 @@ -768,144 +768,140 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[2,3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhwd (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd (%rsp), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm11[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm13[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm2[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps %xmm8, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movaps %xmm1, (%r9) -; SSE-NEXT: movapd %xmm12, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps %xmm15, 16(%r8) +; SSE-NEXT: movaps %xmm7, (%r9) +; SSE-NEXT: movapd %xmm4, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm7, (%rax) -; SSE-NEXT: movaps %xmm4, 16(%rax) +; SSE-NEXT: movaps %xmm9, (%rax) +; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm6, (%rax) -; SSE-NEXT: movapd %xmm9, 16(%rax) +; SSE-NEXT: movapd %xmm11, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm14, (%rax) -; SSE-NEXT: addq $168, %rsp +; SSE-NEXT: movaps %xmm14, 16(%rax) +; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride8_vf16: @@ -920,8 +916,8 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -933,8 +929,8 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] @@ -942,140 +938,140 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3],xmm11[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm6[2,3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: addq $152, %rsp @@ -1084,11 +1080,11 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $264, %rsp # imm = 0x108 +; AVX2-ONLY-NEXT: subq $232, %rsp ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 @@ -1097,51 +1093,52 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpbroadcastd %xmm12, %xmm0 ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm9, %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm7, %xmm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm5[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm3[0,1,0,2] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm2[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm15 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm15[0,1,0,2] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4],ymm2[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd $15, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] @@ -1149,32 +1146,32 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -1182,82 +1179,81 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm5, %xmm2 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm2 ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5,6],ymm8[7] -; AVX2-ONLY-NEXT: vpermq $212, (%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5,6],ymm7[7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm15[0,1,1,3] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm1[1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm12 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm12 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) @@ -1269,174 +1265,174 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-ONLY-NEXT: addq $232, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride8_vf16: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm30 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm7, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm18 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm18[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm20 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm20[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm27 +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm25 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm29 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm15 +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm15[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm17 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm17[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm16[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm18[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm23 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm23[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm16 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm16[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm20 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm20[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm21[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm23[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5],ymm10[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm23[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm16, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm2 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpermi2d %xmm1, %xmm2, %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm7[2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpermi2d %xmm2, %xmm4, %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm17[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm8[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm18[0,1,1,3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm23[0,1,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm20[0,1,1,3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm21[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm15[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm7[1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm17[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpermt2d %xmm7, %xmm16, %xmm6 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, (%rdx) +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, (%rsi) +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%r8) -; AVX512F-SLOW-NEXT: vmovdqa %ymm10, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%r8) +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa %ymm5, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1448,7 +1444,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm27 ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 @@ -1457,161 +1453,161 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm13 -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm7, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm31 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm13[2,3] +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm8, %xmm7, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm16 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm18[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm17[0,1,0,2] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7] -; AVX512F-FAST-NEXT: vmovdqa64 160(%rdi), %ymm19 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm19[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm23[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-FAST-NEXT: vmovdqa64 160(%rdi), %ymm21 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm21[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm19[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm23[0,2,2,3,4,6,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm15, %xmm1 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm8[2],xmm12[3],xmm8[3] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm12, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7] -; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm18, %xmm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpermt2d %xmm8, %xmm12, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7] +; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm18, %xmm13 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm4, %xmm7 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm4, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm7[2,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm16[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm17[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm23[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm17[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm21[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm19[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vpermi2d %xmm6, %xmm5, %xmm15 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm4, %xmm12 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm4, %xmm12 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-FAST-NEXT: vpermt2d %xmm6, %xmm18, %xmm5 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%r9) -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, (%r8) +; AVX512F-FAST-NEXT: vmovdqa %ymm3, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1620,66 +1616,66 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx) ; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx) ; AVX512BW-NEXT: vmovdqa %ymm7, (%r8) ; AVX512BW-NEXT: vmovdqa %ymm8, (%r9) ; AVX512BW-NEXT: vmovdqa %ymm9, (%r11) ; AVX512BW-NEXT: vmovdqa %ymm10, (%r10) -; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <128 x i16>, ptr %in.vec, align 64 @@ -1705,468 +1701,464 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i16_stride8_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $696, %rsp # imm = 0x2B8 +; SSE-NEXT: subq $664, %rsp # imm = 0x298 ; SSE-NEXT: movdqa 496(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 480(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: movdqa %xmm13, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa 448(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa 384(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 304(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm10 +; SSE-NEXT: movdqa 288(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa 272(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa 256(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa 96(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa 64(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movapd %xmm3, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: movaps %xmm14, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movapd %xmm7, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movapd %xmm6, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%r9) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 48(%r8) +; SSE-NEXT: movaps (%rsp), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rax) -; SSE-NEXT: movaps %xmm10, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm13, 48(%rax) -; SSE-NEXT: movapd %xmm12, 32(%rax) +; SSE-NEXT: movapd %xmm12, 48(%rax) +; SSE-NEXT: movapd %xmm8, 32(%rax) ; SSE-NEXT: movapd %xmm11, 16(%rax) -; SSE-NEXT: movaps %xmm14, (%rax) +; SSE-NEXT: movaps %xmm15, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 48(%rax) ; SSE-NEXT: movaps %xmm2, 32(%rax) -; SSE-NEXT: movaps %xmm15, 16(%rax) +; SSE-NEXT: movaps %xmm3, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $696, %rsp # imm = 0x2B8 +; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $872, %rsp # imm = 0x368 +; AVX1-ONLY-NEXT: subq $904, %rsp # imm = 0x388 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 @@ -2176,17 +2168,17 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] @@ -2194,13 +2186,12 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm3 @@ -2212,13 +2203,15 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -2254,7 +2247,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 @@ -2269,116 +2262,118 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm12[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3,4,5],xmm6[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm14[2],mem[2],xmm14[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2387,73 +2382,73 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm11 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -2462,88 +2457,93 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm14[2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm7[0,1,2],xmm15[3] -; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm11[0,1,2,3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload @@ -2553,16 +2553,16 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -2598,7 +2598,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $872, %rsp # imm = 0x368 +; AVX1-ONLY-NEXT: addq $904, %rsp # imm = 0x388 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2618,15 +2618,14 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm9 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 @@ -2644,24 +2643,23 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 @@ -2687,148 +2685,144 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm11[7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm7[7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa %xmm9, %xmm14 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm15[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vmovdqa %xmm9, %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm12 -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,2,2,2] +; AVX2-ONLY-NEXT: vmovdqa %xmm13, %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm15, %xmm8 +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, (%rsp), %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2836,11 +2830,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -2852,112 +2844,117 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 ; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm11, %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm10 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm11 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm10[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm10[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] @@ -2968,96 +2965,97 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vmovdqa %xmm9, %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm13, %xmm0 +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm8, %xmm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-ONLY-NEXT: vzeroupper @@ -3065,81 +3063,83 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-SLOW-LABEL: load_i16_stride8_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $616, %rsp # imm = 0x268 -; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: subq $536, %rsp # imm = 0x218 +; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm26 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm25 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vpermt2d %xmm3, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa 272(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 ; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm18[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm1[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm20[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm1[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5],ymm9[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-SLOW-NEXT: movb $-64, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm1, %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm3, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 @@ -3148,118 +3148,115 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm31[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm26 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm26[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm25[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5,6],ymm11[7] +; AVX512F-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm27 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm27[0,1,0,2] ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm28 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm28[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm11 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm21[2],xmm5[3],xmm21[3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm23[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm17[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm11, %xmm0, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpermt2d %xmm21, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = xmm5[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %xmm8, %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpermt2d %xmm7, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm3[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -3273,43 +3270,43 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm30 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm2 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm1, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm8 ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm19 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm19[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm21 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm29 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm23 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm22 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3317,145 +3314,142 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512F-SLOW-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm17 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm24 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm26[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm17[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm25[0,1,1,3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm25 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm28[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm25[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm27 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm27[0,1,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm28[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm26[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm25[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm9 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm16[0],xmm18[0],xmm16[1],xmm18[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm18[0],xmm16[0],xmm18[1],xmm16[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm26[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1,2],xmm12[3] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm12, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %xmm8, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm18[2],xmm9[3],xmm18[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm25[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm16[2,2,2,2] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vpermt2d %xmm28, %xmm10, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm18[2],xmm16[2],xmm18[3],xmm16[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rcx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%r8) +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: addq $616, %rsp # imm = 0x268 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-SLOW-NEXT: addq $536, %rsp # imm = 0x218 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride8_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $552, %rsp # imm = 0x228 +; AVX512F-FAST-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm1 @@ -3465,37 +3459,38 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm3 ; AVX512F-FAST-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm11 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm9 ; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 ; AVX512F-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm1[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm27[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm1[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3503,197 +3498,200 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: movb $-64, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1} ; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm8, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm2[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm23[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa64 160(%rdi), %ymm26 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm26[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm27 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm27[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm31[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm11 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1] +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm6 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm22[0],xmm11[1],xmm22[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm9 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm9 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm14[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm11, %xmm1 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm25[0],xmm13[1],xmm25[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm6, %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm22[2],xmm11[3],xmm22[3] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm22, %xmm0, %xmm2 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm7 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm13[2],xmm25[2],xmm13[3],xmm25[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm25, %xmm0, %xmm7 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm17[2],xmm6[3],xmm17[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm22, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm19[2],xmm12[3],xmm19[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm4 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm28[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm5 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm8, %xmm0, %xmm5 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm16[2],xmm3[3],xmm16[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm31[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm17[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm27 = [3,7,3,7] -; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm27, %xmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm30 = [3,7,3,7] +; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm30, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm9[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm30, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm27, %xmm6 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm10[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm1 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] -; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm6 +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm23 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm20 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm20[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm29 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm26 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm30 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm25 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm31 {%k1} +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm21 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm6 {%k1} ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -3701,138 +3699,138 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm20 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm19 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm21 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,5,1,5] -; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm18 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm18[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm17 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm26[0,1,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm24[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm27[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [1,5,1,5] +; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm6, %xmm0 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm3 -; AVX512F-FAST-NEXT: vpermi2d %xmm16, %xmm7, %xmm3 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm22[0],xmm17[1],xmm22[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm1 +; AVX512F-FAST-NEXT: vpermi2d %xmm22, %xmm8, %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm16[0],xmm31[0],xmm16[1],xmm31[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm0, %xmm8 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm25[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm3 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm26 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] +; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm1, %xmm3 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm19[2],xmm4[3],xmm19[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vpermi2d %xmm22, %xmm17, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm16[2],xmm7[3],xmm16[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpermi2d %xmm31, %xmm16, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm8[2],xmm22[2],xmm8[3],xmm22[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm18[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm17[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm24[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm21[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm27, %xmm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm27[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm30, %xmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %xmm22, %xmm30, %xmm8 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm16[2],xmm31[2],xmm16[3],xmm31[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm27, %xmm7 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm22[2],xmm17[3],xmm22[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rsi) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rdx) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-FAST-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512F-FAST-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -3843,24 +3841,24 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm10 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 @@ -3868,12 +3866,12 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm10 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 @@ -3881,12 +3879,12 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm11 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 @@ -3894,12 +3892,12 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm12 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 @@ -3907,12 +3905,12 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm13 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm13 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 @@ -3920,12 +3918,12 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm14 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 @@ -3933,26 +3931,26 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm15 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm15 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm15, %zmm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) @@ -3986,7 +3984,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i16_stride8_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1800, %rsp # imm = 0x708 +; SSE-NEXT: subq $1768, %rsp # imm = 0x6E8 ; SSE-NEXT: movdqa 752(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 736(%rdi), %xmm3 @@ -3995,92 +3993,91 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm5 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm6 +; SSE-NEXT: movdqa 144(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa 128(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm8 +; SSE-NEXT: movdqa 176(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm9 +; SSE-NEXT: movdqa 160(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 720(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 704(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 704(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 688(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm0 +; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 656(%rdi), %xmm2 +; SSE-NEXT: movdqa 672(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 624(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 608(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 608(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; SSE-NEXT: movdqa 592(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 576(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 560(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 528(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm0 +; SSE-NEXT: movdqa 560(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 544(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa 528(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa 512(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 496(%rdi), %xmm0 @@ -4091,27 +4088,27 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 432(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm0 +; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 416(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa 384(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1008(%rdi), %xmm0 @@ -4122,26 +4119,27 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 976(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 960(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 944(%rdi), %xmm2 +; SSE-NEXT: movdqa 944(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 928(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 928(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 912(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 896(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 912(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 896(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4160,20 +4158,20 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 272(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4185,112 +4183,104 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 848(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 832(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 816(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm0 +; SSE-NEXT: movdqa 816(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 800(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 784(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa 768(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4308,88 +4298,87 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movapd %xmm4, %xmm0 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movapd %xmm3, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movapd %xmm7, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -4402,19 +4391,18 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -4426,22 +4414,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -4453,19 +4440,42 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4475,41 +4485,19 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4519,15 +4507,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -4536,23 +4525,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4562,45 +4553,42 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4612,8 +4600,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4627,65 +4616,65 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4694,115 +4683,119 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd $170, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movapd %xmm13, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movapd %xmm14, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,2,2,2] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movapd %xmm11, %xmm0 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] @@ -4810,13 +4803,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -4828,36 +4820,31 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -4939,45 +4926,47 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm9, 112(%rax) -; SSE-NEXT: movapd %xmm10, 96(%rax) -; SSE-NEXT: movapd %xmm11, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, 16(%rax) +; SSE-NEXT: movapd %xmm8, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rax) ; SSE-NEXT: movaps %xmm7, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm1, 112(%rax) +; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps %xmm2, 96(%rax) -; SSE-NEXT: movaps %xmm3, 80(%rax) +; SSE-NEXT: movaps %xmm13, 80(%rax) ; SSE-NEXT: movaps %xmm4, 64(%rax) ; SSE-NEXT: movaps %xmm5, 48(%rax) ; SSE-NEXT: movaps %xmm6, 32(%rax) -; SSE-NEXT: movaps %xmm8, 16(%rax) -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $1800, %rsp # imm = 0x708 +; SSE-NEXT: movaps %xmm12, 16(%rax) +; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: addq $1768, %rsp # imm = 0x6E8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride8_vf64: @@ -4987,20 +4976,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 @@ -5013,30 +5003,29 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -5067,15 +5056,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 976(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm3 @@ -5088,15 +5078,15 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 928(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 896(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -5175,16 +5165,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -5200,7 +5190,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5212,54 +5202,56 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm9[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm11[2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 @@ -5286,70 +5278,71 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] @@ -5358,17 +5351,18 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5380,7 +5374,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm15 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm9[2],xmm14[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm14[2],mem[2],xmm14[3],mem[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload @@ -5391,32 +5386,31 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload @@ -5427,8 +5421,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] @@ -5436,67 +5430,71 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5504,12 +5502,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload @@ -5522,7 +5520,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -5539,7 +5537,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5556,55 +5553,57 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5613,238 +5612,233 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm3[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm2[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm2[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm12[2],mem[2],xmm12[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm13[2],mem[2],xmm13[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -5906,7 +5900,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX1-ONLY-NEXT: vzeroupper @@ -5914,7 +5908,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2408, %rsp # imm = 0x968 +; AVX2-ONLY-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 @@ -5928,7 +5922,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 @@ -6025,8 +6019,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,2,3,4,6,6,7] @@ -6061,7 +6055,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 @@ -6084,13 +6078,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 624(%rdi), %xmm4 @@ -6102,23 +6096,23 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm4 ; AVX2-ONLY-NEXT: vmovdqa 592(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 560(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm5[0,1],xmm4[2,3] @@ -6130,18 +6124,18 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm7[7] ; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm7[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6157,8 +6151,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm8[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm7[1],xmm12[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] @@ -6187,20 +6181,20 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] @@ -6210,10 +6204,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] @@ -6233,7 +6227,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] @@ -6271,13 +6265,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -6287,28 +6280,28 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] @@ -6317,12 +6310,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] @@ -6330,7 +6323,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] @@ -6363,8 +6356,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7] @@ -6376,55 +6368,52 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3] +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm5, %xmm0 ; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, %xmm8 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] @@ -6432,9 +6421,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] @@ -6462,7 +6450,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -6482,10 +6470,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] @@ -6493,10 +6483,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6515,7 +6506,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -6528,10 +6519,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] @@ -6539,18 +6532,18 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -6560,118 +6553,126 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm12, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovdqa %xmm8, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm11[0],mem[0],xmm11[1],mem[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] @@ -6685,20 +6686,18 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] @@ -6707,90 +6706,126 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm5[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm6[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX2-ONLY-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -6802,111 +6837,76 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm15, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-ONLY-NEXT: addq $2408, %rsp # imm = 0x968 +; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%rax) +; AVX2-ONLY-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $2408, %rsp # imm = 0x968 +; AVX512F-SLOW-NEXT: subq $2392, %rsp # imm = 0x958 ; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm0 @@ -6917,7 +6917,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] @@ -6925,7 +6925,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %xmm2 @@ -6938,7 +6939,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 @@ -7003,37 +7004,37 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2] ; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm3[0,1,0,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm3[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm3[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -7052,12 +7053,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm26 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm6, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa 816(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 800(%rdi), %xmm2 @@ -7069,294 +7069,298 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa 992(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa 992(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 960(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vmovdqa 928(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2] ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 896(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa 928(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 896(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 624(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa 592(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm10, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 560(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 624(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa 528(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 592(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-SLOW-NEXT: vmovdqa 672(%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm5[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa 640(%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm5[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm28[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 560(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm0[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vmovdqa 672(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm4[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 640(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm4[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm17[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm15[1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm6 = xmm6[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm16[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm11 = xmm11[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm27[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm10[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm25[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm14[2],xmm31[3],xmm14[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm17[2],xmm15[2],xmm17[3],xmm15[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm22 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm13[2],xmm17[3],xmm13[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm22 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm16[2],xmm13[2],xmm16[3],xmm13[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm26 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm20[2],xmm19[3],xmm20[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm24 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm27[2],xmm24[2],xmm27[3],xmm24[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm29 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24 ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512F-SLOW-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm25[2],xmm5[2],xmm25[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm3 = xmm0[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm31[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm22, %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm15 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm1[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm3 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm13 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-SLOW-NEXT: vpermt2d %xmm22, %xmm16, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm26, %xmm15, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-SLOW-NEXT: vpermt2d %xmm24, %xmm16, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-SLOW-NEXT: vpermt2d %xmm29, %xmm15, %xmm0 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %xmm18, %xmm16, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm15[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm23, %xmm15, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] @@ -7367,30 +7371,30 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm10, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm8, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3] @@ -7433,7 +7437,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm8, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -7455,17 +7459,18 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm30 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3] ; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -7480,10 +7485,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm8, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -7491,8 +7496,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12 ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -7506,36 +7511,35 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm26 = mem[0,1,1,3] ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm29 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5],ymm7[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm10, %xmm1 -; AVX512F-SLOW-NEXT: vpermi2d %xmm2, %xmm8, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm31 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vpermi2d %xmm2, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm19 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3] @@ -7544,28 +7548,27 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm10 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm28 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm28[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload @@ -7583,27 +7586,27 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 {%k1} ; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm13 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm10 = xmm10[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm16[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm17[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] @@ -7613,239 +7616,236 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm6 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm16[0],xmm30[0],xmm16[1],xmm30[1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm28[2],xmm10[2],xmm28[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm15[2],xmm31[3],xmm15[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm27 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm19 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm15[2],xmm17[3],xmm15[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm23 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm21 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm30 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm17[2],xmm18[2],xmm17[3],xmm18[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm25 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm18 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm24[2,2,2,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm30[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm6[2],xmm31[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm29 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm28 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-SLOW-NEXT: vpermt2d %xmm27, %xmm18, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-SLOW-NEXT: vpermt2d %xmm21, %xmm18, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm6 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm3 -; AVX512F-SLOW-NEXT: vpermt2d %xmm23, %xmm16, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm3 = xmm3[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 -; AVX512F-SLOW-NEXT: vpermt2d %xmm30, %xmm16, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm3 = xmm3[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm25, %xmm18, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %xmm29, %xmm16, %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm22[2],xmm24[2],xmm22[3],xmm24[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %xmm29, %xmm18, %xmm11 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm26[2],xmm30[2],xmm26[3],xmm30[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rcx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rcx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%r8) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%r8) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%r9) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%r9) +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rcx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%r8) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rax) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rax) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-SLOW-NEXT: addq $2408, %rsp # imm = 0x968 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-SLOW-NEXT: addq $2392, %rsp # imm = 0x958 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512F-FAST-NEXT: subq $2520, %rsp # imm = 0x9D8 ; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm5 ; AVX512F-FAST-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 @@ -7893,10 +7893,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm5, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 @@ -7906,10 +7905,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX512F-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7919,24 +7918,27 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,2,2,3,4,6,6,7] ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -7946,276 +7948,278 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 864(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 848(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm2 -; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512F-FAST-NEXT: vmovdqa 816(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm29 +; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 816(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 800(%rdi), %xmm3 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqa 784(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqa 784(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 768(%rdi), %xmm3 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm10 ; AVX512F-FAST-NEXT: vmovdqa 992(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 960(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa 928(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 896(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm2[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm10 {%k1} ; AVX512F-FAST-NEXT: vmovdqa 624(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqa 592(%rdi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa 592(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512F-FAST-NEXT: vpermt2d %xmm11, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm16 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 +; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 ; AVX512F-FAST-NEXT: vmovdqa 560(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa 528(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm21 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm28 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm4[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa 672(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-FAST-NEXT: vmovdqa 672(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm9, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm30 -; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm13 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] -; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm1, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-FAST-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7] ; AVX512F-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm12 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm13, %xmm9 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm20[0],xmm19[0],xmm20[1],xmm19[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm11 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm11 -; AVX512F-FAST-NEXT: vpermt2d %xmm25, %xmm13, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm8 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm26[0],xmm27[0],xmm26[1],xmm27[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm19 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512F-FAST-NEXT: vpermt2d %xmm21, %xmm13, %xmm3 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm29[0],xmm16[0],xmm29[1],xmm16[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512F-FAST-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %xmm0, %xmm1, %xmm10 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm21[0],xmm19[0],xmm21[1],xmm19[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7] +; AVX512F-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm1 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm10 +; AVX512F-FAST-NEXT: vpermt2d %xmm31, %xmm13, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm29[0],xmm14[0],xmm29[1],xmm14[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm13, %xmm3 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm20[0],xmm23[0],xmm20[1],xmm23[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm26 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm24, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm28 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa %xmm8, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,6,2,6] +; AVX512F-FAST-NEXT: vpermt2d %xmm9, %xmm2, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm6 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm16[2],xmm17[3],xmm16[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm27 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm20[2],xmm2[2],xmm20[3],xmm2[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm5, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm17[2],xmm12[3],xmm17[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm23 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm25 = xmm21[2],xmm19[2],xmm21[3],xmm19[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm6, %xmm2 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm22[2],xmm0[2],xmm22[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm19[2],xmm8[2],xmm19[3],xmm8[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm8, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm20 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24 -; AVX512F-FAST-NEXT: vpshufd $212, (%rsp), %ymm9 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm29[2],xmm14[2],xmm29[3],xmm14[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm6, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm29 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm24[2],xmm10[2],xmm24[3],xmm10[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm19 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1} -; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm6[2],xmm16[2],xmm6[3],xmm16[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm21[2],xmm22[3],xmm21[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm20[2],xmm26[2],xmm20[3],xmm26[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm26, %xmm6, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm14[2],xmm28[2],xmm14[3],xmm28[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] @@ -8235,55 +8239,56 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm0 = xmm1[0,1],mem[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm3 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm15 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm23, %xmm16, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm21, %xmm16, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm16, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm16, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm21, %xmm16, %xmm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm16, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] @@ -8295,24 +8300,23 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm15 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,4,0,4] -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm29 +; AVX512F-FAST-NEXT: vmovdqa %xmm10, %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,4,0,4] +; AVX512F-FAST-NEXT: vpermt2d %xmm15, %xmm11, %xmm0 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -8337,31 +8341,30 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm9 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-FAST-NEXT: vpermt2d %xmm6, %xmm5, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm18 +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm11, %xmm1 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3] @@ -8372,9 +8375,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3] @@ -8382,47 +8385,49 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm30 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm31 -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512F-FAST-NEXT: vpermt2d %xmm1, %xmm11, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm17 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm8 ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3] @@ -8430,13 +8435,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1} ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -8444,157 +8449,163 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1 ; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm3, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm28 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm30 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] -; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm15, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm14, %xmm0 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] +; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm12 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm8 +; AVX512F-FAST-NEXT: vpermt2d %xmm21, %xmm14, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm9[0],xmm16[0],xmm9[1],xmm16[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-FAST-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7] ; AVX512F-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm9 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm24, %xmm15, %xmm1 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm27[0],xmm18[0],xmm27[1],xmm18[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] -; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm13 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm25, %xmm15, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm13 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm31[0],xmm20[0],xmm31[1],xmm20[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm21 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm20 +; AVX512F-FAST-NEXT: vpermt2d %xmm29, %xmm14, %xmm0 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm19[0],xmm17[0],xmm19[1],xmm17[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm8 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm6 -; AVX512F-FAST-NEXT: vpermi2d %xmm28, %xmm30, %xmm6 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm16[0],xmm17[1],xmm16[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm27 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm29 +; AVX512F-FAST-NEXT: vpermi2d %xmm23, %xmm25, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm26 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm31[0],xmm14[0],xmm31[1],xmm14[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm10, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm29, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm15, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm18[2],xmm24[3],xmm18[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm31 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm27[2],xmm18[2],xmm27[3],xmm18[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm5, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm22[2],xmm24[2],xmm22[3],xmm24[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm18 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm25 = xmm22[2],xmm16[2],xmm22[3],xmm16[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm28[2],xmm21[2],xmm28[3],xmm21[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm23 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] @@ -8603,110 +8614,107 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm21[2],xmm13[2],xmm21[3],xmm13[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm13, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm22 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm25[2],xmm24[3],xmm25[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm19[2],xmm17[2],xmm19[3],xmm17[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm21 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 {%k1} ; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm27 -; AVX512F-FAST-NEXT: vpermi2d %xmm7, %xmm17, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm28[2],xmm29[3],xmm28[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpermi2d %xmm14, %xmm26, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm19 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm27[2],xmm29[3],xmm27[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] -; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm1 = xmm0[0,1],mem[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm17, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm23, %xmm17, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm22, %xmm17, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm17, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm4 -; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm17, %xmm4 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm27[2],xmm25[2],xmm27[3],xmm25[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 +; AVX512F-FAST-NEXT: vpermt2d %xmm27, %xmm17, %xmm6 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm26[2],xmm19[2],xmm26[3],xmm19[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%rsi) @@ -8740,1131 +8748,1095 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-FAST-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512F-FAST-NEXT: addq $2520, %rsp # imm = 0x9D8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i16_stride8_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm27, %zmm9, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm8, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm8, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm8, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm18, %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm26, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm27, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm8, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm11, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm12, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm9, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm14, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm1, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm16, %zmm11, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm16, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm16, %zmm10, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm16, %zmm9, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm16, %zmm14, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm16, %zmm1, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm1, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm11, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm12, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm10, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm9, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm8, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm10, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm9, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm14, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm18, %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm26, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm27, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm26, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm27, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm26, %zmm15, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm27, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm27, %zmm8, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm27, %zmm12, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm27, %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm26, %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm26, %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm26, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm26, %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i16_stride8_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm27, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm8, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm8, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm8, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm18, %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm26, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm27, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm8, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm11, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm12, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm9, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm14, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm1, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm16, %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm16, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm16, %zmm10, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm16, %zmm9, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm16, %zmm14, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm16, %zmm1, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm1, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm11, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm12, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm10, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm9, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm10, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm9, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm14, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm18, %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm26, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm27, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm26, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm27, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm26, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm27, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm27, %zmm8, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm27, %zmm12, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm27, %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm26, %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm26, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm26, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm26, %zmm16, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, (%rsi) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i16_stride8_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm8, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 ; AVX512DQBW-SLOW-NEXT: movb $-64, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm27, %zmm9, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm8, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm8, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm18, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm26, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm8, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm27, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm11, %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm12, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm14, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm1, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm16, %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm16, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm16, %zmm10, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm16, %zmm9, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm16, %zmm14, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm16, %zmm1, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm10, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm1, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm8, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm11, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm12, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm10, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm9, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm14, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm18, %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm18, %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm26, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm27, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm26, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm27, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm26, %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm27, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm27, %zmm8, %zmm11 ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm27, %zmm12, %zmm15 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm27, %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm26, %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm26, %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm26, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm26, %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: load_i16_stride8_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm8, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 ; AVX512DQBW-FAST-NEXT: movb $-64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm27, %zmm9, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm8, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm8, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm18, %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm26, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm8, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm27, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm11, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm12, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm9, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm14, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm1, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm16, %zmm11, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm16, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm16, %zmm10, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm16, %zmm9, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm16, %zmm14, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm16, %zmm1, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm10, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm1, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm11, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm12, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm10, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm9, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm10, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm14, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm18, %zmm15, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm18, %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm26, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm27, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm26, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm27, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm26, %zmm15, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm27, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm27, %zmm8, %zmm11 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm27, %zmm12, %zmm15 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm27, %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm26, %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm26, %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm26, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm26, %zmm16, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-FAST-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-FAST-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <512 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll index 85b318556a5b90..85a8c5e50354a3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -249,54 +249,54 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; SSE-LABEL: load_i32_stride2_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 32(%rdi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps 208(%rdi), %xmm9 ; SSE-NEXT: movaps 192(%rdi), %xmm3 -; SSE-NEXT: movaps 80(%rdi), %xmm13 +; SSE-NEXT: movaps 80(%rdi), %xmm10 ; SSE-NEXT: movaps 64(%rdi), %xmm2 ; SSE-NEXT: movaps 240(%rdi), %xmm11 ; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm14 +; SSE-NEXT: movaps 112(%rdi), %xmm12 ; SSE-NEXT: movaps 96(%rdi), %xmm4 -; SSE-NEXT: movaps 144(%rdi), %xmm12 +; SSE-NEXT: movaps 144(%rdi), %xmm13 ; SSE-NEXT: movaps 128(%rdi), %xmm6 -; SSE-NEXT: movaps 176(%rdi), %xmm15 +; SSE-NEXT: movaps 176(%rdi), %xmm14 ; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm12[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm12[1,3] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm10[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm10[1,3] +; SSE-NEXT: movaps %xmm7, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm14[1,3] -; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm14[1,3] +; SSE-NEXT: movaps %xmm6, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm13[1,3] -; SSE-NEXT: movaps %xmm7, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm15[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm15[1,3] -; SSE-NEXT: movaps %xmm6, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm12[1,3] -; SSE-NEXT: movaps %xmm5, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm11[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm13[1,3] +; SSE-NEXT: movaps %xmm5, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm11[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm11[1,3] ; SSE-NEXT: movaps %xmm3, %xmm11 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm9[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm9[1,3] -; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: movaps %xmm0, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm8[1,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm8[1,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm8[1,3] ; SSE-NEXT: movaps %xmm11, 96(%rsi) -; SSE-NEXT: movaps %xmm14, 32(%rsi) -; SSE-NEXT: movaps %xmm12, 112(%rsi) -; SSE-NEXT: movaps %xmm10, 48(%rsi) -; SSE-NEXT: movaps %xmm15, 64(%rsi) -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps %xmm13, 80(%rsi) +; SSE-NEXT: movaps %xmm12, 32(%rsi) +; SSE-NEXT: movaps %xmm13, 112(%rsi) +; SSE-NEXT: movaps %xmm15, 48(%rsi) +; SSE-NEXT: movaps %xmm14, 64(%rsi) +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm10, 80(%rsi) ; SSE-NEXT: movaps %xmm9, 16(%rsi) ; SSE-NEXT: movaps %xmm3, 96(%rdx) ; SSE-NEXT: movaps %xmm5, 112(%rdx) @@ -304,7 +304,7 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm7, 80(%rdx) ; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: retq @@ -410,77 +410,77 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-LABEL: load_i32_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm11 -; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 208(%rdi), %xmm0 +; SSE-NEXT: movaps 192(%rdi), %xmm9 ; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 240(%rdi), %xmm14 -; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps 64(%rdi), %xmm8 +; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps 224(%rdi), %xmm11 ; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps 272(%rdi), %xmm12 -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdi), %xmm9 -; SSE-NEXT: movaps 304(%rdi), %xmm0 -; SSE-NEXT: movaps 288(%rdi), %xmm13 -; SSE-NEXT: movaps 176(%rdi), %xmm4 -; SSE-NEXT: movaps 160(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: movaps 96(%rdi), %xmm10 +; SSE-NEXT: movaps 272(%rdi), %xmm4 +; SSE-NEXT: movaps 144(%rdi), %xmm5 +; SSE-NEXT: movaps 128(%rdi), %xmm12 +; SSE-NEXT: movaps 304(%rdi), %xmm6 +; SSE-NEXT: movaps 288(%rdi), %xmm14 +; SSE-NEXT: movaps 176(%rdi), %xmm7 +; SSE-NEXT: movaps 160(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm10, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm3[0,2] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm3[1,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm1[1,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,3],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,3],xmm3[1,3] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm1[1,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm2[1,3] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm7[1,3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm14[1,3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,3],xmm5[1,3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm11[1,3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,3],xmm2[1,3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm0[1,3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm0[1,3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,3],xmm6[1,3] +; SSE-NEXT: movaps %xmm14, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm12[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm12[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 352(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 352(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,3],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,3],xmm0[1,3] ; SSE-NEXT: movaps 336(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm0[1,3] +; SSE-NEXT: movaps 320(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,3],xmm0[1,3] ; SSE-NEXT: movaps 432(%rdi), %xmm0 ; SSE-NEXT: movaps 416(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,3],xmm0[1,3] ; SSE-NEXT: movaps 400(%rdi), %xmm0 ; SSE-NEXT: movaps 384(%rdi), %xmm9 @@ -489,31 +489,31 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm0[1,3] ; SSE-NEXT: movaps 496(%rdi), %xmm0 ; SSE-NEXT: movaps 480(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm0[1,3] -; SSE-NEXT: movaps 464(%rdi), %xmm1 -; SSE-NEXT: movaps 448(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm1[1,3] +; SSE-NEXT: movaps 464(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm0[1,3] ; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm1[1,3] -; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm0[1,3] +; SSE-NEXT: movaps (%rdi), %xmm6 ; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm0[1,3] -; SSE-NEXT: movaps %xmm2, 224(%rsi) -; SSE-NEXT: movaps %xmm11, 160(%rsi) +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm0[1,3] +; SSE-NEXT: movaps %xmm1, 224(%rsi) +; SSE-NEXT: movaps %xmm15, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm6, 240(%rsi) +; SSE-NEXT: movaps %xmm3, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -525,22 +525,22 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 128(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps %xmm14, 208(%rsi) +; SSE-NEXT: movaps %xmm4, (%rsi) +; SSE-NEXT: movaps %xmm13, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps %xmm5, 16(%rsi) -; SSE-NEXT: movaps %xmm3, 224(%rdx) +; SSE-NEXT: movaps %xmm2, 224(%rdx) ; SSE-NEXT: movaps %xmm7, 240(%rdx) ; SSE-NEXT: movaps %xmm9, 192(%rdx) ; SSE-NEXT: movaps %xmm12, 208(%rdx) -; SSE-NEXT: movaps %xmm13, 160(%rdx) -; SSE-NEXT: movaps %xmm15, 176(%rdx) +; SSE-NEXT: movaps %xmm11, 160(%rdx) +; SSE-NEXT: movaps %xmm14, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rdx) @@ -554,7 +554,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps %xmm6, (%rdx) ; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq @@ -562,131 +562,131 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX1-ONLY-LABEL: load_i32_stride2_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2],ymm8[0,2],ymm10[4,6],ymm8[4,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,2],ymm8[0,2],ymm9[4,6],ymm8[4,6] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm7[0,2],ymm11[4,6],ymm7[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,3],ymm7[1,3],ymm11[5,7],ymm7[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,3],ymm12[1,3],ymm13[5,7],ymm12[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,3],ymm14[1,3],ymm15[5,7],ymm14[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,2],ymm12[0,2],ymm7[4,6],ymm12[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm1, %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,3],ymm10[1,3],ymm11[5,7],ymm10[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3],ymm12[1,3],ymm7[5,7],ymm12[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm5[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[1,3],ymm14[1,3],ymm15[5,7],ymm14[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,3],ymm8[1,3],ymm10[5,7],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[0,2],ymm15[0,2],ymm4[4,6],ymm15[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm15[1,3],ymm4[5,7],ymm15[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,2],ymm13[0,2],ymm6[4,6],ymm13[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3],ymm13[1,3],ymm6[5,7],ymm13[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm11[0,2],ymm9[4,6],ymm11[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3],ymm11[1,3],ymm9[5,7],ymm11[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm11[0,2],ymm1[4,6],ymm11[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm11[1,3],ymm1[5,7],ymm11[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,3],ymm8[1,3],ymm9[5,7],ymm8[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2],ymm14[0,2],ymm4[4,6],ymm14[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm14[1,3],ymm4[5,7],ymm14[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm5[0,2],ymm11[0,2],ymm5[4,6],ymm11[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3],ymm11[1,3],ymm5[5,7],ymm11[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm6[0,2],ymm10[0,2],ymm6[4,6],ymm10[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3],ymm10[1,3],ymm6[5,7],ymm10[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2],ymm10[0,2],ymm2[4,6],ymm10[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm10[1,3],ymm2[5,7],ymm10[5,7] +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride2_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,2],ymm2[0,2],ymm13[4,6],ymm2[4,6] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,2],ymm0[0,2],ymm14[4,6],ymm0[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,3],ymm0[1,3],ymm14[5,7],ymm0[5,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,3],ymm2[1,3],ymm13[5,7],ymm2[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm5[0,2],ymm15[4,6],ymm5[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,3],ymm5[1,3],ymm15[5,7],ymm5[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,2],ymm6[0,2],ymm14[4,6],ymm6[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,3],ymm6[1,3],ymm14[5,7],ymm6[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2],ymm10[0,2],ymm12[4,6],ymm10[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,3],ymm10[1,3],ymm12[5,7],ymm10[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[0,2],ymm8[0,2],ymm11[4,6],ymm8[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,3],ymm8[1,3],ymm11[5,7],ymm8[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2],ymm7[0,2],ymm9[4,6],ymm7[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,3],ymm7[1,3],ymm9[5,7],ymm7[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm4[1,3],ymm3[5,7],ymm4[5,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm4[0,2],ymm1[4,6],ymm4[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm4[1,3],ymm1[5,7],ymm4[5,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm13[0,2],ymm1[0,2],ymm13[4,6],ymm1[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,3],ymm1[1,3],ymm13[5,7],ymm1[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2],ymm2[0,2],ymm12[4,6],ymm2[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3],ymm2[1,3],ymm12[5,7],ymm2[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm7[0,2],ymm6[0,2],ymm7[4,6],ymm6[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,3],ymm6[1,3],ymm7[5,7],ymm6[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,2],ymm8[0,2],ymm9[4,6],ymm8[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,3],ymm8[1,3],ymm9[5,7],ymm8[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,3],ymm10[1,3],ymm11[5,7],ymm10[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm3[0,2],ymm5[0,2],ymm3[4,6],ymm5[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm5[1,3],ymm3[5,7],ymm5[5,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2],ymm5[0,2],ymm4[4,6],ymm5[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm5[1,3],ymm4[5,7],ymm5[5,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index e812fb903a808e..5e2a5dff4578ef 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -185,40 +185,40 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps 80(%rdi), %xmm0 +; SSE-NEXT: movaps 64(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm3 ; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,2] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movaps %xmm5, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm5[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,3] +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm3[0,2] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm10[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[0,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm7, 16(%rsi) -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps %xmm6, (%rsi) +; SSE-NEXT: movaps %xmm5, 16(%rdx) +; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movaps %xmm6, (%rcx) +; SSE-NEXT: movaps %xmm8, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf8: @@ -371,99 +371,98 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps 128(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rdi), %xmm13 -; SSE-NEXT: movaps 144(%rdi), %xmm11 -; SSE-NEXT: movaps 176(%rdi), %xmm10 -; SSE-NEXT: movaps 160(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm15 -; SSE-NEXT: movaps 80(%rdi), %xmm14 -; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0] -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps 112(%rdi), %xmm9 +; SSE-NEXT: movaps 144(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: movaps 160(%rdi), %xmm6 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm4 +; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm15, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm2[0,0] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm9[0,0] -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0] -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm8[0,0] -; SSE-NEXT: movaps %xmm8, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm12[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movaps 48(%rdi), %xmm14 +; SSE-NEXT: movaps 80(%rdi), %xmm13 +; SSE-NEXT: movaps 64(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm13[1,0] +; SSE-NEXT: movaps %xmm14, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[1,0] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[1,0] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm9, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[1,0] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm5[0,2] +; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm15[0,0] +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm5[0,2] +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm6[0,0] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm9[0,0] +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm3[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] -; SSE-NEXT: movaps %xmm5, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm6, 48(%rdx) -; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rdx) -; SSE-NEXT: movaps %xmm4, 32(%rcx) -; SSE-NEXT: movaps %xmm8, 48(%rcx) -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,3] +; SSE-NEXT: movaps %xmm8, 32(%rsi) +; SSE-NEXT: movaps %xmm11, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps %xmm7, 32(%rdx) +; SSE-NEXT: movaps %xmm5, 48(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm12, 16(%rdx) +; SSE-NEXT: movaps %xmm6, 32(%rcx) +; SSE-NEXT: movaps %xmm4, 48(%rcx) +; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movaps %xmm14, 16(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf16: @@ -704,163 +703,160 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $392, %rsp # imm = 0x188 -; SSE-NEXT: movaps 192(%rdi), %xmm4 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps 208(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm7 -; SSE-NEXT: movaps 272(%rdi), %xmm6 -; SSE-NEXT: movaps 256(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm13 +; SSE-NEXT: subq $376, %rsp # imm = 0x178 +; SSE-NEXT: movaps 192(%rdi), %xmm3 +; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm11 +; SSE-NEXT: movaps 240(%rdi), %xmm6 +; SSE-NEXT: movaps 272(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 256(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps 64(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0] -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, %xmm11 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 144(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 336(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 96(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[1,0] +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 288(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] +; SSE-NEXT: movaps 144(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] +; SSE-NEXT: movaps 336(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] +; SSE-NEXT: movaps 96(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm15 +; SSE-NEXT: movaps 304(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[1,0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm9[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm0[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm15[0,0] -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm11, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm7[0,0] +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] +; SSE-NEXT: movaps (%rsp), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm13[0,0] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm12[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm5[0,0] +; SSE-NEXT: movaps %xmm5, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm13[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm12[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm15[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload @@ -868,195 +864,197 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps %xmm3, 96(%rdx) -; SSE-NEXT: movaps %xmm11, 32(%rdx) -; SSE-NEXT: movaps %xmm7, 112(%rdx) +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps %xmm2, 96(%rdx) +; SSE-NEXT: movaps %xmm10, 32(%rdx) +; SSE-NEXT: movaps %xmm4, 112(%rdx) ; SSE-NEXT: movaps %xmm9, 48(%rdx) -; SSE-NEXT: movaps %xmm10, 64(%rdx) +; SSE-NEXT: movaps %xmm6, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps %xmm14, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps %xmm0, 96(%rcx) -; SSE-NEXT: movaps %xmm1, 112(%rcx) -; SSE-NEXT: movaps %xmm15, 64(%rcx) +; SSE-NEXT: movaps %xmm15, 96(%rcx) +; SSE-NEXT: movaps %xmm0, 112(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rcx) ; SSE-NEXT: movaps %xmm13, 80(%rcx) -; SSE-NEXT: movaps %xmm4, 32(%rcx) -; SSE-NEXT: movaps %xmm6, 48(%rcx) -; SSE-NEXT: movaps %xmm5, (%rcx) -; SSE-NEXT: movaps %xmm8, 16(%rcx) -; SSE-NEXT: addq $392, %rsp # imm = 0x188 +; SSE-NEXT: movaps %xmm5, 32(%rcx) +; SSE-NEXT: movaps %xmm7, 48(%rcx) +; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movaps %xmm12, 16(%rcx) +; SSE-NEXT: addq $376, %rsp # imm = 0x178 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6],ymm9[7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm8[2,0],ymm5[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm12[1],ymm7[2,3],ymm12[4],ymm7[5,6],ymm12[7] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm11 ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm12[1,3],ymm1[6,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm7[1,3],ymm1[6,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm4[2,0],ymm2[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[2,3],ymm15[4],ymm3[5,6],ymm15[7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm11, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm11[3,0],ymm10[6,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,0],ymm8[2,0],ymm11[4,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[2,0],ymm8[3,0],ymm5[6,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0],ymm2[2,0],ymm8[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm13[0,3],ymm6[5,6],ymm13[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm4[2,0],ymm14[3,0],ymm4[6,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0],ymm6[2,0],ymm14[4,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm13[0,3],ymm7[5,6],ymm13[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm14[3,0],ymm6[6,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,0],ymm2[2,0],ymm14[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm7[0,3],ymm10[5,6],ymm7[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm0[3,0],ymm7[6,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm0[3,0],ymm1[6,4],ymm0[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm2[2,0],ymm0[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm11[0,3],ymm4[5,6],ymm11[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,0],ymm12[3,0],ymm1[6,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,0],ymm4[2,0],ymm12[4,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm15[2],ymm3[3,4],ymm15[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm11[0,3],ymm6[5,6],ymm11[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[3,0],ymm2[6,4],ymm4[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,0],ymm2[2,0],ymm4[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm13[1,0],ymm6[2,0],ymm13[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm9[0,3],ymm6[6,4],ymm9[4,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm9[0,1],mem[0,3],ymm9[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[2,0],ymm5[0,3],ymm8[6,4],ymm5[4,7] -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm10[0,3],ymm4[5,6],ymm10[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm4[2,0],ymm13[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm9[0,3],ymm4[6,4],ymm9[4,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1],mem[0,3],ymm8[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm5[2],ymm12[3,4],ymm5[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0],ymm8[2,0],ymm7[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,0],ymm12[0,3],ymm7[6,4],ymm12[4,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm3[2,0],ymm11[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm15[0,3],ymm3[6,4],ymm15[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm7[0,3],ymm0[4,5],ymm7[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,3],ymm0[4,5],ymm1[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm1[2,0],ymm10[5,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,3],ymm1[6,4],ymm2[4,7] -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[0,3],ymm2[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1066,115 +1064,112 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX1-ONLY-NEXT: addq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride3_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $136, %rsp -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $104, %rsp +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm12 = <0,3,6,1,4,7,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm14, %ymm8 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm1[1],ymm13[2,3],ymm1[4],ymm13[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm1, %ymm8 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm8 -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm1, %ymm12 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm2[2],ymm13[3,4],ymm2[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm14[2],ymm10[3,4],ymm14[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm4 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm15[2],ymm3[3,4],ymm15[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm13[2],ymm2[3,4],ymm13[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: addq $136, %rsp +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: addq $104, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1183,81 +1178,81 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: subq $104, %rsp ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm11 ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm1[1],ymm13[2,3],ymm1[4],ymm13[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm9 -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = [0,1,0,3,0,1,4,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm13, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm6[2],ymm13[3,4],ymm6[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [0,1,0,3,0,1,4,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm9, %ymm11 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm12, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm13, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload @@ -1267,123 +1262,120 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm8, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm10, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FAST-NEXT: addq $104, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $136, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: subq $104, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm12 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm14, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm1[1],ymm13[2,3],ymm1[4],ymm13[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm2[2],ymm13[3,4],ymm2[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm14[2],ymm10[3,4],ymm14[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm15[2],ymm3[3,4],ymm15[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm13[2],ymm2[3,4],ymm13[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: addq $136, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: addq $104, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1436,46 +1428,46 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1112, %rsp # imm = 0x458 +; SSE-NEXT: subq $1080, %rsp # imm = 0x438 ; SSE-NEXT: movaps 624(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 656(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 432(%rdi), %xmm6 +; SSE-NEXT: movaps 640(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 432(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 464(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm5 +; SSE-NEXT: movaps 448(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm9 +; SSE-NEXT: movaps 256(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm3 ; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 64(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] ; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] ; SSE-NEXT: movaps %xmm7, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] @@ -1494,8 +1486,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 208(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 192(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 416(%rdi), %xmm1 @@ -1516,11 +1509,11 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm10 -; SSE-NEXT: movaps 160(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: movaps 160(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1528,12 +1521,11 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps 352(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 336(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 336(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 560(%rdi), %xmm1 @@ -1548,122 +1540,108 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 752(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 736(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] ; SSE-NEXT: movaps 720(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm6 -; SSE-NEXT: movaps 112(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[1,0] -; SSE-NEXT: movaps 96(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm7 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm13 -; SSE-NEXT: movaps 304(%rdi), %xmm11 +; SSE-NEXT: movaps 128(%rdi), %xmm0 +; SSE-NEXT: movaps 112(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[1,0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 512(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] -; SSE-NEXT: movaps 480(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm9 +; SSE-NEXT: movaps 304(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm9[1,0] +; SSE-NEXT: movaps 288(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 512(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[1,0] +; SSE-NEXT: movaps 480(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 704(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 688(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[1,0] -; SSE-NEXT: movaps 672(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm12[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[1,0] +; SSE-NEXT: movaps 672(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm1[0,2] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm12[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm11[0,0] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm1[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm15[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm15[0,2] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm11[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm8[0,0] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm9[2,3] +; SSE-NEXT: movaps %xmm9, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1673,16 +1651,21 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm14[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm14[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm14[0,0] -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1690,26 +1673,31 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm1[0,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm15, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm1[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm1[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm2, %xmm11 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm9[0,0] +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -1718,24 +1706,26 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[0,3] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload @@ -1748,8 +1738,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0,1],mem[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload @@ -1757,12 +1746,12 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] @@ -1774,17 +1763,18 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0,1],mem[0,3] @@ -1800,101 +1790,102 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] +; SSE-NEXT: pshufd $85, (%rsp), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps %xmm14, 224(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 240(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 192(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 160(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 176(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 128(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 144(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movaps %xmm12, 224(%rdx) +; SSE-NEXT: movaps %xmm14, 240(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 208(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 160(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 176(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 128(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 144(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 16(%rdx) ; SSE-NEXT: movaps %xmm0, 240(%rcx) ; SSE-NEXT: movaps %xmm1, 224(%rcx) ; SSE-NEXT: movaps %xmm2, 208(%rcx) ; SSE-NEXT: movaps %xmm3, 192(%rcx) -; SSE-NEXT: movaps %xmm5, 176(%rcx) +; SSE-NEXT: movaps %xmm4, 176(%rcx) ; SSE-NEXT: movaps %xmm6, 160(%rcx) ; SSE-NEXT: movaps %xmm7, 144(%rcx) ; SSE-NEXT: movaps %xmm8, 128(%rcx) ; SSE-NEXT: movaps %xmm9, 112(%rcx) ; SSE-NEXT: movaps %xmm10, 96(%rcx) ; SSE-NEXT: movaps %xmm11, 80(%rcx) -; SSE-NEXT: movaps %xmm12, 64(%rcx) -; SSE-NEXT: movaps %xmm13, 48(%rcx) -; SSE-NEXT: movaps %xmm15, 32(%rcx) +; SSE-NEXT: movaps %xmm13, 64(%rcx) +; SSE-NEXT: movaps %xmm15, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: addq $1112, %rsp # imm = 0x458 +; SSE-NEXT: addq $1080, %rsp # imm = 0x438 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1384, %rsp # imm = 0x568 +; AVX1-ONLY-NEXT: subq $1416, %rsp # imm = 0x588 ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1907,18 +1898,17 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm1[2,0],ymm8[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1926,8 +1916,8 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm6[1,3],ymm1[6,5],ymm6[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm14[2,0],ymm5[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm6[2,0],ymm5[5,4],ymm6[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1936,8 +1926,8 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm13[2,0],ymm2[5,4],ymm13[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1956,47 +1946,46 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm2[1,3],ymm0[6,5],ymm2[5,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm2[2,0],ymm5[5,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm1[2,0],ymm5[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm4[2,0],ymm1[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6],ymm11[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm8[2,0],ymm1[5,4],ymm8[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2007,44 +1996,45 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm11[2,0],ymm2[5,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm1[2,0],ymm3[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[3,0],ymm0[6,4],ymm9[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,0],ymm0[2,0],ymm9[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm0[0,3],ymm1[5,6],ymm0[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[3,0],ymm0[6,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,0],ymm0[2,0],ymm6[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[3,0],ymm0[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[2,0],ymm3[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[3,0],ymm0[6,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,0],ymm0[2,0],ymm13[4,4],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2067,159 +2057,162 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm2[3,0],ymm5[6,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[2,0],ymm2[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm4[0,3],ymm2[5,6],ymm4[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm3[0,3],ymm2[5,6],ymm3[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm2[3,0],ymm5[6,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,2],ymm5[0,3],ymm3[5,6],ymm5[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[2,0],ymm4[3,0],ymm9[6,4],ymm4[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0],ymm0[2,0],ymm4[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm4[0,3],ymm6[5,6],ymm4[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm9[3,0],ymm8[6,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0],ymm2[2,0],ymm9[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm8[3,0],ymm2[6,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,0],ymm2[2,0],ymm8[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm8[0,3],ymm15[5,6],ymm8[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[2,0],ymm11[3,0],ymm13[6,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,0],ymm3[2,0],ymm11[4,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm10[2],ymm3[3,4],ymm10[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm9[0,3],ymm14[5,6],ymm9[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm13[1,0],ymm14[2,0],ymm13[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm15[0,3],ymm14[6,4],ymm15[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[2,0],ymm7[3,0],ymm14[6,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[0,0],ymm6[2,0],ymm7[4,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,2],ymm14[0,3],ymm13[5,6],ymm14[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm13[2,0],ymm7[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm15[0,3],ymm13[6,4],ymm15[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps $196, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm15[0,1],mem[0,3],ymm15[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[0,3],ymm0[6,4],ymm7[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[0,3],ymm0[6,4],ymm14[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,0],ymm0[2,0],ymm5[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[0,3],ymm0[6,4],ymm6[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm13[2,0],ymm3[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[0,3],ymm1[6,4],ymm7[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,1],ymm5[0,3],ymm3[4,5],ymm5[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm13[0,1],mem[2],ymm13[3,4],mem[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm7[2,0],ymm3[5,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm13[0,3],ymm7[6,4],ymm13[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm3[0,1],mem[0,3],ymm3[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm7[2,0],ymm4[5,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm12[0,3],ymm0[6,4],ymm12[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,1],ymm9[0,3],ymm3[4,5],ymm9[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm7[0,1],mem[2],ymm7[3,4],mem[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[1,0],ymm5[2,0],ymm3[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm7[0,3],ymm5[6,4],ymm7[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm3[0,1],mem[0,3],ymm3[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm2[2],ymm11[3,4],ymm2[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm11[0,3],ymm2[6,4],ymm11[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm3[0,1],mem[0,3],ymm3[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm7[0,1],mem[2],ymm7[3,4],mem[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,0],ymm4[2,0],ymm3[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm7[0,3],ymm4[6,4],ymm7[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm3[0,1],mem[0,3],ymm3[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,0],ymm7[2,0],ymm14[5,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm10[0,3],ymm6[6,4],ymm10[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1],mem[0,3],ymm6[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,3],ymm1[6,4],ymm6[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm2[2],ymm12[3,4],ymm2[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm12[0,3],ymm2[6,4],ymm12[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm6[0,3],ymm5[6,4],ymm6[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[0,3],ymm3[6,4],ymm10[4,7] -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm11[0,1],mem[0,3],ymm11[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) -; AVX1-ONLY-NEXT: addq $1384, %rsp # imm = 0x568 +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rcx) +; AVX1-ONLY-NEXT: addq $1416, %rsp # imm = 0x588 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride3_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX2-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 ; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm4 @@ -2230,248 +2223,249 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm8 = <0,3,6,1,4,7,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = <0,3,6,1,4,7,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm10, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm10, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm10, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm5 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm6 -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm10, %ymm11 +; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm10, %ymm15 +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm7[0,1],mem[2],ymm7[3,4],mem[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm14 = <2,5,0,3,6,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm14, %ymm4 -; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm14, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermilps $196, (%rsp), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm6 -; AVX2-SLOW-NEXT: vpermilps $196, (%rsp), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm14, %ymm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 224(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 160(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 160(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 160(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 160(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rcx) -; AVX2-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm15, 32(%rcx) +; AVX2-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride3_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm8 @@ -2480,226 +2474,233 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovaps %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm9 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm10 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm11 +; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm5 +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $219, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm10 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $219, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = [0,1,0,3,0,1,4,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm15, %ymm14 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm15, %ymm5 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm13[2],ymm9[3,4],ymm13[5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm9, %ymm13 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = [0,1,0,3,0,1,4,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vblendps $36, (%rsp), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm15, %ymm4 +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1],mem[2],ymm7[3,4],mem[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm8[2],mem[3,4],ymm8[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm12, 192(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm6, 160(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm14, 32(%rcx) ; AVX2-FAST-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX2-FAST-PERLANE-NEXT: subq $1096, %rsp # imm = 0x448 ; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm4 @@ -2710,289 +2711,290 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm8 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm10, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm10, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm10, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm10, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[0,1],mem[2],ymm7[3,4],mem[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm14 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm14, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm14, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $196, (%rsp), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, (%rsp), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 224(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 160(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 160(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: load_i32_stride3_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] -; AVX512-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 -; AVX512-NEXT: vpermt2d %zmm6, %zmm14, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 -; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 -; AVX512-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 -; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 +; AVX512-NEXT: vpermt2d %zmm6, %zmm14, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm15 +; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512-NEXT: vpermt2d %zmm1, %zmm12, %zmm16 +; AVX512-NEXT: vpermt2d %zmm0, %zmm14, %zmm16 +; AVX512-NEXT: vpermi2d %zmm9, %zmm8, %zmm12 +; AVX512-NEXT: vpermt2d %zmm10, %zmm14, %zmm12 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = <17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512-NEXT: vpermt2d %zmm5, %zmm14, %zmm17 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] -; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 -; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512-NEXT: vpermt2d %zmm0, %zmm14, %zmm20 -; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 -; AVX512-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 -; AVX512-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 +; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm19 +; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512-NEXT: vpermt2d %zmm2, %zmm14, %zmm20 +; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm20 +; AVX512-NEXT: vpermi2d %zmm8, %zmm9, %zmm14 +; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm14 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = <2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u> -; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] -; AVX512-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 -; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 -; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 -; AVX512-NEXT: vpermt2d %zmm7, %zmm18, %zmm1 -; AVX512-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 -; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm2 -; AVX512-NEXT: vpermt2d %zmm11, %zmm9, %zmm2 +; AVX512-NEXT: vpermt2d %zmm7, %zmm18, %zmm11 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-NEXT: vpermt2d %zmm6, %zmm7, %zmm11 +; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm2 +; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 +; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm5 +; AVX512-NEXT: vpermt2d %zmm3, %zmm7, %zmm5 +; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm8 +; AVX512-NEXT: vpermt2d %zmm10, %zmm7, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) @@ -3001,10 +3003,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <192 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 7b4df6a350e337..1351377466689e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -216,8 +216,8 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i32_stride4_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm3 -; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps 32(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rdi), %xmm4 ; SSE-NEXT: movaps 80(%rdi), %xmm5 ; SSE-NEXT: movaps 64(%rdi), %xmm1 @@ -229,10 +229,10 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] ; SSE-NEXT: movaps %xmm9, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] -; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: movaps %xmm3, %xmm11 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] ; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] ; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] @@ -241,18 +241,18 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movaps %xmm10, 16(%rsi) ; SSE-NEXT: movaps %xmm13, (%rsi) ; SSE-NEXT: movaps %xmm9, 16(%rdx) ; SSE-NEXT: movaps %xmm12, (%rdx) ; SSE-NEXT: movaps %xmm5, 16(%rcx) -; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movaps %xmm2, (%rcx) ; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: retq @@ -305,56 +305,56 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i32_stride4_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm3, %ymm5 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,4,0,4,0,4,0,4] +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,4,0,4] ; AVX2-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm5, %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm9 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm5, %ymm9 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [1,5,1,5] ; AVX2-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm11, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm11 ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [2,6,2,6] ; AVX2-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm7 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [3,7,3,7,3,7,3,7] +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [3,7,3,7] -; AVX2-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -393,85 +393,85 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride4_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm2 +; SSE-NEXT: movaps 208(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps 144(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps 224(%rdi), %xmm14 +; SSE-NEXT: movaps 144(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm7 ; SSE-NEXT: movaps 176(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdi), %xmm9 -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm11 -; SSE-NEXT: movaps 112(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 192(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: movaps %xmm14, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; SSE-NEXT: movaps 192(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1] ; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps 16(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm7[1] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm14[1] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps %xmm4, (%rsi) @@ -480,138 +480,135 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps %xmm15, 48(%rdx) -; SSE-NEXT: movaps %xmm14, (%rdx) +; SSE-NEXT: movaps %xmm13, (%rdx) ; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps %xmm13, 16(%rdx) -; SSE-NEXT: movaps %xmm7, 48(%rcx) -; SSE-NEXT: movaps %xmm6, 32(%rcx) -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps %xmm8, (%rcx) -; SSE-NEXT: movaps %xmm10, 48(%r8) -; SSE-NEXT: movaps %xmm3, 32(%r8) -; SSE-NEXT: movaps %xmm11, 16(%r8) -; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps %xmm9, 48(%rcx) +; SSE-NEXT: movaps %xmm10, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm11, (%rcx) +; SSE-NEXT: movaps %xmm8, 48(%r8) +; SSE-NEXT: movaps %xmm7, 32(%r8) +; SSE-NEXT: movaps %xmm6, 16(%r8) +; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $264, %rsp # imm = 0x108 +; AVX1-ONLY-NEXT: subq $328, %rsp # imm = 0x148 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm14 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm4[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm8 -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm12 -; AVX1-ONLY-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[4],ymm5[4],ymm11[5],ymm5[5] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm0[2,0],ymm13[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm13[0],xmm12[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[4],ymm14[4],ymm15[5],ymm14[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,0],ymm3[1,0],ymm10[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vmovaps %ymm10, %ymm15 -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,0],ymm3[1,0],ymm0[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm7[2,3],ymm6[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = xmm9[1],xmm1[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps %xmm9, %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm15 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[1],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm14 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm12 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[4],ymm0[4],ymm8[5],ymm0[5] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,0],ymm11[1,0],ymm2[5,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm3[2,3],ymm6[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = xmm9[1],xmm8[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm10[2],xmm3[2] -; AVX1-ONLY-NEXT: vmovaps %xmm10, %xmm14 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm3[1,0],ymm9[1,0],ymm3[5,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm7[2,3],ymm6[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = xmm10[1],xmm8[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm2[2],ymm11[3],ymm2[3],ymm11[6],ymm2[6],ymm11[7],ymm2[7] +; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm9[2],xmm1[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm2[2],xmm5[2] +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm12[2],xmm13[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0],ymm13[3,0],ymm15[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm3[2],xmm14[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm3[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm15[3,0],mem[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm11[3,0],ymm4[7,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,0],ymm9[3,0],ymm1[7,4],ymm9[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[3,0],xmm10[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[3,0],xmm10[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -627,7 +624,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: addq $264, %rsp # imm = 0x108 +; AVX1-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -636,95 +633,95 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: subq $104, %rsp ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4] -; AVX2-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm9, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm12, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm12, %ymm4 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm6, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm7 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm7, %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [2,6,2,6,2,6,2,6] +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [2,6,2,6] ; AVX2-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm9 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm5 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm9, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [3,7,3,7] -; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm9, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm9, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm8 = [3,7,3,7,3,7,3,7] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm8, %ymm5 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm8, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -733,10 +730,10 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) ; AVX2-ONLY-NEXT: addq $104, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -792,70 +789,74 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride4_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 -; SSE-NEXT: movaps 272(%rdi), %xmm7 +; SSE-NEXT: movaps 272(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 304(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm8 +; SSE-NEXT: movaps 288(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 336(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm2 +; SSE-NEXT: movaps 320(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 336(%rdi), %xmm10 +; SSE-NEXT: movaps 368(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm11 +; SSE-NEXT: movaps 64(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm4 -; SSE-NEXT: movaps 96(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movaps 256(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 208(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 496(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 480(%rdi), %xmm0 @@ -873,13 +874,13 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps 160(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 144(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm14 +; SSE-NEXT: movaps 128(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm14 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -892,170 +893,171 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 400(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 32(%rdi), %xmm11 -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movaps (%rdi), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: movaps 48(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 16(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: unpckhps (%rsp), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm4[1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] +; SSE-NEXT: movaps %xmm4, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movaps (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm10[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 16(%rsi) -; SSE-NEXT: movaps %xmm7, 96(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps %xmm8, 96(%rdx) ; SSE-NEXT: movaps %xmm14, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 64(%rdx) -; SSE-NEXT: movaps %xmm5, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps %xmm3, 96(%rcx) -; SSE-NEXT: movaps %xmm9, 32(%rcx) -; SSE-NEXT: movaps %xmm10, 112(%rcx) -; SSE-NEXT: movaps %xmm4, 48(%rcx) -; SSE-NEXT: movaps %xmm13, 64(%rcx) -; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movaps %xmm0, 80(%rcx) -; SSE-NEXT: movaps %xmm8, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%r8) -; SSE-NEXT: movaps %xmm12, 96(%r8) +; SSE-NEXT: movaps %xmm0, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rdx) +; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm9, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm6, 112(%rcx) +; SSE-NEXT: movaps %xmm15, 48(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps %xmm13, (%rcx) +; SSE-NEXT: movaps %xmm4, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm10, 112(%r8) +; SSE-NEXT: movaps %xmm11, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps %xmm15, 32(%r8) +; SSE-NEXT: movaps %xmm12, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: movaps %xmm7, (%r8) ; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] @@ -1065,12 +1067,10 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1084,177 +1084,179 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm14 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[4],ymm10[4],ymm15[5],ymm10[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm10[1,0],ymm6[5,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm7[1,0],ymm1[5,4],ymm7[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm9[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm9[0],mem[0],xmm9[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm6[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[4],ymm13[4],ymm8[5],ymm13[5] -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm11[1,0],mem[1,0],ymm11[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[4],mem[4],ymm14[5],mem[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm12[1,0],ymm13[5,4],ymm12[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm4[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[1],xmm13[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm3[1,0],ymm1[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm14[2,3],ymm1[6,4],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm4[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[2,0],ymm15[2,3],ymm1[6,4],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[1],xmm1[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[4],mem[4],ymm9[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm11[1,0],mem[1,0],ymm11[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm8[2,3],ymm1[6,4],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[1],xmm12[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,0],ymm15[2,3],ymm1[6,4],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[1],xmm10[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm5[1],ymm8[1],ymm5[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,1],ymm12[2,0],ymm1[4,5],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = zero,zero,xmm9[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm8[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = zero,zero,xmm5[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm6[2,0],ymm1[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm3[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm2[2],xmm4[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm4[2],xmm2[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm11[2],ymm1[3],ymm11[3],ymm1[6],ymm11[6],ymm1[7],ymm11[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm7[2],xmm13[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm9[2],xmm11[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -1263,44 +1265,43 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm8[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm8[3,0],ymm3[7,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm7[3,0],ymm5[7,4],ymm7[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm1[2,3],ymm3[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,0],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm3[2,3],ymm6[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[2,0],ymm3[2,3],ymm7[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm11[3,0],mem[3,0],ymm11[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[3,0],ymm13[3,0],ymm12[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm2[2,3],ymm6[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm10[3,0],mem[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -1325,256 +1326,257 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: subq $744, %rsp # imm = 0x2E8 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm7 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,4,0,4] -; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm1, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4] +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm12 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm13 -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm1, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm7, %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm1, %ymm5 -; AVX2-ONLY-NEXT: vmovaps %ymm13, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm1, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm6 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm7, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm4, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps %ymm12, %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm13 -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm13, %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm4, %ymm15 +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm7 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm15[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [2,6,2,6] -; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm3 -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm1, %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [3,7,3,7] -; AVX2-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm1, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm1, %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm1, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm1, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX2-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-ONLY-NEXT: addq $744, %rsp # imm = 0x2E8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1654,79 +1656,81 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i32_stride4_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1224, %rsp # imm = 0x4C8 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 176(%rdi), %xmm11 +; SSE-NEXT: subq $1256, %rsp # imm = 0x4E8 +; SSE-NEXT: movaps 144(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps 64(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm8, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movaps 128(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm12 +; SSE-NEXT: movaps 368(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movaps 336(%rdi), %xmm13 -; SSE-NEXT: movaps 320(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 336(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 272(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movaps 304(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 272(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 256(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1749,24 +1753,24 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 432(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 400(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 400(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 624(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 624(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 608(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 592(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 576(%rdi), %xmm1 @@ -1782,11 +1786,11 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 544(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 528(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 512(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 528(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 512(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1814,9 +1818,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 656(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: movaps 640(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] @@ -1828,167 +1831,126 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 864(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 848(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 848(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 832(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 816(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 800(%rdi), %xmm0 +; SSE-NEXT: movaps 816(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 784(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 768(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1008(%rdi), %xmm1 +; SSE-NEXT: movaps 800(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 992(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 976(%rdi), %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 784(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 960(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 768(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 944(%rdi), %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 928(%rdi), %xmm1 +; SSE-NEXT: movaps 1008(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 992(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 912(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 896(%rdi), %xmm0 +; SSE-NEXT: movaps 976(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 960(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 944(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 928(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 912(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 896(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps 32(%rdi), %xmm8 ; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps (%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps (%rsp), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] @@ -1996,172 +1958,222 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 224(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 240(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 192(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 160(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 176(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 128(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 144(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 16(%rdx) -; SSE-NEXT: movaps %xmm0, 240(%rcx) -; SSE-NEXT: movaps %xmm12, 224(%rcx) -; SSE-NEXT: movaps %xmm1, 208(%rcx) -; SSE-NEXT: movaps %xmm2, 192(%rcx) -; SSE-NEXT: movaps %xmm3, 176(%rcx) -; SSE-NEXT: movaps %xmm4, 160(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm10, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps %xmm9, 224(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 240(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 208(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 160(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 176(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 128(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 144(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm14, 240(%rcx) +; SSE-NEXT: movaps %xmm6, 224(%rcx) +; SSE-NEXT: movaps %xmm12, 208(%rcx) +; SSE-NEXT: movaps %xmm0, 192(%rcx) +; SSE-NEXT: movaps %xmm1, 176(%rcx) +; SSE-NEXT: movaps %xmm3, 160(%rcx) ; SSE-NEXT: movaps %xmm5, 144(%rcx) -; SSE-NEXT: movaps %xmm6, 128(%rcx) -; SSE-NEXT: movaps %xmm10, 112(%rcx) -; SSE-NEXT: movaps %xmm15, 96(%rcx) +; SSE-NEXT: movaps %xmm7, 128(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) @@ -2171,10 +2183,9 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm13, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%r8) -; SSE-NEXT: movaps %xmm7, 224(%r8) +; SSE-NEXT: movaps %xmm4, (%rcx) +; SSE-NEXT: movaps %xmm10, 240(%r8) +; SSE-NEXT: movaps %xmm11, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2183,9 +2194,9 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 176(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%r8) +; SSE-NEXT: movaps %xmm15, 144(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%r8) -; SSE-NEXT: movaps %xmm11, 128(%r8) +; SSE-NEXT: movaps %xmm0, 128(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2200,43 +2211,42 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm9, (%r8) -; SSE-NEXT: addq $1224, %rsp # imm = 0x4C8 +; SSE-NEXT: movaps %xmm8, (%r8) +; SSE-NEXT: addq $1256, %rsp # imm = 0x4E8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2184, %rsp # imm = 0x888 +; AVX1-ONLY-NEXT: subq $2200, %rsp # imm = 0x898 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm13[0],ymm1[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] ; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm8 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm14[0],ymm5[1],ymm14[1],ymm5[4],ymm14[4],ymm5[5],ymm14[5] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm14 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2247,25 +2257,26 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2372,8 +2383,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 @@ -2382,11 +2393,11 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 @@ -2400,41 +2411,39 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm7[1,0],ymm14[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm8[1,0],ymm14[5,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm4[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[1],xmm5[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm5[1,0],ymm9[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm4[1,0],ymm10[5,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[1],xmm13[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[1],xmm12[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[4],mem[4],ymm7[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] @@ -2528,47 +2537,47 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm13[1],ymm2[3],ymm13[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm7[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm8[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[6],ymm10[6],ymm4[7],ymm10[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm11[2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm12[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -2647,18 +2656,18 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[3,0],ymm1[7,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm13[3,0],ymm1[7,4],ymm13[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm10[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm13[3,0],mem[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] @@ -2666,166 +2675,166 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[3,0],xmm14[3,0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm8[3,0],mem[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm11[3,0],ymm5[7,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,0],ymm5[3,0],ymm4[7,4],ymm5[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[3,0],xmm13[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm1[2,3],ymm12[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm13[2,0],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[2,0],ymm6[2,3],ymm9[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[3,0],xmm14[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,0],ymm10[3,0],ymm7[7,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,0],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[2,0],ymm4[2,3],ymm8[6,4],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,0],ymm1[2,3],ymm8[6,4],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm11[2,0],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[2,0],ymm4[2,3],ymm7[6,4],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[2,0],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,0],ymm5[2,3],ymm6[6,4],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,0],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[2,0],ymm2[2,3],ymm7[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm3[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[2,3],ymm5[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r8) -; AVX1-ONLY-NEXT: addq $2184, %rsp # imm = 0x888 +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%r8) +; AVX1-ONLY-NEXT: addq $2200, %rsp # imm = 0x898 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1992, %rsp # imm = 0x7C8 +; AVX2-ONLY-NEXT: subq $2008, %rsp # imm = 0x7D8 ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm13 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2838,9 +2847,11 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm7, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2849,14 +2860,15 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm13 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2871,18 +2883,20 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 912(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2893,32 +2907,32 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 528(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2929,24 +2943,23 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm7 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 @@ -2961,292 +2974,296 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [2,6,2,6] -; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm7 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2,6,2,6,2,6,2,6] +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm15, %ymm6 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm7, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,7,3,7] ; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm6, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm5, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) @@ -3284,8 +3301,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3295,14 +3314,16 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 224(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r8) -; AVX2-ONLY-NEXT: addq $1992, %rsp # imm = 0x7C8 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-ONLY-NEXT: addq $2008, %rsp # imm = 0x7D8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -3314,72 +3335,72 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm15 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm13 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] +; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512-NEXT: vpermt2d %zmm14, %zmm18, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm17 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm17[0,1,2,3],zmm10[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512-NEXT: vpermt2d %zmm16, %zmm18, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512-NEXT: vpermt2d %zmm11, %zmm18, %zmm19 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm19[0,1,2,3],zmm17[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512-NEXT: vpermt2d %zmm7, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm20 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 +; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm20 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm18 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm20[4,5,6,7] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] +; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512-NEXT: vpermt2d %zmm14, %zmm20, %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 +; AVX512-NEXT: vpermt2d %zmm8, %zmm20, %zmm22 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512-NEXT: vpermt2d %zmm16, %zmm20, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512-NEXT: vpermt2d %zmm11, %zmm20, %zmm23 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512-NEXT: vpermt2d %zmm7, %zmm20, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 +; AVX512-NEXT: vpermt2d %zmm4, %zmm20, %zmm24 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512-NEXT: vpermt2d %zmm3, %zmm21, %zmm24 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm21 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm3, %zmm20, %zmm24 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm20 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,2,3],zmm24[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512-NEXT: vpermt2d %zmm14, %zmm24, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 ; AVX512-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512-NEXT: vpermt2d %zmm16, %zmm24, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512-NEXT: vpermt2d %zmm11, %zmm24, %zmm27 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512-NEXT: vpermt2d %zmm7, %zmm24, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] @@ -3389,26 +3410,26 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 +; AVX512-NEXT: vpermt2d %zmm16, %zmm28, %zmm15 +; AVX512-NEXT: vpermt2d %zmm11, %zmm28, %zmm12 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm15[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm7, %zmm28, %zmm6 ; AVX512-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm14, %zmm28, %zmm13 ; AVX512-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm13[4,5,6,7] ; AVX512-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 ; AVX512-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm21, 128(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rcx) @@ -3416,7 +3437,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm11, 64(%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <256 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index 70c84aa6d2c998..6197adc880dcf3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -138,43 +138,43 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i32_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] -; SSE-NEXT: movapd %xmm3, (%rsi) +; SSE-NEXT: movapd %xmm5, (%rsi) ; SSE-NEXT: movapd %xmm8, (%rdx) ; SSE-NEXT: movapd %xmm6, (%rcx) -; SSE-NEXT: movapd %xmm0, (%r8) +; SSE-NEXT: movapd %xmm4, (%r8) ; SSE-NEXT: movapd %xmm9, (%r9) ; SSE-NEXT: retq ; @@ -285,90 +285,90 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i32_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 144(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa 128(%rdi), %xmm4 -; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa 144(%rdi), %xmm0 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm14 +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm13 ; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1] ; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm12[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm14[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm14[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm13[0],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm15[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm7[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] -; SSE-NEXT: movapd %xmm8, 16(%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm5[0],xmm15[1] +; SSE-NEXT: movapd %xmm9, 16(%rsi) ; SSE-NEXT: movapd %xmm6, (%rsi) ; SSE-NEXT: movapd %xmm12, 16(%rdx) ; SSE-NEXT: movapd %xmm10, (%rdx) -; SSE-NEXT: movapd %xmm14, 16(%rcx) -; SSE-NEXT: movapd %xmm13, (%rcx) -; SSE-NEXT: movapd %xmm4, 16(%r8) +; SSE-NEXT: movapd %xmm13, 16(%rcx) +; SSE-NEXT: movapd %xmm14, (%rcx) +; SSE-NEXT: movapd %xmm3, 16(%r8) ; SSE-NEXT: movapd %xmm2, (%r8) ; SSE-NEXT: movapd %xmm15, 16(%r9) -; SSE-NEXT: movapd %xmm11, (%r9) +; SSE-NEXT: movapd %xmm0, (%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf8: @@ -497,34 +497,34 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i32_stride5_vf8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = <0,5,10,15,20,25,30,u> -; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = <0,5,10,15,20,25,30,u> +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm2, %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <17,22,27,0,5,10,15,u> -; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512-NEXT: vpbroadcastd 144(%rdi), %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <2,7,12,17,22,27,u,u> -; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <3,8,13,18,23,28,u,u> -; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <4,9,14,19,24,29,u,u> -; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] -; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 -; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm0 +; AVX512-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512-NEXT: vmovdqa %ymm3, (%rdx) ; AVX512-NEXT: vmovdqa %ymm5, (%rcx) ; AVX512-NEXT: vmovdqa %ymm7, (%r8) -; AVX512-NEXT: vmovdqa %ymm1, (%r9) +; AVX512-NEXT: vmovdqa %ymm0, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <40 x i32>, ptr %in.vec, align 64 @@ -544,46 +544,45 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i32_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $312, %rsp # imm = 0x138 +; SSE-NEXT: subq $328, %rsp # imm = 0x148 ; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 272(%rdi), %xmm3 -; SSE-NEXT: movdqa 240(%rdi), %xmm14 -; SSE-NEXT: movdqa 256(%rdi), %xmm8 -; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 240(%rdi), %xmm8 +; SSE-NEXT: movdqa 256(%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm9 ; SSE-NEXT: movdqa 16(%rdi), %xmm15 ; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm7 -; SSE-NEXT: movdqa 160(%rdi), %xmm10 -; SSE-NEXT: movdqa 176(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa 160(%rdi), %xmm4 +; SSE-NEXT: movdqa 176(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm9 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm7 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] @@ -595,70 +594,71 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 128(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa 304(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -668,80 +668,80 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm15[0],xmm13[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm15[0],xmm14[1] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm14[0],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm4[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -758,21 +758,21 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movapd %xmm9, 16(%rcx) -; SSE-NEXT: movapd %xmm12, 48(%rcx) +; SSE-NEXT: movapd %xmm7, 16(%rcx) +; SSE-NEXT: movapd %xmm9, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movapd %xmm1, 16(%r8) +; SSE-NEXT: movapd %xmm0, 16(%r8) ; SSE-NEXT: movapd %xmm2, 48(%r8) -; SSE-NEXT: movapd %xmm6, (%r8) +; SSE-NEXT: movapd %xmm5, (%r8) ; SSE-NEXT: movapd %xmm8, 32(%r8) -; SSE-NEXT: movapd %xmm14, 16(%r9) +; SSE-NEXT: movapd %xmm12, 16(%r9) ; SSE-NEXT: movapd %xmm15, 48(%r9) -; SSE-NEXT: movapd %xmm13, (%r9) -; SSE-NEXT: movapd %xmm0, 32(%r9) -; SSE-NEXT: addq $312, %rsp # imm = 0x138 +; SSE-NEXT: movapd %xmm14, (%r9) +; SSE-NEXT: movapd %xmm1, 32(%r9) +; SSE-NEXT: addq $328, %rsp # imm = 0x148 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf16: @@ -930,76 +930,76 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,5,2,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm10, %ymm7 +; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm2[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4],ymm12[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6],ymm11[7] -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4],ymm10[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <1,6,3,u> +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <1,6,3,u> ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,2,7,0,5,2,7,0] -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm13, %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm13, %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm12, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,2,7,0,5,2,7,0] +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm14, %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm13[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,7,4,u> +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm14, %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,7,4,u> ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm7, %ymm13 -; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3,4,5,6],ymm14[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,5,0,5,0,5,0,5] -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm14, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm14, %ymm13 +; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm15[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,5,0,5,0,5,0,5] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm15, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm14, %ymm7 +; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm14[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm15, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6] -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm10, %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [1,6,1,6,1,6,1,6] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm15, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm15, %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] @@ -1014,7 +1014,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm8, %ymm2 +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm8, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) @@ -1024,10 +1024,10 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-ONLY-NEXT: popq %rax @@ -1173,63 +1173,60 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride5_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $904, %rsp # imm = 0x388 -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 448(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm4 -; SSE-NEXT: movdqa 400(%rdi), %xmm10 -; SSE-NEXT: movdqa 416(%rdi), %xmm14 +; SSE-NEXT: movdqa 400(%rdi), %xmm12 +; SSE-NEXT: movdqa 416(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm7 -; SSE-NEXT: movdqa 80(%rdi), %xmm12 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 320(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm13 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 368(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1239,26 +1236,25 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 272(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 560(%rdi), %xmm15 +; SSE-NEXT: movdqa 576(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 608(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1270,9 +1266,9 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa 192(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1282,30 +1278,30 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 512(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1314,11 +1310,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1328,254 +1324,251 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa 624(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 544(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] -; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm10 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm9 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm3[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm15[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -1583,15 +1576,15 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -1599,10 +1592,10 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -1615,56 +1608,55 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm15, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movapd %xmm7, 112(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 16(%rcx) +; SSE-NEXT: movapd %xmm6, 112(%r8) ; SSE-NEXT: movapd %xmm9, 96(%r8) ; SSE-NEXT: movapd %xmm10, 80(%r8) -; SSE-NEXT: movapd %xmm12, 64(%r8) -; SSE-NEXT: movapd %xmm13, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, (%r8) +; SSE-NEXT: movapd %xmm11, 64(%r8) +; SSE-NEXT: movapd %xmm12, 48(%r8) +; SSE-NEXT: movapd %xmm14, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, (%r8) ; SSE-NEXT: movapd %xmm0, 112(%r9) ; SSE-NEXT: movapd %xmm1, 96(%r9) ; SSE-NEXT: movapd %xmm2, 80(%r9) ; SSE-NEXT: movapd %xmm3, 64(%r9) ; SSE-NEXT: movapd %xmm4, 48(%r9) ; SSE-NEXT: movapd %xmm5, 32(%r9) -; SSE-NEXT: movapd %xmm6, 16(%r9) +; SSE-NEXT: movapd %xmm7, 16(%r9) ; SSE-NEXT: movapd %xmm8, (%r9) ; SSE-NEXT: addq $904, %rsp # imm = 0x388 ; SSE-NEXT: retq @@ -1672,151 +1664,149 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-LABEL: load_i32_stride5_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $952, %rsp # imm = 0x3B8 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm14 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm8[1,3],ymm1[6,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[3,0],ymm1[6,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm10[1,3],ymm1[6,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[3,0],ymm1[6,4],ymm2[7,4] ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm12[1,3],ymm1[6,5],ymm12[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[3,0],ymm1[6,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm11[1,3],ymm1[6,5],ymm11[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[3,0],ymm1[6,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm8[2,0],ymm13[7,4],ymm8[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] @@ -1828,8 +1818,8 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm14[2,0],ymm12[7,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[2,1],ymm0[6,4],ymm14[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,0],ymm10[2,0],ymm7[7,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,1],ymm0[6,4],ymm10[6,5] ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -1837,48 +1827,49 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm2[0,0],ymm7[5,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm4[0,0],ymm7[5,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm11[2,0],ymm0[7,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,1],ymm0[6,4],ymm11[6,5] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm12[2,0],ymm0[7,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm12[2,1],ymm0[6,4],ymm12[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm13[0,0],ymm9[5,4],ymm13[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm14[0,0],ymm9[5,4],ymm14[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,1],ymm0[6,4],ymm10[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm11[2,0],ymm0[7,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,1],ymm0[6,4],ymm11[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2],xmm6[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm6 = xmm6[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,0],ymm12[0,0],ymm6[5,4],ymm12[4,4] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm13[0,0],ymm1[5,4],ymm13[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,0],ymm8[3,0],ymm4[4,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm8[3,0],ymm2[4,4],ymm8[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] @@ -1890,78 +1881,78 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm14[3,0],ymm3[4,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm14[2,2],ymm15[6,4],ymm14[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm10[3,0],ymm3[4,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[2,0],ymm2[1,0],ymm7[6,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[2,0],ymm4[1,0],ymm7[6,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm11[3,0],ymm2[4,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm11[2,2],ymm15[6,4],ymm11[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,0],ymm12[3,0],ymm4[4,4],ymm12[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm12[2,2],ymm15[6,4],ymm12[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm13[1,0],ymm9[6,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm14[1,0],ymm9[6,4],ymm14[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,0],ymm11[3,0],ymm6[4,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm11[2,2],ymm15[6,4],ymm11[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm12[1,0],ymm6[6,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm13[1,0],ymm1[6,4],ymm13[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,2,3],ymm2[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],mem[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm3[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm2[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) @@ -1987,61 +1978,59 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r9) ; AVX1-ONLY-NEXT: addq $952, %rsp # imm = 0x3B8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride5_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm14 +; AVX2-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm11[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm12 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm13 -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm12 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -2049,32 +2038,38 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm6[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,3,u> -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [5,2,7,0,5,2,7,0] ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 @@ -2082,61 +2077,56 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm5[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm14 ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm13[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm6[2,3],ymm15[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm15[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm12[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm14[0,1],mem[2,3],ymm14[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm14 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <2,7,4,u> +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm9[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm10[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm14, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 @@ -2144,25 +2134,25 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm12[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 @@ -2170,73 +2160,75 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,6,1,6,1,6,1,6] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm5, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm5, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm14[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm1[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm5, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm5, %ymm10 +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm2[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $207, (%rsp), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm11[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm6[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[0,1],ymm11[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <4,1,6,u> ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm10, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm5, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm12[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm14[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm10, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm5, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm13[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm10, %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload @@ -2249,7 +2241,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm7[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) @@ -2287,7 +2279,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX2-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2298,91 +2290,91 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm11 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512F-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <0,5,10,15,20,25,30,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <0,5,10,15,20,25,30,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm8, %zmm7 ; AVX512F-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm13, %zmm7 ; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 -; AVX512F-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u> +; AVX512F-NEXT: vpermi2d %zmm3, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm8 {%k1} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 -; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,7,12,17,22,27,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm15, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <17,22,27,0,5,10,15,u> +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512F-NEXT: vpermt2d %zmm6, %zmm13, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <2,7,12,17,22,27,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 -; AVX512F-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <3,8,13,18,23,28,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512F-NEXT: vpermt2d %zmm6, %zmm13, %zmm17 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm16 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm16 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <3,8,13,18,23,28,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm18, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512F-NEXT: vpermt2d %zmm6, %zmm13, %zmm19 ; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 -; AVX512F-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm17, %zmm18 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm18 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm11, %zmm12, %zmm10 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <4,9,14,19,24,29,u,u> -; AVX512F-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm11, %zmm1 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm12, %zmm4 ; AVX512F-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512F-NEXT: vpermt2d %zmm2, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm15, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%r9) @@ -2397,91 +2389,91 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,5,10,15,20,25,30,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,5,10,15,20,25,30,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm8, %zmm7 ; AVX512BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm13, %zmm7 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u> +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm8 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,7,12,17,22,27,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm15, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <17,22,27,0,5,10,15,u> +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm13, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <2,7,12,17,22,27,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <3,8,13,18,23,28,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm13, %zmm17 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm16 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,8,13,18,23,28,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm13, %zmm19 ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm17, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm18 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <4,9,14,19,24,29,u,u> -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm11, %zmm1 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) @@ -2505,48 +2497,48 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i32_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1928, %rsp # imm = 0x788 +; SSE-NEXT: subq $1944, %rsp # imm = 0x798 ; SSE-NEXT: movdqa 768(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 752(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 752(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 720(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm6 +; SSE-NEXT: movdqa 736(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm9 +; SSE-NEXT: movdqa 400(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa 416(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm12 ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2566,15 +2558,15 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm12 +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2612,40 +2604,41 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1008(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1008(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 992(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 240(%rdi), %xmm7 +; SSE-NEXT: movdqa 256(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 560(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 608(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2664,49 +2657,47 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1200(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 1200(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1248(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1248(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1232(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm3 -; SSE-NEXT: movdqa 176(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm14 +; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm9 -; SSE-NEXT: movdqa 496(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa 480(%rdi), %xmm6 +; SSE-NEXT: movdqa 496(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 528(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 512(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2719,9 +2710,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 848(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa 832(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2739,51 +2730,54 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2792,24 +2786,24 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2818,46 +2812,49 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 544(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 784(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 704(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2866,24 +2863,23 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 944(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 864(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -2897,10 +2893,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 1024(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2910,7 +2906,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -2923,64 +2919,43 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 1184(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -2995,22 +2970,30 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -3019,46 +3002,54 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -3069,121 +3060,122 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -3194,51 +3186,51 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3248,7 +3240,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3262,37 +3254,40 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm9[0],xmm12[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload @@ -3308,22 +3303,22 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3333,7 +3328,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -3343,14 +3339,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -3472,37 +3468,36 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm15, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, (%rcx) -; SSE-NEXT: movapd %xmm13, 240(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 224(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 208(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 192(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 176(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 160(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 144(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 128(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 112(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 96(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 80(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 64(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, (%r8) +; SSE-NEXT: movapd %xmm11, 240(%r8) +; SSE-NEXT: movapd %xmm13, 224(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 208(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 192(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 176(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 160(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 144(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 128(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 112(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 96(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 64(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 16(%r8) +; SSE-NEXT: movaps (%rsp), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, (%r8) ; SSE-NEXT: movapd %xmm0, 240(%r9) ; SSE-NEXT: movapd %xmm1, 224(%r9) ; SSE-NEXT: movapd %xmm2, 208(%r9) @@ -3514,19 +3509,20 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm8, 112(%r9) ; SSE-NEXT: movapd %xmm9, 96(%r9) ; SSE-NEXT: movapd %xmm10, 80(%r9) -; SSE-NEXT: movapd %xmm11, 64(%r9) -; SSE-NEXT: movapd %xmm12, 48(%r9) -; SSE-NEXT: movapd %xmm14, 32(%r9) +; SSE-NEXT: movapd %xmm12, 64(%r9) +; SSE-NEXT: movapd %xmm14, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: addq $1928, %rsp # imm = 0x788 +; SSE-NEXT: addq $1944, %rsp # imm = 0x798 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2488, %rsp # imm = 0x9B8 +; AVX1-ONLY-NEXT: subq $2520, %rsp # imm = 0x9D8 ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 @@ -3576,12 +3572,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 @@ -3594,14 +3590,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] @@ -3613,14 +3609,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] @@ -3632,14 +3628,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] @@ -3652,12 +3648,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm0, %ymm0 @@ -3669,18 +3665,17 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3689,11 +3684,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -3717,11 +3712,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm1[1,3],ymm0[6,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm2[1,3],ymm0[6,5],ymm2[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm14[2,3],ymm2[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm14 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3734,10 +3730,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm13[1,3],ymm0[6,5],ymm13[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3750,26 +3746,26 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm9[1,3],ymm0[6,5],ymm9[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm6[2,3],ymm15[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3782,11 +3778,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3799,12 +3794,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill @@ -3821,6 +3815,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm6 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -3854,10 +3849,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm14[2,0],ymm0[7,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[2,1],ymm0[6,4],ymm14[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -3873,10 +3867,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm13[2,0],ymm0[7,4],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,1],ymm0[6,4],ymm13[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[2,0],ymm0[7,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -3886,16 +3879,15 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm6[0,0],ymm1[5,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm2[0,0],ymm1[5,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[2,0],ymm0[7,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm9[2,0],ymm0[7,4],ymm9[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,1],ymm0[6,4],ymm9[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -3910,10 +3902,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm14[2,0],ymm0[7,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm14[2,1],ymm1[6,4],ymm14[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm15[2,0],ymm0[7,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm15[2,1],ymm1[6,4],ymm15[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm0[4,5],mem[6,7] @@ -3921,16 +3913,17 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,0],ymm0[0,0],ymm12[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[1,0],ymm0[0,0],ymm13[5,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm7[2,0],ymm0[7,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm7[2,1],ymm2[6,4],ymm7[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm12[2,0],ymm0[7,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm12[2,1],ymm2[6,4],ymm12[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] @@ -3938,26 +3931,26 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[1,0],ymm0[0,0],ymm13[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,0],ymm0[0,0],ymm11[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[2,1],ymm3[6,4],ymm10[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm11 = xmm11[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[1,0],ymm1[0,0],ymm11[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2],xmm7[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm7 = xmm7[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,0],ymm1[0,0],ymm14[5,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] @@ -3967,9 +3960,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm1[3,0],ymm5[4,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm6[3,0],ymm5[4,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm6[2,2],ymm15[6,4],ymm6[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3996,10 +3988,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,0],ymm1[3,0],ymm15[4,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4012,53 +4004,55 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,0],ymm1[3,0],ymm15[4,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,0],ymm8[3,0],ymm15[4,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm6[1,0],ymm1[6,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm3[1,0],ymm11[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,0],ymm3[1,0],ymm14[6,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm7[3,0],ymm9[4,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[0,0],ymm12[3,0],ymm7[4,4],ymm12[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm12[2,2],ymm15[6,4],ymm12[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm10[1,0],ymm13[6,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm10[1,0],ymm11[6,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,0],ymm8[3,0],ymm6[4,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm8[3,0],ymm3[4,4],ymm8[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,0],ymm3[1,0],ymm12[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm6[1,0],ymm13[6,4],ymm6[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4066,23 +4060,22 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm8[3,0],ymm1[4,4],ymm8[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm9[3,0],ymm1[4,4],ymm9[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm9[2,2],ymm15[6,4],ymm9[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,0],ymm2[1,0],ymm4[6,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4098,15 +4091,15 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm13[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 16-byte Folded Reload @@ -4122,43 +4115,43 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm13[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm11[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3,4],mem[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3,4],mem[5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3,4],mem[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload @@ -4244,28 +4237,29 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX1-ONLY-NEXT: addq $2488, %rsp # imm = 0x9B8 +; AVX1-ONLY-NEXT: addq $2520, %rsp # imm = 0x9D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride5_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX2-ONLY-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 @@ -4282,6 +4276,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm12 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] @@ -4292,9 +4287,6 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm14 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] @@ -4302,27 +4294,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4366,27 +4358,28 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm15[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm5[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,3,u> +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,2,7,0,5,2,7,0] @@ -4396,9 +4389,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] @@ -4407,54 +4400,51 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 944(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm14[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 1264(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm10[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm15[2,3],ymm4[4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm7[0,1],mem[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm15[0,1],mem[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm10[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm8[0,1],mem[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -4477,7 +4467,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] @@ -4489,7 +4479,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm9[4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -4502,10 +4492,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm11[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm11[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 896(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 @@ -4513,13 +4504,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 1216(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 @@ -4532,7 +4522,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm1 @@ -4540,11 +4531,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 @@ -4552,18 +4543,17 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm14 ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vinserti128 $1, 736(%rdi), %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5,6],ymm15[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -4582,7 +4572,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -4616,10 +4606,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload @@ -4634,19 +4625,19 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm12[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm14[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm4[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload @@ -4657,9 +4648,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -4669,8 +4660,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[0,1],ymm7[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,1,6,u> ; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm8, %ymm5 @@ -4679,10 +4670,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm11[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 @@ -4690,7 +4681,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[0,1],ymm4[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] @@ -4707,14 +4698,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm12[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm10[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm14[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm12[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm8, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm7, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload @@ -4743,7 +4734,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm9 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm10, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm10[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] @@ -4820,415 +4811,409 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-ONLY-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX2-ONLY-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i32_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <0,5,10,15,20,25,30,u> -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm16, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm25 = <2,7,12,17,22,27,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm24, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,5,10,15,20,25,30,u> +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm7, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm24, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm24, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm3, %zmm24 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm20, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <17,22,27,0,5,10,15,u> +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm8, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm20, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm20, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm3, %zmm20 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm21 = <2,7,12,17,22,27,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm21, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm11, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm11, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm3, %zmm11 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm14, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 -; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm14, %zmm13 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm14, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm19, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm28, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm19, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm28, %zmm26 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <3,8,13,18,23,28,u,u> -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,u,u> -; AVX512F-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <3,8,13,18,23,28,u,u> +; AVX512F-NEXT: vpermt2d %zmm17, %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <4,9,14,19,24,29,u,u> +; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm25 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm15, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm12, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm25, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm9, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm8, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm21, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm3 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm12, %zmm3 -; AVX512F-NEXT: vpermi2d %zmm21, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm6 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm7 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm22, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm8, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm22, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm21, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm22, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm6, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm22, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm18 ; AVX512F-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm31 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm19 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm31 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm24 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm5, %zmm23 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm5, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm8 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm5, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm5, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm14 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm25 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm17 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm5, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm31, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm28, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, (%r9) ; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%r9) -; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 +; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,5,10,15,20,25,30,u> -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm16, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <2,7,12,17,22,27,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,5,10,15,20,25,30,u> +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm24 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <17,22,27,0,5,10,15,u> +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm8, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <2,7,12,17,22,27,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm11, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm14, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm14, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm19, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm28, %zmm26 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <3,8,13,18,23,28,u,u> -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,u,u> -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <3,8,13,18,23,28,u,u> +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <4,9,14,19,24,29,u,u> +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm25 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm15, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm12, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm25, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm9, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm8, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm3 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm3 -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm6 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm7 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm9, %zmm11 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm22, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm8, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm22, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm21, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm22, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm22, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm18 ; AVX512BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm31 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm31 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm23 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm8 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm14 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm17 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm31, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r9) -; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 +; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <320 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 8c709508720a44..165a5be5fe2ba0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -19,29 +19,29 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i32_stride6_vf2: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movq %xmm1, (%rsi) +; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: movq %xmm4, (%rdx) ; SSE-NEXT: movq %xmm5, (%rcx) ; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm0, (%r9) +; SSE-NEXT: movq %xmm1, (%r9) ; SSE-NEXT: movq %xmm7, (%rax) ; SSE-NEXT: retq ; @@ -220,52 +220,51 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa 80(%rdi), %xmm1 ; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] -; SSE-NEXT: movapd %xmm4, (%rsi) -; SSE-NEXT: movapd %xmm3, (%rdx) +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm9, (%rsi) +; SSE-NEXT: movapd %xmm4, (%rdx) ; SSE-NEXT: movapd %xmm5, (%rcx) -; SSE-NEXT: movapd %xmm6, (%r8) -; SSE-NEXT: movapd %xmm9, (%r9) +; SSE-NEXT: movapd %xmm10, (%r8) +; SSE-NEXT: movapd %xmm8, (%r9) ; SSE-NEXT: movapd %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -314,27 +313,27 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i32_stride6_vf4: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <0,6,4,u> -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <0,6,4,u> +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <1,7,5,u> ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,0,2,3,4,4,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,0,2,3,4,4,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpbroadcastd %xmm7, %xmm8 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,3,3,4,5,7,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,3,3,4,5,7,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-ONLY-NEXT: vpbroadcastd 84(%rdi), %xmm8 @@ -342,20 +341,20 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm9 = [4,2,4,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,3,5,3] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, (%rsi) ; AVX2-ONLY-NEXT: vmovdqa %xmm3, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %xmm6, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %xmm5, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %xmm2, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%r9) +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -403,122 +402,123 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 144(%rdi), %xmm4 -; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa 96(%rdi), %xmm6 -; SSE-NEXT: movdqa 112(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa 144(%rdi), %xmm8 +; SSE-NEXT: movdqa 160(%rdi), %xmm4 +; SSE-NEXT: movdqa 96(%rdi), %xmm10 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm13 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa 48(%rdi), %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm6[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] ; SSE-NEXT: movdqa 80(%rdi), %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm15[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,3,3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm11[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,2,3,3] ; SSE-NEXT: movdqa 176(%rdi), %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movapd %xmm12, 16(%rdx) -; SSE-NEXT: movapd %xmm11, (%rdx) -; SSE-NEXT: movapd %xmm13, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movapd %xmm4, 16(%r8) -; SSE-NEXT: movapd %xmm8, (%r8) -; SSE-NEXT: movapd %xmm10, 16(%r9) -; SSE-NEXT: movapd %xmm6, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movapd %xmm9, 16(%rdx) +; SSE-NEXT: movapd %xmm6, (%rdx) +; SSE-NEXT: movapd %xmm11, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movapd %xmm8, 16(%r8) +; SSE-NEXT: movapd %xmm12, (%r8) +; SSE-NEXT: movapd %xmm0, 16(%r9) +; SSE-NEXT: movapd %xmm13, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm2, 16(%rax) -; SSE-NEXT: movapd %xmm9, (%rax) +; SSE-NEXT: movapd %xmm4, 16(%rax) +; SSE-NEXT: movapd %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf8: @@ -553,35 +553,35 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,0],ymm8[2,0],ymm9[4,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,0],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm6[2,0],xmm7[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm1[2,0],ymm0[6,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[3,1],ymm9[4,5],ymm8[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,1],xmm11[3,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1],ymm1[2,1],ymm0[7,5],ymm1[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,1],xmm7[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[3,1],ymm1[2,1],ymm0[7,5],ymm1[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2],ymm8[2,0],ymm4[4,6],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,2],ymm7[2,0],ymm4[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3] ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm12[1],ymm1[0],ymm12[2],ymm1[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,1],ymm13[2,0],ymm0[4,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm4[1,0],ymm3[7,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,3],ymm3[2,0],ymm4[4,7],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm1[1,3],ymm12[7,5],ymm1[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,0],ymm0[5,5],ymm1[6,4] @@ -589,9 +589,9 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -860,46 +860,46 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i32_stride6_vf8: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <0,6,12,18,24,30,u,u> -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] ; AVX512-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <1,7,13,19,25,31,u,u> -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] ; AVX512-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12] ; AVX512-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 +; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <2,8,14,20,26,u,u,u> -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13] ; AVX512-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 +; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <3,9,15,21,27,u,u,u> -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = <20,26,0,6,12,u,u,u> -; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <20,26,0,6,12,u,u,u> +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = <21,27,1,7,13,u,u,u> -; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 +; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <21,27,1,7,13,u,u,u> +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,11,9,15] +; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm0 ; AVX512-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512-NEXT: vmovdqa %ymm4, (%rcx) ; AVX512-NEXT: vmovdqa %ymm5, (%r8) ; AVX512-NEXT: vmovdqa %ymm8, (%r9) -; AVX512-NEXT: vmovdqa %ymm2, (%rax) +; AVX512-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <48 x i32>, ptr %in.vec, align 64 @@ -921,162 +921,165 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movdqa 240(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm3 -; SSE-NEXT: movdqa 192(%rdi), %xmm10 +; SSE-NEXT: subq $392, %rsp # imm = 0x188 +; SSE-NEXT: movdqa 240(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm4 +; SSE-NEXT: movdqa 192(%rdi), %xmm11 +; SSE-NEXT: movdqa 208(%rdi), %xmm5 +; SSE-NEXT: movdqa 336(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm4 -; SSE-NEXT: movdqa 336(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm5 -; SSE-NEXT: movdqa 288(%rdi), %xmm15 +; SSE-NEXT: movdqa 352(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 304(%rdi), %xmm7 -; SSE-NEXT: movdqa 64(%rdi), %xmm12 -; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm12 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm9 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm10 ; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa 160(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: movdqa 144(%rdi), %xmm15 +; SSE-NEXT: movdqa 160(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] ; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa 272(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa 272(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: movdqa 224(%rdi), %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa 176(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -1084,176 +1087,174 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) ; SSE-NEXT: movapd %xmm15, 16(%r8) -; SSE-NEXT: movapd %xmm12, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm2, 16(%r9) -; SSE-NEXT: movapd %xmm3, 32(%r9) +; SSE-NEXT: movapd %xmm7, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movapd %xmm0, 16(%r9) +; SSE-NEXT: movapd %xmm1, 32(%r9) ; SSE-NEXT: movapd %xmm4, 48(%r9) ; SSE-NEXT: movapd %xmm5, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm13, 16(%rax) -; SSE-NEXT: movapd %xmm9, 32(%rax) -; SSE-NEXT: movapd %xmm8, 48(%rax) -; SSE-NEXT: movapd %xmm10, (%rax) -; SSE-NEXT: addq $408, %rsp # imm = 0x198 +; SSE-NEXT: movapd %xmm9, 16(%rax) +; SSE-NEXT: movapd %xmm10, 32(%rax) +; SSE-NEXT: movapd %xmm13, 48(%rax) +; SSE-NEXT: movapd %xmm14, (%rax) +; SSE-NEXT: addq $392, %rsp # imm = 0x188 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $328, %rsp # imm = 0x148 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0],ymm2[0,0],ymm3[6,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm2[2,2],ymm5[6,4],ymm2[6,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm7[0,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm8[0],ymm13[0],ymm8[3],ymm13[2] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[2,0],ymm6[0,0],ymm14[6,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm6[2,2],ymm3[6,4],ymm6[6,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm2[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm2[0,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm0[0,1] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm7[0],ymm10[0],ymm7[3],ymm10[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm10[2,0],ymm1[0,0],ymm10[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm1[2,2],ymm5[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,0],ymm1[2,2],ymm3[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm4[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2],ymm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm12 @@ -1262,18 +1263,18 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[3,0],ymm2[1,0],ymm9[7,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[2,0],ymm2[2,3],ymm11[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm13[1,3],ymm8[7,5],ymm13[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[3,0],ymm6[1,0],ymm14[7,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm6[2,3],ymm5[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[1,0],xmm2[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[0,2],xmm2[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm10[1,3],ymm7[7,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,0],ymm1[1,0],ymm10[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,0],ymm1[1,0],ymm8[7,4],ymm1[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,0],xmm0[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] @@ -1282,155 +1283,155 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[2,1],ymm7[2,0],ymm9[6,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm9[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0],ymm4[2,0],ymm8[4,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[2,1],ymm13[2,0],ymm14[6,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm8[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0],ymm4[2,0],ymm5[4,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,1],ymm3[2,0],ymm10[6,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[2,0],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm14[2,0],ymm0[4,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[3,1],ymm8[4,5],ymm4[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],xmm5[3,3] -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm1[2,1],ymm7[7,5],ymm1[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm15[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[2,1],ymm6[2,0],ymm9[6,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[2,0],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm14[3,1],ymm0[4,5],ymm14[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[3,1],xmm15[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[3,1],ymm5[4,5],ymm4[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1],xmm3[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[3,1],ymm2[2,1],ymm14[7,5],ymm2[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[3,1],xmm11[3,3] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,1],ymm6[2,1],ymm15[7,5],ymm6[6,5] +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm10[2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm6[1],ymm1[0],ymm6[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,0],ymm7[4,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[2,0],ymm8[0,0],ymm9[6,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,2],ymm11[2,0],ymm8[4,6],ymm11[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm7[1],ymm2[0],ymm7[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,0],ymm14[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,0],ymm8[0,0],ymm0[6,4],ymm8[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2],ymm9[2,0],ymm8[4,6],ymm9[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm2[0],xmm12[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm1[1],ymm15[0],ymm1[2],ymm15[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm7[2,0],ymm4[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,0],ymm14[0,0],ymm11[6,4],ymm14[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm4[2,0],ymm14[4,6],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,0],ymm8[1,0],ymm9[7,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,3],ymm7[2,0],ymm8[4,7],ymm7[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1],ymm5[2,0],ymm13[5,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,0],ymm14[1,0],ymm11[7,4],ymm14[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,3],ymm5[2,0],ymm14[4,7],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[2,0],ymm10[5,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,0],ymm15[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm12[0,0],ymm9[6,4],ymm12[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,2],ymm15[2,0],ymm12[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[1,0],ymm0[7,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,3],ymm0[2,0],ymm8[4,7],ymm0[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm10[1,3],ymm7[7,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1],ymm5[2,0],ymm14[5,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm12[1,0],ymm9[7,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,3],ymm4[2,0],ymm12[4,7],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm3[1,3],ymm1[7,5],ymm3[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm1[2,0],ymm13[5,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: addq $328, %rsp # imm = 0x148 +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride6_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $232, %rsp -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $200, %rsp +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm12[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[0,1],ymm7[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm8[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[0,1],ymm9[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm14 @@ -1440,70 +1441,70 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm5, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm4, %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1511,80 +1512,79 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vblendps $240, (%rsp), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [4,2,4,2] -; AVX2-SLOW-NEXT: # xmm4 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm8[2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [5,3,5,3] -; AVX2-SLOW-NEXT: # xmm2 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm2, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] +; AVX2-SLOW-NEXT: # xmm5 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm10, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm3 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm3 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm12, 32(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm13, (%r8) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rax) -; AVX2-SLOW-NEXT: addq $232, %rsp +; AVX2-SLOW-NEXT: addq $200, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride6_vf16: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $200, %rsp -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm2 @@ -1592,25 +1592,25 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm8[0,1],ymm7[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[0,1],ymm9[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm14 @@ -1620,41 +1620,41 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] @@ -1665,24 +1665,24 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovaps %ymm12, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1690,62 +1690,62 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] ; AVX2-FAST-NEXT: # xmm5 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm9, %ymm14 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm10, %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0,1],ymm4[2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm2 = [5,3,5,3] -; AVX2-FAST-NEXT: # xmm2 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm3 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm3 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm12, 32(%r8) ; AVX2-FAST-NEXT: vmovaps %ymm13, (%r8) ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) @@ -1758,37 +1758,37 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $232, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm12[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[0,1],ymm7[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm8[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[0,1],ymm9[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm1, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm14 @@ -1798,70 +1798,70 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm5, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm4, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1869,70 +1869,69 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $240, (%rsp), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm8[2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm2 = [5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm2, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm10, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm3 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $232, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $200, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1941,86 +1940,86 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,18,24,30,u,u> -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,18,24,30,u,u> +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512F-NEXT: movb $56, %dil ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 ; AVX512F-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <1,7,13,19,25,31,u,u> -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <2,8,14,20,26,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <1,7,13,19,25,31,u,u> +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <2,8,14,20,26,u,u,u> +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 ; AVX512F-NEXT: movw $31, %di ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <3,9,15,21,27,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm9 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <3,9,15,21,27,u,u,u> +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 -; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm10 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm10 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <20,26,0,6,12,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm11 ; AVX512F-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm11 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 ; AVX512F-NEXT: movb $-32, %dil ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <21,27,1,7,13,u,u,u> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm4 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2029,86 +2028,86 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,18,24,30,u,u> -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,18,24,30,u,u> +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <1,7,13,19,25,31,u,u> -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <2,8,14,20,26,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <1,7,13,19,25,31,u,u> +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,8,14,20,26,u,u,u> +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 ; AVX512BW-NEXT: movw $31, %di ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <3,9,15,21,27,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm9 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <3,9,15,21,27,u,u,u> +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm10 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm10 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <20,26,0,6,12,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm11 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm11 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <21,27,1,7,13,u,u,u> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <96 x i32>, ptr %in.vec, align 64 @@ -2131,347 +2130,350 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride6_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $1032, %rsp # imm = 0x408 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm14 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: movdqa 528(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 528(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 544(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm8 +; SSE-NEXT: movdqa 480(%rdi), %xmm11 ; SSE-NEXT: movdqa 496(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa 96(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 384(%rdi), %xmm6 +; SSE-NEXT: movdqa 400(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 432(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 304(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa 448(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 688(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 720(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm1 +; SSE-NEXT: movdqa 288(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm2 +; SSE-NEXT: movdqa 336(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm7 -; SSE-NEXT: movdqa 592(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 688(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa 720(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa 736(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa 240(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm2[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa 176(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa 80(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm14 -; SSE-NEXT: movdqa 224(%rdi), %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 560(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa 512(%rdi), %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 464(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa 416(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa 272(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa 560(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa 512(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa 464(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa 416(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa 752(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa 704(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa 752(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa 656(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa 608(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa 704(%rdi), %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa 656(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa 608(%rdi), %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -2479,88 +2481,77 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] @@ -2568,29 +2559,40 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2599,8 +2601,9 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2609,17 +2612,17 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, %xmm11 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, %xmm12 +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2627,20 +2630,10 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2648,9 +2641,10 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2658,18 +2652,23 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2722,9 +2721,9 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%r8) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) @@ -2737,23 +2736,27 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, 112(%r9) ; SSE-NEXT: movapd %xmm3, 96(%r9) ; SSE-NEXT: movapd %xmm4, 80(%r9) -; SSE-NEXT: movapd %xmm6, 64(%r9) -; SSE-NEXT: movapd %xmm8, 48(%r9) +; SSE-NEXT: movapd %xmm5, 64(%r9) +; SSE-NEXT: movapd %xmm6, 48(%r9) ; SSE-NEXT: movapd %xmm9, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm14, 112(%rax) -; SSE-NEXT: movapd %xmm13, 96(%rax) -; SSE-NEXT: movapd %xmm15, 80(%rax) -; SSE-NEXT: movapd %xmm10, 64(%rax) +; SSE-NEXT: movapd %xmm8, 112(%rax) +; SSE-NEXT: movapd %xmm10, 96(%rax) +; SSE-NEXT: movapd %xmm13, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movapd %xmm12, 32(%rax) -; SSE-NEXT: movapd %xmm11, 16(%rax) -; SSE-NEXT: movapd %xmm7, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $1032, %rsp # imm = 0x408 ; SSE-NEXT: retq ; @@ -2769,7 +2772,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 @@ -2869,7 +2872,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],xmm7[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[0,2],xmm7[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] @@ -2881,9 +2884,9 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[1,0],xmm5[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[0,2],xmm5[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm14[1,3],ymm0[7,5],ymm14[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm13[1,3],ymm0[7,5],ymm13[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2893,9 +2896,9 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[1,0],xmm2[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,2],xmm2[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,1],mem[1,3],ymm3[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2905,114 +2908,113 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,0],xmm1[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm1[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1],ymm13[1,3],ymm10[7,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1],ymm7[1,3],ymm10[7,5],ymm7[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm15[2,1],mem[2,0],ymm15[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm3[2,0],ymm0[4,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,0],ymm5[2,0],ymm12[4,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,0],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0],ymm2[2,0],ymm0[4,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,0],ymm8[2,0],ymm6[4,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm9 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm9[2,0],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2],ymm10[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,0],ymm11[2,0],ymm0[4,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm13[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[2,1],ymm8[2,0],ymm5[6,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,1],ymm7[2,0],ymm1[6,5],ymm7[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,0],xmm14[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[2,0],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm3[3,1],ymm1[4,5],ymm3[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm4[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1],ymm5[3,1],ymm12[4,5],ymm5[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,1],ymm13[2,1],ymm11[7,5],ymm13[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm12[3,1],ymm13[2,1],ymm12[7,5],ymm13[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1],ymm8[3,1],ymm6[4,5],ymm8[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,1],ymm6[2,1],ymm5[7,5],ymm6[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[3,1],ymm1[4,5],ymm2[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm6[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm6[2,1],ymm4[7,5],ymm6[6,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[3,1],xmm14[3,3] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,1],ymm7[2,1],ymm1[7,5],ymm7[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,1],xmm14[3,3] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm8[2,1],ymm5[7,5],ymm8[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm10[3,1],ymm7[4,5],ymm10[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm11[3,1],ymm0[4,5],ymm11[7,5] ; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm9[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm12[2,1],ymm2[7,5],ymm12[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,1],ymm10[2,1],ymm11[7,5],ymm10[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -3029,7 +3031,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] @@ -3037,159 +3039,161 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm1[2,0],ymm7[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[1],ymm13[0],ymm10[2],ymm13[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,0],ymm11[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm1[0,0],ymm6[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm2[2,0],ymm1[4,6],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[1],ymm13[0],ymm6[2],ymm13[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,0],ymm12[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[2,0],ymm0[0,0],ymm9[6,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm2[2,0],ymm0[4,6],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm2[0,0],ymm5[6,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,2],ymm3[2,0],ymm2[4,6],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[2,0],ymm1[0,0],ymm13[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm3[2,0],ymm1[4,6],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm3[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm3[1],ymm10[0],ymm3[2],ymm10[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[0,1],ymm14[2,0],ymm11[4,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm4[2,0],ymm14[0,0],ymm4[6,4],ymm14[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[0,2],ymm12[2,0],ymm14[4,6],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,0],ymm0[1,0],ymm9[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1],ymm6[2,0],ymm9[5,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm9[1],ymm12[0],ymm9[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,1],ymm14[2,0],ymm4[4,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,0],ymm0[0,0],ymm3[6,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm1[1,0],ymm6[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm7[1,0],ymm0[7,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,3],ymm0[2,0],ymm7[4,7],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm7[3,1],mem[1,3],ymm7[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1],ymm7[2,0],ymm9[5,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,0],ymm1[1,0],ymm13[7,4],ymm1[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm6[2,0],ymm1[4,7],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps $12, (%rsp), %xmm15, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm7[3,1],mem[1,3],ymm7[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1],ymm7[2,0],ymm9[5,5],ymm7[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm7[1,0],ymm1[7,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,3],ymm1[2,0],ymm7[4,7],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm2[1,0],ymm5[7,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,3],ymm1[2,0],ymm2[4,7],ymm1[6,4] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm13[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1],ymm5[2,0],ymm10[5,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[2,0],ymm4[5,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm14[1,0],ymm4[7,4],ymm14[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,3],ymm4[2,0],ymm14[4,7],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,1],mem[1,3],ymm3[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,1],ymm3[2,0],ymm10[5,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1160, %rsp # imm = 0x488 +; AVX2-SLOW-NEXT: subq $1192, %rsp # imm = 0x4A8 ; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovups %ymm12, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm3 @@ -3202,33 +3206,33 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm11 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm11, %ymm0 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[0,1],ymm6[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm12[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm0 @@ -3236,106 +3240,105 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm12, %ymm10 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm4, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm11, %ymm13 +; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm12, %ymm15 -; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm14[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, (%rsp), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] @@ -3357,19 +3360,20 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm15[2,3],ymm2[4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] @@ -3383,76 +3387,76 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3,4],ymm10[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm14 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm15[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm3[3,3,3,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] @@ -3464,138 +3468,134 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] ; AVX2-SLOW-NEXT: # xmm11 = mem[0,0] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vmovaps %ymm11, %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm13, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm14 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm9[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm9[2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vmovaps %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm10[2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm14 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm14, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] -; AVX2-SLOW-NEXT: # xmm4 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm13, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm13, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm13, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm13, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) -; AVX2-SLOW-NEXT: addq $1160, %rsp # imm = 0x488 +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: addq $1192, %rsp # imm = 0x4A8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3603,10 +3603,10 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovups %ymm12, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm3 @@ -3619,33 +3619,33 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm11 = <0,6,4,u> +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm11, %ymm0 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[0,1],ymm6[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm12[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm0 @@ -3653,105 +3653,105 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm4, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm13 +; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm12, %ymm15 -; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm14[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, (%rsp), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, (%rsp), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] @@ -3771,15 +3771,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm15[2,3],mem[4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm3 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3] @@ -3790,86 +3789,88 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3,4],ymm10[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm3[3,3,3,3] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm9 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm10 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm13 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3878,149 +3879,144 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] ; AVX2-FAST-NEXT: # xmm11 = mem[0,0] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vmovaps %ymm11, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm14 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm9[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm9[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vmovaps %ymm11, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm10[2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm14 = mem[0,0] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm14, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm13, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] -; AVX2-FAST-NEXT: # xmm4 = mem[0,0] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm13, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm13, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm13, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1160, %rsp # imm = 0x488 +; AVX2-FAST-PERLANE-NEXT: subq $1192, %rsp # imm = 0x4A8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm12, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm3 @@ -4033,33 +4029,33 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm11 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm11, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[0,1],ymm6[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm12[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm0 @@ -4067,106 +4063,105 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm4, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm11, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm12, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm14[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, (%rsp), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] @@ -4188,19 +4183,20 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm15[2,3],ymm2[4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] @@ -4214,76 +4210,76 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3,4],ymm10[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm15[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm3[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] @@ -4295,138 +4291,134 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] ; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm14 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm9[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm9[2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm10[2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm14, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm13, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm13, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm13, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm13, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm13, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1160, %rsp # imm = 0x488 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1192, %rsp # imm = 0x4A8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4434,13 +4426,13 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm8 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm11 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm12 @@ -4449,42 +4441,42 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,18,24,30,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <0,6,12,18,24,30,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm10, %zmm9 ; AVX512F-NEXT: movb $56, %dil ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 ; AVX512F-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} ; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} +; AVX512F-NEXT: vpermi2d %zmm3, %zmm5, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512F-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <1,7,13,19,25,31,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <1,7,13,19,25,31,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm15, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 -; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm17, %zmm18 +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm14 {%k1} ; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} +; AVX512F-NEXT: vpermi2d %zmm3, %zmm5, %zmm16 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,8,14,20,26,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 @@ -4497,77 +4489,77 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm19, %zmm20 ; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} ; AVX512F-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 -; AVX512F-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm17 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm18 ; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} ; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm20 = <3,9,15,21,27,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} -; AVX512F-NEXT: vpermi2d %zmm4, %zmm6, %zmm21 -; AVX512F-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = <3,9,15,21,27,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512F-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm21 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm19, %zmm22 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm21 {%k1} +; AVX512F-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm20 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm18 +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm20 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm18, %zmm19 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = <20,26,0,6,12,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512F-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 ; AVX512F-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm23 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm19, %zmm24 ; AVX512F-NEXT: movb $-32, %dil ; AVX512F-NEXT: kmovw %edi, %k2 ; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm23 {%k2} -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm21 -; AVX512F-NEXT: vpermi2d %zmm5, %zmm2, %zmm20 -; AVX512F-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm22 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm13, %zmm18, %zmm12 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <21,27,1,7,13,u,u,u> ; AVX512F-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} +; AVX512F-NEXT: vpermt2d %zmm7, %zmm11, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 {%k2} ; AVX512F-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm15, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm21, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) @@ -4579,13 +4571,13 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm12 @@ -4594,42 +4586,42 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,18,24,30,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <0,6,12,18,24,30,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm9 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 ; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <1,7,13,19,25,31,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <1,7,13,19,25,31,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 -; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm17, %zmm18 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm14 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm5, %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,8,14,20,26,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 @@ -4642,77 +4634,77 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm19, %zmm20 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm17 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm18 ; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = <3,9,15,21,27,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <3,9,15,21,27,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm21 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm19, %zmm22 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm21 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm18 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm20 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm19 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <20,26,0,6,12,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm23 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm19, %zmm24 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm20 -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm22 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm12 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <21,27,1,7,13,u,u,u> ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 {%k2} ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) @@ -4747,35 +4739,34 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 528(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 528(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 544(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 480(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 496(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa 96(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4787,12 +4778,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1248(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1248(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1264(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 1296(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4809,30 +4799,30 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm4 +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 432(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 448(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 768(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 784(%rdi), %xmm0 @@ -4844,10 +4834,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 832(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1152(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1168(%rdi), %xmm0 @@ -4859,42 +4849,41 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 304(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm7 +; SSE-NEXT: movdqa 336(%rdi), %xmm14 ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 720(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 736(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1056(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1072(%rdi), %xmm0 @@ -4902,505 +4891,543 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1104(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1104(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 1120(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1440(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1440(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1456(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1488(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1488(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1504(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm5 -; SSE-NEXT: movdqa 208(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm10 -; SSE-NEXT: movdqa 592(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm11 -; SSE-NEXT: movdqa 640(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 976(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa 1008(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa 208(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1024(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1344(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1360(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa 1392(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1408(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 240(%rdi), %xmm7 +; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm12 +; SSE-NEXT: movdqa 592(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm11 +; SSE-NEXT: movdqa 640(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm5[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 960(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 976(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movdqa 1008(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1024(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm6[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1344(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1360(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movdqa 1392(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1408(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm6[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm5[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: movdqa 80(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] ; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,3,2,3] ; SSE-NEXT: movdqa 128(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 272(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 368(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa 416(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] ; SSE-NEXT: movdqa 560(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa 512(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa 512(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 656(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 608(%rdi), %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: movdqa 608(%rdi), %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 752(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa 704(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa 704(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa 848(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa 800(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa 944(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: movdqa 896(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa 1040(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa 992(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 848(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa 800(%rdi), %xmm2 +; SSE-NEXT: movdqa 1136(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 944(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 896(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1088(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa 1040(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa 992(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa 1232(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa 1184(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa 1136(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 1088(%rdi), %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa 1328(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1280(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa 1232(%rdi), %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa 1424(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa 1184(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1376(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 1328(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 1280(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movdqa 1520(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1472(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 1424(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 1376(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 1520(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 1472(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -5410,8 +5437,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -5421,28 +5448,19 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -5450,52 +5468,23 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5503,14 +5492,14 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5525,38 +5514,33 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5566,29 +5550,25 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5598,7 +5578,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5618,38 +5598,23 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -5658,16 +5623,39 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm7 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,0,1,1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5676,13 +5664,15 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -5692,32 +5682,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, %xmm4 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5725,10 +5704,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5736,10 +5715,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5747,10 +5726,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5808,14 +5787,15 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5823,38 +5803,45 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5957,7 +5944,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%r8) @@ -5979,18 +5966,18 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movapd %xmm2, 240(%r9) ; SSE-NEXT: movapd %xmm3, 224(%r9) -; SSE-NEXT: movapd %xmm5, 208(%r9) -; SSE-NEXT: movapd %xmm7, 192(%r9) -; SSE-NEXT: movapd %xmm13, 176(%r9) -; SSE-NEXT: movapd %xmm8, 160(%r9) -; SSE-NEXT: movapd %xmm9, 144(%r9) -; SSE-NEXT: movapd %xmm11, 128(%r9) +; SSE-NEXT: movapd %xmm4, 208(%r9) +; SSE-NEXT: movapd %xmm5, 192(%r9) +; SSE-NEXT: movapd %xmm6, 176(%r9) +; SSE-NEXT: movapd %xmm7, 160(%r9) +; SSE-NEXT: movapd %xmm8, 144(%r9) +; SSE-NEXT: movapd %xmm10, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6008,11 +5995,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm14, 240(%rax) -; SSE-NEXT: movapd %xmm12, 224(%rax) -; SSE-NEXT: movapd %xmm15, 208(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rax) +; SSE-NEXT: movapd %xmm15, 240(%rax) +; SSE-NEXT: movapd %xmm14, 224(%rax) +; SSE-NEXT: movapd %xmm12, 208(%rax) +; SSE-NEXT: movapd %xmm13, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6035,44 +6021,45 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movapd %xmm4, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $2184, %rsp # imm = 0x888 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $2584, %rsp # imm = 0xA18 -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm5[0,0],ymm1[6,4],ymm5[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,2],ymm0[6,4],ymm5[6,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm4[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[3],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6082,7 +6069,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,2],ymm0[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6227,87 +6214,87 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[3],ymm9[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm1[0,0],ymm5[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[2,0],ymm1[0,0],ymm9[6,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[2,0],ymm1[2,2],ymm0[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,2],xmm5[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,2],xmm9[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm9[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm12[0,1] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm12[0],ymm0[3],ymm12[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,0],ymm0[1,0],ymm12[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm9[1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,0],ymm9[1,0],ymm12[7,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm9[2,3],ymm14[6,4],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[1,0],xmm5[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm5[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[3,0],ymm0[1,0],ymm9[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm1[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm1[1,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[3,1],ymm9[1,3],ymm5[7,5],ymm9[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm5[3,0],ymm12[1,0],ymm5[7,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm12[2,3],ymm14[6,4],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[1,0],xmm0[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm1[1,0],ymm0[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[2,3],ymm14[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,0],ymm0[1,0],ymm12[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm13[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[0,2],xmm13[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1],ymm15[1,3],ymm0[7,5],ymm15[5,7] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm14[3,0],ymm11[1,0],ymm14[7,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[3,0],ymm11[1,0],ymm15[7,4],ymm11[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,0],ymm11[2,3],ymm13[6,4],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,0],xmm10[3,0] @@ -6319,8 +6306,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[3,0],ymm8[1,0],ymm11[7,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0],ymm8[1,0],ymm10[7,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm8[2,3],ymm10[6,4],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[1,0],xmm7[3,0] @@ -6332,8 +6319,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm6[1,0],ymm7[7,4],ymm6[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,0],xmm4[3,0] @@ -6345,8 +6332,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,0],ymm3[1,0],ymm6[7,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm3[1,0],ymm4[7,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,0],xmm2[3,0] @@ -6359,243 +6346,244 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm1[1,0],ymm3[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm5[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm5[1,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm6[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm6[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm2[3,1],mem[1,3],ymm2[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm2[1,3],ymm1[7,5],ymm2[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm12[2,0],ymm1[4,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm6[2,0],ymm1[4,4],ymm6[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm9[2,1],mem[2,0],ymm9[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm5[2,1],mem[2,0],ymm5[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm11[2,0],ymm1[4,4],ymm11[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm12[2,1],mem[2,0],ymm12[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm8[2,0],ymm1[4,4],ymm8[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[2,1],mem[2,0],ymm14[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm15[2,1],mem[2,0],ymm15[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm10[2,0],ymm1[4,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm6[2,0],ymm1[4,4],ymm6[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm5[2,0],ymm1[4,4],ymm5[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm9[2,0],ymm1[4,4],ymm9[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm14[2,0],ymm1[4,4],ymm14[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm6[2,1],mem[2,0],ymm6[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0],ymm6[2,0],ymm13[4,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,1],ymm3[2,0],ymm4[6,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[2,0],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,0],ymm15[2,0],ymm2[4,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm12[3,1],ymm0[4,5],ymm12[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[3,1],ymm14[2,1],ymm1[7,5],ymm14[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,0],ymm4[2,0],ymm1[4,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm1[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm3[2,1],ymm1[2,0],ymm3[6,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[2,0],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,0],ymm13[2,0],ymm2[4,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm9[0,1],mem[3,1],ymm9[4,5],mem[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm9[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,1],ymm15[2,1],ymm9[7,5],ymm15[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1],ymm11[3,1],ymm9[4,5],ymm11[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,1],mem[2,1],ymm0[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm10[3,1],ymm0[4,5],ymm10[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5] +; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm15[3,1],ymm2[4,5],ymm15[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,1],xmm7[3,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[3,1],ymm9[4,5],ymm8[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[2,1],ymm10[7,5],mem[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[3,1],ymm8[4,5],ymm6[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm9[3,1],mem[2,1],ymm9[7,5],mem[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm13[3,1],ymm2[4,5],ymm13[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm3[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,1],ymm1[2,1],ymm7[7,5],ymm1[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm6[3,1],ymm13[4,5],ymm6[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm8[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm6[2,1],ymm3[7,5],ymm6[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm4[3,1],ymm0[4,5],ymm4[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm5[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,1],ymm11[2,1],ymm10[7,5],ymm11[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm9[3,1],ymm0[4,5],ymm9[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm14[3,1],ymm0[4,5],ymm14[7,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm8[2,1],ymm4[7,5],ymm8[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm8[2,1],ymm5[7,5],ymm8[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -6606,9 +6594,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm4[2,1],ymm3[7,5],ymm4[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -6624,19 +6612,18 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm4[0],ymm1[2],ymm4[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm10[0,0],ymm2[6,4],ymm10[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm1[2,0],ymm10[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm9[0,0],ymm2[6,4],ymm9[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm1[2,0],ymm9[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6645,13 +6632,14 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm15[0],ymm1[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm1[2,0],ymm7[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm6[0,0],ymm3[6,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm1[2,0],ymm6[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6667,7 +6655,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm5[0,0],ymm2[6,4],ymm5[4,4] @@ -6675,8 +6663,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6687,17 +6675,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,0],ymm12[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm4[0,0],ymm12[6,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,0],ymm4[0,0],ymm13[6,4],ymm4[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm1[2,0],ymm4[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6706,17 +6693,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 848(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm11[0],ymm1[2],ymm11[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,0],ymm10[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,0],ymm3[0,0],ymm11[6,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm3[0,0],ymm12[6,4],ymm3[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6730,14 +6717,14 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm2[0,0],ymm14[6,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,0],ymm2[0,0],ymm11[6,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6748,230 +6735,230 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm1[0,0],ymm10[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,2],ymm8[2,0],ymm1[4,6],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm9[1],mem[0],ymm9[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm7[1],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,1],ymm14[2,0],ymm7[4,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,0],ymm0[0,0],ymm8[6,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[0,2],ymm15[2,0],ymm0[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[3,0],ymm9[1,0],ymm7[7,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,3],ymm14[2,0],ymm9[4,7],ymm14[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm9[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,1],ymm15[2,0],ymm9[4,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm6[2,0],ymm0[0,0],ymm6[6,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,2],ymm13[2,0],ymm0[4,6],ymm13[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4],ymm13[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[3,0],ymm10[1,0],ymm9[7,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,3],ymm13[2,0],ymm10[4,7],ymm13[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm7[1,0],ymm10[7,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,3],ymm13[2,0],ymm7[4,7],ymm13[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm5[1,0],ymm10[7,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm13[2,0],ymm5[4,7],ymm13[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm12[2,0],ymm4[4,7],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[1,1],ymm13[2,0],ymm10[5,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm11[2,0],ymm3[4,7],ymm11[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm10[1,1],ymm12[2,0],ymm10[5,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm14[3,0],ymm2[1,0],ymm14[7,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm10[2,0],ymm2[4,7],ymm10[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[1,3],ymm11[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[1,1],ymm11[2,0],ymm12[5,5],ymm11[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm1[1,0],ymm8[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm8[2,0],ymm1[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,1],ymm10[2,0],ymm11[5,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm0[1,0],ymm6[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm6[2,0],ymm0[4,7],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1],ymm15[2,0],ymm9[5,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm6[1,0],ymm7[7,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,3],ymm7[2,0],ymm6[4,7],ymm7[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[1,1],ymm14[2,0],ymm9[5,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm5[1,0],ymm6[7,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm6[2,0],ymm5[4,7],ymm6[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,1],ymm8[2,0],ymm10[5,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm7[3,1],mem[1,3],ymm7[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1],ymm7[2,0],ymm9[5,5],ymm7[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm13[3,0],ymm4[1,0],ymm13[7,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm5[2,0],ymm4[4,7],ymm5[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r9) +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1],ymm6[2,0],ymm9[5,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm12[3,0],ymm3[1,0],ymm12[7,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm5[2,0],ymm3[4,7],ymm5[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r9) +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1],ymm6[2,0],ymm9[5,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,0],ymm2[1,0],ymm11[7,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm5[2,0],ymm2[4,7],ymm5[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r9) +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1],ymm6[2,0],ymm9[5,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm10[3,0],ymm1[1,0],ymm10[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm5[2,0],ymm1[4,7],ymm5[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1],ymm6[2,0],ymm9[5,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[3,0],ymm0[1,0],ymm8[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm5[2,0],ymm0[4,7],ymm5[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r9) +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1],ymm6[2,0],ymm8[5,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) ; AVX1-ONLY-NEXT: addq $2584, %rsp # imm = 0xA18 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -6985,29 +6972,29 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm9 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm7[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7018,7 +7005,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 @@ -7027,7 +7014,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1056(%rdi), %ymm1 @@ -7043,7 +7030,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1088(%rdi), %ymm1 @@ -7052,7 +7039,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -7068,7 +7055,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1472(%rdi), %ymm1 @@ -7077,7 +7064,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm1 @@ -7085,21 +7072,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm1 @@ -7112,8 +7099,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1 @@ -7121,7 +7108,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 864(%rdi), %ymm1 @@ -7129,21 +7116,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %ymm0 @@ -7151,79 +7138,79 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovaps 1248(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1248(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm9[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1312(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovaps 1312(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm6 = <1,7,5,u> +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm5, %ymm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm6, %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm6, %ymm8 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -7234,17 +7221,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $0, (%rsp), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -7257,172 +7243,153 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm6[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, (%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $68, (%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -7437,9 +7404,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3],ymm8[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7447,68 +7414,87 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -7545,142 +7531,143 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2] +; AVX2-SLOW-NEXT: # xmm8 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm5[2,3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 848(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 848(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 1040(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1040(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1232(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1232(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1424(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1424(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -7689,166 +7676,167 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r9) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm13, 160(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 160(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) ; AVX2-SLOW-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2504, %rsp # imm = 0x9C8 +; AVX2-FAST-NEXT: subq $2472, %rsp # imm = 0x9A8 ; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7860,24 +7848,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm9 = <0,6,4,u> ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm7[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7888,7 +7876,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 @@ -7897,11 +7885,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1056(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1024(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] @@ -7913,7 +7901,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1088(%rdi), %ymm1 @@ -7922,7 +7910,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -7933,12 +7921,12 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1376(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1472(%rdi), %ymm1 @@ -7947,7 +7935,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 @@ -7955,21 +7943,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm1 @@ -7982,8 +7970,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1 @@ -7991,7 +7979,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 864(%rdi), %ymm1 @@ -7999,21 +7987,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %ymm0 @@ -8021,358 +8009,357 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovaps 1248(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps 1248(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm9[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1312(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovaps 1312(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm6 = <1,7,5,u> +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm6, %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm6, %ymm8 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = [2,0,6,4,2,0,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = [2,0,6,4,2,0,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, (%rsp), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2,3],ymm2[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2,3],ymm2[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1,2,3],ymm0[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1,2,3],ymm7[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm9 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm9 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm8 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm6[2,3],ymm13[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm7[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm7[1,2,3],ymm2[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1,2,3],ymm1[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3],ymm7[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3],ymm8[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermilps $255, (%rsp), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -8398,9 +8385,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -8409,142 +8396,143 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2] -; AVX2-FAST-NEXT: # xmm2 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm8 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm5[2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm12[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 848(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm11[2,3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1040(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -8553,160 +8541,161 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-FAST-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $85, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm13, 160(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm14, 160(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-NEXT: addq $2472, %rsp # imm = 0x9A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -8719,29 +8708,29 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm9 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm7[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8752,7 +8741,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 @@ -8761,7 +8750,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1056(%rdi), %ymm1 @@ -8777,7 +8766,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1088(%rdi), %ymm1 @@ -8786,7 +8775,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -8802,7 +8791,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1472(%rdi), %ymm1 @@ -8811,7 +8800,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm1 @@ -8819,21 +8808,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm1 @@ -8846,8 +8835,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1 @@ -8855,7 +8844,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 864(%rdi), %ymm1 @@ -8863,21 +8852,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %ymm0 @@ -8885,79 +8874,79 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1248(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1248(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm9[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1312(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1312(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm6 = <1,7,5,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm6, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm6, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -8968,17 +8957,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, (%rsp), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -8991,172 +8979,153 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, (%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm11[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, (%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -9171,9 +9140,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3],ymm8[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9181,68 +9150,87 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9279,303 +9267,305 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm5[2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 848(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 848(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1040(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1232(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 1040(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1424(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps 1232(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1424(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -9583,207 +9573,206 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-LABEL: load_i32_stride6_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm12, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm6, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm6, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm3, %zmm5, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm5, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm22, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm12, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm13, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm22, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm12, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm19 ; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512F-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm19, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm13, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm19, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm22, %zmm28 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm29 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,18,24,30,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm20 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm4, %zmm21 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <3,9,15,21,27,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <3,9,15,21,27,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm16, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm24 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm26 = <21,27,1,7,13,u,u,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm26, %zmm3 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm4, %zmm14 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm16, %zmm31 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm17 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm26, %zmm6 ; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm2, %zmm8 ; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 @@ -9791,13 +9780,13 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm16, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm26, %zmm10 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9815,9 +9804,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9827,9 +9816,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9839,83 +9828,84 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm13 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm22 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm0 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm24 {%k1} ; AVX512F-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 {%k2} ; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsi) +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm15, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm25, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm12, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm24, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm29, 128(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm17, (%r9) @@ -9925,7 +9915,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -9933,207 +9923,206 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i32_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm12, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm22, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm12, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm22, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm12, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm13, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm22, %zmm28 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm29 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,18,24,30,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm20 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm21 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,9,15,21,27,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <3,9,15,21,27,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <21,27,1,7,13,u,u,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm26, %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm26, %zmm6 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 @@ -10141,13 +10130,13 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm26, %zmm10 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10165,9 +10154,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10177,9 +10166,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10189,83 +10178,84 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm24 {%k1} ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r9) @@ -10275,7 +10265,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 7c98c94de19be0..74cbf72357d874 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -212,58 +212,58 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 ; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm11[0],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,1,1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm5, (%rsi) +; SSE-NEXT: movapd %xmm3, (%rsi) ; SSE-NEXT: movapd %xmm6, (%rdx) -; SSE-NEXT: movapd %xmm11, (%rcx) -; SSE-NEXT: movapd %xmm8, (%r8) -; SSE-NEXT: movapd %xmm4, (%r9) +; SSE-NEXT: movapd %xmm8, (%rcx) +; SSE-NEXT: movapd %xmm9, (%r8) +; SSE-NEXT: movapd %xmm5, (%r9) ; SSE-NEXT: movapd %xmm0, (%rdi) ; SSE-NEXT: movapd %xmm2, (%rax) ; SSE-NEXT: retq @@ -334,19 +334,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vbroadcastss 84(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm9 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm3 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] @@ -372,8 +372,8 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] ; AVX2-SLOW-NEXT: vmovaps %xmm2, (%rsi) -; AVX2-SLOW-NEXT: vmovaps %xmm3, (%rdx) -; AVX2-SLOW-NEXT: vmovaps %xmm4, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %xmm4, (%rdx) +; AVX2-SLOW-NEXT: vmovaps %xmm3, (%rcx) ; AVX2-SLOW-NEXT: vmovaps %xmm7, (%r8) ; AVX2-SLOW-NEXT: vmovaps %xmm9, (%r9) ; AVX2-SLOW-NEXT: vmovaps %xmm5, (%r10) @@ -451,19 +451,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 84(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 8(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm3 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] @@ -489,8 +489,8 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm5, (%r10) @@ -549,144 +549,142 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i32_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movdqa 144(%rdi), %xmm9 +; SSE-NEXT: movdqa 144(%rdi), %xmm13 ; SSE-NEXT: movdqa 80(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm14 ; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 192(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm9 ; SSE-NEXT: movdqa 160(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm15 +; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movdqa 208(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm12[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] +; SSE-NEXT: movdqa 208(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: movdqa 96(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm9[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm14[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm5[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm14[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm8, (%rcx) -; SSE-NEXT: movapd %xmm13, 16(%rcx) -; SSE-NEXT: movapd %xmm15, (%r8) -; SSE-NEXT: movapd %xmm7, 16(%r8) -; SSE-NEXT: movapd %xmm2, (%r9) -; SSE-NEXT: movapd %xmm9, 16(%r9) +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movapd %xmm10, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movapd %xmm15, (%rcx) +; SSE-NEXT: movapd %xmm11, 16(%rcx) +; SSE-NEXT: movapd %xmm4, (%r8) +; SSE-NEXT: movapd %xmm8, 16(%r8) +; SSE-NEXT: movapd %xmm0, (%r9) +; SSE-NEXT: movapd %xmm13, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm14, (%rax) -; SSE-NEXT: movapd %xmm10, 16(%rax) +; SSE-NEXT: movapd %xmm12, (%rax) +; SSE-NEXT: movapd %xmm9, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm3, (%rax) -; SSE-NEXT: movapd %xmm4, 16(%rax) +; SSE-NEXT: movapd %xmm5, 16(%rax) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; @@ -898,7 +896,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 @@ -922,17 +920,17 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,0,7,0,1,0,7,0] ; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm10[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm11 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 @@ -944,13 +942,13 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7] ; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] @@ -973,17 +971,17 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm12, %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm12, %ymm9 ; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -992,7 +990,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rsi) ; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm10, (%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r10) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) @@ -1182,47 +1180,49 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 -; SSE-NEXT: movdqa 304(%rdi), %xmm3 -; SSE-NEXT: movdqa 272(%rdi), %xmm5 -; SSE-NEXT: movdqa 224(%rdi), %xmm15 +; SSE-NEXT: subq $408, %rsp # imm = 0x198 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm3 +; SSE-NEXT: movdqa 224(%rdi), %xmm4 ; SSE-NEXT: movdqa 240(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm5 ; SSE-NEXT: movdqa 16(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm9 -; SSE-NEXT: movdqa 192(%rdi), %xmm14 +; SSE-NEXT: movdqa 192(%rdi), %xmm10 ; SSE-NEXT: movdqa 160(%rdi), %xmm11 -; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa 112(%rdi), %xmm14 ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: movdqa %xmm11, %xmm12 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm11 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm15 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 336(%rdi), %xmm1 @@ -1231,241 +1231,222 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm8 -; SSE-NEXT: movdqa 384(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa 416(%rdi), %xmm2 +; SSE-NEXT: movdqa 384(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa 144(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa 368(%rdi), %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 288(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movdqa 400(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa 320(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; SSE-NEXT: movdqa %xmm12, %xmm14 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: movdqa 432(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm7[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm7[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd $80, (%rsp), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm7[0],xmm14[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1498,9 +1479,10 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm9, 48(%r9) -; SSE-NEXT: movapd %xmm14, 32(%r9) -; SSE-NEXT: movapd %xmm12, (%r9) +; SSE-NEXT: movapd %xmm11, 48(%r9) +; SSE-NEXT: movapd %xmm12, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1511,40 +1493,40 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm4, 48(%rax) -; SSE-NEXT: movapd %xmm6, 32(%rax) -; SSE-NEXT: movapd %xmm7, (%rax) -; SSE-NEXT: movapd %xmm10, 16(%rax) -; SSE-NEXT: addq $440, %rsp # imm = 0x1B8 +; SSE-NEXT: movapd %xmm14, 48(%rax) +; SSE-NEXT: movapd %xmm5, 32(%rax) +; SSE-NEXT: movapd %xmm6, (%rax) +; SSE-NEXT: movapd %xmm9, 16(%rax) +; SSE-NEXT: addq $408, %rsp # imm = 0x198 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -1553,7 +1535,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] @@ -1564,169 +1546,169 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm5[2,2],ymm7[5,5],ymm5[6,6] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm6[2,2],ymm7[5,5],ymm6[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0],xmm13[1],xmm11[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm6[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm5[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm2[3,3],ymm1[4,4],ymm2[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm3[2,2],ymm15[5,5],ymm3[6,6] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm4[2,2],ymm15[5,5],ymm4[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0],xmm10[1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0],xmm10[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm4[0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm3[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm14[3,3],ymm0[4,4],ymm14[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm8[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm9[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm6[0,3],ymm14[7,5],ymm6[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm3[0,3],ymm12[7,5],ymm3[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm4[0,3],ymm12[7,5],ymm4[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[2,1],ymm12[2,0],ymm15[6,5],ymm12[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm7[0,0],ymm5[5,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm7[0,0],ymm6[5,4],ymm7[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,1],ymm6[1,3],ymm2[4,5],ymm6[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm6[0,1],ymm5[1,3],ymm6[4,5],ymm5[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,2],ymm7[2,0],ymm1[4,6],ymm7[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm15[0,0],ymm3[5,4],ymm15[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,1],ymm3[0,2],ymm15[7,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1],ymm4[1,3],ymm5[4,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[0,2],ymm8[2,0],ymm0[4,6],ymm8[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm6[2,0],ymm2[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm3[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,0],ymm15[0,0],ymm4[5,4],ymm15[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,1],ymm2[0,2],ymm15[7,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1],ymm3[1,3],ymm15[4,5],ymm3[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,2],ymm4[2,0],ymm0[4,6],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm2[0,0],ymm1[7,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm5[2,0],ymm6[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[0,0],ymm0[7,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,0],ymm0[6,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,0],ymm3[2,0],ymm15[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = mem[0],xmm8[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,1],ymm6[3,3],ymm2[6,5],ymm6[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[2,1],ymm5[3,3],ymm6[6,5],ymm5[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[0,0],ymm13[1,0],ymm14[4,4],ymm13[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1],xmm3[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,1],ymm4[3,3],ymm5[6,5],ymm4[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm3[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm1[2,0],ymm5[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0],ymm12[1,0],ymm11[4,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm6[0,0],ymm2[7,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[0,0],ymm10[1,0],ymm12[4,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[0,1],xmm5[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm15[2,1],ymm3[3,3],ymm15[6,5],ymm3[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm11[1,0],ymm13[4,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[0,1],xmm5[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,0],ymm5[0,0],ymm6[7,4],ymm5[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm1[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,0],ymm6[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0],ymm6[0,0],ymm5[7,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,0],ymm6[4,5],ymm5[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,0],ymm12[2,0],ymm11[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[1,0],ymm10[2,0],ymm12[5,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm15[3,0],ymm6[0,0],ymm15[7,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,0],ymm8[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[1,0],ymm11[2,0],ymm13[5,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -1748,11 +1730,11 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1765,29 +1747,28 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,6,u> -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm10[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpbroadcastd 196(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpbroadcastd 196(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7] ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] @@ -1809,7 +1790,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] @@ -1818,130 +1799,130 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = ymm4[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,0] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,3,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm14[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm13[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm4[1,3],ymm2[4,6],ymm4[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm11 ; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm10 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpbroadcastd 324(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3] ; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 436(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vpbroadcastd 216(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vpbroadcastd 216(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm11[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm12[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vpbroadcastd 440(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 136(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 136(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpermd 192(%rdi), %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 80(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2,3,4],ymm11[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 360(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] @@ -1985,38 +1966,39 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-FAST-LABEL: load_i32_stride7_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],ymm6[6],ymm14[7] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm12[1] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm5[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vpbroadcastd 196(%rdi), %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm0 @@ -2032,153 +2014,150 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm7 = ymm9[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm13[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm13[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm8[2,3],ymm3[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm4[1],ymm13[2,3,4],ymm4[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm5[1],ymm12[2,3,4],ymm5[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm4 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm11[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3] +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm8[1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2],ymm14[1,3],ymm15[4,6],ymm14[5,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2],ymm13[1,3],ymm15[4,6],ymm13[5,7] ; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [4,3,4,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 ; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm13[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm15[1],ymm13[2,3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm5 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] ; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) @@ -2192,20 +2171,19 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-FAST-NEXT: addq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -2217,29 +2195,28 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 196(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 196(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] @@ -2261,7 +2238,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] @@ -2270,130 +2247,130 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm4 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm10 = ymm4[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,0] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm10 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm14[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm10 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm13[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm4[1,3],ymm2[4,6],ymm4[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 324(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 436(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 216(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 216(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 440(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 136(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 136(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermd 192(%rdi), %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2,3,4],ymm11[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 360(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] @@ -2439,115 +2416,115 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,u,u,u> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512F-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm8, %zmm9 ; AVX512F-NEXT: movb $-32, %dil ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm6, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm6, %zmm12 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <1,8,15,22,29,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm14 ; AVX512F-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm14 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm6, %zmm15 ; AVX512F-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm7, %zmm6, %zmm9 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm9, %zmm15 ; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <19,26,1,8,15,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm9 ; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm4, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 ; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm4, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 ; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 -; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, (%rsi) +; AVX512F-NEXT: vpermi2d %zmm0, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [5,12,19,26] +; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 +; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm12, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} +; AVX512F-NEXT: vpermt2d %zmm4, %zmm6, %zmm3 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] +; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2556,115 +2533,115 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,u,u,u> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm9 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm12 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <1,8,15,22,29,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm14 ; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm14 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm15 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm9 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm15 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <19,26,1,8,15,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rsi) +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [5,12,19,26] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm12, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2689,109 +2666,106 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $1160, %rsp # imm = 0x488 +; SSE-NEXT: subq $1176, %rsp # imm = 0x498 ; SSE-NEXT: movdqa 80(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm14 +; SSE-NEXT: movdqa 608(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 560(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm3 -; SSE-NEXT: movdqa 608(%rdi), %xmm4 -; SSE-NEXT: movdqa 560(%rdi), %xmm10 -; SSE-NEXT: movdqa 576(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm7 ; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: movdqa 112(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm15 ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm9 -; SSE-NEXT: movdqa 496(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 528(%rdi), %xmm5 +; SSE-NEXT: movdqa 496(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 416(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 384(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 784(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm3 -; SSE-NEXT: movdqa 832(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 864(%rdi), %xmm9 +; SSE-NEXT: movdqa 832(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 672(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2801,131 +2775,136 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa 752(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 720(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa 720(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa 144(%rdi), %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa 592(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa 480(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 480(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa 816(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 816(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[1,1,1,1] ; SSE-NEXT: movdqa 704(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2934,288 +2913,283 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 624(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 512(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 512(%rdi), %xmm3 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 848(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 736(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 736(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 656(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 768(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 656(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 880(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movapd %xmm1, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3225,16 +3199,16 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3244,7 +3218,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm7 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3254,29 +3229,30 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm6 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -3285,8 +3261,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -3349,13 +3324,13 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r8) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%r9) @@ -3372,9 +3347,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movapd %xmm13, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rax) -; SSE-NEXT: movapd %xmm15, 96(%rax) +; SSE-NEXT: movaps %xmm1, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3396,17 +3371,17 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm7, 32(%rax) ; SSE-NEXT: movapd %xmm8, 16(%rax) ; SSE-NEXT: movapd %xmm9, (%rax) -; SSE-NEXT: addq $1160, %rsp # imm = 0x488 +; SSE-NEXT: addq $1176, %rsp # imm = 0x498 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX1-ONLY-NEXT: subq $1480, %rsp # imm = 0x5C8 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 @@ -3415,12 +3390,13 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -3438,12 +3414,13 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 @@ -3451,10 +3428,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3468,20 +3444,21 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm11[1] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3495,10 +3472,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3513,11 +3492,11 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[2,2],ymm7[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm6[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 @@ -3533,20 +3512,21 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[2,2],ymm7[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0],xmm8[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm14[0,1] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0],ymm2[3,3],ymm12[4,4],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3],ymm7[0,1] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0],ymm2[3,3],ymm10[4,4],ymm2[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm4[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -3554,7 +3534,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm0[2,2],ymm13[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm0[2,2],ymm14[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3568,321 +3548,329 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm0[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0],ymm5[3,3],ymm2[4,4],ymm5[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm11[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm8[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm8[2,2],ymm3[5,5],ymm8[6,6] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm4[2,2],ymm11[5,5],ymm4[6,6] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm2[0,1] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm8[0,0],ymm14[3,3],ymm8[4,4],ymm14[7,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0],xmm2[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm15[2,3],ymm5[0,1] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0],ymm13[3,3],ymm15[4,4],ymm13[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm13[1,2],xmm4[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm3[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm13[3,1],mem[0,3],ymm13[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm11[2,1],ymm13[2,0],ymm11[6,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1],ymm10[0,3],ymm13[7,5],ymm10[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[2,1],ymm13[2,0],ymm7[6,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1],ymm8[0,3],ymm13[7,5],ymm8[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[2,1],ymm13[2,0],ymm9[6,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm13[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm13[0,3],ymm14[7,5],ymm13[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[2,1],ymm14[2,0],ymm6[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm11[0,3],ymm14[7,5],ymm11[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm5[2,1],ymm14[2,0],ymm5[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,1],ymm14[0,3],ymm8[7,5],ymm14[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm4[0,3],ymm14[7,5],ymm4[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[2,1],ymm14[2,0],ymm1[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1],ymm8[2,0],ymm4[6,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,1],ymm4[0,3],ymm9[7,5],ymm4[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[2,1],ymm9[2,0],ymm7[6,5],ymm9[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,1],ymm0[0,2],ymm11[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,0],ymm6[0,0],ymm13[5,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,1],ymm0[0,2],ymm6[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm9[1,3],ymm1[4,5],ymm9[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[0,2],ymm8[2,0],ymm3[4,6],ymm8[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm1[0,1],mem[1,3],ymm1[4,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[0,2],ymm9[2,0],ymm13[4,6],ymm9[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,0],ymm7[0,0],ymm10[5,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,1],ymm0[0,2],ymm7[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm11[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm8[1,3],ymm1[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,2],ymm10[2,0],ymm12[4,6],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,0],ymm5[0,0],ymm11[5,4],ymm5[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[3,1],ymm0[0,2],ymm5[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm8[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1],ymm11[1,3],ymm2[4,5],ymm11[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,2],ymm9[2,0],ymm10[4,6],ymm9[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm4[0,0],ymm14[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,1],ymm0[0,2],ymm4[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm2[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm7[0,0],ymm4[5,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,1],ymm0[0,2],ymm7[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm14[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,1],ymm13[1,3],ymm7[4,5],ymm13[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[0,2],ymm10[2,0],ymm6[4,6],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1],ymm12[1,3],ymm1[4,5],ymm12[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,2],ymm7[2,0],ymm3[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[0,0],ymm0[5,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,1],ymm0[0,2],ymm2[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm6[0,0],ymm0[5,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,1],ymm0[0,2],ymm6[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm15[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,1],ymm7[1,3],ymm5[4,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,2],ymm6[2,0],ymm15[4,6],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1],ymm5[1,3],ymm3[4,5],ymm5[5,7] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,2],ymm6[2,0],ymm9[4,6],ymm6[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm0[0,0],ymm12[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0],ymm8[2,0],ymm1[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,0],ymm0[0,0],ymm10[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm11[2,0],ymm2[5,4],ymm11[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm6[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = mem[0],xmm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,0],ymm9[2,0],ymm3[5,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm8[2,0],ymm7[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,0],ymm12[2,0],ymm10[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = mem[0],xmm13[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,0],ymm10[0,0],ymm1[7,4],ymm10[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm14[1,0],ymm11[2,0],ymm14[5,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,0],ymm10[2,0],ymm2[6,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm2[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,0],ymm12[0,0],ymm15[7,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm5[1,0],ymm7[2,0],ymm5[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,0],ymm12[2,0],ymm1[6,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,0],ymm0[0,0],ymm9[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm5[2,0],ymm14[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,1],ymm9[3,3],ymm3[6,5],ymm9[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm1[0],mem[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[2,1],ymm8[3,3],ymm7[6,5],ymm8[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[1,0],ymm12[2,0],ymm15[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,0],mem[1,0],ymm1[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,0],ymm0[2,0],ymm15[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,1],xmm7[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[2,1],ymm8[3,3],ymm3[6,5],ymm8[7,7] +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0],xmm7[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm13[0],xmm8[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,0],ymm0[2,0],ymm15[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm12[1,0],ymm9[4,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm14[0,0],mem[1,0],ymm14[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm15[0,1],xmm9[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[3,3],ymm0[6,5],ymm11[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0],xmm1[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,1],ymm11[3,3],ymm10[6,5],ymm11[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,0],ymm0[2,0],ymm15[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,0],mem[1,0],ymm1[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm2[0,0],mem[1,0],ymm2[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm15[0,1],xmm11[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,1],ymm5[3,3],ymm2[6,5],ymm5[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,0],ymm0[2,0],ymm2[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[0,0],ymm13[1,0],ymm5[4,4],ymm13[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,0],ymm0[2,0],ymm12[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm6[0,0],ymm15[1,0],ymm6[4,4],ymm15[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[0,1],xmm1[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm12[2,0],ymm9[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm14[1,0],mem[2,0],ymm14[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm13[2,0],ymm5[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm15[2,0],ymm6[5,4],ymm15[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -3890,104 +3878,99 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[0,0],ymm4[7,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX1-ONLY-NEXT: addq $1480, %rsp # imm = 0x5C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride7_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm15 ; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -4001,11 +3984,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -4021,9 +4005,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -4037,14 +4020,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -4065,12 +4048,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -4087,8 +4070,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] @@ -4100,22 +4083,21 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm13[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -4124,387 +4106,386 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm15[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm1[1],xmm12[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm4[1],xmm12[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm15 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm12 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm12 = ymm4[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm10[1],xmm15[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] -; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm13[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] +; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm5[1],mem[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm4[1,3],ymm13[4,6],ymm4[5,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2],ymm9[1,3],ymm5[4,6],ymm9[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm4[1],ymm13[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7] +; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm13 +; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm4[1,3],ymm14[4,6],ymm4[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,3,4,3] +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastd 548(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] -; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 660(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm10 -; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm11, %ymm3 -; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpbroadcastd 324(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-SLOW-NEXT: vpbroadcastd 436(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastd 772(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermd %ymm15, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 884(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vpbroadcastd 664(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vpbroadcastd 440(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vpbroadcastd 888(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermd 640(%rdi), %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 528(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermd 864(%rdi), %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 752(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm13 +; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm14 +; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm10[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermd 192(%rdi), %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm11, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 80(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpermd 416(%rdi), %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm11, %ymm8 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 304(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm12[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%r8) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rax) ; AVX2-SLOW-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1192, %rsp # imm = 0x4A8 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm9[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] @@ -4517,13 +4498,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm7[6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm7[6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -4535,11 +4518,11 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -4550,10 +4533,10 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6],ymm14[7] ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm2 @@ -4572,210 +4555,212 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,7,7,5,4,7,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm15[2,3],ymm7[4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm13[1],ymm8[2,3,4],ymm13[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm13[1],ymm10[2,3,4],ymm13[5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm11[1],mem[2,3,4],ymm11[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm14[1],mem[2,3,4],ymm14[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm0[1],xmm11[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] -; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm13[0],ymm6[0],ymm13[2],ymm6[2] -; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] +; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm13[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm1[3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] -; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm5 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm14[0,2],ymm5[1,3],ymm14[4,6],ymm5[5,7] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm9 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm13[1,3],ymm12[4,6],ymm13[5,7] -; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm4[1],mem[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm5[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm13[0],ymm1[0],ymm13[2],ymm1[2] +; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,2],ymm10[1,3],ymm14[4,6],ymm10[5,7] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm8[1,3],ymm3[4,6],ymm8[5,7] -; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,2],ymm12[1,3],ymm8[4,6],ymm12[5,7] +; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm12[1,3],ymm10[4,6],ymm12[5,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,2],ymm1[1,3],ymm13[4,6],ymm1[5,7] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm12 +; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm8[1,3],ymm4[4,6],ymm8[5,7] ; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastd 548(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 ; AVX2-FAST-NEXT: vpbroadcastd 660(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -4788,152 +4773,149 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastd 772(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 884(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 884(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm14[1],ymm4[2,3,4],ymm14[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1,2],xmm15[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1,2],xmm14[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpermd 640(%rdi), %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 528(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd 640(%rdi), %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 528(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 808(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3] -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm11, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm11, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm7 ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm11, %ymm9 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm9, 96(%rsi) @@ -4961,7 +4943,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) @@ -4970,45 +4952,42 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX2-FAST-NEXT: addq $1192, %rsp # imm = 0x4A8 +; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -5022,11 +5001,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -5042,9 +5022,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -5058,14 +5037,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -5086,12 +5065,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -5108,8 +5087,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] @@ -5121,22 +5100,21 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm13[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -5145,551 +5123,550 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm15[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm1[1],xmm12[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm4[1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm12 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm12 = ymm4[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm10[1],xmm15[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm13[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm5[1],mem[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm4[1,3],ymm13[4,6],ymm4[5,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2],ymm9[1,3],ymm5[4,6],ymm9[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm4[1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm4[1,3],ymm14[4,6],ymm4[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 548(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 660(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm14, %ymm11, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 324(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm11, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 436(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 772(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm11, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 884(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 664(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 440(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 888(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd 640(%rdi), %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 528(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd 864(%rdi), %ymm11, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 752(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd 192(%rdi), %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm11, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermd 416(%rdi), %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm11, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 304(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1224, %rsp # imm = 0x4C8 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX512F-LABEL: load_i32_stride7_vf32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512F-LABEL: load_i32_stride7_vf32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,7,14,21,28,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm16, %zmm17 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <0,7,14,21,28,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm9, %zmm8 ; AVX512F-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm17, %zmm18 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm19, %zmm18 ; AVX512F-NEXT: movb $-32, %dil ; AVX512F-NEXT: kmovw %edi, %k2 ; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} -; AVX512F-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 +; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 -; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm16 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 {%k2} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,8,15,22,29,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm18, %zmm19 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm17 = <1,8,15,22,29,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm17, %zmm16 ; AVX512F-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm19, %zmm20 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm21, %zmm20 ; AVX512F-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512F-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} +; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm20 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm17 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = <18,25,0,7,14,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm21, %zmm22 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm20 = <18,25,0,7,14,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm20, %zmm19 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm19 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm22, %zmm23 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} -; AVX512F-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm24, %zmm23 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} +; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm22 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm20 {%k1} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = <19,26,1,8,15,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm24, %zmm25 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <19,26,1,8,15,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm23, %zmm22 +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm22 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm25, %zmm26 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} -; AVX512F-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm27, %zmm26 +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm22 {%k1} +; AVX512F-NEXT: vpermi2d %zmm4, %zmm6, %zmm25 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm24 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm23 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm24, %zmm25 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] +; AVX512F-NEXT: vpermt2d %zmm11, %zmm26, %zmm25 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm28 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 -; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512F-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 -; AVX512F-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 -; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512F-NEXT: vpermt2d %zmm12, %zmm29, %zmm30 +; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm28 +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm28 {%k1} +; AVX512F-NEXT: vpermi2d %zmm4, %zmm6, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm26, %zmm24 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm27 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 +; AVX512F-NEXT: vinserti32x4 $0, %xmm29, %zmm27, %zmm25 +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm18, %zmm24 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm11, %zmm26, %zmm24 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm30, %zmm31 ; AVX512F-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 -; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} -; AVX512F-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 -; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 -; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} -; AVX512F-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 -; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 -; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm29 {%k1} +; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm26, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm27 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm30 +; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm27, %zmm24 +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm24 {%k1} +; AVX512F-NEXT: vpermt2d %zmm14, %zmm21, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm11, %zmm14, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm11, %zmm13 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [6,13,20,27] +; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm10 +; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm13, %zmm10 +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm10 {%k1} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm21, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm14, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%r10) +; AVX512F-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 64(%r10) ; AVX512F-NEXT: vmovdqa64 %zmm29, (%r10) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) @@ -5700,188 +5677,188 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,7,14,21,28,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,7,14,21,28,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm9, %zmm8 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm17, %zmm18 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm19, %zmm18 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 {%k2} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,8,15,22,29,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = <1,8,15,22,29,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm17, %zmm16 ; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm20 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm21, %zmm20 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = <18,25,0,7,14,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = <18,25,0,7,14,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm20, %zmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm19 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm22, %zmm23 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm24, %zmm23 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <19,26,1,8,15,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm25 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = <19,26,1,8,15,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm23, %zmm22 +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm26 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm27, %zmm26 +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm22 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm24 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm25 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm26, %zmm25 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm28 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm29, %zmm30 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm28 +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm28 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm26, %zmm24 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm27 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm27, %zmm25 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm18, %zmm24 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm26, %zmm24 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm30, %zmm31 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 -; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm29 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm26, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm27 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm30 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm27, %zmm24 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm24 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm21, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm11, %zmm13 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm15 = [6,13,20,27] +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm10 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm13, %zmm10 +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm10 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm29, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) @@ -5909,48 +5886,48 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2456, %rsp # imm = 0x998 -; SSE-NEXT: movdqa 1088(%rdi), %xmm3 +; SSE-NEXT: movdqa 1088(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1056(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1056(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1008(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1024(%rdi), %xmm5 +; SSE-NEXT: movdqa 1024(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 608(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm13 -; SSE-NEXT: movdqa 608(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 560(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm7 +; SSE-NEXT: movdqa 576(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm15 +; SSE-NEXT: movdqa 160(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1456(%rdi), %xmm1 @@ -5967,667 +5944,678 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 464(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm0 +; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 896(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 912(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 528(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa 496(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 976(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 896(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 944(%rdi), %xmm0 +; SSE-NEXT: movdqa 912(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1344(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 976(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 944(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1344(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 1360(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1424(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 1424(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1392(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm11 ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm4 -; SSE-NEXT: movdqa 384(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa 416(%rdi), %xmm8 +; SSE-NEXT: movdqa 384(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 784(%rdi), %xmm14 ; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movdqa 864(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1232(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 832(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1232(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1248(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movdqa 1312(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1280(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1680(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1680(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1696(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movdqa 1760(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1728(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa 304(%rdi), %xmm10 ; SSE-NEXT: movdqa 272(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: movdqa 752(%rdi), %xmm14 -; SSE-NEXT: movdqa 720(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa 752(%rdi), %xmm6 +; SSE-NEXT: movdqa 720(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1120(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1120(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1136(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; SSE-NEXT: movdqa 1200(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1168(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa 1200(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1168(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1568(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1568(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1584(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; SSE-NEXT: movdqa 1648(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movdqa 1648(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1616(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm7[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 592(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa 480(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa 816(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa 704(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1040(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 928(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: movdqa 592(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1264(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa 1152(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1488(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1376(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1712(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1600(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa 480(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa 816(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm9[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa 704(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movdqa 1040(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movdqa 928(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movdqa 1264(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa 1152(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 512(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movdqa 1488(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; SSE-NEXT: pshufd $85, (%rsp), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1376(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa 1712(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 736(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1600(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 960(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 1072(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 288(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 1184(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 1296(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 512(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 736(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 848(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 960(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 1072(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 1184(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 1296(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, (%rsp), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 1408(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 1408(%rdi), %xmm0 +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 1520(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 1520(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 1632(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 1632(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; SSE-NEXT: movdqa 1744(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 768(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd $250, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 992(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1104(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1328(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -6635,10 +6623,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 1440(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] @@ -6647,8 +6635,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 1552(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6657,21 +6645,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1664(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1776(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6679,19 +6666,19 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6701,41 +6688,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] @@ -6744,8 +6717,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] @@ -6754,7 +6726,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] @@ -6763,90 +6735,115 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -6855,35 +6852,30 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6902,12 +6894,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6916,16 +6907,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6934,9 +6925,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] @@ -6944,43 +6934,38 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6988,23 +6973,22 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7027,20 +7011,24 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm14 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] @@ -7056,7 +7044,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm11 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -7077,11 +7066,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] @@ -7120,7 +7109,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -7135,13 +7124,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -7368,7 +7358,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3176, %rsp # imm = 0xC68 +; AVX1-ONLY-NEXT: subq $3192, %rsp # imm = 0xC78 ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 @@ -7385,8 +7375,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 @@ -7408,14 +7398,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] @@ -7429,14 +7420,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm15 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7460,10 +7452,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7476,21 +7469,22 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7509,15 +7503,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7530,30 +7525,29 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7567,20 +7561,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7590,8 +7584,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 @@ -7610,33 +7604,32 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0],xmm4[1],xmm12[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0],ymm1[3,3],ymm3[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm0[2,2],ymm14[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0],xmm9[1],xmm11[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 @@ -7646,24 +7639,23 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm5[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1],ymm0[2,2],ymm10[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] @@ -7676,7 +7668,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm0[2,2],ymm15[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7698,13 +7690,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm0[2,2],ymm13[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm15[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm3[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 @@ -7721,11 +7711,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm0[2,2],ymm13[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm8[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 @@ -7735,261 +7728,263 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm0[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm4[3,3],ymm1[4,4],ymm4[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm10[2] +; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1,2],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm0[2,2],ymm8[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0],xmm3[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0],xmm3[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm4[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0],ymm10[3,3],ymm1[4,4],ymm10[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm10[1,2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm1[2,3],ymm0[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[0,0],ymm9[3,3],ymm1[4,4],ymm9[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm9[1,2],xmm15[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[2,1],ymm14[2,0],ymm2[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm6[2,1],ymm12[2,0],ymm6[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm6[0,3],ymm12[7,5],ymm6[4,7] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm5[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[2,1],ymm14[2,0],ymm4[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[2,1],ymm14[2,0],ymm4[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[2,1],ymm14[2,0],ymm4[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm10[3,1],mem[0,3],ymm10[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm13[2,1],ymm14[2,0],ymm13[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm10[3,1],ymm2[0,3],ymm10[7,5],ymm2[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm8[2,1],ymm14[2,0],ymm8[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm12[0,0],ymm10[5,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,1],ymm8[1,3],ymm12[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,2],ymm15[2,0],ymm8[4,6],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,0],ymm12[0,0],ymm8[5,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm5[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,1],ymm1[1,3],ymm5[4,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,0],ymm10[0,0],ymm8[5,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1],ymm9[0,2],ymm10[7,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,1],ymm1[1,3],ymm8[4,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,2],ymm15[2,0],ymm1[4,6],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm1[0,0],ymm5[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,1],ymm10[0,2],ymm1[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,0],ymm1[0,0],ymm8[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[3,1],ymm9[0,2],ymm1[7,5],ymm9[4,6] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm1[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,1],ymm0[1,3],ymm1[4,5],ymm0[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,2],ymm15[2,0],ymm13[4,6],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[1,3],ymm1[4,5],ymm0[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,2],ymm10[2,0],ymm12[4,6],ymm10[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[3,1],ymm10[0,2],ymm0[7,5],ymm10[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1],ymm9[0,2],ymm0[7,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm6[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,1],ymm15[1,3],ymm0[4,5],ymm15[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,2],ymm10[2,0],ymm11[4,6],ymm10[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1],ymm9[0,2],ymm0[7,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm5[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1],mem[1,3],ymm0[4,5],mem[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,2],ymm15[2,0],ymm11[4,6],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vshufps $212, (%rsp), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm0[0,1],mem[1,3],ymm0[4,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,2],ymm10[2,0],ymm7[4,6],ymm10[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[0,0],ymm6[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm0[0,0],ymm2[5,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm4[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm13[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,2],ymm9[2,0],ymm7[4,6],ymm9[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2],ymm9[2,0],ymm4[4,6],ymm9[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm15[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,1],ymm8[1,3],ymm12[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm9[1,3],ymm13[4,5],ymm9[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[0,2],ymm7[2,0],ymm11[4,6],ymm7[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm7[5,6,7] @@ -7998,15 +7993,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm12[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm10[1,3],ymm4[4,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm6[0,1],ymm4[1,3],ymm6[4,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,2],ymm7[2,0],ymm3[4,6],ymm7[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm7[5,6,7] @@ -8020,33 +8015,33 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm7 = xmm5[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm14[0,1],ymm1[1,3],ymm14[4,5],ymm1[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,2],ymm9[2,0],ymm2[4,6],ymm9[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2],ymm10[2,0],ymm2[4,6],ymm10[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm7[0,0],ymm2[7,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm14[1,0],ymm1[2,0],ymm14[5,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[2,0],ymm0[6,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = mem[0],xmm8[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,0],ymm0[6,4],ymm5[6,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8054,65 +8049,65 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = mem[0],xmm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm10[2,0],ymm4[5,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm12[1,0],mem[2,0],ymm12[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,0],ymm0[0,0],ymm11[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm8[2,0],ymm12[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm9[2,0],ymm13[5,4],ymm9[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm15[2,0],ymm3[5,4],ymm15[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8120,93 +8115,111 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps $33, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm13[2,1],mem[3,3],ymm13[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm4[2,0],ymm11[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[0,0],ymm15[1,0],ymm9[4,4],ymm15[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm14[0,1],xmm11[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = mem[0],xmm13[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm7[2,1],mem[3,3],ymm7[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0],xmm9[1],xmm14[2,3] +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm14[2,1],mem[3,3],ymm14[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm15[0],xmm13[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm4[2,0],ymm11[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[0,1],xmm7[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[0,1],xmm10[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0],xmm11[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm4[2,0],ymm10[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,0],mem[1,0],ymm10[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[0,1],xmm8[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm11[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0],xmm10[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,0],ymm4[2,0],ymm8[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,0],mem[1,0],ymm8[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[0,1],xmm7[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm12[2,1],mem[3,3],ymm12[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm8[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm4[2,0],ymm7[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm12[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4] @@ -8215,32 +8228,32 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,1],xmm6[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0],ymm7[1,0],ymm6[4,4],ymm7[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm6[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -8251,34 +8264,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,1],xmm3[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm9[2,1],mem[3,3],ymm9[6,5],mem[7,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm5[0],mem[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,0],mem[1,0],ymm4[4,4],mem[5,4] @@ -8287,7 +8281,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps $246, (%rsp), %ymm2, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm2[2,1],mem[3,3],ymm2[6,5],mem[7,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload @@ -8303,11 +8297,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,1],xmm0[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8315,59 +8309,42 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm15[2,0],ymm2[5,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm14[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0],ymm1[0,0],ymm3[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -8379,6 +8356,24 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload @@ -8389,21 +8384,59 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,0],ymm4[0,0],ymm7[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,0],ymm7[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,0],ymm4[0,0],ymm6[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,0],ymm8[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,0],ymm4[0,0],ymm7[7,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm6[3,0],ymm8[0,0],ymm6[7,4],ymm8[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,0],ymm9[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload @@ -8414,166 +8447,128 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm4[0,0],ymm8[7,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,0],ymm10[4,5],ymm4[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,0],ymm10[0,0],ymm5[7,4],ymm10[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm11[0],mem[1],xmm11[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm11[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm11, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm11, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rax) -; AVX1-ONLY-NEXT: addq $3176, %rsp # imm = 0xC68 +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $3192, %rsp # imm = 0xC78 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2680, %rsp # imm = 0xA78 -; AVX2-SLOW-NEXT: vmovdqa 1216(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX2-SLOW-NEXT: vmovdqa 1216(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 @@ -8598,13 +8593,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %xmm3 @@ -8616,13 +8609,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm14 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -8634,16 +8628,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vpbroadcastq 1648(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 1696(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 1728(%rdi), %xmm3 @@ -8674,15 +8666,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm8[6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -8697,10 +8688,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 896(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 896(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6],ymm7[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 992(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8716,16 +8706,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1344(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 1344(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1440(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 1440(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vpbroadcastq 1424(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1504(%rdi), %xmm2 @@ -8737,18 +8727,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] @@ -8763,17 +8753,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8789,13 +8780,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 1184(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa 1184(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8811,14 +8802,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 1632(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1632(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8836,11 +8828,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8858,12 +8850,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8881,12 +8873,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8895,13 +8886,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 @@ -8921,38 +8910,38 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] ; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1200(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm2 @@ -8967,7 +8956,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm2 @@ -8976,8 +8965,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1772(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpbroadcastd 1772(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 @@ -8988,14 +8977,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm3 @@ -9011,62 +9001,63 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] ; AVX2-SLOW-NEXT: vpbroadcastd 1100(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1548(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX2-SLOW-NEXT: vpbroadcastd 1548(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm15[1,3],ymm4[4,6],ymm15[5,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm4[1,3],ymm12[4,6],ymm4[5,7] +; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm12 ; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm11[0,2],mem[1,3],ymm11[4,6],mem[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,2],ymm10[1,3],ymm15[4,6],ymm10[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload @@ -9076,11 +9067,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1328(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,2],ymm14[1,3],ymm11[4,6],ymm14[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1328(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -9097,31 +9088,31 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2],ymm1[1,3],ymm5[4,6],ymm1[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 1552(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm7[1,3],ymm13[4,6],ymm7[5,7] ; AVX2-SLOW-NEXT: vmovaps %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm11 ; AVX2-SLOW-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -9129,9 +9120,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm8[1,3],ymm7[4,6],ymm8[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9144,9 +9135,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm5[1,3],ymm4[4,6],ymm5[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9154,43 +9145,43 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3] -; AVX2-SLOW-NEXT: # xmm7 = mem[0,0] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm3 = [4,3,4,3] +; AVX2-SLOW-NEXT: # xmm3 = mem[0,0] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm3, %ymm4 ; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] @@ -9199,218 +9190,220 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastss 996(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1108(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm7, %ymm8 -; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1332(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm7, %ymm8 -; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 996(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1108(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1332(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm10[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm15[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm13[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm10[1],mem[2,3,4],ymm10[5],mem[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1,2],xmm14[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0,1,2],xmm11[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -9419,206 +9412,207 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1032(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1032(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1256(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vbroadcastss 1704(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1480(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1704(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] ; AVX2-SLOW-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1648(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r9) +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1648(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 128(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 192(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 224(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 192(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 128(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm13, 224(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm14, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm12, (%rax) -; AVX2-SLOW-NEXT: addq $2680, %rsp # imm = 0xA78 +; AVX2-SLOW-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm12, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rax) +; AVX2-SLOW-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2680, %rsp # imm = 0xA78 +; AVX2-FAST-NEXT: subq $2696, %rsp # imm = 0xA88 ; AVX2-FAST-NEXT: vmovdqa 1216(%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 1120(%rdi), %ymm5 @@ -9626,17 +9620,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm2[6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -9648,8 +9643,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 752(%rdi), %ymm2 @@ -9667,12 +9663,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 1200(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm15 +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -9686,7 +9682,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 1568(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9706,9 +9702,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 @@ -9752,7 +9748,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa 992(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 976(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -9796,14 +9792,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,0,7,7,5,4,7,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9818,12 +9814,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9838,13 +9834,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 1184(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9857,14 +9852,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1632(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm13[1],ymm5[2,3,4],ymm13[5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa 1632(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9877,18 +9872,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm15[2,3],ymm10[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9897,20 +9890,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9925,14 +9918,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 960(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9947,52 +9940,52 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 1408(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] ; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %xmm2 @@ -10007,7 +10000,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %xmm2 @@ -10016,34 +10010,34 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 1772(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpbroadcastd 1772(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm15 @@ -10052,62 +10046,61 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3] +; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX2-FAST-NEXT: vpbroadcastd 1100(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm5[0],ymm12[2],ymm5[2] +; AVX2-FAST-NEXT: vpbroadcastd 1100(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm13 = ymm5[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm8[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-FAST-NEXT: vpbroadcastd 1548(%rdi), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX2-FAST-NEXT: vpbroadcastd 1548(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm6[0],mem[1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2],ymm4[1,3],ymm12[4,6],ymm4[5,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,2],ymm4[1,3],ymm11[4,6],ymm4[5,7] ; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm3[1,3],ymm15[4,6],ymm3[5,7] -; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm13 -; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,2],ymm3[1,3],ymm9[4,6],ymm3[5,7] +; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload @@ -10118,10 +10111,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm12[1,3],ymm14[4,6],ymm12[5,7] -; AVX2-FAST-NEXT: vbroadcastss 1328(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7] +; AVX2-FAST-NEXT: vbroadcastss 1328(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -10131,37 +10124,38 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm13[0,2],mem[1,3],ymm13[4,6],mem[5,7] ; AVX2-FAST-NEXT: vbroadcastss 1776(%rdi), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm10[1,3],ymm1[4,6],ymm10[5,7] ; AVX2-FAST-NEXT: vbroadcastss 1552(%rdi), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm8 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7] -; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm10 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm5[1,3],ymm12[4,6],ymm5[5,7] +; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm9 ; AVX2-FAST-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] @@ -10170,7 +10164,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] ; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm2 @@ -10193,8 +10187,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3] ; AVX2-FAST-NEXT: # xmm4 = mem[0,0] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10204,6 +10198,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm12 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vbroadcastss 212(%rdi), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] @@ -10237,7 +10232,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vbroadcastss 660(%rdi), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] @@ -10246,8 +10241,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 736(%rdi), %xmm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-FAST-NEXT: vbroadcastss 884(%rdi), %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] @@ -10261,121 +10257,118 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vbroadcastss 1108(%rdi), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 1220(%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %xmm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm12, %ymm15 -; AVX2-FAST-NEXT: vbroadcastss 1332(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %xmm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1556(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 1668(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovaps 1632(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1220(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm14 +; AVX2-FAST-NEXT: vbroadcastss 1332(%rdi), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 216(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm8 +; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm1[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1556(%rdi), %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vbroadcastss 1668(%rdi), %xmm8 +; AVX2-FAST-NEXT: vmovaps 1632(%rdi), %xmm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vbroadcastss 216(%rdi), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm7[0,1,2],xmm11[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 440(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3,4],ymm8[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vbroadcastss 440(%rdi), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 664(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vbroadcastss 664(%rdi), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm6[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] @@ -10384,287 +10377,287 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vbroadcastss 1112(%rdi), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vbroadcastss 1336(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vbroadcastss 1560(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm15[0,1,2],xmm12[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vbroadcastss 1336(%rdi), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vbroadcastss 1560(%rdi), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vbroadcastss 1784(%rdi), %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 136(%rdi), %xmm1 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 360(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm11[3] ; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 360(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm12[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 808(%rdi), %xmm1 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm11 = xmm11[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm13[3] ; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1032(%rdi), %xmm1 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 808(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 1032(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 1704(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1704(%rdi), %xmm4 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX2-FAST-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm14[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%r9) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm12, 224(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm8, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm10, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm8, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm3, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm14, 128(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm13, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, (%rax) -; AVX2-FAST-NEXT: addq $2680, %rsp # imm = 0xA78 +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm12, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm11, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rax) +; AVX2-FAST-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2680, %rsp # imm = 0xA78 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1216(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1216(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm2 @@ -10689,13 +10682,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %xmm3 @@ -10707,13 +10698,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -10725,16 +10717,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1664(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1664(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1648(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1696(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1728(%rdi), %xmm3 @@ -10765,15 +10755,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm8[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -10788,10 +10777,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 896(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 896(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 992(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10807,16 +10795,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1344(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1344(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1440(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1440(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1424(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1504(%rdi), %xmm2 @@ -10828,18 +10816,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] @@ -10854,17 +10842,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10880,13 +10869,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1184(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1184(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10902,14 +10891,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1632(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1632(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10927,11 +10917,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10949,12 +10939,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10972,12 +10962,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10986,13 +10975,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm3 @@ -11012,38 +10999,38 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1200(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %xmm2 @@ -11058,7 +11045,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %xmm2 @@ -11067,8 +11054,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1772(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1772(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 @@ -11079,14 +11066,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm3 @@ -11102,62 +11090,63 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1100(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1548(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1548(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm15[1,3],ymm4[4,6],ymm15[5,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm4[1,3],ymm12[4,6],ymm4[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm11[0,2],mem[1,3],ymm11[4,6],mem[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,2],ymm10[1,3],ymm15[4,6],ymm10[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload @@ -11167,11 +11156,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1328(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,2],ymm14[1,3],ymm11[4,6],ymm14[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1328(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -11188,31 +11177,31 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2],ymm1[1,3],ymm5[4,6],ymm1[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1552(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm7[1,3],ymm13[4,6],ymm7[5,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -11220,9 +11209,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm8[1,3],ymm7[4,6],ymm8[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -11235,9 +11224,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm5[1,3],ymm4[4,6],ymm5[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -11245,43 +11234,43 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3] -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm3 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm3, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] @@ -11290,218 +11279,220 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 996(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1108(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1332(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 996(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1108(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1332(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0],ymm10[1],mem[2,3,4],ymm10[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1,2],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0,1,2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -11510,506 +11501,506 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1032(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1032(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1256(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1704(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1480(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1704(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1648(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r9) +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1648(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 224(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 192(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2680, %rsp # imm = 0xA78 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: load_i32_stride7_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm11, %zmm12 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm11, %zmm12 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm13, %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm13, %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm23, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm23, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm23, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm23, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm23, %zmm18 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm12, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm23, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm12, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm23, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm12, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm2, %zmm13, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm23 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm6, %zmm12, %zmm16 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm4, %zmm6, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm2, %zmm12, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm3, %zmm12, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm6, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm26, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm31, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm2, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm26, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm9, %zmm29, %zmm26 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm31, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm29, %zmm9, %zmm31 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm2, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512F-NEXT: vpermi2d %zmm29, %zmm9, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm29, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <1,8,15,22,29,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <18,25,0,7,14,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm22 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <18,25,0,7,14,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm6, %zmm24 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <19,26,1,8,15,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm7, %zmm25 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13,20,27] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm15 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm7, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm5 ; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm6, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm7, %zmm12 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm18, %zmm20 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm19, %zmm27 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm30, %zmm4 ; AVX512F-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12019,14 +12010,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512F-NEXT: movw $480, %ax # imm = 0x1E0 @@ -12038,9 +12029,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12058,30 +12049,30 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti32x4 $0, %xmm20, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload @@ -12090,56 +12081,56 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512F-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm18, %zmm26, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm26 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512F-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, (%rsp), %zmm30, %zmm31 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm19, %zmm30, %zmm19 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm23, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm15 {%k1} ; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm5 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm13, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm24, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm20, (%r9) @@ -12147,14 +12138,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512F-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -12162,306 +12153,305 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i32_stride7_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm11, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm11, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm23, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm23, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm23, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm23, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm23, %zmm18 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm23, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm23, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm13, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm23 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm16 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm26, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm31, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm26, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm29, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm9, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,8,15,22,29,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <18,25,0,7,14,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <18,25,0,7,14,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm6, %zmm24 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <19,26,1,8,15,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm7, %zmm25 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13,20,27] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm15 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm5 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm7, %zmm12 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm20 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm27 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm30, %zmm4 ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12471,14 +12461,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-NEXT: movw $480, %ax # imm = 0x1E0 @@ -12490,9 +12480,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12510,30 +12500,30 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload @@ -12542,56 +12532,56 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm26, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm18 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm26 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, (%rsp), %zmm30, %zmm31 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm30, %zmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm15, %zmm23, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm15 {%k1} ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r9) @@ -12599,14 +12589,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index a47102b6f7c0ae..b515a27aabfde8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -238,8 +238,8 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps 80(%rdi), %xmm2 +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 80(%rdi), %xmm3 ; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm0 ; SSE-NEXT: movaps 32(%rdi), %xmm4 @@ -258,25 +258,25 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; SSE-NEXT: movaps %xmm7, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movaps %xmm10, (%rsi) ; SSE-NEXT: movaps %xmm9, (%rdx) ; SSE-NEXT: movaps %xmm4, (%rcx) ; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movaps %xmm8, (%r9) ; SSE-NEXT: movaps %xmm7, (%r11) -; SSE-NEXT: movaps %xmm3, (%r10) +; SSE-NEXT: movaps %xmm2, (%r10) ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -454,99 +454,99 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i32_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm15 +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm10 -; SSE-NEXT: movaps 32(%rdi), %xmm1 -; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps 64(%rdi), %xmm11 -; SSE-NEXT: movaps 160(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdi), %xmm6 -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps 192(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps 240(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movaps 208(%rdi), %xmm12 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movaps %xmm10, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps (%rdi), %xmm11 +; SSE-NEXT: movaps 32(%rdi), %xmm10 +; SSE-NEXT: movaps 96(%rdi), %xmm0 +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps 160(%rdi), %xmm1 +; SSE-NEXT: movaps 128(%rdi), %xmm5 +; SSE-NEXT: movaps 224(%rdi), %xmm13 +; SSE-NEXT: movaps 192(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] +; SSE-NEXT: movaps 240(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; SSE-NEXT: movaps 208(%rdi), %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; SSE-NEXT: movaps %xmm13, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm13[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm5, 16(%rdx) +; SSE-NEXT: movaps %xmm3, 16(%rdx) ; SSE-NEXT: movaps %xmm8, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps %xmm10, (%r8) -; SSE-NEXT: movaps %xmm6, 16(%r8) -; SSE-NEXT: movaps %xmm4, (%r9) -; SSE-NEXT: movaps %xmm13, 16(%r9) +; SSE-NEXT: movaps %xmm15, 16(%rcx) +; SSE-NEXT: movaps %xmm11, (%r8) +; SSE-NEXT: movaps %xmm5, 16(%r8) +; SSE-NEXT: movaps %xmm6, (%r9) +; SSE-NEXT: movaps %xmm14, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm14, (%rax) -; SSE-NEXT: movaps %xmm11, 16(%rax) +; SSE-NEXT: movaps %xmm12, (%rax) +; SSE-NEXT: movaps %xmm10, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, (%rax) -; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm9, 16(%rax) -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf8: @@ -556,10 +556,10 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] @@ -570,11 +570,11 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 @@ -585,20 +585,20 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm11[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 @@ -606,7 +606,7 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,0],ymm12[4,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] @@ -616,7 +616,7 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm6[1,0],ymm7[1,0],ymm6[5,4],ymm7[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[2,0],ymm8[2,3],ymm13[6,4],ymm8[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] @@ -626,7 +626,7 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,0],ymm15[4,5],ymm13[6,4] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] @@ -636,7 +636,7 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm7[3,0],ymm6[7,4],ymm7[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm10[2,3],ymm6[6,4],ymm10[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] @@ -668,8 +668,8 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm5 @@ -682,73 +682,73 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vbroadcastss %xmm9, %xmm5 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm6 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm10[0,1,2],xmm12[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm11[0,1,2],xmm14[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm9[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm13[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[4],ymm6[4],ymm12[5],ymm6[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] @@ -764,7 +764,7 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovaps %ymm8, (%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm9, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -777,66 +777,66 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24] -; AVX512-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,8,16,24,0,8,16,24] +; AVX512-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25] ; AVX512-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26] ; AVX512-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27] ; AVX512-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28] ; AVX512-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] ; AVX512-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30] ; AVX512-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm10 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] ; AVX512-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm11 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [7,15,23,31] +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-NEXT: vmovdqa %ymm5, (%rdx) ; AVX512-NEXT: vmovdqa %ymm6, (%rcx) ; AVX512-NEXT: vmovdqa %ymm7, (%r8) ; AVX512-NEXT: vmovdqa %ymm8, (%r9) ; AVX512-NEXT: vmovdqa %ymm9, (%r11) ; AVX512-NEXT: vmovdqa %ymm10, (%r10) -; AVX512-NEXT: vmovdqa %ymm0, (%rax) +; AVX512-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <64 x i32>, ptr %in.vec, align 64 @@ -863,171 +863,172 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride8_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $296, %rsp # imm = 0x128 -; SSE-NEXT: movaps 288(%rdi), %xmm6 +; SSE-NEXT: movaps 288(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm5 +; SSE-NEXT: movaps 320(%rdi), %xmm4 ; SSE-NEXT: movaps 416(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm13 -; SSE-NEXT: movaps 448(%rdi), %xmm4 -; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps 128(%rdi), %xmm10 +; SSE-NEXT: movaps 384(%rdi), %xmm13 +; SSE-NEXT: movaps 480(%rdi), %xmm9 +; SSE-NEXT: movaps 448(%rdi), %xmm3 +; SSE-NEXT: movaps 160(%rdi), %xmm6 +; SSE-NEXT: movaps 128(%rdi), %xmm14 ; SSE-NEXT: movaps 224(%rdi), %xmm8 -; SSE-NEXT: movaps 192(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; SSE-NEXT: movaps 192(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] ; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm7[0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE-NEXT: movaps %xmm13, %xmm12 ; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm7[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movaps 256(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movaps 256(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm7[0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 64(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] ; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm12 +; SSE-NEXT: movaps 32(%rdi), %xmm15 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] ; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm5[1] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm1 +; SSE-NEXT: movaps 176(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 144(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps 464(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 432(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 400(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps 400(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 368(%rdi), %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 336(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movaps 304(%rdi), %xmm12 -; SSE-NEXT: movaps 272(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 304(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps 112(%rdi), %xmm13 ; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm3, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdi), %xmm6 +; SSE-NEXT: movaps 48(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: unpckhps (%rsp), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movaps %xmm7, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE-NEXT: movaps %xmm8, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm7[1] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1064,273 +1065,281 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm4, 32(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) +; SSE-NEXT: movaps %xmm11, 48(%rax) ; SSE-NEXT: movaps %xmm3, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm14, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps %xmm11, 16(%rax) -; SSE-NEXT: movaps %xmm2, (%rax) +; SSE-NEXT: movaps %xmm2, 48(%rax) +; SSE-NEXT: movaps %xmm8, 32(%rax) +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm7, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm6, 48(%rax) -; SSE-NEXT: movaps %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm12, 16(%rax) -; SSE-NEXT: movaps %xmm8, (%rax) +; SSE-NEXT: movaps %xmm12, 48(%rax) +; SSE-NEXT: movaps %xmm5, 32(%rax) +; SSE-NEXT: movaps %xmm15, 16(%rax) +; SSE-NEXT: movaps %xmm6, (%rax) ; SSE-NEXT: addq $296, %rsp # imm = 0x128 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $616, %rsp # imm = 0x268 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: subq $648, %rsp # imm = 0x288 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm2[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm5[0],ymm12[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[4],ymm8[4],ymm4[5],ymm8[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[4],ymm9[4],ymm4[5],ymm9[5] -; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,0],ymm6[1,0],ymm9[5,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[1,0],ymm12[1,0],ymm2[5,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,0],ymm0[2,3],ymm5[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[4],mem[4],ymm9[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,0],ymm1[1,0],ymm0[5,4],ymm1[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,0],ymm5[1,0],ymm7[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,0],ymm15[2,3],ymm2[6,4],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm14[0],ymm5[1],ymm14[1],ymm5[4],ymm14[4],ymm5[5],ymm14[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,0],ymm13[1,0],ymm12[5,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[4],ymm8[4],ymm11[5],ymm8[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,0],ymm11[1,0],ymm8[5,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,0],ymm4[2,3],ymm5[6,4],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[1,0],ymm6[1,0],ymm7[5,4],ymm6[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1],ymm7[2,0],ymm2[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 -; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[6],ymm8[6],ymm11[7],ymm8[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,0],ymm5[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm4[3,0],ymm2[7,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,0],ymm3[3,0],ymm1[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm2[2,3],ymm6[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm14[2],ymm5[3],ymm14[3],ymm5[6],ymm14[6],ymm5[7],ymm14[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0],ymm13[3,0],ymm12[7,4],ymm13[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm3 +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm0[2,3],ymm5[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm4[3,0],ymm6[7,4],ymm4[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,0],ymm5[2,3],ymm6[6,4],ymm5[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,0],ymm13[3,0],ymm7[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1357,13 +1366,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: addq $616, %rsp # imm = 0x268 +; AVX1-ONLY-NEXT: addq $648, %rsp # imm = 0x288 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1371,26 +1379,27 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm15 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm2 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm9 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm10 @@ -1400,15 +1409,15 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm0, %xmm6 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm0, %xmm7 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm0, %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vbroadcastss %xmm14, %xmm3 ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm11 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] @@ -1416,107 +1425,105 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm0, %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm0, %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vbroadcastss %xmm13, %xmm3 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm14[0],xmm12[1],xmm14[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps %xmm8, %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps %xmm15, %xmm10 +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps %xmm8, %xmm7 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm15[2],xmm7[3],xmm15[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm14 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm6[2],xmm13[3],xmm6[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 @@ -1524,8 +1531,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 @@ -1538,52 +1545,52 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm3 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload @@ -1591,56 +1598,56 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r9) +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm3 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1652,24 +1659,24 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 ; AVX512F-NEXT: movb $-64, %dil ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] ; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 @@ -1677,12 +1684,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25] ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm10 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 @@ -1690,12 +1697,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26] ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm11 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 @@ -1703,12 +1710,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27] ; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm12 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 @@ -1716,12 +1723,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28] ; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm13 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm13 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 @@ -1729,12 +1736,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29] ; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm14 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 @@ -1742,26 +1749,26 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm16 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30] ; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm15 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512F-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31] ; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm5 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm15, %zmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%rcx) @@ -1780,24 +1787,24 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 @@ -1805,12 +1812,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25] ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm10 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 @@ -1818,12 +1825,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26] ; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm11 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 @@ -1831,12 +1838,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27] ; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm12 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 @@ -1844,12 +1851,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28] ; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm13 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 @@ -1857,12 +1864,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29] ; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm14 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 @@ -1870,26 +1877,26 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30] ; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm15 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31] ; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) @@ -1923,38 +1930,36 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i32_stride8_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $952, %rsp # imm = 0x3B8 -; SSE-NEXT: movaps 544(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm6 +; SSE-NEXT: subq $968, %rsp # imm = 0x3C8 +; SSE-NEXT: movaps 544(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm7 +; SSE-NEXT: movaps 608(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 576(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 672(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 640(%rdi), %xmm5 ; SSE-NEXT: movaps 736(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 704(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: movaps 128(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1963,27 +1968,27 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movaps 512(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm1 +; SSE-NEXT: movaps 480(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 416(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 416(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1991,8 +1996,8 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 992(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 960(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps 960(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 928(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2006,167 +2011,170 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 352(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 320(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 288(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movaps 256(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 832(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 800(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 768(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 864(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 832(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 800(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 768(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm15 +; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movaps (%rdi), %xmm9 +; SSE-NEXT: movaps 32(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] +; SSE-NEXT: movaps %xmm7, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm14[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm8[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 176(%rdi), %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm13[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm6[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm3 +; SSE-NEXT: movaps 208(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps 176(%rdi), %xmm4 ; SSE-NEXT: movaps 144(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm4 -; SSE-NEXT: movaps 336(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 304(%rdi), %xmm5 -; SSE-NEXT: movaps 272(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 336(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 304(%rdi), %xmm6 +; SSE-NEXT: movaps 272(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 464(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movaps 432(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 400(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 432(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 400(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2174,24 +2182,24 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 624(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 592(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps 592(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 560(%rdi), %xmm6 +; SSE-NEXT: movaps 560(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 528(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 752(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 720(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 752(%rdi), %xmm7 +; SSE-NEXT: movaps 720(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movaps 688(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 656(%rdi), %xmm1 @@ -2204,19 +2212,19 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 880(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 848(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps 848(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 816(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 784(%rdi), %xmm2 +; SSE-NEXT: movaps 816(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 784(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1008(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 976(%rdi), %xmm11 @@ -2224,8 +2232,8 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 944(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 912(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 912(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -2237,85 +2245,86 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 80(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 16(%rdi), %xmm15 -; SSE-NEXT: movaps 48(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps 16(%rdi), %xmm14 +; SSE-NEXT: movaps 48(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movaps %xmm5, %xmm13 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm13[1] +; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm11[1] +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm10[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -2407,66 +2416,65 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rax) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 112(%rax) -; SSE-NEXT: movaps %xmm8, 96(%rax) -; SSE-NEXT: movaps %xmm4, 80(%rax) -; SSE-NEXT: movaps %xmm13, 64(%rax) -; SSE-NEXT: movaps %xmm12, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rax) -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 112(%rax) -; SSE-NEXT: movaps %xmm6, 96(%rax) -; SSE-NEXT: movaps %xmm5, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps %xmm7, 48(%rax) +; SSE-NEXT: movaps %xmm4, 112(%rax) +; SSE-NEXT: movaps %xmm0, 96(%rax) +; SSE-NEXT: movaps %xmm3, 80(%rax) +; SSE-NEXT: movaps %xmm5, 64(%rax) +; SSE-NEXT: movaps %xmm6, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm15, (%rax) -; SSE-NEXT: addq $952, %rsp # imm = 0x3B8 +; SSE-NEXT: movaps %xmm2, (%rax) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps %xmm15, 112(%rax) +; SSE-NEXT: movaps %xmm9, 96(%rax) +; SSE-NEXT: movaps %xmm7, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm14, (%rax) +; SSE-NEXT: addq $968, %rsp # imm = 0x3C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1800, %rsp # imm = 0x708 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2481,14 +2489,15 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 @@ -2497,227 +2506,226 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm12[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vpermilps $170, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm3[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -2727,27 +2735,27 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm9 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] @@ -2760,21 +2768,22 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm13[0],ymm4[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] @@ -2782,50 +2791,49 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 @@ -2835,137 +2843,138 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[4],ymm12[4],ymm9[5],ymm12[5] -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm5[1,0],ymm13[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm5[1,0],ymm11[5,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,0],ymm5[1,0],ymm3[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,0],ymm3[1,0],ymm8[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[4],ymm11[4],ymm7[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm9[1,0],ymm1[5,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,0],ymm4[1,0],ymm6[5,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,0],ymm14[1,0],ymm2[5,4],ymm14[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm4[0],ymm10[1],ymm4[1],ymm10[4],ymm4[4],ymm10[5],ymm4[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm10[1,0],ymm6[5,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,0],ymm4[1,0],ymm15[5,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[4],mem[4],ymm14[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,0],ymm2[1,0],ymm14[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm5[1],ymm8[3],ymm5[3] ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm5[1],ymm13[3],ymm5[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[6],ymm10[6],ymm14[7],ymm10[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2978,58 +2987,57 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm15[3,0],mem[3,0],ymm15[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm14[3,0],ymm1[7,4],ymm14[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,0],ymm6[3,0],ymm8[7,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,0],ymm4[3,0],ymm6[7,4],ymm4[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[2,3],ymm4[6,4],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[3,0],ymm13[3,0],ymm10[7,4],ymm13[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,0],ymm13[3,0],ymm11[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,0],ymm4[2,3],ymm5[6,4],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -3091,7 +3099,7 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: addq $1800, %rsp # imm = 0x708 ; AVX1-ONLY-NEXT: vzeroupper @@ -3102,24 +3110,24 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm10 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm14 +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm13 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm15 +; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm14 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3132,25 +3140,26 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm10 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3170,7 +3179,7 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -3199,215 +3208,213 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] ; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm12 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps %xmm10, %xmm3 -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm14[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm10[0],xmm6[1],xmm10[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm13[2],mem[2],xmm13[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm6[0,1,2],xmm15[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm3[0,1,2],xmm9[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps %xmm4, %xmm8 ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm3[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm13[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm4, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 @@ -3420,19 +3427,22 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 @@ -3445,21 +3455,19 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[4],ymm14[4],ymm1[5],ymm14[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 @@ -3472,88 +3480,89 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm12[0],ymm2[1],ymm12[1],ymm2[4],ymm12[4],ymm2[5],ymm12[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[4],ymm0[4],ymm13[5],ymm0[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm10[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm6[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm8 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm4 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm2 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -3561,101 +3570,104 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1016(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 1016(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm15[2],ymm6[2],ymm15[3],ymm6[3],ymm15[6],ymm6[6],ymm15[7],ymm6[7] +; AVX2-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm8 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 760(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm15 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm7 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 760(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[6],ymm11[6],ymm2[7],ymm11[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm6 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm13[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 732(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 988(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm3 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm3 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 732(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm2 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 988(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vpermilps $238, (%rsp), %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -3701,2271 +3713,2199 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm9, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm8, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm8, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm8, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm10, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm9, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm14, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm11, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm12, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm9, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm14, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,2,10,18,26] +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] -; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm8, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] ; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm12, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 ; AVX512F-ONLY-FAST-NEXT: movb $-64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm9, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm8, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm8, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm10, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm9, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm14, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm11, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm12, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm9, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm14, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,2,10,18,26] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm8, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] ; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm12, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 ; AVX512DQ-SLOW-NEXT: movb $-64, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm9, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm8, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm8, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm18, %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm26, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm8, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm10, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm9, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm14, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm11, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm12, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm9, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm14, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,2,10,18,26] +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] -; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] -; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,3,11,19,27] +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm8, %zmm11 ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] ; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] -; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm12, %zmm15 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-SLOW-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i32_stride8_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 ; AVX512DQ-FAST-NEXT: movb $-64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm9, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm26, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm10, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm9, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm14, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm11, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm12, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm9, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm14, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm11 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm9 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,2,10,18,26] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,3,11,19,27] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm8, %zmm11 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] ; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] -; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm12, %zmm15 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-FAST-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-FAST-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm9, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm8, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm8, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm8, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm8, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm11, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm10, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm9, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm14, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm11, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm12, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm9, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm14, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm8, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm9, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm8, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] ; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm12, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm8, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm8, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm8, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm10, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm9, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm14, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm11, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm12, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm9, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm14, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm9, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm8, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] ; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm12, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, (%rsi) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 ; AVX512DQBW-SLOW-NEXT: movb $-64, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm9, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm8, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm8, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm18, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm26, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] -; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] -; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] -; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm10, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm9, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm14, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm10, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm11, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm12, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm9, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm18, %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,2,10,18,26] +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm26, %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm8, %zmm11 ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] ; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm12, %zmm15 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm26, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf32: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 ; AVX512DQBW-FAST-NEXT: movb $-64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm9, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm8, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm8, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm26, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] -; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm11, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm10, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm9, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm14, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm10, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm11, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm12, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm9, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm18, %zmm15, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,10,18,26,2,10,18,26] +; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm31 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] -; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] -; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm26, %zmm15, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm8, %zmm11 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] ; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm12, %zmm15 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm26, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-FAST-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-FAST-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <256 x i32>, ptr %in.vec, align 64 @@ -5991,17 +5931,15 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i32_stride8_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $2232, %rsp # imm = 0x8B8 -; SSE-NEXT: movaps 288(%rdi), %xmm4 +; SSE-NEXT: subq $2248, %rsp # imm = 0x8C8 +; SSE-NEXT: movaps 288(%rdi), %xmm6 +; SSE-NEXT: movaps 352(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm8 +; SSE-NEXT: movaps 320(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 416(%rdi), %xmm7 +; SSE-NEXT: movaps 384(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 480(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 448(%rdi), %xmm3 @@ -6010,11 +5948,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm1 @@ -6022,7 +5960,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6031,21 +5969,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movaps 256(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm9 +; SSE-NEXT: movaps 736(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 704(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 672(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 640(%rdi), %xmm1 @@ -6060,13 +5999,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 576(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 544(%rdi), %xmm15 -; SSE-NEXT: movaps 512(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movaps 544(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 512(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6087,16 +6026,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 832(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 800(%rdi), %xmm14 -; SSE-NEXT: movaps 768(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: movaps 864(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 832(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 800(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 768(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6119,9 +6058,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1120(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1088(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: movaps 1088(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 1056(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6134,7 +6072,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1504(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 1472(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -6148,11 +6086,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1376(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1344(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 1376(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1344(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 1312(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1280(%rdi), %xmm1 @@ -6178,17 +6116,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1632(%rdi), %xmm1 +; SSE-NEXT: movaps 1632(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1600(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 1568(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1536(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1600(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1568(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1536(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6211,308 +6148,304 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1888(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1856(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1856(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1824(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1792(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps 1824(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1792(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps (%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps (%rsp), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps (%rsp), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 176(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 176(%rdi), %xmm4 ; SSE-NEXT: movaps 144(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm3 -; SSE-NEXT: movaps 336(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movaps 304(%rdi), %xmm4 -; SSE-NEXT: movaps 272(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps 368(%rdi), %xmm6 +; SSE-NEXT: movaps 336(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps 304(%rdi), %xmm7 +; SSE-NEXT: movaps 272(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm5 +; SSE-NEXT: movaps 496(%rdi), %xmm9 ; SSE-NEXT: movaps 464(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movaps 432(%rdi), %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movaps 432(%rdi), %xmm15 ; SSE-NEXT: movaps 400(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 624(%rdi), %xmm9 +; SSE-NEXT: movaps 624(%rdi), %xmm14 ; SSE-NEXT: movaps 592(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movaps 560(%rdi), %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movaps 560(%rdi), %xmm5 ; SSE-NEXT: movaps 528(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 752(%rdi), %xmm12 +; SSE-NEXT: movaps 752(%rdi), %xmm10 ; SSE-NEXT: movaps 720(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movaps 688(%rdi), %xmm13 -; SSE-NEXT: movaps 656(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movaps 688(%rdi), %xmm11 +; SSE-NEXT: movaps 656(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 880(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 880(%rdi), %xmm8 ; SSE-NEXT: movaps 848(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movaps 816(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 784(%rdi), %xmm1 @@ -6523,10 +6456,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1008(%rdi), %xmm14 +; SSE-NEXT: movaps 1008(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 976(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 944(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 912(%rdi), %xmm1 @@ -6552,10 +6486,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1264(%rdi), %xmm11 +; SSE-NEXT: movaps 1264(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1232(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1200(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1168(%rdi), %xmm1 @@ -6611,10 +6546,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1776(%rdi), %xmm8 +; SSE-NEXT: movaps 1776(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1744(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1712(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1680(%rdi), %xmm1 @@ -6643,7 +6579,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 2032(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 2000(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1968(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6670,67 +6606,44 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] @@ -6738,131 +6651,146 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movaps %xmm13, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] -; SSE-NEXT: movaps %xmm2, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm13[0] -; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: movaps %xmm0, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7058,45 +6986,45 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm3, 240(%rax) -; SSE-NEXT: movaps %xmm4, 224(%rax) -; SSE-NEXT: movaps %xmm5, 208(%rax) +; SSE-NEXT: movaps %xmm0, 224(%rax) +; SSE-NEXT: movaps %xmm4, 208(%rax) ; SSE-NEXT: movaps %xmm2, 192(%rax) -; SSE-NEXT: movaps %xmm6, 176(%rax) -; SSE-NEXT: movaps %xmm7, 160(%rax) -; SSE-NEXT: movaps %xmm13, 144(%rax) +; SSE-NEXT: movaps %xmm5, 176(%rax) +; SSE-NEXT: movaps %xmm12, 160(%rax) +; SSE-NEXT: movaps %xmm8, 144(%rax) ; SSE-NEXT: movaps %xmm14, 128(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rax) -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm6, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, 240(%rax) -; SSE-NEXT: movaps %xmm8, 224(%rax) +; SSE-NEXT: movaps %xmm10, 240(%rax) +; SSE-NEXT: movaps %xmm7, 224(%rax) +; SSE-NEXT: movaps %xmm11, 208(%rax) +; SSE-NEXT: movaps %xmm13, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: movaps %xmm10, 192(%rax) -; SSE-NEXT: movaps %xmm12, 176(%rax) +; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps %xmm15, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -7106,25 +7034,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm11, (%rax) -; SSE-NEXT: addq $2232, %rsp # imm = 0x8B8 +; SSE-NEXT: movaps %xmm9, (%rax) +; SSE-NEXT: addq $2248, %rsp # imm = 0x8C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $3720, %rsp # imm = 0xE88 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 @@ -7142,11 +7067,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7159,9 +7084,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 @@ -7171,11 +7096,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7186,18 +7111,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 @@ -7215,17 +7139,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 @@ -7243,24 +7167,24 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7268,151 +7192,164 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm6[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm6[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm14[1],xmm11[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -7426,33 +7363,20 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7472,8 +7396,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7482,20 +7405,18 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7504,18 +7425,19 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm11[2],mem[2],xmm11[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7524,19 +7446,19 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm9[2],mem[2],xmm9[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7547,147 +7469,146 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm7, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm7[2],mem[2],xmm7[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm15[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm6[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm9[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm7[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm7[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm8[1] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm6[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7700,24 +7621,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm14[2],xmm9[3],xmm14[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm13[2],mem[2],xmm13[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] @@ -7728,65 +7647,66 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm15 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[4],ymm1[4],ymm8[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[4],ymm12[4],ymm8[5],ymm12[5] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm15 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7806,20 +7726,20 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7922,446 +7842,442 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm15[1,0],ymm1[5,4],ymm15[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm15 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[4],ymm15[4],ymm9[5],ymm15[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,0],ymm2[1,0],ymm6[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm4[1,0],ymm1[5,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[1,0],ymm8[1,0],ymm7[5,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm6[0],ymm13[1],ymm6[1],ymm13[4],ymm6[4],ymm13[5],ymm6[5] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,0],ymm8[1,0],ymm12[5,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[1,0],ymm11[1,0],ymm7[5,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[4],ymm9[4],ymm11[5],ymm9[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,0],ymm8[1,0],ymm10[5,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[4],ymm13[4],ymm15[5],ymm13[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,0],ymm13[1,0],ymm15[5,4],ymm13[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm7[1,0],ymm10[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[1],ymm12[1],ymm0[4],ymm12[4],ymm0[5],ymm12[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm12[1,0],ymm2[5,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm9[1],ymm15[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm1[1],mem[1],ymm1[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm4[1],ymm10[3],ymm4[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[6],ymm11[6],ymm15[7],ymm11[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,0],mem[3,0],ymm10[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,0],mem[3,0],ymm10[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[3,0],ymm13[3,0],ymm11[7,4],ymm13[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[6],ymm10[6],ymm4[7],ymm10[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm9[3,0],mem[3,0],ymm9[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[3,0],ymm15[3,0],ymm11[7,4],ymm15[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[3,0],ymm9[3,0],ymm7[7,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,0],ymm7[3,0],ymm5[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[3,0],ymm15[3,0],ymm1[7,4],ymm15[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,0],ymm13[3,0],ymm12[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) @@ -8478,12 +8394,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm14, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) @@ -8494,16 +8411,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-LABEL: load_i32_stride8_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $3528, %rsp # imm = 0xDC8 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm9 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -8513,9 +8429,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm13 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm3 @@ -8529,11 +8445,12 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 @@ -8562,16 +8479,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vbroadcastss %xmm14, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8592,9 +8508,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8620,11 +8536,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8633,11 +8549,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -8650,11 +8566,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8680,11 +8596,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8706,24 +8622,24 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 @@ -8733,17 +8649,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps %xmm12, %xmm3 +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm13, %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8757,13 +8673,12 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8773,17 +8688,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8793,17 +8707,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8815,15 +8728,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -8832,13 +8744,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8856,9 +8768,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8872,13 +8784,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8888,24 +8800,24 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpermilps $170, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8916,46 +8828,48 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -8965,7 +8879,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8976,58 +8890,55 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -9042,17 +8953,18 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm5[0,1],xmm13[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm5, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1] @@ -9066,8 +8978,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm11[2],mem[2],xmm11[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -9080,9 +8992,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm5[1],xmm11[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] @@ -9096,7 +9010,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm5[1],xmm9[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] @@ -9121,10 +9036,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm5[1] ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -9134,7 +9048,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -9154,17 +9069,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm15[2],mem[2],xmm15[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] +; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9178,11 +9093,12 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] -; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -9207,12 +9123,12 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 @@ -9234,12 +9150,12 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 @@ -9258,15 +9174,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 @@ -9283,158 +9199,186 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm9 -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[4],ymm10[4],ymm1[5],ymm10[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm10 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm11[0],ymm1[1],ymm11[1],ymm1[4],ymm11[4],ymm1[5],ymm11[5] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm11 -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[4],ymm10[4],ymm2[5],ymm10[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm14 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm13 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm12 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vblendps $32, (%rsp), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1,2,3,4],mem[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm12 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1,2,3,4],mem[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm11 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 1172(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 1428(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1172(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 1684(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3,4],mem[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -9443,40 +9387,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1428(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1684(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1940(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -9491,67 +9406,70 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7] +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 760(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload @@ -9559,21 +9477,21 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1272(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -9582,17 +9500,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm1 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1784(%rdi), %ymm0 @@ -9601,15 +9518,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9617,20 +9532,20 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -9682,37 +9597,37 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 1244(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 1500(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 1756(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 2012(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -9814,7 +9729,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rax) @@ -9822,7 +9737,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) @@ -9832,9 +9747,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 128(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9847,557 +9762,539 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i32_stride8_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm7 -; AVX512F-NEXT: vmovaps 1152(%rdi), %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm9 +; AVX512F-NEXT: subq $3080, %rsp # imm = 0xC08 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm8 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm13 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm23 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm26 ; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm19, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm23 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm6 ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm25 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm11 +; AVX512F-NEXT: vpermi2d %zmm24, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm13 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm13, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm9 -; AVX512F-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm9 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm25, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512F-NEXT: vpermi2d %zmm24, %zmm11, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm12, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm22, %zmm13, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm12, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm22, %zmm15, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm10 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm9, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512F-NEXT: vpermt2d %zmm24, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm8, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 {%k1} +; AVX512F-NEXT: vpermi2d %zmm18, %zmm8, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm12, %zmm15, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,5,13,21,29] +; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm8, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm2, %zmm14 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm7, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm7, %zmm27 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm2, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm9 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm7, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm10, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm2, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm10, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm24, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm7, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm10, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm7, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm22, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm4, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm7, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm7, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm7, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm7, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm10, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm21 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm3, %zmm10, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm7, %zmm16 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm10, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm7, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm29, %zmm6, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm14, %zmm1, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm22, %zmm1, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm25 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] ; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm22, %zmm3, %zmm2 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 -; AVX512F-NEXT: vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm6 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm3, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm4, %zmm3, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm22, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm16 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm15 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm12 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm6 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm20 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm3 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm20, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512F-NEXT: vpblendd $15, (%rsp), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm9 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm9 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm19, %zmm9 ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm2, 192(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -10439,578 +10336,560 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm2, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm24, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm2, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512F-NEXT: addq $3080, %rsp # imm = 0xC08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm7 -; AVX512BW-NEXT: vmovaps 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm9 +; AVX512BW-NEXT: subq $3080, %rsp # imm = 0xC08 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm23 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm19, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm6 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm25 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm24, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm9 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm25, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm24, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm22, %zmm13, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm22, %zmm15, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm10 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm18, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm15, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm8, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm2, %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm7, %zmm27 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm9 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm7, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm10, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm7, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm7, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm7, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm7, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm7, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm10, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm7, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm6, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm1, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm22, %zmm1, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm25 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] ; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm6 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] ; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm3, %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 -; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm3, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm22, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm16 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm15 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm12 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm6 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm3 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm20, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm19, %zmm9 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 192(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -11052,22 +10931,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512BW-NEXT: addq $3080, %rsp # imm = 0xC08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <512 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll index eb2a1cc227f43a..69dcfdff42e9a1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll @@ -220,35 +220,35 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm8 ; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 208(%rdi), %xmm9 ; SSE-NEXT: movaps 192(%rdi), %xmm2 ; SSE-NEXT: movaps 240(%rdi), %xmm10 ; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps 144(%rdi), %xmm14 +; SSE-NEXT: movaps 144(%rdi), %xmm11 ; SSE-NEXT: movaps 128(%rdi), %xmm3 ; SSE-NEXT: movaps 176(%rdi), %xmm12 ; SSE-NEXT: movaps 160(%rdi), %xmm6 ; SSE-NEXT: movaps 80(%rdi), %xmm13 ; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm15 +; SSE-NEXT: movaps 112(%rdi), %xmm14 ; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] ; SSE-NEXT: movaps %xmm7, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] -; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] -; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] +; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] +; SSE-NEXT: movaps %xmm6, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] ; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] ; SSE-NEXT: movaps %xmm1, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] @@ -259,10 +259,10 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] ; SSE-NEXT: movaps %xmm12, 96(%rsi) -; SSE-NEXT: movaps %xmm11, 112(%rsi) -; SSE-NEXT: movaps %xmm15, 64(%rsi) -; SSE-NEXT: movaps %xmm14, 80(%rsi) -; SSE-NEXT: movaps %xmm9, 32(%rsi) +; SSE-NEXT: movaps %xmm9, 112(%rsi) +; SSE-NEXT: movaps %xmm14, 64(%rsi) +; SSE-NEXT: movaps %xmm11, 80(%rsi) +; SSE-NEXT: movaps %xmm15, 32(%rsi) ; SSE-NEXT: movaps %xmm13, 48(%rsi) ; SSE-NEXT: movaps %xmm10, (%rsi) ; SSE-NEXT: movaps %xmm1, 16(%rsi) @@ -378,77 +378,77 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-LABEL: load_i64_stride2_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm11 -; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 208(%rdi), %xmm0 +; SSE-NEXT: movaps 192(%rdi), %xmm9 ; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 240(%rdi), %xmm14 -; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps 64(%rdi), %xmm8 +; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps 224(%rdi), %xmm11 ; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps 272(%rdi), %xmm12 -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdi), %xmm9 -; SSE-NEXT: movaps 304(%rdi), %xmm0 -; SSE-NEXT: movaps 288(%rdi), %xmm13 -; SSE-NEXT: movaps 176(%rdi), %xmm4 -; SSE-NEXT: movaps 160(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: movaps 96(%rdi), %xmm10 +; SSE-NEXT: movaps 272(%rdi), %xmm4 +; SSE-NEXT: movaps 144(%rdi), %xmm5 +; SSE-NEXT: movaps 128(%rdi), %xmm12 +; SSE-NEXT: movaps 304(%rdi), %xmm6 +; SSE-NEXT: movaps 288(%rdi), %xmm14 +; SSE-NEXT: movaps 176(%rdi), %xmm7 +; SSE-NEXT: movaps 160(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm10, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm14[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: movaps %xmm14, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 352(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 352(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 336(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 320(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 432(%rdi), %xmm0 ; SSE-NEXT: movaps 416(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 400(%rdi), %xmm0 ; SSE-NEXT: movaps 384(%rdi), %xmm9 @@ -457,31 +457,31 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm0 ; SSE-NEXT: movaps 480(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 464(%rdi), %xmm1 -; SSE-NEXT: movaps 448(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps 464(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps (%rdi), %xmm6 ; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, 224(%rsi) -; SSE-NEXT: movaps %xmm11, 160(%rsi) +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, 224(%rsi) +; SSE-NEXT: movaps %xmm15, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm6, 240(%rsi) +; SSE-NEXT: movaps %xmm3, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -493,22 +493,22 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 128(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps %xmm14, 208(%rsi) +; SSE-NEXT: movaps %xmm4, (%rsi) +; SSE-NEXT: movaps %xmm13, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps %xmm5, 16(%rsi) -; SSE-NEXT: movaps %xmm3, 224(%rdx) +; SSE-NEXT: movaps %xmm2, 224(%rdx) ; SSE-NEXT: movaps %xmm7, 240(%rdx) ; SSE-NEXT: movaps %xmm9, 192(%rdx) ; SSE-NEXT: movaps %xmm12, 208(%rdx) -; SSE-NEXT: movaps %xmm13, 160(%rdx) -; SSE-NEXT: movaps %xmm15, 176(%rdx) +; SSE-NEXT: movaps %xmm11, 160(%rdx) +; SSE-NEXT: movaps %xmm14, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rdx) @@ -522,7 +522,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps %xmm6, (%rdx) ; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq @@ -530,131 +530,131 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX1-ONLY-LABEL: load_i64_stride2_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm12[0],ymm7[2],ymm12[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm1, %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm12[1],ymm7[3],ymm12[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm5[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm15[1],ymm4[3],ymm15[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm13[1],ymm6[3],ymm13[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm14[1],ymm4[3],ymm14[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm5[0],ymm11[0],ymm5[2],ymm11[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride2_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm15[1],ymm5[1],ymm15[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm14[1],ymm6[1],ymm14[3],ymm6[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm1[0],ymm13[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm1[1],ymm13[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) @@ -829,7 +829,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps 608(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 656(%rdi), %xmm0 @@ -866,63 +866,62 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 816(%rdi), %xmm0 ; SSE-NEXT: movaps 800(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 848(%rdi), %xmm0 ; SSE-NEXT: movaps 832(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 880(%rdi), %xmm0 ; SSE-NEXT: movaps 864(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: movaps %xmm15, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 912(%rdi), %xmm0 -; SSE-NEXT: movaps 896(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 896(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 944(%rdi), %xmm0 ; SSE-NEXT: movaps 928(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 976(%rdi), %xmm0 -; SSE-NEXT: movaps 960(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movaps 960(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps 1008(%rdi), %xmm3 -; SSE-NEXT: movaps 992(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: movaps (%rdi), %xmm14 -; SSE-NEXT: movaps 16(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] -; SSE-NEXT: movaps 32(%rdi), %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 1008(%rdi), %xmm4 +; SSE-NEXT: movaps 992(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm12 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps %xmm4, 496(%rsi) +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, 496(%rsi) ; SSE-NEXT: movaps %xmm1, 480(%rsi) ; SSE-NEXT: movaps %xmm2, 464(%rsi) -; SSE-NEXT: movaps %xmm9, 448(%rsi) -; SSE-NEXT: movaps %xmm7, 432(%rsi) -; SSE-NEXT: movaps %xmm12, 416(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 400(%rsi) +; SSE-NEXT: movaps %xmm3, 448(%rsi) +; SSE-NEXT: movaps %xmm11, 432(%rsi) +; SSE-NEXT: movaps %xmm9, 416(%rsi) +; SSE-NEXT: movaps %xmm14, 400(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -933,7 +932,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 336(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%rsi) @@ -969,18 +968,18 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm10, (%rsi) -; SSE-NEXT: movaps %xmm5, 496(%rdx) -; SSE-NEXT: movaps %xmm6, 480(%rdx) +; SSE-NEXT: movaps %xmm4, 16(%rsi) +; SSE-NEXT: movaps %xmm13, (%rsi) +; SSE-NEXT: movaps %xmm6, 496(%rdx) +; SSE-NEXT: movaps %xmm7, 480(%rdx) ; SSE-NEXT: movaps %xmm8, 464(%rdx) -; SSE-NEXT: movaps %xmm13, 448(%rdx) +; SSE-NEXT: movaps %xmm10, 448(%rdx) ; SSE-NEXT: movaps %xmm15, 432(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rdx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rdx) @@ -1026,68 +1025,69 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rdx) -; SSE-NEXT: movaps %xmm14, (%rdx) +; SSE-NEXT: movaps %xmm12, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride2_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm9, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm4, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm5[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm8[0],ymm5[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm8[1],ymm5[3],ymm8[3] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm11[1],ymm3[3],ymm11[3] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm13[0],ymm2[2],ymm13[2] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm13[1],ymm2[3],ymm13[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm8, %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] @@ -1187,56 +1187,55 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-ONLY-LABEL: load_i64_stride2_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-ONLY-NEXT: vmovups %ymm12, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 @@ -1244,58 +1243,58 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, (%rsp), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermpd $216, (%rsp), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -1306,11 +1305,11 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] @@ -1345,9 +1344,10 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX2-ONLY-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll index b83571698ea2e6..3a8ff8b626171a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -188,41 +188,41 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd 176(%rdi), %xmm1 ; SSE-NEXT: movapd 80(%rdi), %xmm0 ; SSE-NEXT: movapd 96(%rdi), %xmm3 -; SSE-NEXT: movapd 112(%rdi), %xmm8 +; SSE-NEXT: movapd 112(%rdi), %xmm4 ; SSE-NEXT: movapd 144(%rdi), %xmm5 -; SSE-NEXT: movapd 160(%rdi), %xmm9 -; SSE-NEXT: movapd (%rdi), %xmm6 -; SSE-NEXT: movapd 16(%rdi), %xmm10 -; SSE-NEXT: movapd 32(%rdi), %xmm4 -; SSE-NEXT: movapd 48(%rdi), %xmm7 +; SSE-NEXT: movapd 160(%rdi), %xmm6 +; SSE-NEXT: movapd (%rdi), %xmm7 +; SSE-NEXT: movapd 16(%rdi), %xmm8 +; SSE-NEXT: movapd 32(%rdi), %xmm9 +; SSE-NEXT: movapd 48(%rdi), %xmm10 ; SSE-NEXT: movapd 64(%rdi), %xmm11 ; SSE-NEXT: movapd %xmm11, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm7[0],xmm12[1] -; SSE-NEXT: movapd %xmm9, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] +; SSE-NEXT: movapd %xmm6, %xmm13 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm5[0],xmm13[1] -; SSE-NEXT: movapd %xmm8, %xmm14 +; SSE-NEXT: movapd %xmm4, %xmm14 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm3[0],xmm14[1] -; SSE-NEXT: movapd %xmm10, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm6[0],xmm15[1] -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm0[0] +; SSE-NEXT: movapd %xmm8, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm7[0],xmm15[1] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm0[0] ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0] ; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm2[0] -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm9[0] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm8[0],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm10[0],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] ; SSE-NEXT: movapd %xmm14, 32(%rsi) ; SSE-NEXT: movapd %xmm13, 48(%rsi) ; SSE-NEXT: movapd %xmm15, (%rsi) ; SSE-NEXT: movapd %xmm12, 16(%rsi) ; SSE-NEXT: movapd %xmm3, 32(%rdx) ; SSE-NEXT: movapd %xmm5, 48(%rdx) -; SSE-NEXT: movapd %xmm6, (%rdx) -; SSE-NEXT: movapd %xmm7, 16(%rdx) +; SSE-NEXT: movapd %xmm7, (%rdx) +; SSE-NEXT: movapd %xmm10, 16(%rdx) ; SSE-NEXT: movapd %xmm2, 32(%rcx) ; SSE-NEXT: movapd %xmm1, 48(%rcx) -; SSE-NEXT: movapd %xmm4, (%rcx) +; SSE-NEXT: movapd %xmm9, (%rcx) ; SSE-NEXT: movapd %xmm0, 16(%rcx) ; SSE-NEXT: retq ; @@ -332,57 +332,57 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride3_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movapd 128(%rdi), %xmm0 -; SSE-NEXT: movapd 176(%rdi), %xmm1 +; SSE-NEXT: movapd 128(%rdi), %xmm1 +; SSE-NEXT: movapd 176(%rdi), %xmm0 ; SSE-NEXT: movapd 224(%rdi), %xmm4 ; SSE-NEXT: movapd 272(%rdi), %xmm3 ; SSE-NEXT: movapd 80(%rdi), %xmm2 -; SSE-NEXT: movapd 96(%rdi), %xmm5 +; SSE-NEXT: movapd 96(%rdi), %xmm6 ; SSE-NEXT: movapd 112(%rdi), %xmm12 -; SSE-NEXT: movapd 144(%rdi), %xmm6 -; SSE-NEXT: movapd 160(%rdi), %xmm14 +; SSE-NEXT: movapd 144(%rdi), %xmm5 +; SSE-NEXT: movapd 160(%rdi), %xmm13 ; SSE-NEXT: movapd 192(%rdi), %xmm7 -; SSE-NEXT: movapd 208(%rdi), %xmm11 +; SSE-NEXT: movapd 208(%rdi), %xmm14 ; SSE-NEXT: movapd 240(%rdi), %xmm10 -; SSE-NEXT: movapd 256(%rdi), %xmm13 +; SSE-NEXT: movapd 256(%rdi), %xmm15 ; SSE-NEXT: movapd 48(%rdi), %xmm9 -; SSE-NEXT: movapd 64(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm15, %xmm8 +; SSE-NEXT: movapd 64(%rdi), %xmm11 +; SSE-NEXT: movapd %xmm11, %xmm8 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm2[0] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm15[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm6[0],xmm15[1] -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm12, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm11[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm13, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm5[0],xmm11[1] ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm12 +; SSE-NEXT: movapd %xmm12, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm6[0],xmm13[1] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm15, %xmm12 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm3[0] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm15[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm11, %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] +; SSE-NEXT: movapd %xmm14, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm7[0],xmm15[1] ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm11[0],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm14[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 336(%rdi), %xmm13 +; SSE-NEXT: movapd 336(%rdi), %xmm7 ; SSE-NEXT: movapd 352(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm13[0],xmm7[1] -; SSE-NEXT: movapd 368(%rdi), %xmm11 -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm11[0] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd %xmm0, %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] +; SSE-NEXT: movapd 368(%rdi), %xmm14 +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm14[0] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movapd 288(%rdi), %xmm0 ; SSE-NEXT: movapd 304(%rdi), %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm1 @@ -391,25 +391,25 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm6[0] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] ; SSE-NEXT: movapd (%rdi), %xmm2 -; SSE-NEXT: movapd 16(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm3 +; SSE-NEXT: movapd 16(%rdi), %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd 32(%rdi), %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm4[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movapd 32(%rdi), %xmm5 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm5[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd %xmm1, 96(%rsi) -; SSE-NEXT: movapd %xmm14, 32(%rsi) -; SSE-NEXT: movapd %xmm7, 112(%rsi) -; SSE-NEXT: movapd %xmm15, 48(%rsi) -; SSE-NEXT: movapd %xmm8, 64(%rsi) +; SSE-NEXT: movapd %xmm13, 32(%rsi) +; SSE-NEXT: movapd %xmm8, 112(%rsi) +; SSE-NEXT: movapd %xmm11, 48(%rsi) +; SSE-NEXT: movapd %xmm15, 64(%rsi) ; SSE-NEXT: movapd %xmm3, (%rsi) ; SSE-NEXT: movapd %xmm12, 80(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movapd %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movapd %xmm13, 112(%rdx) +; SSE-NEXT: movapd %xmm7, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -418,7 +418,7 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm10, 80(%rdx) ; SSE-NEXT: movapd %xmm9, 16(%rdx) ; SSE-NEXT: movapd %xmm6, 96(%rcx) -; SSE-NEXT: movapd %xmm11, 112(%rcx) +; SSE-NEXT: movapd %xmm14, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -427,76 +427,76 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movapd %xmm4, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movapd %xmm5, (%rcx) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm7[1],ymm5[0],ymm7[3],ymm5[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[1],ymm2[0],ymm8[3],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[1],ymm5[0],ymm6[3],ymm5[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm7[1],ymm4[0],ymm7[3],ymm4[2] ; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm9[1],ymm1[0],ymm9[3],ymm1[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm9[1],ymm2[0],ymm9[3],ymm2[2] ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm3[1],ymm0[0],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm8[1],ymm0[0],ymm8[3],ymm0[2] ; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2],ymm2[3] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd %ymm9, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 96(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 96(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm11, 64(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm10, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 96(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm5, 32(%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride3_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 @@ -506,40 +506,40 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm9[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm8[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = mem[0,1,0,3] @@ -548,18 +548,18 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -622,17 +622,17 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 112(%rdi), %xmm0 ; SSE-NEXT: movapd 144(%rdi), %xmm11 ; SSE-NEXT: movapd 160(%rdi), %xmm1 -; SSE-NEXT: movapd 192(%rdi), %xmm12 +; SSE-NEXT: movapd 192(%rdi), %xmm13 ; SSE-NEXT: movapd 208(%rdi), %xmm2 -; SSE-NEXT: movapd 240(%rdi), %xmm13 +; SSE-NEXT: movapd 240(%rdi), %xmm14 ; SSE-NEXT: movapd 256(%rdi), %xmm3 -; SSE-NEXT: movapd 48(%rdi), %xmm14 +; SSE-NEXT: movapd 48(%rdi), %xmm12 ; SSE-NEXT: movapd 64(%rdi), %xmm4 ; SSE-NEXT: movapd %xmm4, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm14[0],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm7[0] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm7[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm0, %xmm4 @@ -650,17 +650,17 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm6[0] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm6[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm3, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm9[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm2 @@ -700,7 +700,7 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 464(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 480(%rdi), %xmm2 @@ -713,19 +713,19 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 528(%rdi), %xmm15 +; SSE-NEXT: movapd 528(%rdi), %xmm2 ; SSE-NEXT: movapd 544(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] ; SSE-NEXT: movapd 560(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movapd 576(%rdi), %xmm12 ; SSE-NEXT: movapd 592(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm12[0],xmm14[1] +; SSE-NEXT: movapd %xmm0, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd 608(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -734,50 +734,49 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 640(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm11 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] -; SSE-NEXT: movapd 656(%rdi), %xmm13 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm13[0] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd 656(%rdi), %xmm14 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm14[0] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movapd 672(%rdi), %xmm6 -; SSE-NEXT: movapd 688(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE-NEXT: movapd 688(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] ; SSE-NEXT: movapd 704(%rdi), %xmm10 ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm10[0] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd 720(%rdi), %xmm4 -; SSE-NEXT: movapd 736(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: movapd 720(%rdi), %xmm5 +; SSE-NEXT: movapd 736(%rdi), %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; SSE-NEXT: movapd 752(%rdi), %xmm7 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm7[0] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] -; SSE-NEXT: movapd (%rdi), %xmm2 +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] +; SSE-NEXT: movapd (%rdi), %xmm3 ; SSE-NEXT: movapd 16(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] ; SSE-NEXT: movapd 32(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm9[0] +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm9[0] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: movapd %xmm5, 224(%rsi) +; SSE-NEXT: movapd %xmm4, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movapd %xmm3, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movapd %xmm1, 240(%rsi) +; SSE-NEXT: movapd %xmm15, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movapd %xmm14, 192(%rsi) +; SSE-NEXT: movapd %xmm13, 192(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movapd %xmm1, (%rsi) +; SSE-NEXT: movapd %xmm2, (%rsi) ; SSE-NEXT: movapd %xmm11, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rsi) @@ -786,15 +785,16 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movapd %xmm6, 224(%rdx) -; SSE-NEXT: movapd %xmm4, 240(%rdx) +; SSE-NEXT: movapd %xmm5, 240(%rdx) ; SSE-NEXT: movapd %xmm12, 192(%rdx) ; SSE-NEXT: movapd %xmm8, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rdx) -; SSE-NEXT: movapd %xmm15, 176(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rdx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rdx) @@ -808,15 +808,15 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movapd %xmm2, (%rdx) +; SSE-NEXT: movapd %xmm3, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movapd %xmm7, 240(%rcx) ; SSE-NEXT: movapd %xmm10, 224(%rcx) -; SSE-NEXT: movapd %xmm13, 208(%rcx) +; SSE-NEXT: movapd %xmm14, 208(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rcx) @@ -845,35 +845,35 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-LABEL: load_i64_stride3_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $232, %rsp -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm10[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm11[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm10[0],ymm12[3],ymm10[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm9[0],ymm6[3],ymm9[2] ; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm8[0],ymm11[3],ymm8[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm13[0],ymm15[3],ymm13[2] ; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm9[0],ymm3[3],ymm9[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm10[0],ymm3[3],ymm10[2] ; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm13[0],ymm2[3],ymm13[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm11[0],ymm2[3],ymm11[2] ; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm7[0],ymm15[3],ymm7[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm12[0],ymm7[3],ymm12[2] ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -884,93 +884,93 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1],ymm1[0],ymm6[3],ymm1[2] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[1],ymm1[0],ymm5[3],ymm1[2] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[1],ymm2[0],ymm5[3],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm2[0],ymm8[3],ymm2[2] ; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm0[0],ymm10[1],ymm0[2],ymm10[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm0[0],ymm6[1],ymm0[2],ymm6[3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm0[1],ymm11[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2],ymm6[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm6[0],ymm7[1],ymm6[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm15[0],ymm7[1],ymm15[2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm7[0],ymm13[1],ymm7[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2],ymm7[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm7[0],ymm3[1],ymm7[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm3[1],mem[2],ymm3[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm3[1],mem[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm3[0],ymm7[1],ymm3[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm3[1],ymm5[2],ymm3[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm8[1],mem[2],ymm8[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[0],ymm3[1],mem[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2],ymm11[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 128(%rsi) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd %ymm8, 192(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 128(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm15, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 224(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 160(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 160(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 192(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 224(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 224(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 160(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 160(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 64(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 96(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm12, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 32(%rcx) ; AVX1-ONLY-NEXT: addq $232, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -982,7 +982,7 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 @@ -1007,7 +1007,7 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 @@ -1031,11 +1031,11 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm7 @@ -1049,44 +1049,44 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = mem[0,1,0,3] @@ -1119,77 +1119,77 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 224(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) ; AVX2-ONLY-NEXT: addq $232, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512-LABEL: load_i64_stride3_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,3,6,9,12,15,u,u> ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 +; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] -; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 -; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm16 -; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 -; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 +; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm15 +; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm16 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm16 +; AVX512-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 ; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = <1,4,7,10,13,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm17 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] -; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 +; AVX512-NEXT: vpermt2q %zmm3, %zmm18, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 -; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm20 -; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 -; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 +; AVX512-NEXT: vpermt2q %zmm7, %zmm14, %zmm19 +; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm20 +; AVX512-NEXT: vpermt2q %zmm0, %zmm18, %zmm20 +; AVX512-NEXT: vpermi2q %zmm9, %zmm8, %zmm14 ; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = <10,13,0,3,6,u,u,u> -; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 +; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm7 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] -; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 -; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 -; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm2 -; AVX512-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 -; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm4 +; AVX512-NEXT: vpermt2q %zmm3, %zmm11, %zmm4 +; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm9 +; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) @@ -1198,10 +1198,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <96 x i64>, ptr %in.vec, align 64 @@ -1422,7 +1422,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 1024(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movapd 1040(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1470,8 +1470,8 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1248(%rdi), %xmm2 ; SSE-NEXT: movapd 1264(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd %xmm0, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] ; SSE-NEXT: movapd 1280(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1479,56 +1479,56 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1296(%rdi), %xmm15 ; SSE-NEXT: movapd 1312(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm15[0],xmm11[1] +; SSE-NEXT: movapd %xmm0, %xmm10 +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1] ; SSE-NEXT: movapd 1328(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1344(%rdi), %xmm12 ; SSE-NEXT: movapd 1360(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm12[0],xmm7[1] +; SSE-NEXT: movapd %xmm0, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] ; SSE-NEXT: movapd 1376(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movapd 1392(%rdi), %xmm10 +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1392(%rdi), %xmm11 ; SSE-NEXT: movapd 1408(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm10[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm11[0],xmm2[1] ; SSE-NEXT: movapd 1424(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1440(%rdi), %xmm9 +; SSE-NEXT: movapd 1440(%rdi), %xmm8 ; SSE-NEXT: movapd 1456(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movapd 1472(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm3[0] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1488(%rdi), %xmm0 -; SSE-NEXT: movapd 1504(%rdi), %xmm8 -; SSE-NEXT: movapd %xmm8, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd 1520(%rdi), %xmm13 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm13[0] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm8[0],xmm13[1] -; SSE-NEXT: movapd (%rdi), %xmm8 -; SSE-NEXT: movapd 16(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] -; SSE-NEXT: movapd 32(%rdi), %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: movapd 1472(%rdi), %xmm4 ; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm4[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] -; SSE-NEXT: movapd %xmm3, 496(%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1488(%rdi), %xmm0 +; SSE-NEXT: movapd 1504(%rdi), %xmm9 +; SSE-NEXT: movapd %xmm9, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd 1520(%rdi), %xmm14 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm14[0] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm9[0],xmm14[1] +; SSE-NEXT: movapd (%rdi), %xmm9 +; SSE-NEXT: movapd 16(%rdi), %xmm6 +; SSE-NEXT: movapd %xmm6, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] +; SSE-NEXT: movapd 32(%rdi), %xmm5 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE-NEXT: movapd %xmm4, 496(%rsi) ; SSE-NEXT: movapd %xmm1, 480(%rsi) ; SSE-NEXT: movapd %xmm2, 464(%rsi) -; SSE-NEXT: movapd %xmm7, 448(%rsi) -; SSE-NEXT: movapd %xmm11, 432(%rsi) -; SSE-NEXT: movapd %xmm14, 416(%rsi) +; SSE-NEXT: movapd %xmm3, 448(%rsi) +; SSE-NEXT: movapd %xmm10, 432(%rsi) +; SSE-NEXT: movapd %xmm13, 416(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 400(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1537,7 +1537,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 368(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 352(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 336(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 320(%rsi) @@ -1579,10 +1579,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movapd %xmm6, (%rsi) +; SSE-NEXT: movapd %xmm7, (%rsi) ; SSE-NEXT: movapd %xmm0, 496(%rdx) -; SSE-NEXT: movapd %xmm9, 480(%rdx) -; SSE-NEXT: movapd %xmm10, 464(%rdx) +; SSE-NEXT: movapd %xmm8, 480(%rdx) +; SSE-NEXT: movapd %xmm11, 464(%rdx) ; SSE-NEXT: movapd %xmm12, 448(%rdx) ; SSE-NEXT: movapd %xmm15, 432(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1637,13 +1637,13 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm8, (%rdx) -; SSE-NEXT: movapd %xmm13, 496(%rcx) +; SSE-NEXT: movapd %xmm9, (%rdx) +; SSE-NEXT: movapd %xmm14, 496(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 432(%rcx) @@ -1699,61 +1699,54 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm4, (%rcx) +; SSE-NEXT: movapd %xmm5, (%rcx) ; SSE-NEXT: addq $1176, %rsp # imm = 0x498 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX1-ONLY-NEXT: subq $1192, %rsp # imm = 0x4A8 ; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm6[0],ymm7[3],ymm6[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm6[0],ymm1[3],ymm6[2] ; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm5[0],ymm8[3],ymm5[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm11[0],ymm8[3],ymm11[2] ; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[1],ymm4[0],ymm9[3],ymm4[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm13[0],ymm7[3],ymm13[2] ; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm3[0],ymm10[3],ymm3[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[1],ymm2[0],ymm5[3],ymm2[2] ; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm2[0],ymm11[3],ymm2[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[1],ymm3[0],ymm4[3],ymm3[2] ; AVX1-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[2] ; AVX1-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] @@ -1765,10 +1758,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm10[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[2] ; AVX1-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1780,42 +1773,45 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm13[0],ymm0[3],ymm13[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm12[0],ymm0[3],ymm12[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm12[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm12[0],ymm0[3],ymm12[2] ; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[1],ymm5[0],ymm14[3],ymm5[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm5[0],ymm0[3],ymm5[2] ; AVX1-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm4[0],ymm11[3],ymm4[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[1],ymm4[0],ymm14[3],ymm4[2] ; AVX1-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm2[0],ymm6[3],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm2[0],ymm7[3],ymm2[2] ; AVX1-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1827,130 +1823,129 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm11[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps $51, (%rsp), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3],ymm0[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd $5, (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm0[1],mem[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1264(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm0[1],mem[2],ymm0[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0],ymm0[1],mem[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2],ymm10[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm1[1],ymm6[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0],ymm1[1],ymm7[2],ymm1[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0],ymm1[1],ymm11[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm1[0],ymm4[1],ymm1[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm1[1],mem[2],ymm1[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm1[0],ymm4[1],ymm1[2],ymm4[3] ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0],ymm4[1],mem[2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm4[0],ymm9[1],ymm4[2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm12[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm4[0],ymm11[1],ymm4[2],ymm11[3] ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm4[1],mem[2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm12[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2],ymm11[3] +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3],ymm4[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[0],ymm11[1],mem[2],ymm11[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm11[0],ymm13[1],ymm11[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovapd %ymm7, 448(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 384(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 320(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 256(%rsi) +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm15[2,3],mem[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovapd %ymm8, 448(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 384(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 320(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 256(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 128(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 416(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 480(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 416(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1995,21 +1990,22 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 192(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 256(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 320(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 192(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rcx) @@ -2021,7 +2017,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX1-ONLY-NEXT: addq $1192, %rsp # imm = 0x4A8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2235,7 +2231,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm7 @@ -2244,16 +2240,15 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] @@ -2263,7 +2258,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] @@ -2272,7 +2267,8 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] @@ -2319,47 +2315,46 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 480(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 416(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 352(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 288(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 384(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 480(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 416(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 352(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 288(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 320(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm11, 256(%rdx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rdx) @@ -2395,10 +2390,11 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 352(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) @@ -2408,124 +2404,127 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride3_vf64: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rax ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm21 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm24 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm25 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm26 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,3,6,9,12,15,u,u> -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm15 -; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm14 +; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm16 ; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512-NEXT: vpermt2q %zmm12, %zmm11, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = <1,4,7,10,13,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512-NEXT: vpermt2q %zmm12, %zmm11, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm30 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = <1,4,7,10,13,u,u,u> +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm17 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = <10,13,0,3,6,u,u,u> -; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm30 -; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 -; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm29 -; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm28 -; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm28 -; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm27 -; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm26 -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm26 -; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm22 -; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm22 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm13 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512-NEXT: vpermi2q %zmm19, %zmm25, %zmm11 -; AVX512-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 -; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm25 +; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm27 +; AVX512-NEXT: vpermt2q %zmm24, %zmm31, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm24 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512-NEXT: vpermt2q %zmm12, %zmm18, %zmm25 +; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm22 +; AVX512-NEXT: vpermt2q %zmm23, %zmm31, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512-NEXT: vpermt2q %zmm15, %zmm18, %zmm23 +; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512-NEXT: vpermt2q %zmm13, %zmm18, %zmm20 +; AVX512-NEXT: vpermt2q %zmm21, %zmm31, %zmm13 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512-NEXT: vpermi2q %zmm21, %zmm26, %zmm11 +; AVX512-NEXT: vpermi2q %zmm21, %zmm26, %zmm18 +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm21 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] -; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm16 -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm17 -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm18 -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm20 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm14 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm16 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm19 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm28 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm29 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm30 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 -; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 -; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm29 -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm28 +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm20 +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm17 ; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm24 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm25 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm22 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm18 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] -; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 -; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm8 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm9 +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm7 ; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm12 -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm10 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm13 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm20, 384(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm17, 256(%rsi) +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm15 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm13 # 64-byte Folded Reload +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm30, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm29, 384(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm28, 320(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm19, 256(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm14, 128(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm26, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm27, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm28, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm29, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm30, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm24, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm22, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm25, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm24, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, 384(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm13, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, 448(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm12, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rcx) +; AVX512-NEXT: popq %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <192 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll index e2bf6ef15fdea7..82c5b1f2a81de8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -224,34 +224,34 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps 208(%rdi), %xmm3 ; SSE-NEXT: movaps 176(%rdi), %xmm9 ; SSE-NEXT: movaps 144(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rdi), %xmm11 +; SSE-NEXT: movaps 112(%rdi), %xmm10 ; SSE-NEXT: movaps 80(%rdi), %xmm2 ; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps 32(%rdi), %xmm13 -; SSE-NEXT: movaps 224(%rdi), %xmm14 +; SSE-NEXT: movaps 32(%rdi), %xmm11 +; SSE-NEXT: movaps 224(%rdi), %xmm12 ; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 160(%rdi), %xmm15 +; SSE-NEXT: movaps 160(%rdi), %xmm13 ; SSE-NEXT: movaps 128(%rdi), %xmm6 -; SSE-NEXT: movaps 96(%rdi), %xmm12 +; SSE-NEXT: movaps 96(%rdi), %xmm14 ; SSE-NEXT: movaps 64(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm12[1] -; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] -; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: movaps %xmm8, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm14[1] +; SSE-NEXT: movaps %xmm6, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm13[1] +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] ; SSE-NEXT: movaps %xmm3, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm5[0] @@ -261,17 +261,17 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movaps %xmm15, 48(%rsi) -; SSE-NEXT: movaps %xmm14, (%rsi) -; SSE-NEXT: movaps %xmm12, 32(%rsi) -; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: movaps %xmm13, 48(%rsi) +; SSE-NEXT: movaps %xmm12, (%rsi) +; SSE-NEXT: movaps %xmm14, 32(%rsi) +; SSE-NEXT: movaps %xmm15, 16(%rsi) ; SSE-NEXT: movaps %xmm7, 48(%rdx) ; SSE-NEXT: movaps %xmm4, (%rdx) ; SSE-NEXT: movaps %xmm6, 32(%rdx) ; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: movaps %xmm9, 48(%rcx) -; SSE-NEXT: movaps %xmm11, 32(%rcx) -; SSE-NEXT: movaps %xmm13, 16(%rcx) +; SSE-NEXT: movaps %xmm10, 32(%rcx) +; SSE-NEXT: movaps %xmm11, 16(%rcx) ; SSE-NEXT: movaps %xmm3, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%r8) @@ -282,17 +282,17 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX1-ONLY-LABEL: load_i64_stride4_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm5[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 @@ -300,25 +300,25 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm13[0],xmm11[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] ; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm13[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %xmm15, 32(%rsi) @@ -327,15 +327,15 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -462,12 +462,12 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm3 @@ -480,33 +480,33 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 80(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 176(%rdi), %xmm0 -; SSE-NEXT: movaps 144(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 144(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movaps 208(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 304(%rdi), %xmm0 ; SSE-NEXT: movaps 272(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm10 @@ -532,18 +532,16 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps %xmm11, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm14, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -565,7 +563,8 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps %xmm6, 96(%rcx) -; SSE-NEXT: movaps %xmm14, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps %xmm4, 112(%rcx) ; SSE-NEXT: movaps %xmm13, 48(%rcx) ; SSE-NEXT: movaps %xmm10, 64(%rcx) @@ -577,9 +576,10 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm5, 96(%r8) ; SSE-NEXT: movaps %xmm8, 80(%r8) ; SSE-NEXT: movaps %xmm7, 64(%r8) -; SSE-NEXT: movaps %xmm12, 48(%r8) -; SSE-NEXT: movaps %xmm11, 32(%r8) -; SSE-NEXT: movaps %xmm15, 16(%r8) +; SSE-NEXT: movaps %xmm15, 48(%r8) +; SSE-NEXT: movaps %xmm12, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq @@ -595,36 +595,36 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm8[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm12[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm9[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm13[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm14[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm12[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -642,44 +642,44 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm13[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm13[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] ; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm3[0],xmm4[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -706,15 +706,15 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -722,9 +722,9 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) ; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -732,69 +732,69 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-LABEL: load_i64_stride4_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm2, %ymm9 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm2, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm11 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm12 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm4, %ymm11 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm4, %ymm12 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm9, %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm14, %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm10, %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm14 +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm11 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 @@ -802,14 +802,14 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] @@ -834,8 +834,9 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) @@ -926,48 +927,48 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $664, %rsp # imm = 0x298 ; SSE-NEXT: movaps 416(%rdi), %xmm0 ; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps 160(%rdi), %xmm3 +; SSE-NEXT: movaps 160(%rdi), %xmm1 ; SSE-NEXT: movaps 128(%rdi), %xmm8 -; SSE-NEXT: movaps 480(%rdi), %xmm1 +; SSE-NEXT: movaps 480(%rdi), %xmm2 ; SSE-NEXT: movaps 448(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm5 +; SSE-NEXT: movaps 224(%rdi), %xmm3 ; SSE-NEXT: movaps 192(%rdi), %xmm10 ; SSE-NEXT: movaps 288(%rdi), %xmm4 -; SSE-NEXT: movaps 256(%rdi), %xmm12 -; SSE-NEXT: movaps 608(%rdi), %xmm2 +; SSE-NEXT: movaps 256(%rdi), %xmm13 +; SSE-NEXT: movaps 608(%rdi), %xmm5 ; SSE-NEXT: movaps 352(%rdi), %xmm6 ; SSE-NEXT: movaps 320(%rdi), %xmm14 ; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps 64(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm7[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -976,9 +977,9 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 576(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 544(%rdi), %xmm0 ; SSE-NEXT: movaps 512(%rdi), %xmm1 @@ -1068,7 +1069,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 336(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 432(%rdi), %xmm0 @@ -1077,7 +1078,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 496(%rdi), %xmm0 ; SSE-NEXT: movaps 464(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1093,16 +1094,16 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 624(%rdi), %xmm0 -; SSE-NEXT: movaps 592(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps 592(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 688(%rdi), %xmm0 -; SSE-NEXT: movaps 656(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movaps 656(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 752(%rdi), %xmm0 ; SSE-NEXT: movaps 720(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, %xmm12 @@ -1211,7 +1212,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) @@ -1227,13 +1228,13 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm7, 208(%r8) ; SSE-NEXT: movaps %xmm9, 192(%r8) ; SSE-NEXT: movaps %xmm11, 176(%r8) -; SSE-NEXT: movaps %xmm13, 160(%r8) -; SSE-NEXT: movaps %xmm14, 144(%r8) +; SSE-NEXT: movaps %xmm14, 160(%r8) +; SSE-NEXT: movaps %xmm13, 144(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) @@ -1256,13 +1257,13 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm5[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm7 @@ -1280,17 +1281,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm5[0],xmm3[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -1401,78 +1402,78 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm14[0],ymm3[2],ymm14[2] ; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1556,10 +1557,10 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm11, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1597,11 +1598,11 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm10, %ymm10 @@ -1621,7 +1622,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1835,72 +1836,72 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm15 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm13 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,4,8,12,0,4,8,12] +; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512-NEXT: vpermt2q %zmm14, %zmm18, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm17 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm17[0,1,2,3],zmm10[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512-NEXT: vpermt2q %zmm16, %zmm18, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm19 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm19[0,1,2,3],zmm17[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm20 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 +; AVX512-NEXT: vpermt2q %zmm3, %zmm18, %zmm20 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm20[4,5,6,7] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,5,9,13,1,5,9,13] +; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512-NEXT: vpermt2q %zmm14, %zmm20, %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 +; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm22 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512-NEXT: vpermt2q %zmm16, %zmm20, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm23 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512-NEXT: vpermt2q %zmm7, %zmm20, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 +; AVX512-NEXT: vpermt2q %zmm4, %zmm20, %zmm24 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm24 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm21 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm20, %zmm24 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,2,3],zmm24[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 ; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512-NEXT: vpermt2q %zmm16, %zmm24, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512-NEXT: vpermt2q %zmm11, %zmm24, %zmm27 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512-NEXT: vpermt2q %zmm7, %zmm24, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] @@ -1910,26 +1911,26 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 +; AVX512-NEXT: vpermt2q %zmm16, %zmm28, %zmm15 +; AVX512-NEXT: vpermt2q %zmm11, %zmm28, %zmm12 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm15[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 ; AVX512-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm13 ; AVX512-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm13[4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 ; AVX512-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm21, 128(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rcx) @@ -1937,7 +1938,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm11, 64(%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <128 x i64>, ptr %in.vec, align 64 @@ -2303,7 +2304,7 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 1168(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1264(%rdi), %xmm0 @@ -2340,7 +2341,7 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1584(%rdi), %xmm0 ; SSE-NEXT: movaps 1552(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2355,15 +2356,15 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 1712(%rdi), %xmm0 -; SSE-NEXT: movaps 1680(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 1680(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 1776(%rdi), %xmm0 -; SSE-NEXT: movaps 1744(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 1744(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 1840(%rdi), %xmm0 ; SSE-NEXT: movaps 1808(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm7 @@ -2376,19 +2377,19 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 1968(%rdi), %xmm0 ; SSE-NEXT: movaps 1936(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps 2032(%rdi), %xmm8 -; SSE-NEXT: movaps 2000(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: movaps 16(%rdi), %xmm10 +; SSE-NEXT: movaps 2000(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; SSE-NEXT: movaps 16(%rdi), %xmm12 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm8 +; SSE-NEXT: movaps %xmm12, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2517,12 +2518,12 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm1, 496(%rcx) -; SSE-NEXT: movaps %xmm3, 480(%rcx) +; SSE-NEXT: movaps %xmm2, 496(%rcx) +; SSE-NEXT: movaps %xmm1, 480(%rcx) ; SSE-NEXT: movaps %xmm5, 464(%rcx) ; SSE-NEXT: movaps %xmm7, 448(%rcx) -; SSE-NEXT: movaps %xmm11, 432(%rcx) -; SSE-NEXT: movaps %xmm14, 416(%rcx) +; SSE-NEXT: movaps %xmm10, 432(%rcx) +; SSE-NEXT: movaps %xmm13, 416(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2537,7 +2538,7 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 320(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rcx) @@ -2574,16 +2575,16 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps %xmm8, (%rcx) -; SSE-NEXT: movaps %xmm2, 496(%r8) +; SSE-NEXT: movaps %xmm3, 496(%r8) ; SSE-NEXT: movaps %xmm4, 480(%r8) ; SSE-NEXT: movaps %xmm6, 464(%r8) ; SSE-NEXT: movaps %xmm9, 448(%r8) -; SSE-NEXT: movaps %xmm12, 432(%r8) -; SSE-NEXT: movaps %xmm13, 416(%r8) +; SSE-NEXT: movaps %xmm11, 432(%r8) +; SSE-NEXT: movaps %xmm14, 416(%r8) ; SSE-NEXT: movaps %xmm15, 400(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%r8) @@ -2629,13 +2630,13 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm10, (%r8) +; SSE-NEXT: movaps %xmm12, (%r8) ; SSE-NEXT: addq $1688, %rsp # imm = 0x698 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2728, %rsp # imm = 0xAA8 +; AVX1-ONLY-NEXT: subq $2664, %rsp # imm = 0xA68 ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -2846,7 +2847,7 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -2920,377 +2921,381 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1936(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm15[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 480(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 448(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 384(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%r8) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 384(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 352(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 288(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 256(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 256(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%r8) -; AVX1-ONLY-NEXT: addq $2728, %rsp # imm = 0xAA8 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: addq $2664, %rsp # imm = 0xA68 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3833,194 +3838,196 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride4_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm29 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm12 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,0,4,8,12] +; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-NEXT: vpermt2q %zmm6, %zmm7, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,5,9,13,1,5,9,13] +; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm29 ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm20 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm29 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm22 ; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm27 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm19 ; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm24 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm17 ; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm16 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm9 ; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm13 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm11 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -4028,102 +4035,102 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload ; AVX512-NEXT: # zmm31 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm26[0,1,2,3],zmm30[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm25[0,1,2,3],zmm30[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload ; AVX512-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm23[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm4[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm10[0,1,2,3],zmm15[4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload ; AVX512-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload -; AVX512-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512-NEXT: # zmm23 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload +; AVX512-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm26[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm13[0,1,2,3],zmm16[4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload +; AVX512-NEXT: # zmm26 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload -; AVX512-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload -; AVX512-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload -; AVX512-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm27[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: # zmm12 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm27 # 64-byte Folded Reload +; AVX512-NEXT: # zmm27 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload +; AVX512-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],zmm19[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm9[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm6, 384(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm30, 320(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm26, 256(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm25, 256(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm9, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm29, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm20, 320(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm3, 128(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm21, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm28, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm23, 320(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm15, 192(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm3, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm24, 448(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm17, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm12, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512-NEXT: vmovdqa64 %zmm24, 320(%r8) -; AVX512-NEXT: vmovdqa64 %zmm7, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 384(%r8) +; AVX512-NEXT: vmovdqa64 %zmm17, 448(%r8) +; AVX512-NEXT: vmovdqa64 %zmm20, 256(%r8) +; AVX512-NEXT: vmovdqa64 %zmm29, 320(%r8) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm27, 192(%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512-NEXT: vmovdqa64 %zmm12, 64(%r8) +; AVX512-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <256 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll index 08763079d0c68d..e3bb2271968977 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -111,8 +111,8 @@ define void @load_i64_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i64_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movapd 144(%rdi), %xmm1 -; SSE-NEXT: movapd 64(%rdi), %xmm0 +; SSE-NEXT: movapd 144(%rdi), %xmm0 +; SSE-NEXT: movapd 64(%rdi), %xmm1 ; SSE-NEXT: movapd 96(%rdi), %xmm2 ; SSE-NEXT: movapd 128(%rdi), %xmm3 ; SSE-NEXT: movapd (%rdi), %xmm4 @@ -129,10 +129,10 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm7[0] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] ; SSE-NEXT: movapd %xmm10, 16(%rsi) ; SSE-NEXT: movapd %xmm11, (%rsi) ; SSE-NEXT: movapd %xmm8, 16(%rdx) @@ -141,8 +141,8 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd %xmm7, (%rcx) ; SSE-NEXT: movapd %xmm2, 16(%r8) ; SSE-NEXT: movapd %xmm5, (%r8) -; SSE-NEXT: movapd %xmm1, 16(%r9) -; SSE-NEXT: movapd %xmm0, (%r9) +; SSE-NEXT: movapd %xmm0, 16(%r9) +; SSE-NEXT: movapd %xmm1, (%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf4: @@ -305,29 +305,29 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd 208(%rdi), %xmm6 ; SSE-NEXT: movapd (%rdi), %xmm9 ; SSE-NEXT: movapd 16(%rdi), %xmm5 -; SSE-NEXT: movapd 32(%rdi), %xmm14 +; SSE-NEXT: movapd 32(%rdi), %xmm12 ; SSE-NEXT: movapd 48(%rdi), %xmm8 ; SSE-NEXT: movapd 240(%rdi), %xmm11 ; SSE-NEXT: movapd 272(%rdi), %xmm13 ; SSE-NEXT: movapd 160(%rdi), %xmm10 -; SSE-NEXT: movapd 192(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm15, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] +; SSE-NEXT: movapd 192(%rdi), %xmm14 +; SSE-NEXT: movapd %xmm14, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm6[0] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] ; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm0[0] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm9[0],xmm15[1] +; SSE-NEXT: movapd %xmm12, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm9[0],xmm14[1] ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm8[0] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] +; SSE-NEXT: movapd %xmm13, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] ; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm2[0] @@ -346,9 +346,9 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] ; SSE-NEXT: movapd %xmm2, 16(%rsi) -; SSE-NEXT: movapd %xmm14, 48(%rsi) -; SSE-NEXT: movapd %xmm15, (%rsi) -; SSE-NEXT: movapd %xmm12, 32(%rsi) +; SSE-NEXT: movapd %xmm12, 48(%rsi) +; SSE-NEXT: movapd %xmm14, (%rsi) +; SSE-NEXT: movapd %xmm15, 32(%rsi) ; SSE-NEXT: movapd %xmm13, 16(%rdx) ; SSE-NEXT: movapd %xmm11, 48(%rdx) ; SSE-NEXT: movapd %xmm9, (%rdx) @@ -398,36 +398,36 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[3],ymm14[2] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3] ; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[3],ymm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm12[0],ymm13[3],ymm12[2] ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm4[0],xmm15[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm4[0],xmm14[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm5[0],xmm0[1] @@ -437,9 +437,9 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm9, 32(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm7, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 32(%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm8, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm1, (%r9) @@ -448,70 +448,70 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm15[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm15[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm14 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm14 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,3] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm13[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-ONLY-NEXT: vzeroupper @@ -652,54 +652,54 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride5_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $280, %rsp # imm = 0x118 -; SSE-NEXT: movapd 224(%rdi), %xmm3 -; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 64(%rdi), %xmm1 -; SSE-NEXT: movapd 176(%rdi), %xmm4 -; SSE-NEXT: movapd 96(%rdi), %xmm5 -; SSE-NEXT: movapd 208(%rdi), %xmm7 -; SSE-NEXT: movapd 128(%rdi), %xmm8 -; SSE-NEXT: movapd (%rdi), %xmm10 -; SSE-NEXT: movapd 16(%rdi), %xmm6 -; SSE-NEXT: movapd 32(%rdi), %xmm14 -; SSE-NEXT: movapd 48(%rdi), %xmm9 -; SSE-NEXT: movapd 160(%rdi), %xmm11 -; SSE-NEXT: movapd 192(%rdi), %xmm13 -; SSE-NEXT: movapd 80(%rdi), %xmm12 -; SSE-NEXT: movapd 112(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm14, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] +; SSE-NEXT: movapd 224(%rdi), %xmm5 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 64(%rdi), %xmm3 +; SSE-NEXT: movapd 176(%rdi), %xmm7 +; SSE-NEXT: movapd 96(%rdi), %xmm6 +; SSE-NEXT: movapd 208(%rdi), %xmm10 +; SSE-NEXT: movapd 128(%rdi), %xmm9 +; SSE-NEXT: movapd (%rdi), %xmm12 +; SSE-NEXT: movapd 16(%rdi), %xmm8 +; SSE-NEXT: movapd 32(%rdi), %xmm0 +; SSE-NEXT: movapd 48(%rdi), %xmm11 +; SSE-NEXT: movapd 160(%rdi), %xmm14 +; SSE-NEXT: movapd 192(%rdi), %xmm1 +; SSE-NEXT: movapd 80(%rdi), %xmm13 +; SSE-NEXT: movapd 112(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm9[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm11[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm3[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm8[0] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm2[0] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm10[0] +; SSE-NEXT: movapd %xmm14, (%rsp) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -710,7 +710,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 256(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 304(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -731,12 +731,12 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 400(%rdi), %xmm11 +; SSE-NEXT: movapd 400(%rdi), %xmm10 ; SSE-NEXT: movapd 432(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] ; SSE-NEXT: movapd 448(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm12[0] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm12[0] ; SSE-NEXT: movapd 416(%rdi), %xmm14 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm14[0],xmm12[1] ; SSE-NEXT: movapd 464(%rdi), %xmm1 @@ -745,49 +745,49 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 480(%rdi), %xmm4 ; SSE-NEXT: movapd 512(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd 528(%rdi), %xmm7 ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm7[0] ; SSE-NEXT: movapd 496(%rdi), %xmm9 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] -; SSE-NEXT: movapd 544(%rdi), %xmm10 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm10[0] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd 544(%rdi), %xmm11 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm11[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: movapd 560(%rdi), %xmm0 -; SSE-NEXT: movapd 592(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd 592(%rdi), %xmm6 +; SSE-NEXT: movapd %xmm6, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd 608(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] -; SSE-NEXT: movapd 576(%rdi), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd 576(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd 624(%rdi), %xmm8 -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm8[0] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] -; SSE-NEXT: movapd %xmm6, 96(%rsi) +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1] +; SSE-NEXT: movapd %xmm5, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, 32(%rsi) -; SSE-NEXT: movapd %xmm2, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movapd %xmm3, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movapd %xmm13, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movapd %xmm4, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rdx) ; SSE-NEXT: movapd %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movapd %xmm15, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movapd %xmm11, 80(%rdx) +; SSE-NEXT: movapd %xmm10, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movapd %xmm7, 96(%rcx) @@ -797,13 +797,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm12, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm3, 112(%r8) +; SSE-NEXT: movapd %xmm2, 112(%r8) ; SSE-NEXT: movapd %xmm9, 96(%r8) ; SSE-NEXT: movapd %xmm14, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -817,7 +817,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movapd %xmm8, 112(%r9) -; SSE-NEXT: movapd %xmm10, 96(%r9) +; SSE-NEXT: movapd %xmm11, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -838,166 +838,166 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: subq $360, %rsp # imm = 0x168 ; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm11[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm5[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm11[0],xmm6[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm13[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm2[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm14[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm5[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm0[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm13[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[3],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm0[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1],ymm15[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm15[0],ymm2[3],ymm15[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm11[0],ymm1[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[3],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm5[0],ymm14[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[3],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm14[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm9[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1,2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[3],ymm5[2] ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm9[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[3],ymm5[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[3],ymm12[2] ; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[3],ymm7[2] ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm10[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm7[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm7, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 64(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm6, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 32(%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm1, (%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%r9) @@ -1008,170 +1008,172 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: subq $392, %rsp # imm = 0x188 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm11 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = ymm9[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = mem[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 64(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%r9) -; AVX2-ONLY-NEXT: addq $360, %rsp # imm = 0x168 +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%r9) +; AVX2-ONLY-NEXT: addq $392, %rsp # imm = 0x188 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1183,86 +1185,86 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,1,6,0,12,1,6,0] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm11 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm11 ; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 ; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] ; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <1,6,11,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <1,6,11,u> +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,11,0,1,6,11,0,1] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,11,0,1,6,11,0,1] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,7,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm17 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 ; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm16 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [7,12,0,2,7,12,0,2] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = <11,0,5,u> ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm19 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 ; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,0,11,0,5,0,11] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm18 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,5,0,11,0,5,0,11] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm9 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <12,1,6,u> +; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm15, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm17, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%r8) @@ -1280,86 +1282,86 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,1,6,0,12,1,6,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm11 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <1,6,11,u> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <1,6,11,u> +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,11,0,1,6,11,0,1] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,7,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm16 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [7,12,0,2,7,12,0,2] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <11,0,5,u> ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm18 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,5,0,11,0,5,0,11] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <12,1,6,u> +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) @@ -1391,17 +1393,17 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 64(%rdi), %xmm3 ; SSE-NEXT: movapd 176(%rdi), %xmm7 ; SSE-NEXT: movapd 96(%rdi), %xmm6 -; SSE-NEXT: movapd 208(%rdi), %xmm9 -; SSE-NEXT: movapd 128(%rdi), %xmm10 +; SSE-NEXT: movapd 208(%rdi), %xmm10 +; SSE-NEXT: movapd 128(%rdi), %xmm9 ; SSE-NEXT: movapd (%rdi), %xmm12 ; SSE-NEXT: movapd 16(%rdi), %xmm8 -; SSE-NEXT: movapd 32(%rdi), %xmm1 +; SSE-NEXT: movapd 32(%rdi), %xmm0 ; SSE-NEXT: movapd 48(%rdi), %xmm11 -; SSE-NEXT: movapd 160(%rdi), %xmm13 -; SSE-NEXT: movapd 192(%rdi), %xmm0 -; SSE-NEXT: movapd 80(%rdi), %xmm14 +; SSE-NEXT: movapd 160(%rdi), %xmm14 +; SSE-NEXT: movapd 192(%rdi), %xmm1 +; SSE-NEXT: movapd 80(%rdi), %xmm13 ; SSE-NEXT: movapd 112(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm1, %xmm15 +; SSE-NEXT: movapd %xmm0, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm11[0] @@ -1410,29 +1412,29 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm3[0] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm10[0] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm6[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm10[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movapd 272(%rdi), %xmm0 @@ -1588,53 +1590,53 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] ; SSE-NEXT: movapd 1024(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1040(%rdi), %xmm8 +; SSE-NEXT: movapd 1040(%rdi), %xmm7 ; SSE-NEXT: movapd 1072(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm9 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] ; SSE-NEXT: movapd 1088(%rdi), %xmm11 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm11[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm11[0] ; SSE-NEXT: movapd 1056(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] ; SSE-NEXT: movapd 1104(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1120(%rdi), %xmm5 -; SSE-NEXT: movapd 1152(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: movapd 1152(%rdi), %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; SSE-NEXT: movapd 1168(%rdi), %xmm6 ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm6[0] ; SSE-NEXT: movapd 1136(%rdi), %xmm12 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm12[0],xmm6[1] ; SSE-NEXT: movapd 1184(%rdi), %xmm0 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1200(%rdi), %xmm0 -; SSE-NEXT: movapd 1232(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd 1232(%rdi), %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd 1248(%rdi), %xmm4 ; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm4[0] -; SSE-NEXT: movapd 1216(%rdi), %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] +; SSE-NEXT: movapd 1216(%rdi), %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm8[0],xmm4[1] ; SSE-NEXT: movapd 1264(%rdi), %xmm13 -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm13[0] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] -; SSE-NEXT: movapd %xmm3, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movapd %xmm1, 240(%rsi) +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm13[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm3[0],xmm13[1] +; SSE-NEXT: movapd %xmm1, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movapd %xmm2, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1658,7 +1660,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm5, 224(%rdx) ; SSE-NEXT: movapd %xmm0, 240(%rdx) ; SSE-NEXT: movapd %xmm10, 192(%rdx) -; SSE-NEXT: movapd %xmm8, 208(%rdx) +; SSE-NEXT: movapd %xmm7, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1711,11 +1713,11 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movapd %xmm7, 240(%r8) +; SSE-NEXT: movapd %xmm8, 240(%r8) ; SSE-NEXT: movapd %xmm12, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%r8) @@ -1744,7 +1746,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm13, 240(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r9) @@ -1790,8 +1792,8 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm10[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3] @@ -1804,8 +1806,8 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm11[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm0 @@ -1814,18 +1816,19 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm12[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 @@ -1835,8 +1838,8 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 @@ -1865,465 +1868,462 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm10[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm11[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm14[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm13[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm15[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[3],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm15[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm4[0],ymm14[0],ymm4[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm15[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm15[0],ymm4[3],ymm15[2] ; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[3],ymm4[2] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm10[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] ; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm9[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[3],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[0],ymm7[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 160(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 224(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 128(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm13, (%r9) ; AVX1-ONLY-NEXT: addq $1368, %rsp # imm = 0x558 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: subq $1512, %rsp # imm = 0x5E8 +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm14 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm13[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2384,106 +2384,109 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm12[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm1[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm0[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, (%rsp), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm10[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) @@ -2531,8 +2534,9 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, 64(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2545,212 +2549,213 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 128(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r9) -; AVX2-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 192(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 128(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%r9) +; AVX2-ONLY-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride5_vf32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm14 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm21, %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,5,10,15] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm25, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <2,7,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u> -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,6,11,u> +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm16, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm8, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm23 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,11,0,1,6,11,0,1] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm18, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <2,7,12,u> ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm11, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm18, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm18, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm18 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [7,12,0,2,7,12,0,2] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm31 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <11,0,5,u> +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm29 = <12,1,6,u> +; AVX512F-NEXT: vpermt2q %zmm14, %zmm29, %zmm13 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm16, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm7 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm20, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm16, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm20, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm20, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm20, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm28[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm28 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm28 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm25[0,1,2,3],zmm21[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm25, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm25, %zmm28 ; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm2 ; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%r9) +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm7 {%k1} +; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm20 {%k1} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm28, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 192(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm13, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2758,197 +2763,198 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride5_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <2,7,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u> -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,6,11,u> +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm8, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,11,0,1,6,11,0,1] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm18, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <2,7,12,u> ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm18, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm18 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [7,12,0,2,7,12,0,2] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <11,0,5,u> +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = <12,1,6,u> +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm29, %zmm13 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm7 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm20, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm29, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm28[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm28 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm28 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm25[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm28 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm2 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r9) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3388,9 +3394,8 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2080(%rdi), %xmm2 ; SSE-NEXT: movapd 2112(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] ; SSE-NEXT: movapd 2128(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3434,10 +3439,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 2352(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm6 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] -; SSE-NEXT: movapd 2368(%rdi), %xmm15 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm15[0] +; SSE-NEXT: movapd 2368(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm1[0] ; SSE-NEXT: movapd 2336(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2384(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3473,8 +3479,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm6, 464(%rsi) ; SSE-NEXT: movapd %xmm8, 448(%rsi) ; SSE-NEXT: movapd %xmm11, 432(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 416(%rsi) +; SSE-NEXT: movapd %xmm15, 416(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3588,7 +3593,8 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movapd %xmm10, 496(%rcx) ; SSE-NEXT: movapd %xmm13, 480(%rcx) -; SSE-NEXT: movapd %xmm15, 464(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 464(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4217,34 +4223,34 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 1296(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 1296(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm5[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload @@ -4279,73 +4285,73 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[3],ymm7[2] ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm8[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[3],ymm7[2] ; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 1408(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1408(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] ; AVX1-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm13 @@ -4480,32 +4486,32 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[0,1,2],ymm10[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm9[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm6[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4634,12 +4640,12 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 480(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 448(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 480(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 448(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm5, 416(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm7, 384(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 352(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 320(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 352(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 320(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm13, 288(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm15, 256(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4664,7 +4670,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3240, %rsp # imm = 0xCA8 +; AVX2-ONLY-NEXT: subq $3256, %rsp # imm = 0xCB8 ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 @@ -4765,10 +4771,10 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm11 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 @@ -4778,8 +4784,8 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 @@ -4806,8 +4812,8 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm4 @@ -4816,8 +4822,8 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm3 @@ -4830,8 +4836,8 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm0 @@ -4841,78 +4847,78 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] @@ -4924,31 +4930,31 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm9 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] @@ -4966,7 +4972,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm15[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] @@ -4975,14 +4981,14 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4993,7 +4999,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -5110,8 +5116,8 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5127,26 +5133,27 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5217,1206 +5224,1210 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = mem[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 480(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 448(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 416(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 384(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 352(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, 320(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 288(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 256(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 160(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 128(%r9) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 480(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 448(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 416(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 384(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 352(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 320(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 288(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 256(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 224(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 192(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 160(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: addq $3240, %rsp # imm = 0xCA8 +; AVX2-ONLY-NEXT: addq $3256, %rsp # imm = 0xCB8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm15, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: subq $3464, %rsp # imm = 0xD88 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,1,6,0,12,1,6,0] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm9, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm18, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm18, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm18, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm18, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [6,11,0,1,6,11,0,1] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm21, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm21, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm21, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm21, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm21, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,12,0,2,7,12,0,2] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm13, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,0,11,0,5,0,11] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm16 ; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm5 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm18 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm21 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <1,6,11,u> +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm25 = <2,7,12,u> +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = <11,0,5,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm29 = <12,1,6,u> +; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm27 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm20 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm25, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm23 ; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm28 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm23 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm3, %zmm26 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm22 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm30 ; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm10 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm25, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 ; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm25, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm11 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm18 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm12 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm25, %zmm17 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm13 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: movb $7, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm29, %zmm8 +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm0 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm20[0,1,2,3],zmm14[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm20 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm20 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm15 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm16[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm11, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm29, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm11, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm10 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm11, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm29 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm7 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm15 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: movb $56, %al +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm9 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 ; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm18 +; AVX512F-NEXT: movb $56, %al +; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm24 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm21 ; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 448(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 384(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 320(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 256(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%rsi) -; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 448(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 256(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 320(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 192(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 384(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm30 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm8 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 384(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm18, 448(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 256(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 320(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm25, 448(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm20, 256(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm24, 320(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm30, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm23, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm17, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm31, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%r9) -; AVX512F-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512F-NEXT: vmovdqa64 %zmm30, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%r9) +; AVX512F-NEXT: addq $3464, %rsp # imm = 0xD88 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: subq $3464, %rsp # imm = 0xD88 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,1,6,0,12,1,6,0] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [6,11,0,1,6,11,0,1] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,12,0,2,7,12,0,2] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm13, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,0,11,0,5,0,11] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm16 ; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,6,11,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <2,7,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = <11,0,5,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = <12,1,6,u> +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm23 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm28 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm23 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm22 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm12 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm29, %zmm8 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm0 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm20[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm20 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm20 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm15 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm18 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 448(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 384(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 256(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 448(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 256(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 320(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 256(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm31, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r9) -; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%r9) +; AVX512BW-NEXT: addq $3464, %rsp # imm = 0xD88 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <320 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index 6e00c62f404e5a..1cee31a4eaa82b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -84,47 +84,47 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i64_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps 80(%rdi), %xmm3 -; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps 176(%rdi), %xmm2 ; SSE-NEXT: movaps 128(%rdi), %xmm0 -; SSE-NEXT: movaps 64(%rdi), %xmm6 -; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps 112(%rdi), %xmm1 -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm4 -; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 64(%rdi), %xmm3 +; SSE-NEXT: movaps 160(%rdi), %xmm4 +; SSE-NEXT: movaps 112(%rdi), %xmm5 +; SSE-NEXT: movaps (%rdi), %xmm6 +; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rdi), %xmm8 ; SSE-NEXT: movaps 48(%rdi), %xmm9 ; SSE-NEXT: movaps 144(%rdi), %xmm10 ; SSE-NEXT: movaps 96(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0] -; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movaps %xmm6, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm10[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm7[0] -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm13, (%rsi) ; SSE-NEXT: movaps %xmm12, 16(%rsi) -; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: movaps %xmm6, (%rdx) ; SSE-NEXT: movaps %xmm11, 16(%rdx) ; SSE-NEXT: movaps %xmm10, (%rcx) ; SSE-NEXT: movaps %xmm9, 16(%rcx) -; SSE-NEXT: movaps %xmm4, (%r8) -; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movaps %xmm7, (%r9) -; SSE-NEXT: movaps %xmm6, 16(%r9) -; SSE-NEXT: movaps %xmm2, (%rax) +; SSE-NEXT: movaps %xmm7, (%r8) +; SSE-NEXT: movaps %xmm5, 16(%r8) +; SSE-NEXT: movaps %xmm4, (%r9) +; SSE-NEXT: movaps %xmm3, 16(%r9) +; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: retq ; @@ -218,84 +218,84 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-SLOW-LABEL: load_i64_stride6_vf4: ; AVX512-SLOW: # %bb.0: ; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,6,12,u> -; AVX512-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512-SLOW-NEXT: vpbroadcastq 144(%rdi), %ymm1 -; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <1,7,13,u> -; AVX512-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 +; AVX512-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,u> +; AVX512-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512-SLOW-NEXT: vpbroadcastq 144(%rdi), %ymm3 +; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,7,13,u> +; AVX512-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <10,0,6,u> -; AVX512-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 +; AVX512-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 ; AVX512-SLOW-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512-SLOW-NEXT: vpbroadcastq %xmm6, %ymm7 ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX512-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <11,1,7,u> -; AVX512-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512-SLOW-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX512-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [4,10] -; AVX512-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512-SLOW-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] ; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] -; AVX512-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512-SLOW-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-SLOW-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512-SLOW-NEXT: vmovdqa %ymm3, (%rdx) ; AVX512-SLOW-NEXT: vmovdqa %ymm5, (%rcx) ; AVX512-SLOW-NEXT: vmovdqa %ymm6, (%r8) ; AVX512-SLOW-NEXT: vmovdqa %ymm4, (%r9) -; AVX512-SLOW-NEXT: vmovdqa %ymm2, (%rax) +; AVX512-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-SLOW-NEXT: vzeroupper ; AVX512-SLOW-NEXT: retq ; ; AVX512-FAST-LABEL: load_i64_stride6_vf4: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,6,12,u> -; AVX512-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512-FAST-NEXT: vpbroadcastq 144(%rdi), %ymm1 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <1,7,13,u> -; AVX512-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 +; AVX512-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,u> +; AVX512-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512-FAST-NEXT: vpbroadcastq 144(%rdi), %ymm3 +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,7,13,u> +; AVX512-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <10,0,6,u> -; AVX512-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 +; AVX512-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,4] ; AVX512-FAST-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512-FAST-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 ; AVX512-FAST-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <11,1,7,u> -; AVX512-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] ; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6] ; AVX512-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512-FAST-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10] -; AVX512-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FAST-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] -; AVX512-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FAST-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FAST-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512-FAST-NEXT: vmovdqa %ymm3, (%rdx) ; AVX512-FAST-NEXT: vmovdqa %ymm6, (%rcx) ; AVX512-FAST-NEXT: vmovdqa %ymm5, (%r8) ; AVX512-FAST-NEXT: vmovdqa %ymm4, (%r9) -; AVX512-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX512-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-FAST-NEXT: vzeroupper ; AVX512-FAST-NEXT: retq %wide.vec = load <24 x i64>, ptr %in.vec, align 64 @@ -319,64 +319,64 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp ; SSE-NEXT: movaps 160(%rdi), %xmm10 -; SSE-NEXT: movaps 256(%rdi), %xmm13 -; SSE-NEXT: movaps 208(%rdi), %xmm5 -; SSE-NEXT: movaps 352(%rdi), %xmm15 -; SSE-NEXT: movaps 304(%rdi), %xmm6 -; SSE-NEXT: movaps 64(%rdi), %xmm0 -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm7 -; SSE-NEXT: movaps 48(%rdi), %xmm1 -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 96(%rdi), %xmm11 -; SSE-NEXT: movaps 240(%rdi), %xmm3 -; SSE-NEXT: movaps 192(%rdi), %xmm12 -; SSE-NEXT: movaps 336(%rdi), %xmm4 -; SSE-NEXT: movaps 288(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0] -; SSE-NEXT: movaps %xmm14, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] -; SSE-NEXT: movaps %xmm12, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 256(%rdi), %xmm0 +; SSE-NEXT: movaps 208(%rdi), %xmm3 +; SSE-NEXT: movaps 352(%rdi), %xmm1 +; SSE-NEXT: movaps 304(%rdi), %xmm4 +; SSE-NEXT: movaps 64(%rdi), %xmm2 +; SSE-NEXT: movaps (%rdi), %xmm6 +; SSE-NEXT: movaps 16(%rdi), %xmm5 +; SSE-NEXT: movaps 48(%rdi), %xmm8 +; SSE-NEXT: movaps 144(%rdi), %xmm13 +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 240(%rdi), %xmm12 +; SSE-NEXT: movaps 192(%rdi), %xmm14 +; SSE-NEXT: movaps 336(%rdi), %xmm9 +; SSE-NEXT: movaps 288(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1] +; SSE-NEXT: movaps %xmm14, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm12[0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm12[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] +; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm7 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] -; SSE-NEXT: movaps 80(%rdi), %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] +; SSE-NEXT: movaps 80(%rdi), %xmm15 ; SSE-NEXT: movaps 32(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm15[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 320(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 272(%rdi), %xmm0 ; SSE-NEXT: movaps 224(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm4 @@ -387,39 +387,39 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps %xmm12, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps %xmm12, (%rsi) +; SSE-NEXT: movaps %xmm13, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps %xmm9, 48(%rdx) +; SSE-NEXT: movaps %xmm11, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm7, 16(%rcx) -; SSE-NEXT: movaps %xmm15, 32(%rcx) -; SSE-NEXT: movaps %xmm14, 48(%rcx) +; SSE-NEXT: movaps %xmm8, 32(%rcx) +; SSE-NEXT: movaps %xmm9, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps %xmm13, 16(%r8) +; SSE-NEXT: movaps %xmm6, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps %xmm2, 16(%r9) ; SSE-NEXT: movaps %xmm4, 32(%r9) -; SSE-NEXT: movaps %xmm6, 48(%r9) -; SSE-NEXT: movaps %xmm8, (%r9) +; SSE-NEXT: movaps %xmm5, 48(%r9) +; SSE-NEXT: movaps %xmm14, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movaps %xmm3, 32(%rax) -; SSE-NEXT: movaps %xmm5, 48(%rax) +; SSE-NEXT: movaps %xmm15, 48(%rax) ; SSE-NEXT: movaps %xmm10, (%rax) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq @@ -433,10 +433,10 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm11 @@ -447,36 +447,36 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm10[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm15[0],ymm8[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm10[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm8[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] @@ -484,12 +484,12 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm15[1],ymm8[3],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],ymm3[1],ymm13[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm15[1],ymm9[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm13[1],ymm3[1],ymm13[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -497,12 +497,12 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) @@ -598,170 +598,170 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-LABEL: load_i64_stride6_vf8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,u> +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512F-NEXT: movb $56, %dil ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 ; AVX512F-NEXT: movb $-64, %dil ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,7,0,11,1,7,0,11] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <1,7,13,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <10,0,6,u> -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 ; AVX512F-NEXT: movb $24, %dil ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: movb $-32, %dil ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <11,1,7,u> -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] ; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride6_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,u> +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,7,0,11,1,7,0,11] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <1,7,13,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <10,0,6,u> -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 ; AVX512BW-NEXT: movb $24, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <11,1,7,u> -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <48 x i64>, ptr %in.vec, align 64 @@ -865,7 +865,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 304(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 448(%rdi), %xmm0 @@ -874,26 +874,25 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 544(%rdi), %xmm0 -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 496(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 640(%rdi), %xmm0 -; SSE-NEXT: movaps 592(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 592(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 736(%rdi), %xmm0 -; SSE-NEXT: movaps 688(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps 688(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 80(%rdi), %xmm0 ; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -902,31 +901,32 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdi), %xmm0 -; SSE-NEXT: movaps 128(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 128(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 272(%rdi), %xmm0 -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps 320(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 464(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movaps 416(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 560(%rdi), %xmm0 -; SSE-NEXT: movaps 512(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movaps 512(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 656(%rdi), %xmm0 ; SSE-NEXT: movaps 608(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm4 @@ -979,17 +979,16 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm11, 112(%r8) -; SSE-NEXT: movaps %xmm15, 96(%r8) +; SSE-NEXT: movaps %xmm15, 112(%r8) +; SSE-NEXT: movaps %xmm10, 96(%r8) +; SSE-NEXT: movaps %xmm9, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) @@ -1003,8 +1002,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm4, 96(%r9) ; SSE-NEXT: movaps %xmm7, 80(%r9) ; SSE-NEXT: movaps %xmm8, 64(%r9) -; SSE-NEXT: movaps %xmm10, 48(%r9) -; SSE-NEXT: movaps %xmm14, 32(%r9) +; SSE-NEXT: movaps %xmm13, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1012,11 +1012,11 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 112(%rax) ; SSE-NEXT: movaps %xmm3, 96(%rax) -; SSE-NEXT: movaps %xmm6, 80(%rax) -; SSE-NEXT: movaps %xmm5, 64(%rax) -; SSE-NEXT: movaps %xmm9, 48(%rax) -; SSE-NEXT: movaps %xmm12, 32(%rax) -; SSE-NEXT: movaps %xmm13, 16(%rax) +; SSE-NEXT: movaps %xmm5, 80(%rax) +; SSE-NEXT: movaps %xmm6, 64(%rax) +; SSE-NEXT: movaps %xmm12, 48(%rax) +; SSE-NEXT: movaps %xmm11, 32(%rax) +; SSE-NEXT: movaps %xmm14, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $408, %rsp # imm = 0x198 @@ -1037,34 +1037,34 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm8[0],ymm4[2],ymm8[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm6[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd (%rsp), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd (%rsp), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1072,513 +1072,511 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm3[0],xmm4[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm4[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm4[0],xmm5[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm10[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm6[1],ymm15[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 512(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm13[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 512(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm15[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm12[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm11[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm5[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm15[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm9[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) ; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride6_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 +; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[2],ymm13[2] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm11[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm6[0],xmm12[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm15[0],ymm8[2],ymm15[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm8[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm14[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm14[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm13[1],ymm2[3],ymm13[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm12[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm11[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm8[1] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm3[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm6[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm12[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm4[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm1[0],xmm13[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm10[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm8[1],ymm14[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm7[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm15[0],ymm8[0],ymm15[2],ymm8[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm10[0],xmm11[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm9[0],xmm7[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm12[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm13[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm9[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm7[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm2[1],ymm15[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm10[1],xmm11[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm7[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm14, (%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride6_vf16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <0,6,12,u> +; AVX512F-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm7 ; AVX512F-NEXT: movb $56, %dil ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [4,10,4,10,4,10,4,10] ; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm16, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm16, %zmm15 ; AVX512F-NEXT: movb $-64, %dil ; AVX512F-NEXT: kmovw %edi, %k2 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} +; AVX512F-NEXT: vpermt2q %zmm3, %zmm16, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11] ; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <1,7,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512F-NEXT: vpermt2q %zmm11, %zmm18, %zmm17 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <1,7,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm15, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [5,11,5,11,5,11,5,11] ; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm17, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} +; AVX512F-NEXT: vpermt2q %zmm6, %zmm17, %zmm19 ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = <10,0,6,u> +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm17, %zmm19 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm20 = <10,0,6,u> +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 ; AVX512F-NEXT: movb $24, %dil ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm21 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,6,12,0,0,6,12] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm22 ; AVX512F-NEXT: movb $-32, %dil ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512F-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} +; AVX512F-NEXT: vpermi2q %zmm3, %zmm4, %zmm19 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm2, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = <11,1,7,u> ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm20 = <11,1,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,1,7,13,0,1,7,13] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512F-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512F-NEXT: vpermi2q %zmm3, %zmm4, %zmm19 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm2, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm22 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,10,0,6,0,10,0,6] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm18, %zmm19 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6] ; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm25 ; AVX512F-NEXT: vpermt2q %zmm13, %zmm24, %zmm25 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm16, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm16, %zmm26 ; AVX512F-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm24 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm25 {%k1} +; AVX512F-NEXT: vpermi2q %zmm5, %zmm2, %zmm24 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 ; AVX512F-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm16 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,11,1,7,0,11,1,7] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm18, %zmm6 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7] ; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm17, %zmm9 -; AVX512F-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rsi) +; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512F-NEXT: vinserti32x4 $0, %xmm9, %zmm11, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm15, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1586,142 +1584,142 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride6_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,6,12,u> +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm7 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [4,10,4,10,4,10,4,10] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm16, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm15 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <1,7,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm17 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <1,7,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [5,11,5,11,5,11,5,11] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm17, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm17, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <10,0,6,u> +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = <10,0,6,u> +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 ; AVX512BW-NEXT: movb $24, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,6,12,0,0,6,12] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm22 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <11,1,7,u> ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = <11,1,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,1,7,13,0,1,7,13] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm22 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,10,0,6,0,10,0,6] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm26 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm24 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm25 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm24 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,11,1,7,0,11,1,7] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm9 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rsi) +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm9, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1747,48 +1745,48 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $1176, %rsp # imm = 0x498 ; SSE-NEXT: movaps 624(%rdi), %xmm0 ; SSE-NEXT: movaps 576(%rdi), %xmm9 -; SSE-NEXT: movaps 240(%rdi), %xmm3 +; SSE-NEXT: movaps 240(%rdi), %xmm1 ; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps 720(%rdi), %xmm1 +; SSE-NEXT: movaps 720(%rdi), %xmm2 ; SSE-NEXT: movaps 672(%rdi), %xmm11 -; SSE-NEXT: movaps 336(%rdi), %xmm5 +; SSE-NEXT: movaps 336(%rdi), %xmm3 ; SSE-NEXT: movaps 288(%rdi), %xmm10 ; SSE-NEXT: movaps 432(%rdi), %xmm4 -; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps 912(%rdi), %xmm2 +; SSE-NEXT: movaps 384(%rdi), %xmm13 +; SSE-NEXT: movaps 912(%rdi), %xmm5 ; SSE-NEXT: movaps 528(%rdi), %xmm6 ; SSE-NEXT: movaps 480(%rdi), %xmm14 ; SSE-NEXT: movaps 144(%rdi), %xmm7 -; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movaps 96(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm7[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -1797,9 +1795,9 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 864(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 816(%rdi), %xmm0 ; SSE-NEXT: movaps 768(%rdi), %xmm1 @@ -1963,12 +1961,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1504(%rdi), %xmm0 -; SSE-NEXT: movaps 1456(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 1456(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 80(%rdi), %xmm0 ; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2010,7 +2007,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 656(%rdi), %xmm0 ; SSE-NEXT: movaps 608(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2019,43 +2016,44 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 752(%rdi), %xmm0 -; SSE-NEXT: movaps 704(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 704(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 848(%rdi), %xmm0 -; SSE-NEXT: movaps 800(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 800(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 944(%rdi), %xmm0 -; SSE-NEXT: movaps 896(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 896(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 1040(%rdi), %xmm0 -; SSE-NEXT: movaps 992(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 992(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 1136(%rdi), %xmm0 -; SSE-NEXT: movaps 1088(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movaps 1088(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 1232(%rdi), %xmm0 -; SSE-NEXT: movaps 1184(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps 1184(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 1328(%rdi), %xmm0 -; SSE-NEXT: movaps 1280(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 1328(%rdi), %xmm0 +; SSE-NEXT: movaps 1280(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 1424(%rdi), %xmm0 ; SSE-NEXT: movaps 1376(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm4 @@ -2162,8 +2160,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%r8) +; SSE-NEXT: movaps %xmm15, 240(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2196,10 +2193,10 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps %xmm2, 240(%r9) ; SSE-NEXT: movaps %xmm4, 224(%r9) -; SSE-NEXT: movaps %xmm6, 208(%r9) +; SSE-NEXT: movaps %xmm7, 208(%r9) ; SSE-NEXT: movaps %xmm8, 192(%r9) ; SSE-NEXT: movaps %xmm11, 176(%r9) -; SSE-NEXT: movaps %xmm12, 160(%r9) +; SSE-NEXT: movaps %xmm14, 160(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2223,16 +2220,17 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 240(%rax) ; SSE-NEXT: movaps %xmm3, 224(%rax) -; SSE-NEXT: movaps %xmm5, 208(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) -; SSE-NEXT: movaps %xmm9, 176(%rax) -; SSE-NEXT: movaps %xmm10, 160(%rax) -; SSE-NEXT: movaps %xmm13, 144(%rax) -; SSE-NEXT: movaps %xmm15, 128(%rax) -; SSE-NEXT: movaps %xmm14, 112(%rax) +; SSE-NEXT: movaps %xmm6, 208(%rax) +; SSE-NEXT: movaps %xmm5, 192(%rax) +; SSE-NEXT: movaps %xmm10, 176(%rax) +; SSE-NEXT: movaps %xmm9, 160(%rax) +; SSE-NEXT: movaps %xmm12, 144(%rax) +; SSE-NEXT: movaps %xmm13, 128(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -2304,29 +2302,29 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm12[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] @@ -2367,18 +2365,18 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm3[1],ymm13[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2443,14 +2441,14 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] @@ -2458,52 +2456,52 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm13[0],ymm3[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm13[0],ymm3[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm13[1],ymm3[3],ymm13[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm13[1],ymm3[3],ymm13[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2567,166 +2565,166 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 1232(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovaps 1232(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm14[0],ymm3[2],ymm14[2] ; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2739,41 +2737,38 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i64_stride6_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX2-ONLY-NEXT: subq $1400, %rsp # imm = 0x578 ; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm0[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm9[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm7[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2784,306 +2779,301 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm5[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm1[0],xmm3[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm11[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm13[0],xmm11[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm13[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm2[0],xmm14[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm12[1],mem[1] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm14[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm6[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm4[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm13[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm11[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm13[1] -; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm14[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] +; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm11[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm6[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1120(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm5[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1504(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm7[0],xmm3[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm12[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm9[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] ; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm15[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm14 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm15[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm9[1] ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm13[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm11[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm9[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm7[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1424(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 @@ -3091,46 +3081,45 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm14[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm12[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm12[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm13[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 904(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1096(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1480(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3191,9 +3180,9 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) @@ -3215,315 +3204,308 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rax) +; AVX2-ONLY-NEXT: addq $1400, %rsp # imm = 0x578 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride6_vf32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm25 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <0,6,12,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,7,0,11,1,7,0,11] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm26 = <1,7,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,0,1,7,13,0,1,7] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm6, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <1,7,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <10,0,6,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <11,1,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <10,0,6,u> -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <11,1,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm31, %zmm15 ; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm23 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm23, %zmm4, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm23, %zmm4, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm23, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm11, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm23, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm23, %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm19 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm25 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm24, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm24, %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm27 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm24 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm21 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 {%k2} ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm20 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} @@ -3533,39 +3515,40 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm8, %zmm5 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm8, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm16, %zmm4 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 +; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm6, %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm20, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%r8) +; AVX512F-NEXT: vmovaps %zmm7, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm21, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) @@ -3573,7 +3556,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512F-NEXT: vzeroupper @@ -3582,300 +3565,294 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride6_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <0,6,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,7,0,11,1,7,0,11] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <1,7,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,0,1,7,13,0,1,7] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <1,7,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <10,0,6,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <11,1,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <10,0,6,u> -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <11,1,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm15 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm23 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm23, %zmm4, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm23, %zmm4, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm23, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm23, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm23, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm25 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm24, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm27 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 {%k2} ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} @@ -3885,39 +3862,40 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm8, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm8, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm16, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm14, %zmm6, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%r8) +; AVX512BW-NEXT: vmovaps %zmm7, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) @@ -3925,7 +3903,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512BW-NEXT: vzeroupper @@ -4521,7 +4499,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 1664(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1808(%rdi), %xmm0 @@ -4558,7 +4536,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 2288(%rdi), %xmm0 ; SSE-NEXT: movaps 2240(%rdi), %xmm15 ; SSE-NEXT: movaps %xmm15, %xmm1 @@ -4566,31 +4544,31 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 2384(%rdi), %xmm0 -; SSE-NEXT: movaps 2336(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 2336(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 2480(%rdi), %xmm0 -; SSE-NEXT: movaps 2432(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps 2432(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 2576(%rdi), %xmm0 -; SSE-NEXT: movaps 2528(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: movaps 2528(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 2672(%rdi), %xmm0 -; SSE-NEXT: movaps 2624(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps 2624(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 2768(%rdi), %xmm0 ; SSE-NEXT: movaps 2720(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 2864(%rdi), %xmm0 ; SSE-NEXT: movaps 2816(%rdi), %xmm6 @@ -4866,8 +4844,8 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, 496(%r9) ; SSE-NEXT: movaps %xmm4, 480(%r9) ; SSE-NEXT: movaps %xmm7, 464(%r9) -; SSE-NEXT: movaps %xmm8, 448(%r9) -; SSE-NEXT: movaps %xmm11, 432(%r9) +; SSE-NEXT: movaps %xmm9, 448(%r9) +; SSE-NEXT: movaps %xmm10, 432(%r9) ; SSE-NEXT: movaps %xmm13, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%r9) @@ -4885,7 +4863,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 304(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r9) @@ -4926,12 +4904,12 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, 480(%rax) ; SSE-NEXT: movaps %xmm6, 464(%rax) ; SSE-NEXT: movaps %xmm5, 448(%rax) -; SSE-NEXT: movaps %xmm9, 432(%rax) -; SSE-NEXT: movaps %xmm10, 416(%rax) -; SSE-NEXT: movaps %xmm14, 400(%rax) -; SSE-NEXT: movaps %xmm12, 384(%rax) +; SSE-NEXT: movaps %xmm8, 432(%rax) +; SSE-NEXT: movaps %xmm11, 416(%rax) +; SSE-NEXT: movaps %xmm12, 400(%rax) +; SSE-NEXT: movaps %xmm14, 384(%rax) ; SSE-NEXT: movaps %xmm15, 368(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) @@ -5384,14 +5362,14 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2400(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 2464(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5424,37 +5402,37 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm10[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm7[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm14[0],ymm6[2],ymm14[2] ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm13[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5465,7 +5443,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm14[1],ymm6[3],ymm14[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5477,7 +5455,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm15[1],ymm9[3],ymm15[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5492,7 +5470,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm12[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm11[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5535,10 +5513,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -5844,16 +5822,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -6015,10 +5993,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) @@ -6446,7 +6424,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 2704(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6473,9 +6451,9 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vbroadcastsd 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 1552(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm13[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm0 @@ -6491,15 +6469,15 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm5[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm5[0],xmm6[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm3[0],xmm13[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm3[0],xmm14[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 @@ -6507,13 +6485,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm15[0],xmm0[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm0[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6528,7 +6506,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm13[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm14[1] ; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm3 @@ -6585,7 +6563,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm13[1] ; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] @@ -6629,10 +6607,9 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6646,7 +6623,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] ; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm2 @@ -6667,7 +6644,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6680,7 +6657,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps %xmm12, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] @@ -6785,7 +6762,6 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 2384(%rdi), %xmm12 @@ -6831,7 +6807,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6919,175 +6895,175 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vbroadcastsd 2824(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 3016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 448(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 448(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 352(%rax) @@ -7117,166 +7093,169 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-LABEL: load_i64_stride6_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,6,0,10,0,6,0,10] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,7,0,11,1,7,0,11] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [13,0,1,7,13,0,1,7] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm3, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> @@ -7287,13 +7266,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <10,0,6,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm30 = <10,0,6,u> ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <11,1,7,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm26 = <11,1,7,u> ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -7313,10 +7292,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 @@ -7332,10 +7311,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 @@ -7351,10 +7330,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 @@ -7370,10 +7349,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 @@ -7381,185 +7360,185 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm14 ; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm25 +; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm13 ; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm31 ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm12 ; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm29 ; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm30, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm2, %zmm30 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm26, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm12, %zmm2, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm19 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm23 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm13, %zmm31, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm13, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7572,10 +7551,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} @@ -7589,11 +7568,11 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} @@ -7601,25 +7580,23 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} @@ -7678,31 +7655,30 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} @@ -7722,81 +7698,82 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm19, %zmm13, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm29, 384(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm25, 384(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm8, 256(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm8, 192(%rsi) @@ -7819,11 +7796,9 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovaps %zmm8, (%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 384(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm29, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm30, 448(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 256(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm8, 320(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -7836,36 +7811,35 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovaps %zmm8, 64(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm8, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm28, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm24, 128(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm8, 192(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm8, (%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512F-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm8, 384(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm0, 384(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 384(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -7873,166 +7847,169 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,7,0,11,1,7,0,11] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [13,0,1,7,13,0,1,7] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> @@ -8043,13 +8020,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <10,0,6,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm30 = <10,0,6,u> ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <11,1,7,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <11,1,7,u> ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -8069,10 +8046,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 @@ -8088,10 +8065,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 @@ -8107,10 +8084,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 @@ -8126,10 +8103,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 @@ -8137,185 +8114,185 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm25 +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm12 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm29 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm30 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -8328,10 +8305,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} @@ -8345,11 +8322,11 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} @@ -8357,25 +8334,23 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} @@ -8434,31 +8409,30 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} @@ -8478,81 +8452,82 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm13, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 384(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm8, 256(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm8, 192(%rsi) @@ -8575,11 +8550,9 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovaps %zmm8, (%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 256(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm8, 320(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -8592,36 +8565,35 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovaps %zmm8, 64(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm8, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm8, 192(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm8, (%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm8, 384(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 864597f14e320a..a54b5fd7b1f8f3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -152,42 +152,42 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd 192(%rdi), %xmm4 ; SSE-NEXT: movapd 80(%rdi), %xmm3 ; SSE-NEXT: movapd 128(%rdi), %xmm5 -; SSE-NEXT: movapd 176(%rdi), %xmm8 +; SSE-NEXT: movapd 176(%rdi), %xmm6 ; SSE-NEXT: movapd 64(%rdi), %xmm7 -; SSE-NEXT: movapd (%rdi), %xmm10 +; SSE-NEXT: movapd (%rdi), %xmm8 ; SSE-NEXT: movapd 16(%rdi), %xmm9 -; SSE-NEXT: movapd 32(%rdi), %xmm6 +; SSE-NEXT: movapd 32(%rdi), %xmm10 ; SSE-NEXT: movapd 48(%rdi), %xmm11 ; SSE-NEXT: movapd 112(%rdi), %xmm12 ; SSE-NEXT: movapd 160(%rdi), %xmm13 ; SSE-NEXT: movapd %xmm13, %xmm14 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm12[0],xmm14[1] ; SSE-NEXT: movapd %xmm11, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm7[0] -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm8[0],xmm15[1] +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm7[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm6[0] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm3[0] ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm4[0] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm10[0],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm0[0] ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] ; SSE-NEXT: movapd %xmm14, 16(%rsi) ; SSE-NEXT: movapd %xmm15, (%rsi) ; SSE-NEXT: movapd %xmm12, 16(%rdx) -; SSE-NEXT: movapd %xmm10, (%rdx) -; SSE-NEXT: movapd %xmm8, 16(%rcx) +; SSE-NEXT: movapd %xmm8, (%rdx) +; SSE-NEXT: movapd %xmm6, 16(%rcx) ; SSE-NEXT: movapd %xmm7, (%rcx) ; SSE-NEXT: movapd %xmm5, 16(%r8) ; SSE-NEXT: movapd %xmm9, (%r8) ; SSE-NEXT: movapd %xmm4, 16(%r9) ; SSE-NEXT: movapd %xmm3, (%r9) ; SSE-NEXT: movapd %xmm2, 16(%r10) -; SSE-NEXT: movapd %xmm6, (%r10) +; SSE-NEXT: movapd %xmm10, (%r10) ; SSE-NEXT: movapd %xmm1, 16(%rax) ; SSE-NEXT: movapd %xmm0, (%rax) ; SSE-NEXT: retq @@ -197,47 +197,47 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm3[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = mem[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[3],ymm4[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[3],ymm2[2] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[2] ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm1[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm3[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm5, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm7, (%r8) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd %ymm5, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm8, (%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, (%r10) +; AVX1-ONLY-NEXT: vmovapd %ymm1, (%r10) ; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -247,49 +247,49 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = mem[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] ; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm9[2,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%r8) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r10) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%r10) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -298,53 +298,53 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-SLOW: # %bb.0: ; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512-SLOW-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,7,14,u> -; AVX512-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 -; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <9,0,7,u> -; AVX512-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 -; AVX512-SLOW-NEXT: vpbroadcastq 176(%rdi), %ymm2 -; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX512-SLOW-NEXT: vpbroadcastq %xmm6, %ymm2 -; AVX512-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,7,14,u> +; AVX512-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <9,0,7,u> +; AVX512-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512-SLOW-NEXT: vpbroadcastq 176(%rdi), %ymm5 +; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX512-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512-SLOW-NEXT: vpbroadcastq %xmm5, %ymm6 +; AVX512-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX512-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX512-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX512-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512-SLOW-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] +; AVX512-SLOW-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX512-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 ; AVX512-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] -; AVX512-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 +; AVX512-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm9 ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] -; AVX512-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 +; AVX512-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm9 ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] ; AVX512-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512-SLOW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] -; AVX512-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 -; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512-SLOW-NEXT: vpermi2q 192(%rdi), %zmm1, %zmm9 +; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512-SLOW-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512-SLOW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512-SLOW-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512-SLOW-NEXT: vmovdqa %ymm6, (%r8) +; AVX512-SLOW-NEXT: vmovdqa %ymm4, (%rdx) +; AVX512-SLOW-NEXT: vmovdqa %ymm6, (%rcx) +; AVX512-SLOW-NEXT: vmovdqa %ymm5, (%r8) ; AVX512-SLOW-NEXT: vmovdqa %ymm7, (%r9) ; AVX512-SLOW-NEXT: vmovdqa %ymm8, (%r10) -; AVX512-SLOW-NEXT: vmovdqa %ymm3, (%rax) +; AVX512-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-SLOW-NEXT: vzeroupper ; AVX512-SLOW-NEXT: retq ; @@ -352,24 +352,24 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512-FAST-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,7,14,u> -; AVX512-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <9,0,7,u> -; AVX512-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 -; AVX512-FAST-NEXT: vpbroadcastq 176(%rdi), %ymm2 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7] -; AVX512-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-FAST-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm3 +; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,7,14,u> +; AVX512-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <9,0,7,u> +; AVX512-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 +; AVX512-FAST-NEXT: vpbroadcastq 176(%rdi), %ymm5 +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,7,0,7] +; AVX512-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512-FAST-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 +; AVX512-FAST-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm5 ; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX512-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX512-FAST-NEXT: vmovdqa 192(%rdi), %xmm8 @@ -379,26 +379,26 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] -; AVX512-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 +; AVX512-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 ; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512-FAST-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] -; AVX512-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 +; AVX512-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 ; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] ; AVX512-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512-FAST-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] -; AVX512-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FAST-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512-FAST-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512-FAST-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512-FAST-NEXT: vpermi2q 192(%rdi), %zmm0, %zmm9 +; AVX512-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FAST-NEXT: vmovdqa %ymm3, (%rsi) +; AVX512-FAST-NEXT: vmovdqa %ymm4, (%rdx) +; AVX512-FAST-NEXT: vmovdqa %ymm5, (%rcx) ; AVX512-FAST-NEXT: vmovdqa %ymm7, (%r8) ; AVX512-FAST-NEXT: vmovdqa %ymm8, (%r9) ; AVX512-FAST-NEXT: vmovdqa %ymm6, (%r10) -; AVX512-FAST-NEXT: vmovdqa %ymm3, (%rax) +; AVX512-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-FAST-NEXT: vzeroupper ; AVX512-FAST-NEXT: retq %wide.vec = load <28 x i64>, ptr %in.vec, align 64 @@ -423,55 +423,55 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i64_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $88, %rsp -; SSE-NEXT: movapd 320(%rdi), %xmm1 -; SSE-NEXT: movapd 208(%rdi), %xmm0 -; SSE-NEXT: movapd 256(%rdi), %xmm3 -; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 304(%rdi), %xmm5 -; SSE-NEXT: movapd 192(%rdi), %xmm4 -; SSE-NEXT: movapd 240(%rdi), %xmm7 -; SSE-NEXT: movapd 128(%rdi), %xmm6 -; SSE-NEXT: movapd 288(%rdi), %xmm9 -; SSE-NEXT: movapd 176(%rdi), %xmm8 -; SSE-NEXT: movapd 336(%rdi), %xmm10 -; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 272(%rdi), %xmm14 -; SSE-NEXT: movapd 112(%rdi), %xmm13 -; SSE-NEXT: movapd 160(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm15, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm8[0] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] +; SSE-NEXT: movapd 320(%rdi), %xmm3 +; SSE-NEXT: movapd 208(%rdi), %xmm2 +; SSE-NEXT: movapd 256(%rdi), %xmm5 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 304(%rdi), %xmm7 +; SSE-NEXT: movapd 192(%rdi), %xmm6 +; SSE-NEXT: movapd 240(%rdi), %xmm9 +; SSE-NEXT: movapd 128(%rdi), %xmm8 +; SSE-NEXT: movapd 288(%rdi), %xmm11 +; SSE-NEXT: movapd 176(%rdi), %xmm10 +; SSE-NEXT: movapd 336(%rdi), %xmm15 +; SSE-NEXT: movapd 224(%rdi), %xmm14 +; SSE-NEXT: movapd 272(%rdi), %xmm0 +; SSE-NEXT: movapd 112(%rdi), %xmm12 +; SSE-NEXT: movapd 160(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0] +; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm2[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] -; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm9[0] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm14[0],xmm13[1] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm7[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm3[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 384(%rdi), %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm10[0],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm15[0],xmm11[1] ; SSE-NEXT: movapd 400(%rdi), %xmm7 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm7[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm7[0] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 352(%rdi), %xmm8 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm8[0],xmm7[1] ; SSE-NEXT: movapd 416(%rdi), %xmm10 @@ -498,7 +498,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] ; SSE-NEXT: movapd %xmm3, (%rsi) ; SSE-NEXT: movapd %xmm11, 48(%rsi) -; SSE-NEXT: movapd %xmm12, 32(%rsi) +; SSE-NEXT: movapd %xmm13, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movapd %xmm2, (%rdx) @@ -506,7 +506,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm2, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movapd %xmm13, 16(%rdx) +; SSE-NEXT: movapd %xmm12, 16(%rdx) ; SSE-NEXT: movapd %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm7, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -517,7 +517,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd %xmm8, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movapd %xmm4, (%r9) ; SSE-NEXT: movapd %xmm10, 48(%r9) @@ -530,7 +530,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd %xmm15, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm6, (%rax) @@ -550,14 +550,14 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm3[3] @@ -647,7 +647,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm10 @@ -659,11 +659,11 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = mem[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm5 @@ -699,7 +699,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] @@ -710,10 +710,10 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 @@ -723,17 +723,17 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-ONLY-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-ONLY-NEXT: vmovdqa %ymm8, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm12, 32(%r8) @@ -742,7 +742,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) @@ -751,112 +751,112 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-LABEL: load_i64_stride7_vf8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <0,7,14,u> -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <0,7,14,u> +; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm1 ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm7[4,5,4,5],zmm6[4,5,4,5] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm9[4,5,4,5],zmm8[4,5,4,5] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,9,0,5,6,9] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,10,0,5,6,10] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm8, %zmm9, %zmm6 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm7, %zmm6, %zmm12 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm14 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm6 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [4,11] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,4,11] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm8, %zmm9, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm9 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <9,0,7,u> +; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [14,0,0,7,14,0,0,7] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm8, %zmm8 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm12 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm13 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] -; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u> -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,11,0,5,6,11] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm11, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm11 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,12,0,5,6,12] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm11, %zmm9 +; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm13, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,7,14,0,0,7,14,0] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,13,4,5,6,13] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm10 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm11, %zmm9 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [9,0,7,0,9,0,7,0] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm11, %zmm13 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,9,2,9,2,9,2,9] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm11, %zmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm10, (%r10) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm11 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -864,110 +864,110 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,7,14,u> -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,7,14,u> +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm1 ; AVX512BW-NEXT: movb $24, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm7[4,5,4,5],zmm6[4,5,4,5] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm9[4,5,4,5],zmm8[4,5,4,5] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,9,0,5,6,9] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 ; AVX512BW-NEXT: movb $-32, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,10,0,5,6,10] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm8, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm13 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm9, %zmm6 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm6, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm6 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u> -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,4,11] +; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm9, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <9,0,7,u> +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm9, %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,11,0,5,6,11] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,12,0,5,6,12] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm13, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,13,4,5,6,13] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm10 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm11, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm11, %zmm13 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,9,2,9,2,9,2,9] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <56 x i64>, ptr %in.vec, align 64 @@ -995,58 +995,58 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 208(%rdi), %xmm3 ; SSE-NEXT: movapd 96(%rdi), %xmm2 ; SSE-NEXT: movapd 144(%rdi), %xmm4 -; SSE-NEXT: movapd 192(%rdi), %xmm6 -; SSE-NEXT: movapd 80(%rdi), %xmm5 +; SSE-NEXT: movapd 192(%rdi), %xmm7 +; SSE-NEXT: movapd 80(%rdi), %xmm6 ; SSE-NEXT: movapd 128(%rdi), %xmm8 -; SSE-NEXT: movapd 64(%rdi), %xmm10 -; SSE-NEXT: movapd 176(%rdi), %xmm11 +; SSE-NEXT: movapd 64(%rdi), %xmm11 +; SSE-NEXT: movapd 176(%rdi), %xmm10 ; SSE-NEXT: movapd (%rdi), %xmm12 ; SSE-NEXT: movapd 16(%rdi), %xmm9 -; SSE-NEXT: movapd 32(%rdi), %xmm7 +; SSE-NEXT: movapd 32(%rdi), %xmm5 ; SSE-NEXT: movapd 48(%rdi), %xmm0 -; SSE-NEXT: movapd 224(%rdi), %xmm13 -; SSE-NEXT: movapd 112(%rdi), %xmm14 +; SSE-NEXT: movapd 224(%rdi), %xmm14 +; SSE-NEXT: movapd 112(%rdi), %xmm13 ; SSE-NEXT: movapd 160(%rdi), %xmm1 ; SSE-NEXT: movapd %xmm0, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm11[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm6[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm2[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0] +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm10[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm7[0] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 288(%rdi), %xmm3 +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm3[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 304(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1122,12 +1122,12 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 672(%rdi), %xmm6 +; SSE-NEXT: movapd 672(%rdi), %xmm5 ; SSE-NEXT: movapd 720(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] ; SSE-NEXT: movapd 736(%rdi), %xmm8 -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm8[0] +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm8[0] ; SSE-NEXT: movapd 688(%rdi), %xmm10 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] ; SSE-NEXT: movapd 752(%rdi), %xmm12 @@ -1146,10 +1146,10 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] ; SSE-NEXT: movapd 800(%rdi), %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd 864(%rdi), %xmm5 -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm5[0] +; SSE-NEXT: movapd 864(%rdi), %xmm6 +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm6[0] ; SSE-NEXT: movapd 816(%rdi), %xmm9 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm9[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] ; SSE-NEXT: movapd 880(%rdi), %xmm11 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm11[0] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm4[0],xmm11[1] @@ -1167,7 +1167,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movapd %xmm6, 96(%rdx) +; SSE-NEXT: movapd %xmm5, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movapd %xmm0, 112(%rdx) @@ -1207,7 +1207,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm5, 112(%r9) +; SSE-NEXT: movapd %xmm6, 112(%r9) ; SSE-NEXT: movapd %xmm12, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) @@ -1257,7 +1257,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $552, %rsp # imm = 0x228 +; AVX1-ONLY-NEXT: subq $568, %rsp # imm = 0x238 ; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 @@ -1266,9 +1266,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm4 @@ -1311,186 +1310,187 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm3[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm13[0],ymm4[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1],ymm4[0],ymm5[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm11[0],ymm2[2],ymm11[2] ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm8[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm5[0],xmm12[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm7[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm2[0],ymm6[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm6[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm3[0],xmm13[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm11[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[0],ymm6[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm11[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm11[0],ymm6[3],ymm11[2] ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[0],ymm7[3],ymm3[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[0],ymm10[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm1[0],ymm7[3],ymm1[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm13[0],ymm0[0],ymm13[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm11, (%r9) +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm10, (%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm9, 64(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm12, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm10, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm3, (%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm7, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 +; AVX1-ONLY-NEXT: vmovapd %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rax) +; AVX1-ONLY-NEXT: addq $568, %rsp # imm = 0x238 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-ONLY-NEXT: subq $504, %rsp # imm = 0x1F8 ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 @@ -1518,8 +1518,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm4 @@ -1594,8 +1595,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1604,8 +1605,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm0[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] @@ -1618,16 +1618,16 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] @@ -1647,76 +1647,76 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm4[1],ymm15[3],ymm4[3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm7, 96(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 32(%rax) -; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 32(%rax) +; AVX2-ONLY-NEXT: addq $504, %rsp # imm = 0x1F8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1724,183 +1724,183 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm24 ; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm25 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm26 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm27 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,7,14,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm7, %zmm29 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <0,7,14,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm28 ; AVX512F-NEXT: movb $24, %r11b ; AVX512F-NEXT: kmovw %r11d, %k2 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm27[4,5,4,5] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k2} = zmm10[4,5,4,5],zmm26[4,5,4,5] ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] ; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 ; AVX512F-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,5,6,9,0,5,6,9] ; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm18, %zmm17 ; AVX512F-NEXT: movb $-32, %r11b ; AVX512F-NEXT: kmovw %r11d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} -; AVX512F-NEXT: vpermi2q %zmm24, %zmm6, %zmm7 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[4,5,4,5],zmm25[4,5,4,5] -; AVX512F-NEXT: vpermi2q %zmm30, %zmm0, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm18, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 {%k1} +; AVX512F-NEXT: vpermi2q %zmm25, %zmm6, %zmm8 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm24[4,5,4,5] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm18, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 {%k1} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,4,11,4,11,4,11,4] ; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512F-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10] ; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm22, %zmm21 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <9,0,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512F-NEXT: vpermi2q %zmm30, %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm20, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm21, %zmm22 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm17 = <9,0,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm17, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm16 {%k1} +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm20, %zmm18 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm22, %zmm19 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm24, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512F-NEXT: vpermt2q %zmm24, %zmm21, %zmm19 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm25, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] ; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512F-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,11,0,5,6,11] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,5,6,11,0,5,6,11] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm22, %zmm19 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [14,0,0,7,14,0,0,7] ; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm23, %zmm18 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512F-NEXT: vpermi2q %zmm30, %zmm0, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm21, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm25, %zmm2, %zmm23 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm22, %zmm20 +; AVX512F-NEXT: vpermi2q %zmm24, %zmm2, %zmm23 ; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm19 ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm22, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,12,0,5,6,12] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm21, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm21, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,5,6,12,0,5,6,12] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm22, %zmm4 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [7,0,9,0,7,0,9,0] ; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm20 ; AVX512F-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 ; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm20, %zmm20 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512F-NEXT: vpermi2q %zmm30, %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm21, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm25, %zmm23 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm22, %zmm21 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm24, %zmm23 ; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm4 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm5 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm5 +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,7,14,0,0,7,14,0] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm5, %zmm21 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm22, %zmm21 ; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm30, %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm23, %zmm4 -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-NEXT: vpermi2q %zmm24, %zmm6, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm23 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [9,0,7,0,9,0,7,0] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [4,11] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm21, %zmm21 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm22, %zmm5 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm14 +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-NEXT: vpermi2q %zmm25, %zmm6, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm29 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [9,0,7,0,9,0,7,0] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm12 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm14 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm22, %zmm14 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = ymm13[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm13, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm31 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm30, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm5 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpermi2q %zmm24, %zmm6, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm15, %zmm5, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm13, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm22, %zmm5 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm7 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm7 = ymm12[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermi2q %zmm25, %zmm6, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm15, %zmm7, %zmm9 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm10 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm13 = [6,13] +; AVX512F-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm12, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm13, %zmm6 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm28, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%r10) -; AVX512F-NEXT: vmovdqa64 %zmm31, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%r10) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-NEXT: vzeroupper @@ -1910,183 +1910,183 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,7,14,u> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,7,14,u> ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm7, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm29 ; AVX512BW-NEXT: movb $24, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm28[4,5,4,5] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm27[4,5,4,5] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,5,6,9,0,5,6,9] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm17 ; AVX512BW-NEXT: movb $-32, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm6, %zmm7 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[4,5,4,5],zmm26[4,5,4,5] -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm6, %zmm8 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm24[4,5,4,5] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm18, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <9,0,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm20, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = <9,0,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm20, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm22, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm24, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm26, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm22, %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [14,0,0,7,14,0,0,7] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm18 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm2, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm22, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm23 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm22, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,5,6,12,0,5,6,12] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm22, %zmm4 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [7,0,9,0,7,0,9,0] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm25 = mem[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm25, %xmm25 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm20, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm26, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm22, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm23 ; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm4 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm22 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,5,6,13,4,5,6,13] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm23, %zmm22 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm5 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,11] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm23, %zmm4 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm6, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm13, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm23, %zmm21 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm14 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4,5],ymm14[6,7] +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm6, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm21, %zmm30 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm23, %zmm21 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm25 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [5,12] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm31, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm8 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm6, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm23, %zmm14 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm12 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm6, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] ; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm14, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -2116,17 +2116,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 208(%rdi), %xmm3 ; SSE-NEXT: movapd 96(%rdi), %xmm2 ; SSE-NEXT: movapd 144(%rdi), %xmm4 -; SSE-NEXT: movapd 192(%rdi), %xmm6 -; SSE-NEXT: movapd 80(%rdi), %xmm5 +; SSE-NEXT: movapd 192(%rdi), %xmm7 +; SSE-NEXT: movapd 80(%rdi), %xmm6 ; SSE-NEXT: movapd 128(%rdi), %xmm8 ; SSE-NEXT: movapd 64(%rdi), %xmm10 ; SSE-NEXT: movapd 176(%rdi), %xmm11 ; SSE-NEXT: movapd (%rdi), %xmm12 ; SSE-NEXT: movapd 16(%rdi), %xmm9 -; SSE-NEXT: movapd 32(%rdi), %xmm7 +; SSE-NEXT: movapd 32(%rdi), %xmm5 ; SSE-NEXT: movapd 48(%rdi), %xmm0 -; SSE-NEXT: movapd 224(%rdi), %xmm13 -; SSE-NEXT: movapd 112(%rdi), %xmm14 +; SSE-NEXT: movapd 224(%rdi), %xmm14 +; SSE-NEXT: movapd 112(%rdi), %xmm13 ; SSE-NEXT: movapd 160(%rdi), %xmm1 ; SSE-NEXT: movapd %xmm0, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] @@ -2135,39 +2135,39 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm6[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm2[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm11[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0] +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm7[0] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 288(%rdi), %xmm3 +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm3[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 304(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2668,519 +2668,520 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1736, %rsp # imm = 0x6C8 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $1752, %rsp # imm = 0x6D8 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm10[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd 944(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1344(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm10[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovapd 944(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm15[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 1344(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd 1392(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm3[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm2[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[3],ymm7[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[3],ymm7[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[0],ymm14[3],ymm11[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[0],ymm9[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[0],ymm15[3],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm13[0],ymm1[0],ymm13[3],ymm1[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm12[0],ymm5[0],ymm12[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm10[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm14[0],ymm9[0],ymm14[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm15[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm2[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm10[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm6[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm11[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm14[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm15[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1696(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm14[1],ymm6[0],ymm14[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm15[1],ymm4[0],ymm15[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm0[0],ymm3[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm15[0],ymm1[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[1],ymm15[0],ymm13[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm3[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm14[0],xmm5[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm9[0],xmm15[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm15[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm13[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[3],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm3[0],ymm12[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[3],ymm3[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1],ymm0[0],ymm5[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[0],ymm1[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[3],ymm8[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm8[0],xmm6[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm11[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm10[0],xmm4[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm6[0],xmm15[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm14[0],ymm7[0],ymm14[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm12[0],ymm11[0],ymm12[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm10[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm6[0],ymm8[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm0[0],ymm13[0],ymm0[3],ymm13[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm5[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm5[0],ymm9[3],ymm5[2] ; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm13[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm15 = xmm15[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm4[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm13[3] ; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm12, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 160(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 160(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm14, (%rax) -; AVX1-ONLY-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, (%rax) +; AVX1-ONLY-NEXT: addq $1752, %rsp # imm = 0x6D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3189,170 +3190,170 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: subq $1576, %rsp # imm = 0x628 ; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm10[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm12[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm11[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastq 576(%rdi), %ymm0 @@ -3361,13 +3362,13 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastq 1024(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 @@ -3378,3869 +3379,1184 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm8[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm14 = ymm15[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm7[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq (%rsp), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 192(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 224(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 160(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 224(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 128(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-ONLY-NEXT: addq $1576, %rsp # imm = 0x628 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-ONLY-SLOW-LABEL: load_i64_stride7_vf32: -; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $2216, %rsp # imm = 0x8A8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovaps 1024(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: movb $24, %al -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,11,4,11] -; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: movb $-32, %al -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 128(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm15, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $2216, %rsp # imm = 0x8A8 -; AVX512F-ONLY-SLOW-NEXT: vzeroupper -; AVX512F-ONLY-SLOW-NEXT: retq -; -; AVX512F-ONLY-FAST-LABEL: load_i64_stride7_vf32: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $2216, %rsp # imm = 0x8A8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovaps 1024(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] -; AVX512F-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: movb $24, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,11,4,11] -; AVX512F-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 -; AVX512F-ONLY-FAST-NEXT: movb $-32, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 128(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm15, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $2216, %rsp # imm = 0x8A8 -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: load_i64_stride7_vf32: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $2216, %rsp # imm = 0x8A8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovaps 1024(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] -; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] -; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: movb $24, %al -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] -; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,11,4,11] -; AVX512DQ-SLOW-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 912(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 -; AVX512DQ-SLOW-NEXT: movb $-32, %al -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 128(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-SLOW-NEXT: vmovaps %zmm15, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $2216, %rsp # imm = 0x8A8 -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: load_i64_stride7_vf32: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2216, %rsp # imm = 0x8A8 -; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovaps 1024(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] -; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] -; AVX512DQ-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: movb $24, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,11,4,11] -; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 912(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: movb $-32, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vmovdqa 1408(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 192(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 128(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-FAST-NEXT: vmovaps %zmm15, 64(%rax) -; AVX512DQ-FAST-NEXT: addq $2216, %rsp # imm = 0x8A8 -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq -; -; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride7_vf32: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: movb $24, %al -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: movb $-32, %al -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 128(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq -; -; AVX512BW-ONLY-FAST-LABEL: load_i64_stride7_vf32: -; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: movb $24, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [4,11,4,11] -; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: movb $-32, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 128(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $2152, %rsp # imm = 0x868 -; AVX512BW-ONLY-FAST-NEXT: vzeroupper -; AVX512BW-ONLY-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: load_i64_stride7_vf32: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: movb $24, %al -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [4,11,4,11] -; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 -; AVX512DQBW-SLOW-NEXT: movb $-32, %al -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 960(%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 512(%rdi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %ymm20 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 128(%rax) -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512DQBW-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq +; AVX512F-LABEL: load_i64_stride7_vf32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: subq $2248, %rsp # imm = 0x8C8 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm29 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [10,3,10,3,10,3,10,3] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm23 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm25 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm16 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm9, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm18 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,7,14,0,0,7,14,0] +; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm14, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm26, %zmm8 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm26, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm26, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm17, %zmm31 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm26, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm26, %zmm1, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm27 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm14 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm17 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: movb $24, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,7,14,u> +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm20 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,11,4,11] +; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm0[4,5,4,5],zmm13[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm16 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm12, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm10[4,5,4,5],zmm24[4,5,4,5] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm24, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm10, %zmm24, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm0[4,5,4,5],zmm19[4,5,4,5] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm15 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm19 +; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm4, %zmm26 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm28 +; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm24 +; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm12, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm0 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [6,13] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti32x4 $1, %ymm6, %xmm22 +; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm6 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm15, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm13 +; AVX512F-NEXT: movb $-32, %al +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm22, %zmm4, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm23 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm8 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm18, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm14 +; AVX512F-NEXT: vmovdqa64 %ymm31, %ymm9 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm14 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm30, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm29, %zmm15 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm12 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm12 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm25, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm24, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm23, 128(%rax) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512F-NEXT: addq $2248, %rsp # imm = 0x8C8 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512DQBW-FAST-LABEL: load_i64_stride7_vf32: -; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: movb $24, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [4,11,4,11] -; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 -; AVX512DQBW-FAST-NEXT: movb $-32, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 960(%rdi), %ymm6 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 512(%rdi), %ymm7 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %ymm20 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 128(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 192(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 128(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 128(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512DQBW-FAST-NEXT: addq $2152, %rsp # imm = 0x868 -; AVX512DQBW-FAST-NEXT: vzeroupper -; AVX512DQBW-FAST-NEXT: retq +; AVX512BW-LABEL: load_i64_stride7_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: subq $2344, %rsp # imm = 0x928 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,3,10,3,10,3,10,3] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm18 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm20 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm30 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm28 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm28 +; AVX512BW-NEXT: movb $24, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,7,14,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm0[4,5,4,5],zmm25[4,5,4,5] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,11,4,11] +; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm0[4,5,4,5],zmm11[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm16 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm0[4,5,4,5],zmm24[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm24, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm9[4,5,4,5] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm15, %zmm2, %zmm25 +; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm5, %zmm22 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm4, %zmm24 +; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm11, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm3 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm3 = ymm18[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm18[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: movb $-32, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm5 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm6 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm15, %zmm7, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm20 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm27, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm10 +; AVX512BW-NEXT: vpalignr $8, (%rsp), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm13 = ymm30[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm30[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm13 = ymm19[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm13 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm23, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm14 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm26, %zmm14 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-NEXT: addq $2344, %rsp # imm = 0x928 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %wide.vec = load <224 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> %strided.vec1 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> @@ -7266,17 +4582,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 208(%rdi), %xmm3 ; SSE-NEXT: movapd 96(%rdi), %xmm2 ; SSE-NEXT: movapd 144(%rdi), %xmm4 -; SSE-NEXT: movapd 192(%rdi), %xmm6 -; SSE-NEXT: movapd 80(%rdi), %xmm5 +; SSE-NEXT: movapd 192(%rdi), %xmm7 +; SSE-NEXT: movapd 80(%rdi), %xmm6 ; SSE-NEXT: movapd 128(%rdi), %xmm8 ; SSE-NEXT: movapd 176(%rdi), %xmm11 ; SSE-NEXT: movapd 64(%rdi), %xmm10 ; SSE-NEXT: movapd (%rdi), %xmm12 ; SSE-NEXT: movapd 16(%rdi), %xmm9 -; SSE-NEXT: movapd 32(%rdi), %xmm7 +; SSE-NEXT: movapd 32(%rdi), %xmm5 ; SSE-NEXT: movapd 48(%rdi), %xmm0 -; SSE-NEXT: movapd 224(%rdi), %xmm13 -; SSE-NEXT: movapd 112(%rdi), %xmm14 +; SSE-NEXT: movapd 224(%rdi), %xmm14 +; SSE-NEXT: movapd 112(%rdi), %xmm13 ; SSE-NEXT: movapd 160(%rdi), %xmm1 ; SSE-NEXT: movapd %xmm0, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] @@ -7285,36 +4601,36 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm6[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm2[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm11[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0] +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm7[0] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7859,8 +5175,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 3024(%rdi), %xmm2 ; SSE-NEXT: movapd 3072(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd %xmm0, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] ; SSE-NEXT: movapd 3088(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7900,8 +5216,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 3248(%rdi), %xmm9 ; SSE-NEXT: movapd 3296(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] ; SSE-NEXT: movapd 3312(%rdi), %xmm15 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm15[0] ; SSE-NEXT: movapd 3264(%rdi), %xmm2 @@ -7917,12 +5233,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 3360(%rdi), %xmm6 +; SSE-NEXT: movapd 3360(%rdi), %xmm7 ; SSE-NEXT: movapd 3408(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm4 -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] ; SSE-NEXT: movapd 3424(%rdi), %xmm11 -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm11[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm11[0] ; SSE-NEXT: movapd 3376(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] ; SSE-NEXT: movapd 3440(%rdi), %xmm1 @@ -7942,10 +5258,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: movapd 3536(%rdi), %xmm8 ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm8[0] -; SSE-NEXT: movapd 3488(%rdi), %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1] +; SSE-NEXT: movapd 3488(%rdi), %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] ; SSE-NEXT: movapd 3552(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm0[0] ; SSE-NEXT: movapd 3504(%rdi), %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7956,9 +5272,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; SSE-NEXT: movapd %xmm2, 496(%rsi) ; SSE-NEXT: movapd %xmm4, 480(%rsi) -; SSE-NEXT: movapd %xmm7, 464(%rsi) +; SSE-NEXT: movapd %xmm6, 464(%rsi) ; SSE-NEXT: movapd %xmm10, 448(%rsi) -; SSE-NEXT: movapd %xmm14, 432(%rsi) +; SSE-NEXT: movapd %xmm13, 432(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8014,7 +5330,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movapd %xmm5, 496(%rdx) -; SSE-NEXT: movapd %xmm6, 480(%rdx) +; SSE-NEXT: movapd %xmm7, 480(%rdx) ; SSE-NEXT: movapd %xmm9, 464(%rdx) ; SSE-NEXT: movapd %xmm12, 448(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8134,7 +5450,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movapd %xmm13, 496(%r8) +; SSE-NEXT: movapd %xmm14, 496(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%r8) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -8395,235 +5711,239 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $4232, %rsp # imm = 0x1088 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX1-ONLY-NEXT: subq $4264, %rsp # imm = 0x10A8 +; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 672(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 720(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2112(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2016(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2064(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 1168(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2560(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2464(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2912(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2960(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 3360(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2912(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2960(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2288(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 3360(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2736(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 3136(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[3],ymm2[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[0],ymm4[3],ymm14[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[0],ymm3[3],ymm13[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[0],ymm6[3],ymm12[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[0],ymm7[3],ymm11[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2624(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[0],ymm8[3],ymm10[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2336(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm12[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2240(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovapd 2288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm10[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2784(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2688(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovapd 2736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm8[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3232(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 3136(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd 3184(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm5[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[3],ymm11[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[0],ymm7[3],ymm3[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm4[0],ymm13[3],ymm4[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0],ymm7[0],ymm15[3],ymm7[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 3072(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 2624(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm0[0],ymm13[0],ymm0[3],ymm13[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 3072(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 3520(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[3],ymm15[2] @@ -8632,11 +5952,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8664,491 +5984,483 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1952(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 1952(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] ; AVX1-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2400(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2848(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3296(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vmovapd 2032(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovapd 2480(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovapd 3376(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovdqa 3152(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovdqa 2704(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2368(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 2400(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 1808(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 1360(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 2848(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3296(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm6[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm12[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 2032(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovapd 2480(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovupd %ymm9, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd 2928(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd 3376(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovdqa 3152(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovdqa 2704(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1872(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 2368(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 1808(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2096(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 1360(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2320(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm9[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm15[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm15[1],ymm2[0],ymm15[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2544(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2768(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2992(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3216(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm5[0],ymm3[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm9[1],ymm3[0],ymm9[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3440(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm13[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm13[1],ymm5[0],ymm13[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm6[1],ymm12[0],ymm6[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1872(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm11[0],ymm7[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2096(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2320(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm14[1],ymm7[0],ymm14[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2544(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm10[0],ymm7[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2768(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2992(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3216(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3440(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm0[0],xmm11[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm12[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2144(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 2048(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 2272(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2592(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 2496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm14[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 2816(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2720(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm5[0],xmm12[1] +; AVX1-ONLY-NEXT: vmovapd %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 1920(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1824(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm14[0],xmm11[1] +; AVX1-ONLY-NEXT: vmovapd %xmm14, %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 3264(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 3168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 2592(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm1[0],xmm10[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 3392(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2720(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 3168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 3488(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 3392(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[3],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -9160,12 +6472,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[3],ymm1[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[3],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -9177,7 +6489,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[3],ymm1[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[3],ymm1[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm1 @@ -9191,28 +6503,27 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[0],ymm9[3],ymm15[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[3],ymm15[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 1984(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[0],ymm7[3],ymm11[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm11[0],ymm5[3],ymm11[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 2208(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[0],ymm10[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[3],ymm12[2] ; AVX1-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] @@ -9222,7 +6533,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 2432(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[0],ymm3[3],ymm9[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm8[0],ymm9[0],ymm8[3],ymm9[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 2656(%rdi), %ymm8 @@ -9237,7 +6548,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 2880(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 3104(%rdi), %ymm6 @@ -9252,14 +6564,14 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 3328(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[0],ymm14[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 3552(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[0],ymm2[0],ymm14[3],ymm2[2] ; AVX1-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpalignr $8, (%rsp), %xmm1, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9509,7 +6821,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) @@ -9575,20 +6887,20 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $4232, %rsp # imm = 0x1088 +; AVX1-ONLY-NEXT: addq $4264, %rsp # imm = 0x10A8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3928, %rsp # imm = 0xF58 -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-ONLY-NEXT: subq $4008, %rsp # imm = 0xFA8 +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9598,83 +6910,86 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2016(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2064(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2912(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2960(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2064(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 3360(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3408(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 2912(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2960(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3360(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovdqa 3408(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 @@ -9698,173 +7013,174 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1840(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1840(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2688(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2736(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3136(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3184(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2688(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2736(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3136(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3184(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm12[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm10[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm8[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %ymm8 @@ -9895,61 +7211,60 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 1136(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2144(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovdqa 2032(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2592(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq 2144(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovdqa 2480(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2032(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 3040(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq 2592(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovdqa 2928(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 3488(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovdqa 3376(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovdqa 2480(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 3040(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 3488(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps 3376(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastq 3264(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] @@ -9970,12 +7285,13 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1024(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -9998,225 +7314,227 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 2592(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2496(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqa 2816(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2816(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2720(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqa 3040(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -10227,7 +7545,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqa 3264(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 3168(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10245,14 +7563,14 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -10264,12 +7582,13 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -10281,12 +7600,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm6[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -10297,26 +7616,26 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm8[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] @@ -10326,14 +7645,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %xmm7, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10341,7 +7661,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %ymm6 @@ -10356,10 +7676,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] @@ -10367,7 +7687,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -10411,8 +7731,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -10424,7 +7743,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload @@ -10434,22 +7753,22 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload @@ -10459,12 +7778,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10626,9 +7945,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rax) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rax) @@ -10659,19 +7978,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 480(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 448(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 448(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm4, 416(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, 384(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 352(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 320(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 288(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 384(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 352(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 320(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 288(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, 256(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 224(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 224(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10682,51 +8000,51 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $3928, %rsp # imm = 0xF58 +; AVX2-ONLY-NEXT: addq $4008, %rsp # imm = 0xFA8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride7_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm22 +; AVX512F-NEXT: subq $6600, %rsp # imm = 0x19C8 +; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm17 +; AVX512F-NEXT: vmovaps 2688(%rdi), %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -10734,877 +8052,864 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4,11] -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm5, %zmm2 +; AVX512F-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 2816(%rdi), %ymm1 +; AVX512F-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm27 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm11, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm25 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 2368(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm26 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa 2368(%rdi), %ymm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm5 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm11 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm8 -; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm6 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm31 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm4 -; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm7 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm5 +; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm13 +; AVX512F-NEXT: vpermi2q %zmm19, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm15 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm15, %zmm10 +; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2432(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa 2880(%rdi), %ymm10 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm10, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [9,0,7,0,9,0,7,0] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm17 # 64-byte Folded Reload +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm17 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 3328(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpermi2q %zmm11, %zmm6, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [10,3,10,3,10,3,10,3] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [11,4,11,4,11,4,11,4] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm12 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm14 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm11, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm18, %zmm14 +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm12 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm10, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm11, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm18, %zmm14 +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm12 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm10, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm14 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqa 2432(%rdi), %ymm12 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm18, %zmm12 +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm9 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm21, %zmm10, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm11, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm18, %zmm9 +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa 3328(%rdi), %ymm8 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermi2q %zmm19, %zmm13, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm15, %zmm0, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm18, %zmm11 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [10,3,10,3,10,3,10,3] +; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm17, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm7, %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [11,4,11,4,11,4,11,4] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm19, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm7, %zmm25 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,9,2,9,2,9,2,9] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm17, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm26, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm26, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm17, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm26, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm17, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm26, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm19, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm17, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm26, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm19, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm11, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm28 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm17, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm26, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm17, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm15, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm15, %zmm26 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm7, %zmm28 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm19, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm15, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm15, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm13 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm17 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm20 ; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm19 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm12 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm27 ; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm11 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <0,7,14,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm13[4,5,4,5],zmm29[4,5,4,5] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,4,11] -; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,u> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm21[4,5,4,5] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,11,4,11] +; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm1[4,5,4,5],zmm17[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm24 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm23, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm29 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm30 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm13, %zmm10, %zmm23 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm5, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm14, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm2[4,5,4,5],zmm4[4,5,4,5] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k1} = zmm3[4,5,4,5],zmm4[4,5,4,5] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm6[4,5,4,5] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <9,0,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm25 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm15 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm22, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm13 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm27, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm19 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512F-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm13, %zmm1, %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm31, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm13, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 2304(%rdi), %ymm15 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm13, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm14 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2304(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm17, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm13, %zmm17, %zmm17 +; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm13 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm15 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm29, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm23, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm11, 384(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 320(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm12, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm16, 384(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm29, 320(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm30, 256(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm20, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm24, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm0, 448(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 256(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 320(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm21, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 64(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm28, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm12, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm19, 384(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11639,13 +8944,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-NEXT: vmovaps %zmm17, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) @@ -11655,51 +8959,51 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512F-NEXT: addq $6600, %rsp # imm = 0x19C8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm4 +; AVX512BW-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm17 +; AVX512BW-NEXT: vmovaps 2688(%rdi), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -11707,19 +9011,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 @@ -11730,844 +9033,845 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,11] -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] ; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm4 +; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm8 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm17 # 64-byte Folded Reload +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm17 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm8 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm7 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm13 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm8 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm13 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm8 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm13 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm8 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm10[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm18, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm8 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm18, %zmm8 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm7 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm12 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm10, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm21 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,3,10,3,10,3,10,3] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm20, %zmm9 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm18, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm29, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm20, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm21, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm19, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm20, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm18, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm11, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm27, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm7, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm11 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm30 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm26 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm27 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,7,14,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,7,14,u> ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm31[4,5,4,5] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,11,4,11] ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm1[4,5,4,5],zmm19[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm1[4,5,4,5],zmm23[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm28[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm9[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm16, %zmm23 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm29 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm11, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm26, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm28 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm2[4,5,4,5],zmm6[4,5,4,5] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <9,0,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [6,13] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm16, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm14 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm21, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm18 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm19, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm14, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa 1408(%rdi), %ymm15 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm15, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm19 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm19, %xmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm20, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm20 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm22 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm22 = mem[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %ymm24 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm21, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm24 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm26, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm21 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %ymm28 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm28 = mem[8,9,10,11,12,13,14,15],ymm28[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm28[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm28, %xmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm28, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm30 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm30 = mem[8,9,10,11,12,13,14,15],ymm30[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm30[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm30, %xmm30 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm9, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm4, 448(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm10, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm10, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12604,11 +9908,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) @@ -12618,7 +9922,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512BW-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <448 x i64>, ptr %in.vec, align 64 @@ -12646,8 +9950,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} ; AVX512BW-FAST: {{.*}} +; AVX512BW-ONLY-FAST: {{.*}} +; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512BW-SLOW: {{.*}} +; AVX512DQ-FAST: {{.*}} +; AVX512DQ-SLOW: {{.*}} +; AVX512DQBW-FAST: {{.*}} +; AVX512DQBW-SLOW: {{.*}} ; AVX512F-FAST: {{.*}} +; AVX512F-ONLY-FAST: {{.*}} +; AVX512F-ONLY-SLOW: {{.*}} ; AVX512F-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll index 884a4abfe646c3..2ee2235bf18f00 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -170,11 +170,11 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps 112(%rdi), %xmm6 ; SSE-NEXT: movaps 240(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm9 -; SSE-NEXT: movaps 224(%rdi), %xmm11 +; SSE-NEXT: movaps 96(%rdi), %xmm8 +; SSE-NEXT: movaps 224(%rdi), %xmm9 ; SSE-NEXT: movaps 160(%rdi), %xmm0 -; SSE-NEXT: movaps 80(%rdi), %xmm14 -; SSE-NEXT: movaps 208(%rdi), %xmm15 +; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps 208(%rdi), %xmm11 ; SSE-NEXT: movaps 144(%rdi), %xmm2 ; SSE-NEXT: movaps 64(%rdi), %xmm12 ; SSE-NEXT: movaps (%rdi), %xmm7 @@ -182,28 +182,28 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps 32(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rdi), %xmm4 ; SSE-NEXT: movaps 192(%rdi), %xmm13 -; SSE-NEXT: movaps 128(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] +; SSE-NEXT: movaps 128(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm13[1] ; SSE-NEXT: movaps %xmm7, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] ; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; SSE-NEXT: movaps %xmm5, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] -; SSE-NEXT: movaps %xmm0, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm5, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] ; SSE-NEXT: movaps 176(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, %xmm1 @@ -211,21 +211,21 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps %xmm13, (%rsi) -; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: movaps %xmm15, 16(%rsi) ; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm8, 16(%rdx) -; SSE-NEXT: movaps %xmm15, (%rcx) +; SSE-NEXT: movaps %xmm14, 16(%rdx) +; SSE-NEXT: movaps %xmm11, (%rcx) ; SSE-NEXT: movaps %xmm12, 16(%rcx) ; SSE-NEXT: movaps %xmm5, (%r8) ; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movaps %xmm11, (%r9) -; SSE-NEXT: movaps %xmm14, 16(%r9) +; SSE-NEXT: movaps %xmm9, (%r9) +; SSE-NEXT: movaps %xmm10, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, (%rax) +; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm6, 16(%rax) @@ -239,31 +239,31 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm10[1],xmm8[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm10[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm3[0] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm8[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm12 @@ -274,15 +274,15 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm12[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %xmm9, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm13, (%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm10, 16(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r11) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%r11) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%r11) ; AVX1-ONLY-NEXT: vmovaps %ymm11, (%r10) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vzeroupper @@ -341,58 +341,58 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX512-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1 -; AVX512-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX512-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512-NEXT: vmovaps (%rdi), %xmm4 +; AVX512-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX512-NEXT: vinsertf128 $1, 192(%rdi), %ymm5, %ymm5 +; AVX512-NEXT: vinsertf128 $1, 128(%rdi), %ymm4, %ymm4 +; AVX512-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX512-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX512-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX512-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] ; AVX512-NEXT: vmovaps 64(%rdi), %ymm9 ; AVX512-NEXT: vmovaps (%rdi), %ymm10 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX512-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] -; AVX512-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 +; AVX512-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] +; AVX512-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] +; AVX512-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,12,4,12] +; AVX512-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] -; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] ; AVX512-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm9 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] -; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14] ; AVX512-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] -; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] ; AVX512-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] -; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vmovaps %ymm7, (%rsi) -; AVX512-NEXT: vmovaps %ymm8, (%rdx) -; AVX512-NEXT: vmovaps %ymm0, (%rcx) -; AVX512-NEXT: vmovaps %ymm1, (%r8) -; AVX512-NEXT: vmovdqa %ymm2, (%r9) +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm11 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15] +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vmovaps %ymm6, (%rsi) +; AVX512-NEXT: vmovaps %ymm4, (%rdx) +; AVX512-NEXT: vmovaps %ymm8, (%rcx) +; AVX512-NEXT: vmovaps %ymm5, (%r8) +; AVX512-NEXT: vmovdqa %ymm7, (%r9) ; AVX512-NEXT: vmovdqa %ymm9, (%r11) ; AVX512-NEXT: vmovdqa %ymm10, (%r10) -; AVX512-NEXT: vmovdqa %ymm3, (%rax) +; AVX512-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <32 x i64>, ptr %in.vec, align 64 @@ -475,41 +475,41 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 96(%rdi), %xmm0 -; SSE-NEXT: movaps 32(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 224(%rdi), %xmm0 -; SSE-NEXT: movaps 160(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps 160(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps 288(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps 288(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 48(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 176(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps 304(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm0 ; SSE-NEXT: movaps 432(%rdi), %xmm1 @@ -547,27 +547,27 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm12, 32(%r9) -; SSE-NEXT: movaps %xmm11, 48(%r9) +; SSE-NEXT: movaps %xmm10, 32(%r9) +; SSE-NEXT: movaps %xmm7, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm8, 48(%rax) -; SSE-NEXT: movaps %xmm10, 32(%rax) -; SSE-NEXT: movaps %xmm9, 16(%rax) -; SSE-NEXT: movaps %xmm14, (%rax) +; SSE-NEXT: movaps %xmm6, 48(%rax) +; SSE-NEXT: movaps %xmm8, 32(%rax) +; SSE-NEXT: movaps %xmm13, 16(%rax) +; SSE-NEXT: movaps %xmm12, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm3, 48(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) -; SSE-NEXT: movaps %xmm6, 16(%rax) -; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movaps %xmm5, 32(%rax) +; SSE-NEXT: movaps %xmm9, 16(%rax) +; SSE-NEXT: movaps %xmm14, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 48(%rax) ; SSE-NEXT: movaps %xmm2, 32(%rax) -; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movaps %xmm7, (%rax) +; SSE-NEXT: movaps %xmm4, 16(%rax) +; SSE-NEXT: movaps %xmm11, (%rax) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; @@ -603,19 +603,19 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm9[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm13[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm9[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] @@ -713,13 +713,13 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm2 ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 @@ -736,39 +736,39 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm10, %ymm11 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm9, %ymm10 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm9, %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm3[0],ymm15[2],ymm3[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 @@ -777,9 +777,9 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm3[1],ymm15[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] @@ -798,11 +798,11 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovaps %ymm8, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm14, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rax) @@ -819,28 +819,28 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm11 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm8 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm11 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm14 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] @@ -848,81 +848,81 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm17 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm9 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm3[0],zmm6[0],zmm3[2],zmm6[2],zmm3[4],zmm6[4],zmm3[6],zmm6[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm6[1],zmm3[3],zmm6[3],zmm3[5],zmm6[5],zmm3[7],zmm6[7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm17 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm19, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[4],zmm11[6],zmm7[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm11[1],zmm7[1],zmm11[3],zmm7[3],zmm11[5],zmm7[5],zmm11[7],zmm7[7] -; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 {%k1} +; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 ; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] ; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm11, %ymm11 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm14, %ymm14 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm10, %ymm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm14[0],ymm10[2],ymm14[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm15 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] -; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm17, %zmm8 +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 ; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%r10) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rdi) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%rdi) ; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -933,28 +933,28 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm11 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] @@ -962,81 +962,81 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm17 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm3[0],zmm6[0],zmm3[2],zmm6[2],zmm3[4],zmm6[4],zmm3[6],zmm6[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm6[1],zmm3[3],zmm6[3],zmm3[5],zmm6[5],zmm3[7],zmm6[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm17 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[4],zmm11[6],zmm7[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm11[1],zmm7[1],zmm11[3],zmm7[3],zmm11[5],zmm7[5],zmm11[7],zmm7[7] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm11, %ymm11 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm14, %ymm14 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm10, %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm14[0],ymm10[2],ymm14[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm15 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm17, %zmm8 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm2 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] ; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdi) ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1173,7 +1173,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 912(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdi), %xmm0 @@ -1201,7 +1201,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 416(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 608(%rdi), %xmm0 @@ -1212,24 +1212,22 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 736(%rdi), %xmm0 -; SSE-NEXT: movaps 672(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 672(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 864(%rdi), %xmm0 -; SSE-NEXT: movaps 800(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 800(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 992(%rdi), %xmm0 -; SSE-NEXT: movaps 928(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps 928(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 48(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1238,31 +1236,33 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 176(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps 176(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 304(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 304(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm0 -; SSE-NEXT: movaps 432(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 432(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 624(%rdi), %xmm0 -; SSE-NEXT: movaps 560(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 752(%rdi), %xmm0 -; SSE-NEXT: movaps 688(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: movaps 560(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 752(%rdi), %xmm0 +; SSE-NEXT: movaps 688(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 880(%rdi), %xmm0 ; SSE-NEXT: movaps 816(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm4 @@ -1309,7 +1309,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) @@ -1337,15 +1337,14 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%r9) +; SSE-NEXT: movaps %xmm14, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) @@ -1354,10 +1353,9 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, 112(%rax) -; SSE-NEXT: movaps %xmm12, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps %xmm5, 112(%rax) +; SSE-NEXT: movaps %xmm15, 96(%rax) +; SSE-NEXT: movaps %xmm11, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1371,10 +1369,12 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm3, 112(%rax) ; SSE-NEXT: movaps %xmm4, 96(%rax) -; SSE-NEXT: movaps %xmm7, 80(%rax) -; SSE-NEXT: movaps %xmm8, 64(%rax) -; SSE-NEXT: movaps %xmm11, 48(%rax) -; SSE-NEXT: movaps %xmm15, 32(%rax) +; SSE-NEXT: movaps %xmm8, 80(%rax) +; SSE-NEXT: movaps %xmm10, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1382,11 +1382,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm2, 112(%rax) ; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm6, 80(%rax) -; SSE-NEXT: movaps %xmm5, 64(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: movaps %xmm13, 32(%rax) -; SSE-NEXT: movaps %xmm14, 16(%rax) +; SSE-NEXT: movaps %xmm7, 80(%rax) +; SSE-NEXT: movaps %xmm6, 64(%rax) +; SSE-NEXT: movaps %xmm12, 48(%rax) +; SSE-NEXT: movaps %xmm9, 32(%rax) +; SSE-NEXT: movaps %xmm13, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 @@ -1394,7 +1394,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 +; AVX1-ONLY-NEXT: subq $824, %rsp # imm = 0x338 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -1545,146 +1545,146 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm6[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm10[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm9[0],ymm4[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm10[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) -; AVX1-ONLY-NEXT: addq $808, %rsp # imm = 0x328 +; AVX1-ONLY-NEXT: addq $824, %rsp # imm = 0x338 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1692,10 +1692,10 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $808, %rsp # imm = 0x328 ; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1732,16 +1732,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm2 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 @@ -1755,26 +1755,26 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1925,445 +1925,425 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride8_vf16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm18 -; AVX512F-NEXT: vmovaps 640(%rdi), %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm19, %zmm15 +; AVX512F-NEXT: subq $72, %rsp +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm9, %zmm12 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa (%rdi), %xmm11 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm16 -; AVX512F-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm21 -; AVX512F-NEXT: vinserti32x4 $1, 128(%rdi), %ymm0, %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm21[0],ymm16[2],ymm21[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm19, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm5[0],ymm20[2],ymm5[2] +; AVX512F-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm29 +; AVX512F-NEXT: vinserti32x4 $1, 128(%rdi), %ymm11, %ymm30 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm9, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm19[0],ymm8[0],ymm19[2],ymm8[2] ; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm20[1],ymm5[1],ymm20[3],ymm5[3] -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm11[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm5 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm26[0],ymm16[2],ymm26[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm7, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm8[1],ymm19[3],ymm8[3] +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] ; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm29 -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm30, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm10[2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm24 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm16[1],ymm26[1],ymm16[3],ymm26[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm22 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm24[0],zmm14[0],zmm24[2],zmm14[2],zmm24[4],zmm14[4],zmm24[6],zmm14[6] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm28[0],zmm11[0],zmm28[2],zmm11[2],zmm28[4],zmm11[4],zmm28[6],zmm11[6] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vpermi2q %zmm31, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm18, %zmm8, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm27 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm30 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm14[1],zmm24[3],zmm14[3],zmm24[5],zmm14[5],zmm24[7],zmm14[7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm24 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm13[0],zmm12[0],zmm13[2],zmm12[2],zmm13[4],zmm12[4],zmm13[6],zmm12[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm13[1],zmm12[1],zmm13[3],zmm12[3],zmm13[5],zmm12[5],zmm13[7],zmm12[7] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm23, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm28[1],zmm11[1],zmm28[3],zmm11[3],zmm28[5],zmm11[5],zmm28[7],zmm11[7] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm31, %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm8, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm15 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm31, %zmm10, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm9, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm20[0],zmm4[2],zmm20[2],zmm4[4],zmm20[4],zmm4[6],zmm20[6] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [4,12,4,12] ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm11 -; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm11, %ymm11 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm29 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm16[1],ymm21[1],ymm16[3],ymm21[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm29, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm10, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm2[0],zmm14[0],zmm2[2],zmm14[2],zmm2[4],zmm14[4],zmm2[6],zmm14[6] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-NEXT: vpermi2q %zmm21, %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm6, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm26 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm10, %zmm0 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,5,13] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm7 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm4[1],zmm20[1],zmm4[3],zmm20[3],zmm4[5],zmm20[5],zmm4[7],zmm20[7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm20, %zmm16, %zmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm17[0],zmm18[2],zmm17[2],zmm18[4],zmm17[4],zmm18[6],zmm17[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm18[1],zmm17[1],zmm18[3],zmm17[3],zmm18[5],zmm17[5],zmm18[7],zmm17[7] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm23, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm10, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm2, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm17 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm2[1],zmm14[1],zmm2[3],zmm14[3],zmm2[5],zmm14[5],zmm2[7],zmm14[7] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm16, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm15[0],zmm13[0],zmm15[2],zmm13[2],zmm15[4],zmm13[4],zmm15[6],zmm13[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm13[1],zmm15[3],zmm13[3],zmm15[5],zmm13[5],zmm15[7],zmm13[7] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm10, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm18 {%k1} +; AVX512F-NEXT: vpermi2q %zmm21, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm6, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm12, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm16, %zmm8 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] +; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm21, %zmm1, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm16, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm13, %zmm14 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [7,15,7,15] +; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm16 +; AVX512F-NEXT: vinserti32x4 $1, 704(%rdi), %ymm16, %ymm16 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm6, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm15, %zmm6 +; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm15, %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm15[0],ymm16[0],ymm15[2],ymm16[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm9, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm24, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm15[1],ymm16[1],ymm15[3],ymm16[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm23, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm7, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm17, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%rsi) +; AVX512F-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm25, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-NEXT: addq $72, %rsp ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm18 -; AVX512BW-NEXT: vmovaps 640(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm15 +; AVX512BW-NEXT: subq $72, %rsp +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm12 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm11 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm21 -; AVX512BW-NEXT: vinserti32x4 $1, 128(%rdi), %ymm0, %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm21[0],ymm16[2],ymm21[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm5[0],ymm20[2],ymm5[2] +; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm29 +; AVX512BW-NEXT: vinserti32x4 $1, 128(%rdi), %ymm11, %ymm30 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm9, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm19[0],ymm8[0],ymm19[2],ymm8[2] ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm20[1],ymm5[1],ymm20[3],ymm5[3] -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm11[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm5 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm26[0],ymm16[2],ymm26[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm8[1],ymm19[3],ymm8[3] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm29 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm30, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm10[2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm24 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm16[1],ymm26[1],ymm16[3],ymm26[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm22 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm24[0],zmm14[0],zmm24[2],zmm14[2],zmm24[4],zmm14[4],zmm24[6],zmm14[6] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm28[0],zmm11[0],zmm28[2],zmm11[2],zmm28[4],zmm11[4],zmm28[6],zmm11[6] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm8, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm27 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm30 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm14[1],zmm24[3],zmm14[3],zmm24[5],zmm14[5],zmm24[7],zmm14[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm13[0],zmm12[0],zmm13[2],zmm12[2],zmm13[4],zmm12[4],zmm13[6],zmm12[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm13[1],zmm12[1],zmm13[3],zmm12[3],zmm13[5],zmm12[5],zmm13[7],zmm12[7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm28[1],zmm11[1],zmm28[3],zmm11[3],zmm28[5],zmm11[5],zmm28[7],zmm11[7] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm8, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm15 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm20[0],zmm4[2],zmm20[2],zmm4[4],zmm20[4],zmm4[6],zmm20[6] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [4,12,4,12] ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm11 -; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm11, %ymm11 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm16[1],ymm21[1],ymm16[3],ymm21[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm29, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm2[0],zmm14[0],zmm2[2],zmm14[2],zmm2[4],zmm14[4],zmm2[6],zmm14[6] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm6, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm26 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm0 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,5,13] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm7 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm4[1],zmm20[1],zmm4[3],zmm20[3],zmm4[5],zmm20[5],zmm4[7],zmm20[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm17[0],zmm18[2],zmm17[2],zmm18[4],zmm17[4],zmm18[6],zmm17[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm18[1],zmm17[1],zmm18[3],zmm17[3],zmm18[5],zmm17[5],zmm18[7],zmm17[7] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm17 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm2[1],zmm14[1],zmm2[3],zmm14[3],zmm2[5],zmm14[5],zmm2[7],zmm14[7] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm15[0],zmm13[0],zmm15[2],zmm13[2],zmm15[4],zmm13[4],zmm15[6],zmm13[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm13[1],zmm15[3],zmm13[3],zmm15[5],zmm13[5],zmm15[7],zmm13[7] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm6, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm16, %zmm8 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] +; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm14 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [7,15,7,15] +; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm16, %ymm16 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm15, %zmm6 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm15, %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm15[0],ymm16[0],ymm15[2],ymm16[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm9, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm24, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm15[1],ymm16[1],ymm15[3],ymm16[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm23, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm7, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm17, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: addq $72, %rsp ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <128 x i64>, ptr %in.vec, align 64 @@ -2391,17 +2371,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE: # %bb.0: ; SSE-NEXT: subq $1688, %rsp # imm = 0x698 ; SSE-NEXT: movaps 832(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm2 +; SSE-NEXT: movaps 320(%rdi), %xmm1 ; SSE-NEXT: movaps 256(%rdi), %xmm8 -; SSE-NEXT: movaps 960(%rdi), %xmm1 +; SSE-NEXT: movaps 960(%rdi), %xmm2 ; SSE-NEXT: movaps 896(%rdi), %xmm10 -; SSE-NEXT: movaps 448(%rdi), %xmm4 +; SSE-NEXT: movaps 448(%rdi), %xmm3 ; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps 576(%rdi), %xmm3 +; SSE-NEXT: movaps 576(%rdi), %xmm4 ; SSE-NEXT: movaps 512(%rdi), %xmm12 -; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm5 ; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 704(%rdi), %xmm5 +; SSE-NEXT: movaps 704(%rdi), %xmm6 ; SSE-NEXT: movaps 640(%rdi), %xmm14 ; SSE-NEXT: movaps 192(%rdi), %xmm7 ; SSE-NEXT: movaps 128(%rdi), %xmm13 @@ -2411,34 +2391,34 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm9, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 768(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2720,12 +2700,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 2016(%rdi), %xmm0 -; SSE-NEXT: movaps 1952(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 1952(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 48(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2765,9 +2744,9 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 688(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 880(%rdi), %xmm0 ; SSE-NEXT: movaps 816(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2776,11 +2755,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1008(%rdi), %xmm0 -; SSE-NEXT: movaps 944(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 944(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 1136(%rdi), %xmm0 ; SSE-NEXT: movaps 1072(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, %xmm1 @@ -2788,41 +2767,42 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 1264(%rdi), %xmm0 -; SSE-NEXT: movaps 1200(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps 1200(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 1392(%rdi), %xmm0 -; SSE-NEXT: movaps 1328(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 1328(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 1520(%rdi), %xmm0 -; SSE-NEXT: movaps 1456(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 1456(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 1648(%rdi), %xmm0 ; SSE-NEXT: movaps 1584(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 1776(%rdi), %xmm0 -; SSE-NEXT: movaps 1712(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps 1712(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps 1904(%rdi), %xmm0 ; SSE-NEXT: movaps 1840(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps 2032(%rdi), %xmm0 -; SSE-NEXT: movaps 1968(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 1968(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2984,8 +2964,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movaps %xmm14, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3017,12 +2996,13 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 240(%rax) +; SSE-NEXT: movaps %xmm4, 240(%rax) ; SSE-NEXT: movaps %xmm6, 224(%rax) -; SSE-NEXT: movaps %xmm8, 208(%rax) -; SSE-NEXT: movaps %xmm9, 192(%rax) -; SSE-NEXT: movaps %xmm10, 176(%rax) -; SSE-NEXT: movaps %xmm15, 160(%rax) +; SSE-NEXT: movaps %xmm7, 208(%rax) +; SSE-NEXT: movaps %xmm10, 192(%rax) +; SSE-NEXT: movaps %xmm13, 176(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3031,7 +3011,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -3044,18 +3024,18 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 240(%rax) +; SSE-NEXT: movaps %xmm3, 240(%rax) ; SSE-NEXT: movaps %xmm1, 224(%rax) -; SSE-NEXT: movaps %xmm4, 208(%rax) +; SSE-NEXT: movaps %xmm2, 208(%rax) ; SSE-NEXT: movaps %xmm5, 192(%rax) -; SSE-NEXT: movaps %xmm7, 176(%rax) -; SSE-NEXT: movaps %xmm13, 160(%rax) -; SSE-NEXT: movaps %xmm14, 144(%rax) +; SSE-NEXT: movaps %xmm9, 176(%rax) +; SSE-NEXT: movaps %xmm12, 160(%rax) +; SSE-NEXT: movaps %xmm8, 144(%rax) ; SSE-NEXT: movaps %xmm11, 128(%rax) -; SSE-NEXT: movaps %xmm12, 112(%rax) +; SSE-NEXT: movaps %xmm15, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -3072,54 +3052,54 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2216, %rsp # imm = 0x8A8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: subq $2248, %rsp # imm = 0x8C8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm7[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 @@ -3230,14 +3210,14 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] @@ -3245,52 +3225,52 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm3[1],ymm13[3],ymm3[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm13[1],ymm3[1],ymm13[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3415,7 +3395,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -3429,7 +3409,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3450,78 +3430,78 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm6[1],ymm15[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm3[1],ymm14[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3704,17 +3684,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $2216, %rsp # imm = 0x8A8 +; AVX1-ONLY-NEXT: addq $2248, %rsp # imm = 0x8C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3745,11 +3725,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm10, %ymm10 @@ -3769,7 +3749,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, 1152(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4240,1031 +4220,1005 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride8_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512F-NEXT: subq $2408, %rsp # imm = 0x968 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm18 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm29 -; AVX512F-NEXT: vmovdqa 1152(%rdi), %ymm13 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2] -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm6 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm8 -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm27 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm20 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm20[0],ymm17[2],ymm20[2] +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm16 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm10 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %ymm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm31 -; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm6[0],ymm10[2],ymm6[2] +; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm28 +; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm12[0],ymm28[0],ymm12[2],ymm28[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm22[0],ymm15[0],ymm22[2],ymm15[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm29 +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm5 +; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm27 +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm27[0],ymm3[2],ymm27[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm9 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm20[1],ymm17[3],ymm20[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm9[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm6[1],ymm10[3],ymm6[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm12[1],ymm28[1],ymm12[3],ymm28[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm10 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm22[1],ymm15[1],ymm22[3],ymm15[3] +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm8 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm14 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm27[1],ymm3[3],ymm27[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm11, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,4,12,4,12,4,12] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm28 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm13 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm26[0],zmm4[2],zmm26[2],zmm4[4],zmm26[4],zmm4[6],zmm26[6] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm24[0],zmm7[0],zmm24[2],zmm7[2],zmm24[4],zmm7[4],zmm24[6],zmm7[6] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm27, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm24, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm1[1],zmm0[1],zmm1[3],zmm0[3],zmm1[5],zmm0[5],zmm1[7],zmm0[7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm30[0],zmm25[0],zmm30[2],zmm25[2],zmm30[4],zmm25[4],zmm30[6],zmm25[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm25[1],zmm30[3],zmm25[3],zmm30[5],zmm25[5],zmm30[7],zmm25[7] +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm7, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,5,13,5,13,5,13] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7] -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm14 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] ; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm9, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm31, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm24, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm16 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm31, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm24, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm7, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm14 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm16[1],zmm26[1],zmm16[3],zmm26[3],zmm16[5],zmm26[5],zmm16[7],zmm26[7] +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm19[0],zmm23[0],zmm19[2],zmm23[2],zmm19[4],zmm23[4],zmm19[6],zmm23[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm19[1],zmm23[1],zmm19[3],zmm23[3],zmm19[5],zmm23[5],zmm19[7],zmm23[7] +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm7, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm12, %zmm9, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7] -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm23 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm1[1],zmm17[1],zmm1[3],zmm17[3],zmm1[5],zmm17[5],zmm1[7],zmm17[7] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm28 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7] -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm18[0],zmm3[0],zmm18[2],zmm3[2],zmm18[4],zmm3[4],zmm18[6],zmm3[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm3[1],zmm18[3],zmm3[3],zmm18[5],zmm3[5],zmm18[7],zmm3[7] ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm26 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm3, %zmm24, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm3, %zmm24, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm24 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm29[0],zmm1[0],zmm29[2],zmm1[2],zmm29[4],zmm1[4],zmm29[6],zmm1[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm1[1],zmm29[3],zmm1[3],zmm29[5],zmm1[5],zmm29[7],zmm1[7] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm27, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm27, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm15, %zmm6, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm16 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm27 # 32-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm9, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm5 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm15, %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm21 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm8 -; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm8, %ymm8 -; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm18, %zmm11 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 1216(%rdi), %ymm12, %ymm12 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %xmm16 -; AVX512F-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm22, %zmm13, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %xmm25 -; AVX512F-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm26 -; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm31, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm1 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm13 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm3, %zmm14 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm13, %zmm1, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm29 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 128(%r9) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 128(%rax) +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [7,15,7,15] +; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm13, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm16 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm3, %ymm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm10 +; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm10, %ymm10 +; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm12 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm12, %ymm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm20, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm14 +; AVX512F-NEXT: vinserti128 $1, 1216(%rdi), %ymm14, %ymm14 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm15 +; AVX512F-NEXT: vinserti128 $1, 1152(%rdi), %ymm15, %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm19, %zmm6, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %xmm20 +; AVX512F-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm20, %ymm20 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm21 +; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm21, %ymm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm21[0],ymm20[0],ymm21[2],ymm20[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm22, %zmm7, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm21[1],ymm20[1],ymm21[3],ymm20[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512F-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm11, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm12 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm24, %zmm12 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm8, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512F-NEXT: vmovaps %zmm10, 64(%rax) -; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512F-NEXT: vmovaps %zmm11, (%rax) +; AVX512F-NEXT: vmovaps %zmm7, 64(%rax) +; AVX512F-NEXT: addq $2408, %rsp # imm = 0x968 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512BW-NEXT: subq $2408, %rsp # imm = 0x968 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm18 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm29 -; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2] -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm6 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm8 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm20[0],ymm17[2],ymm20[2] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm16 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm10 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %ymm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm31 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm6[0],ymm10[2],ymm6[2] +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm28 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm12[0],ymm28[0],ymm12[2],ymm28[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm22[0],ymm15[0],ymm22[2],ymm15[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm29 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm5 +; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm27 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm27[0],ymm3[2],ymm27[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm8 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm9 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm20[1],ymm17[3],ymm20[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm9[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm6[1],ymm10[3],ymm6[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm12[1],ymm28[1],ymm12[3],ymm28[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm10 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm22[1],ymm15[1],ymm22[3],ymm15[3] +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm8 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm14 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm27[1],ymm3[3],ymm27[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm28 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm26[0],zmm4[2],zmm26[2],zmm4[4],zmm26[4],zmm4[6],zmm26[6] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm24[0],zmm7[0],zmm24[2],zmm7[2],zmm24[4],zmm7[4],zmm24[6],zmm7[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm24, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm1[1],zmm0[1],zmm1[3],zmm0[3],zmm1[5],zmm0[5],zmm1[7],zmm0[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm30[0],zmm25[0],zmm30[2],zmm25[2],zmm30[4],zmm25[4],zmm30[6],zmm25[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm25[1],zmm30[3],zmm25[3],zmm30[5],zmm25[5],zmm30[7],zmm25[7] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm7, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,5,13,5,13,5,13] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7] -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm14 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm9, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm24, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm16 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm14 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm16[1],zmm26[1],zmm16[3],zmm26[3],zmm16[5],zmm26[5],zmm16[7],zmm26[7] +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm19[0],zmm23[0],zmm19[2],zmm23[2],zmm19[4],zmm23[4],zmm19[6],zmm23[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm19[1],zmm23[1],zmm19[3],zmm23[3],zmm19[5],zmm23[5],zmm19[7],zmm23[7] +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7] -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm23 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm1[1],zmm17[1],zmm1[3],zmm17[3],zmm1[5],zmm17[5],zmm1[7],zmm17[7] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7] -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm18[0],zmm3[0],zmm18[2],zmm3[2],zmm18[4],zmm3[4],zmm18[6],zmm3[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm3[1],zmm18[3],zmm3[3],zmm18[5],zmm3[5],zmm18[7],zmm3[7] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm26 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm9, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm24, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm24, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm24 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm29[0],zmm1[0],zmm29[2],zmm1[2],zmm29[4],zmm1[4],zmm29[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm1[1],zmm29[3],zmm1[3],zmm29[5],zmm1[5],zmm29[7],zmm1[7] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm27 # 32-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm5 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm21 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm4, %ymm4 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm5, %ymm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm8 -; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm8, %ymm8 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm18, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm12, %ymm12 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm13, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm25 -; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm26 -; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm31, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm1 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm13 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm1, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm29 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm3, %zmm14 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [7,15,7,15] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 128(%r9) +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm16 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm3, %ymm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm10, %ymm10 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm12, %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm20, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm14 +; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm14, %ymm14 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm15 +; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm15, %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm6, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm20 +; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm20, %ymm20 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm21 +; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm21, %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm21[0],ymm20[0],ymm21[2],ymm20[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm21[1],ymm20[1],ymm21[3],ymm20[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512BW-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm11, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm12 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm24, %zmm12 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm15 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm15 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512BW-NEXT: vmovaps %zmm10, 64(%rax) -; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512BW-NEXT: vmovaps %zmm11, (%rax) +; AVX512BW-NEXT: vmovaps %zmm7, 64(%rax) +; AVX512BW-NEXT: addq $2408, %rsp # imm = 0x968 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <256 x i64>, ptr %in.vec, align 64 @@ -6129,17 +6083,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 3184(%rdi), %xmm0 -; SSE-NEXT: movaps 3120(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 3120(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 3312(%rdi), %xmm0 -; SSE-NEXT: movaps 3248(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps 3248(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 3440(%rdi), %xmm0 ; SSE-NEXT: movaps 3376(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm1 @@ -6147,16 +6101,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 3568(%rdi), %xmm0 -; SSE-NEXT: movaps 3504(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movaps 3504(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 4016(%rdi), %xmm4 -; SSE-NEXT: movaps 3952(%rdi), %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 4016(%rdi), %xmm3 +; SSE-NEXT: movaps 3952(%rdi), %xmm4 ; SSE-NEXT: movaps 3696(%rdi), %xmm0 ; SSE-NEXT: movaps 3632(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: movaps 4080(%rdi), %xmm1 ; SSE-NEXT: movaps 3888(%rdi), %xmm5 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] @@ -6169,11 +6123,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movaps %xmm6, 480(%rsi) ; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 464(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6556,10 +6510,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 496(%rax) +; SSE-NEXT: movaps %xmm4, 496(%rax) ; SSE-NEXT: movaps %xmm6, 480(%rax) ; SSE-NEXT: movaps %xmm7, 464(%rax) -; SSE-NEXT: movaps %xmm10, 448(%rax) +; SSE-NEXT: movaps %xmm8, 448(%rax) ; SSE-NEXT: movaps %xmm13, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 416(%rax) @@ -6616,14 +6570,14 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 496(%rax) +; SSE-NEXT: movaps %xmm3, 496(%rax) ; SSE-NEXT: movaps %xmm5, 480(%rax) ; SSE-NEXT: movaps %xmm0, 464(%rax) ; SSE-NEXT: movaps %xmm2, 448(%rax) -; SSE-NEXT: movaps %xmm8, 432(%rax) +; SSE-NEXT: movaps %xmm10, 432(%rax) ; SSE-NEXT: movaps %xmm12, 416(%rax) -; SSE-NEXT: movaps %xmm9, 400(%rax) -; SSE-NEXT: movaps %xmm15, 384(%rax) +; SSE-NEXT: movaps %xmm15, 400(%rax) +; SSE-NEXT: movaps %xmm9, 384(%rax) ; SSE-NEXT: movaps %xmm11, 368(%rax) ; SSE-NEXT: movaps %xmm14, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6675,7 +6629,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $5016, %rsp # imm = 0x1398 +; AVX1-ONLY-NEXT: subq $5096, %rsp # imm = 0x13E8 ; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -6886,7 +6840,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -6946,7 +6900,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 3344(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -6964,192 +6918,192 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2640(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 2576(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 3088(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovaps 3664(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 3600(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 2640(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 2576(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 3088(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 3664(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 3600(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7213,7 +7167,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7243,7 +7197,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7932,7 +7886,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rax) @@ -7942,7 +7896,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) @@ -8021,7 +7975,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $5016, %rsp # imm = 0x1398 +; AVX1-ONLY-NEXT: addq $5096, %rsp # imm = 0x13E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -9124,338 +9078,345 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-LABEL: load_i64_stride8_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $6600, %rsp # imm = 0x19C8 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm10 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa64 3264(%rdi), %ymm23 ; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm23[0],ymm0[2],ymm23[2] ; AVX512F-NEXT: vmovdqa 3136(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm30 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm16 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm18 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm21 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %ymm20 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %ymm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm20[0],ymm17[2],ymm20[2] +; AVX512F-NEXT: vmovdqa 1600(%rdi), %ymm9 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm19[0],ymm9[0],ymm19[2],ymm9[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 1216(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm24 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm28 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm26 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm28[0],ymm26[2],ymm28[2] +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm21 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %ymm22 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vmovdqa 2688(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %ymm31 -; AVX512F-NEXT: vmovdqa 2560(%rdi), %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512F-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 2560(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm25 ; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm18 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm30 ; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm30, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %ymm28 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2] +; AVX512F-NEXT: vmovdqa 2240(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX512F-NEXT: vmovdqa64 2112(%rdi), %ymm27 -; AVX512F-NEXT: vmovdqa 2048(%rdi), %ymm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-NEXT: vmovdqa 2048(%rdi), %ymm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm27[0],ymm5[2],ymm27[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 4032(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 4032(%rdi), %zmm14 ; AVX512F-NEXT: vmovdqa64 3968(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 3840(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa 3776(%rdi), %ymm12 -; AVX512F-NEXT: vmovdqa 3712(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] -; AVX512F-NEXT: vmovdqa 3648(%rdi), %ymm4 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3840(%rdi), %zmm4 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa 3776(%rdi), %ymm6 +; AVX512F-NEXT: vmovdqa 3712(%rdi), %ymm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512F-NEXT: vmovdqa 3648(%rdi), %ymm2 ; AVX512F-NEXT: vmovdqa 3584(%rdi), %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm31, %zmm7, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm23[1],ymm0[3],ymm23[3] +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm10 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm17[1],ymm20[1],ymm17[3],ymm20[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm9[1],ymm19[3],ymm9[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm19, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm7, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm28[1],ymm26[3],ymm28[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3] +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm7, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm27[1],ymm5[3],ymm27[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm5, %zmm19, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm30, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm31[0],zmm27[2],zmm31[2],zmm27[4],zmm31[4],zmm27[6],zmm31[6] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm23[0],zmm11[0],zmm23[2],zmm11[2],zmm23[4],zmm11[4],zmm23[6],zmm11[6] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 @@ -9464,95 +9425,102 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm23[0],zmm13[0],zmm23[2],zmm13[2],zmm23[4],zmm13[4],zmm23[6],zmm13[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm8 ; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm15[0],zmm20[0],zmm15[2],zmm20[2],zmm15[4],zmm20[4],zmm15[6],zmm20[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm22[0],zmm21[0],zmm22[2],zmm21[2],zmm22[4],zmm21[4],zmm22[6],zmm21[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm28 ; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6] -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[6] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqa64 3648(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3584(%rdi), %zmm3 @@ -9564,92 +9532,91 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm1[0],zmm30[0],zmm1[2],zmm30[2],zmm1[4],zmm30[4],zmm1[6],zmm30[6] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7] +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm31[1],zmm27[3],zmm31[3],zmm27[5],zmm31[5],zmm27[7],zmm31[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm18[1],zmm11[1],zmm18[3],zmm11[3],zmm18[5],zmm11[5],zmm18[7],zmm11[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm17 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm23[1],zmm25[1],zmm23[3],zmm25[3],zmm23[5],zmm25[5],zmm23[7],zmm25[7] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm16[1],mem[1],zmm16[3],mem[3],zmm16[5],mem[5],zmm16[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -9659,277 +9626,281 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm22[1],zmm4[3],zmm22[3],zmm4[5],zmm22[5],zmm4[7],zmm22[7] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm6, %zmm8, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm31[1],zmm15[1],zmm31[3],zmm15[3],zmm31[5],zmm15[5],zmm31[7],zmm15[7] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 {%k1} = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 {%k1} = zmm4[0],mem[0],zmm4[2],mem[2],zmm4[4],mem[4],zmm4[6],mem[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload -; AVX512F-NEXT: # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm4[0],zmm19[2],zmm4[2],zmm19[4],zmm4[4],zmm19[6],zmm4[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm26[0],zmm24[2],zmm26[2],zmm24[4],zmm26[4],zmm24[6],zmm26[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm11 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm28[0],zmm20[2],zmm28[2],zmm20[4],zmm28[4],zmm20[6],zmm28[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 {%k1} = zmm10[0],mem[0],zmm10[2],mem[2],zmm10[4],mem[4],zmm10[6],mem[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm8, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm22[0],zmm6[0],zmm22[2],zmm6[2],zmm22[4],zmm6[4],zmm22[6],zmm6[6] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm19[1],zmm4[1],zmm19[3],zmm4[3],zmm19[5],zmm4[5],zmm19[7],zmm4[7] ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm11, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm14, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm19 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm30 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm14, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7] -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm24[1],zmm26[1],zmm24[3],zmm26[3],zmm24[5],zmm26[5],zmm24[7],zmm26[7] ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm26, %zmm14, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm14, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm4[1],zmm23[3],zmm4[3],zmm23[5],zmm4[5],zmm23[7],zmm4[7] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm30, %zmm11, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm14, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm28[1],zmm20[3],zmm28[3],zmm20[5],zmm28[5],zmm20[7],zmm28[7] +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm11, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm28, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm11, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm26[1],zmm5[1],zmm26[3],zmm5[3],zmm26[5],zmm5[5],zmm26[7],zmm5[7] +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm11, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm30 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm31, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm14, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm31, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm15[1],zmm6[1],zmm15[3],zmm6[3],zmm15[5],zmm6[5],zmm15[7],zmm6[7] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm30 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm9[1],zmm29[1],zmm9[3],zmm29[3],zmm9[5],zmm29[5],zmm9[7],zmm29[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm11 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm12 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] ; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -9938,164 +9909,167 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm24 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm29 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, (%rsp), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm15, %ymm15 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm31, %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm24 -; AVX512F-NEXT: vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %xmm24 -; AVX512F-NEXT: vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm19 +; AVX512F-NEXT: vinserti32x4 $1, 704(%rdi), %ymm19, %ymm19 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %xmm31 +; AVX512F-NEXT: vinserti32x4 $1, 640(%rdi), %ymm31, %ymm31 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm31[0],ymm19[0],ymm31[2],ymm19[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm23, %zmm22, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX512F-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm9 -; AVX512F-NEXT: vinserti128 $1, 1152(%rdi), %ymm9, %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm3 +; AVX512F-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm3, %ymm23 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm3 +; AVX512F-NEXT: vinserti128 $1, 1152(%rdi), %ymm3, %ymm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm3[0],ymm23[0],ymm3[2],ymm23[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm22, %zmm17, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX512F-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm17 -; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm19, %zmm29, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %xmm19 -; AVX512F-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %xmm26 -; AVX512F-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm29, %zmm20, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm13 +; AVX512F-NEXT: vinserti128 $1, 1728(%rdi), %ymm13, %ymm13 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm22 +; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm22, %ymm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm22[0],ymm13[0],ymm22[2],ymm13[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm25, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %xmm26 +; AVX512F-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm26, %ymm26 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %xmm27 +; AVX512F-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm27, %ymm27 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm17, %zmm16, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 2752(%rdi), %ymm12, %ymm12 -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %xmm29 -; AVX512F-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm21, %zmm18 +; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512F-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %xmm17 +; AVX512F-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm17, %ymm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm17[0],ymm10[0],ymm17[2],ymm10[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm16, %zmm21, %zmm15 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa 3136(%rdi), %xmm6 -; AVX512F-NEXT: vinserti128 $1, 3264(%rdi), %ymm6, %ymm6 -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512F-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm21, %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa 3136(%rdi), %xmm4 +; AVX512F-NEXT: vinserti128 $1, 3264(%rdi), %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %xmm16 +; AVX512F-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm16, %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm16[0],ymm4[0],ymm16[2],ymm4[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm18, %zmm12 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqa 3648(%rdi), %xmm7 ; AVX512F-NEXT: vinserti128 $1, 3776(%rdi), %ymm7, %ymm7 -; AVX512F-NEXT: vmovdqa64 3584(%rdi), %xmm21 -; AVX512F-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm22 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512F-NEXT: vmovdqa64 3584(%rdi), %xmm30 +; AVX512F-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm30, %ymm30 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm30[0],ymm7[0],ymm30[2],ymm7[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm16[1],ymm4[1],ymm16[3],ymm4[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm31[1],ymm19[1],ymm31[3],ymm19[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm22[1],ymm13[1],ymm22[3],ymm13[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm23[1],ymm3[3],ymm23[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm10[1],ymm17[3],ymm10[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm28, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm30[1],ymm7[1],ymm30[3],ymm7[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm2, 384(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm12, 384(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm15, 320(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm25, 192(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 128(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10183,17 +10157,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm24, 64(%rax) ; AVX512F-NEXT: addq $6600, %rsp # imm = 0x19C8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -10201,338 +10174,345 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride8_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $6600, %rsp # imm = 0x19C8 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm10 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %ymm23 ; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm23[0],ymm0[2],ymm23[2] ; AVX512BW-NEXT: vmovdqa 3136(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm30 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm16 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm21 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm20[0],ymm17[2],ymm20[2] +; AVX512BW-NEXT: vmovdqa 1600(%rdi), %ymm9 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm19[0],ymm9[0],ymm19[2],ymm9[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm24 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm28 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm28[0],ymm26[2],ymm28[2] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm21 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 2688(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %ymm31 -; AVX512BW-NEXT: vmovdqa 2560(%rdi), %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512BW-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 2560(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %ymm28 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2] +; AVX512BW-NEXT: vmovdqa 2240(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %ymm27 -; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm27[0],ymm5[2],ymm27[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 4032(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 4032(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 3968(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 3776(%rdi), %ymm12 -; AVX512BW-NEXT: vmovdqa 3712(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm4 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa 3776(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa 3712(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm2 ; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm23[1],ymm0[3],ymm23[3] +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm10 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm17[1],ymm20[1],ymm17[3],ymm20[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm9[1],ymm19[3],ymm9[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm28[1],ymm26[3],ymm28[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3] +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm27[1],ymm5[3],ymm27[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm30, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm31[0],zmm27[2],zmm31[2],zmm27[4],zmm31[4],zmm27[6],zmm31[6] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm23[0],zmm11[0],zmm23[2],zmm11[2],zmm23[4],zmm11[4],zmm23[6],zmm11[6] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 @@ -10541,95 +10521,102 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm23[0],zmm13[0],zmm23[2],zmm13[2],zmm23[4],zmm13[4],zmm23[6],zmm13[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm15[0],zmm20[0],zmm15[2],zmm20[2],zmm15[4],zmm20[4],zmm15[6],zmm20[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm22[0],zmm21[0],zmm22[2],zmm21[2],zmm22[4],zmm21[4],zmm22[6],zmm21[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqa64 3648(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %zmm3 @@ -10641,92 +10628,91 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm1[0],zmm30[0],zmm1[2],zmm30[2],zmm1[4],zmm30[4],zmm1[6],zmm30[6] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7] +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm31[1],zmm27[3],zmm31[3],zmm27[5],zmm31[5],zmm27[7],zmm31[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm18[1],zmm11[1],zmm18[3],zmm11[3],zmm18[5],zmm11[5],zmm18[7],zmm11[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm17 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm23[1],zmm25[1],zmm23[3],zmm25[3],zmm23[5],zmm25[5],zmm23[7],zmm25[7] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm16[1],mem[1],zmm16[3],mem[3],zmm16[5],mem[5],zmm16[7],mem[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -10736,277 +10722,281 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm22[1],zmm4[3],zmm22[3],zmm4[5],zmm22[5],zmm4[7],zmm22[7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm8, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm31[1],zmm15[1],zmm31[3],zmm15[3],zmm31[5],zmm15[5],zmm31[7],zmm15[7] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 {%k1} = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 {%k1} = zmm4[0],mem[0],zmm4[2],mem[2],zmm4[4],mem[4],zmm4[6],mem[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm4[0],zmm19[2],zmm4[2],zmm19[4],zmm4[4],zmm19[6],zmm4[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm26[0],zmm24[2],zmm26[2],zmm24[4],zmm26[4],zmm24[6],zmm26[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm28[0],zmm20[2],zmm28[2],zmm20[4],zmm28[4],zmm20[6],zmm28[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 {%k1} = zmm10[0],mem[0],zmm10[2],mem[2],zmm10[4],mem[4],zmm10[6],mem[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm8, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm22[0],zmm6[0],zmm22[2],zmm6[2],zmm22[4],zmm6[4],zmm22[6],zmm6[6] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm19[1],zmm4[1],zmm19[3],zmm4[3],zmm19[5],zmm4[5],zmm19[7],zmm4[7] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm19 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm30 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7] -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm24[1],zmm26[1],zmm24[3],zmm26[3],zmm24[5],zmm26[5],zmm24[7],zmm26[7] ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm14, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm4[1],zmm23[3],zmm4[3],zmm23[5],zmm4[5],zmm23[7],zmm4[7] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm28[1],zmm20[3],zmm28[3],zmm20[5],zmm28[5],zmm20[7],zmm28[7] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm26[1],zmm5[1],zmm26[3],zmm5[3],zmm26[5],zmm5[5],zmm26[7],zmm5[7] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm11, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm30 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm31, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm14, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm31, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm15[1],zmm6[1],zmm15[3],zmm6[3],zmm15[5],zmm6[5],zmm15[7],zmm6[7] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm30 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm9[1],zmm29[1],zmm9[3],zmm29[3],zmm9[5],zmm29[5],zmm9[7],zmm29[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm11 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -11015,164 +11005,167 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm24 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm29 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm15, %ymm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm31, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm24 -; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm24 -; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm19 +; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm19, %ymm19 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm31 +; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm31, %ymm31 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm31[0],ymm19[0],ymm31[2],ymm19[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm23, %zmm22, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX512BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm9 -; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm9, %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm3 +; AVX512BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm3, %ymm23 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm3 +; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm3, %ymm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm3[0],ymm23[0],ymm3[2],ymm23[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm17, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm17 -; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm29, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %xmm19 -; AVX512BW-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm26 -; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm13 +; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm13, %ymm13 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm22 +; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm22, %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm22[0],ymm13[0],ymm22[2],ymm13[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm25, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %xmm26 +; AVX512BW-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm26, %ymm26 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm27 +; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm27, %ymm27 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm17, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm12, %ymm12 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm29 -; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm21, %zmm18 +; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm17 +; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm17, %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm17[0],ymm10[0],ymm17[2],ymm10[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm21, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm6 -; AVX512BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm6, %ymm6 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm4, %ymm4 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm16, %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm16[0],ymm4[0],ymm16[2],ymm4[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm18, %zmm12 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm7 ; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm7, %ymm7 -; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %xmm21 -; AVX512BW-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm22 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %xmm30 +; AVX512BW-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm30, %ymm30 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm30[0],ymm7[0],ymm30[2],ymm7[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm16[1],ymm4[1],ymm16[3],ymm4[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm31[1],ymm19[1],ymm31[3],ymm19[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm22[1],ymm13[1],ymm22[3],ymm13[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm23[1],ymm3[3],ymm23[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm10[1],ymm17[3],ymm10[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm28, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm30[1],ymm7[1],ymm30[3],ymm7[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 128(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11260,17 +11253,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rax) ; AVX512BW-NEXT: addq $6600, %rsp # imm = 0x19C8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll index 06b95c94ce16d4..c666131227b3cf 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -368,47 +368,47 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa 80(%rdi), %xmm4 ; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa 112(%rdi), %xmm7 +; SSE-NEXT: movdqa 112(%rdi), %xmm5 ; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: packuswb %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: packuswb %xmm10, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: packuswb %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: packuswb %xmm12, %xmm6 -; SSE-NEXT: psrlw $8, %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: packuswb %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pand %xmm8, %xmm11 +; SSE-NEXT: packuswb %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pand %xmm8, %xmm12 +; SSE-NEXT: packuswb %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: packuswb %xmm9, %xmm8 +; SSE-NEXT: psrlw $8, %xmm7 ; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: packuswb %xmm11, %xmm3 -; SSE-NEXT: psrlw $8, %xmm9 +; SSE-NEXT: packuswb %xmm7, %xmm3 +; SSE-NEXT: psrlw $8, %xmm6 ; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: packuswb %xmm9, %xmm2 -; SSE-NEXT: psrlw $8, %xmm7 +; SSE-NEXT: packuswb %xmm6, %xmm2 +; SSE-NEXT: psrlw $8, %xmm5 ; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: packuswb %xmm7, %xmm1 +; SSE-NEXT: packuswb %xmm5, %xmm1 ; SSE-NEXT: psrlw $8, %xmm4 ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm6, 32(%rsi) -; SSE-NEXT: movdqa %xmm10, 48(%rsi) -; SSE-NEXT: movdqa %xmm8, (%rsi) -; SSE-NEXT: movdqa %xmm5, 16(%rsi) +; SSE-NEXT: movdqa %xmm8, 32(%rsi) +; SSE-NEXT: movdqa %xmm12, 48(%rsi) +; SSE-NEXT: movdqa %xmm11, (%rsi) +; SSE-NEXT: movdqa %xmm10, 16(%rsi) ; SSE-NEXT: movdqa %xmm0, 32(%rdx) ; SSE-NEXT: movdqa %xmm1, 48(%rdx) ; SSE-NEXT: movdqa %xmm2, (%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index 7d7b43a2234c2b..77fdf028a19092 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -202,13 +202,13 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i8_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pxor %xmm6, %xmm6 @@ -248,9 +248,9 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm11, %xmm8 ; SSE-NEXT: por %xmm10, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm4, %xmm10 ; SSE-NEXT: por %xmm11, %xmm10 ; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] @@ -281,9 +281,9 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm9, %xmm9 ; SSE-NEXT: pandn %xmm9, %xmm2 ; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535] @@ -395,238 +395,238 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i8_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa 48(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pandn %xmm6, %xmm14 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pand %xmm14, %xmm10 +; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm4, %xmm11 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: movdqa 80(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pand %xmm15, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pand %xmm14, %xmm12 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm14 -; SSE-NEXT: por %xmm14, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm15, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm6, %xmm15 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm14, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: pand %xmm14, %xmm13 +; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE-NEXT: pand %xmm13, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: packuswb %xmm0, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; SSE-NEXT: pand %xmm5, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm13, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,1,2,0] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: por %xmm5, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm6 -; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm11, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movdqa %xmm7, 16(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rcx) -; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm12, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm15, (%rdx) +; SSE-NEXT: movdqa %xmm6, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride3_vf32: @@ -769,65 +769,15 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: subq $168, %rsp ; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm2 @@ -835,347 +785,333 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm13, %xmm13 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: packuswb %xmm0, %xmm9 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0] +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm15, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm12, %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: packuswb %xmm4, %xmm9 +; SSE-NEXT: movdqa 176(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm9 -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: movdqa 96(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: por %xmm11, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm9, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: packuswb %xmm0, %xmm9 -; SSE-NEXT: movdqa 128(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0] +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa 128(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm9 -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: por %xmm9, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm13[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn (%rsp), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm15, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm10, %xmm9 -; SSE-NEXT: pand %xmm12, %xmm10 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm11, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm15[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSE-NEXT: pand %xmm2, %xmm14 +; SSE-NEXT: por %xmm4, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm15 -; SSE-NEXT: por %xmm9, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: pand %xmm2, %xmm14 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,1,2,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] @@ -1183,49 +1119,109 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; SSE-NEXT: movdqa %xmm14, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; SSE-NEXT: pand %xmm14, %xmm12 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm8 +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: por %xmm8, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; SSE-NEXT: pand %xmm14, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: por %xmm11, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm7 +; SSE-NEXT: pand %xmm10, %xmm9 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1234,130 +1230,131 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movdqa %xmm7, 32(%rdx) -; SSE-NEXT: movdqa %xmm13, 48(%rdx) -; SSE-NEXT: movdqa %xmm10, (%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm15, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movdqa %xmm4, 32(%rcx) -; SSE-NEXT: movdqa %xmm0, 48(%rcx) -; SSE-NEXT: movdqa %xmm9, (%rcx) -; SSE-NEXT: movdqa %xmm5, 16(%rcx) +; SSE-NEXT: movdqa %xmm6, 32(%rcx) +; SSE-NEXT: movdqa %xmm4, 48(%rcx) +; SSE-NEXT: movdqa %xmm5, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) ; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm11 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm9, %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm14, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 32(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 16(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm12, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 16(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 16(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i8_stride3_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index df33d3b9d8fd4b..750a22ea59f856 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -84,19 +84,19 @@ define void @load_i8_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE-NEXT: packuswb %xmm2, %xmm2 @@ -107,14 +107,14 @@ define void @load_i8_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movd %xmm0, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movd %xmm1, (%rsi) ; SSE-NEXT: movd %xmm2, (%rdx) ; SSE-NEXT: movd %xmm4, (%rcx) -; SSE-NEXT: movd %xmm1, (%r8) +; SSE-NEXT: movd %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i8_stride4_vf4: @@ -157,61 +157,61 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride4_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: packuswb %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm1 +; SSE-NEXT: packuswb %xmm3, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: packuswb %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3] ; SSE-NEXT: movq %xmm0, (%rsi) -; SSE-NEXT: movq %xmm7, (%rdx) +; SSE-NEXT: movq %xmm4, (%rdx) ; SSE-NEXT: movq %xmm1, (%rcx) ; SSE-NEXT: movq %xmm2, (%r8) ; SSE-NEXT: retq @@ -297,12 +297,12 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: packuswb %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm3 @@ -310,97 +310,97 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm10, %xmm10 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: pxor %xmm11, %xmm11 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: packuswb %xmm7, %xmm14 -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm12[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; SSE-NEXT: packuswb %xmm8, %xmm14 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm11[8],xmm8[9],xmm11[9],xmm8[10],xmm11[10],xmm8[11],xmm11[11],xmm8[12],xmm11[12],xmm8[13],xmm11[13],xmm8[14],xmm11[14],xmm8[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm13[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] ; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: packuswb %xmm15, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm14[0,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm13, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; SSE-NEXT: packuswb %xmm15, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm14[0,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm9, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm8[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; SSE-NEXT: packuswb %xmm3, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] ; SSE-NEXT: movdqa %xmm0, (%rsi) -; SSE-NEXT: movaps %xmm10, (%rdx) +; SSE-NEXT: movaps %xmm11, (%rdx) ; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movaps %xmm5, (%r8) ; SSE-NEXT: retq @@ -534,19 +534,19 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 80(%rdi), %xmm5 ; SSE-NEXT: movdqa 96(%rdi), %xmm15 ; SSE-NEXT: movdqa 112(%rdi), %xmm14 -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 32(%rdi), %xmm10 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 @@ -557,102 +557,101 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: packuswb %xmm0, %xmm6 ; SSE-NEXT: packuswb %xmm1, %xmm6 -; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: packuswb %xmm9, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm8[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: packuswb %xmm8, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm7[0,3] ; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: packuswb %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: packuswb %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] ; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: packuswb %xmm1, %xmm4 @@ -663,115 +662,115 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,3] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm7 ; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[1,0,3,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: packuswb %xmm2, %xmm7 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] -; SSE-NEXT: packuswb %xmm7, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: packuswb %xmm2, %xmm10 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm11[0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: packuswb %xmm2, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm10[0,3] ; SSE-NEXT: movdqa %xmm6, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps %xmm4, 16(%rdx) -; SSE-NEXT: movaps %xmm13, (%rdx) +; SSE-NEXT: movaps %xmm12, (%rdx) ; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm8, 16(%r8) +; SSE-NEXT: movaps %xmm9, (%rcx) +; SSE-NEXT: movaps %xmm7, 16(%r8) ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: addq $136, %rsp ; SSE-NEXT: retq @@ -1027,61 +1026,61 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa 128(%rdi), %xmm15 +; SSE-NEXT: subq $648, %rsp # imm = 0x288 +; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa 48(%rdi), %xmm12 +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa 160(%rdi), %xmm11 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm13 -; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa 160(%rdi), %xmm13 +; SSE-NEXT: movdqa 176(%rdi), %xmm4 +; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: movdqa 80(%rdi), %xmm8 ; SSE-NEXT: movdqa 96(%rdi), %xmm2 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movdqa 112(%rdi), %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1096,230 +1095,230 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm5 ; SSE-NEXT: packuswb %xmm1, %xmm5 ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: packuswb %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: packuswb %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] +; SSE-NEXT: packuswb %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: packuswb %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: packuswb %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] +; SSE-NEXT: packuswb %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; SSE-NEXT: packuswb %xmm1, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[0,3] -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: packuswb %xmm1, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm0[0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm3[0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm3[0,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] @@ -1339,7 +1338,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] @@ -1365,38 +1364,38 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,7,6,5,4] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm3 +; SSE-NEXT: packuswb %xmm7, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm7[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm11 +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm7[0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm11[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] @@ -1407,111 +1406,110 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: packuswb %xmm4, %xmm11 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, (%rsp), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: packuswb %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm11[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: packuswb %xmm4, %xmm9 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: packuswb %xmm7, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm9[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: packuswb %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm8[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: packuswb %xmm9, %xmm11 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; SSE-NEXT: packuswb %xmm8, %xmm10 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE-NEXT: packuswb %xmm12, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm11[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: packuswb %xmm11, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm10[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: packuswb %xmm11, %xmm12 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: packuswb %xmm10, %xmm11 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] @@ -1519,8 +1517,8 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: # xmm13 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; SSE-NEXT: packuswb %xmm10, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm11[0,3] +; SSE-NEXT: packuswb %xmm11, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm12[0,3] ; SSE-NEXT: movdqa %xmm5, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, (%rsi) @@ -1529,57 +1527,57 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rsi) ; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps %xmm15, (%rdx) +; SSE-NEXT: movaps %xmm10, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps %xmm6, 48(%rcx) +; SSE-NEXT: movaps %xmm7, 48(%rcx) ; SSE-NEXT: movaps %xmm3, 32(%rcx) ; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps %xmm12, (%rcx) +; SSE-NEXT: movaps %xmm14, (%rcx) ; SSE-NEXT: movaps %xmm13, 48(%r8) -; SSE-NEXT: movaps %xmm8, 32(%r8) -; SSE-NEXT: movaps %xmm7, 16(%r8) -; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: addq $664, %rsp # imm = 0x298 +; SSE-NEXT: movaps %xmm9, 32(%r8) +; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: addq $648, %rsp # imm = 0x288 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $328, %rsp # imm = 0x148 +; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm11 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] @@ -1593,14 +1591,14 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm15 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1,2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1608,17 +1606,17 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 @@ -1637,8 +1635,9 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -1651,11 +1650,11 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 @@ -1693,79 +1692,77 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm3 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: addq $328, %rsp # imm = 0x148 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1774,51 +1771,51 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: subq $168, %rsp ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm9 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm8 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm9 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm2, %ymm9 -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm1, %ymm9 -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm2, %ymm9 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm2, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm11 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm10 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; AVX2-ONLY-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm14 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm14 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm10 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm11 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm10, %ymm14 -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm1, %ymm15 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm11, %ymm14 +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm2, %ymm15 ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm1, %ymm13 +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1833,183 +1830,183 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa %xmm5, %xmm14 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm2 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm4, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm1, %ymm13 +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm13 +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm6, %ymm4 -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm6, %ymm4 +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm2, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm0 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm3 +; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm3 ; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm4 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm10, %ymm3 -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm11, %ymm3 +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm2, %ymm3 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm13 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm1, %ymm13 +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm6, %ymm15 -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm1, %ymm15 +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm2, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm1 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm1 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm10, %ymm2 -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm11, %ymm1 +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm14, %ymm3 -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm1, %ymm6 +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm2, %ymm6 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm2, %ymm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm6 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm6 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm6 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm4 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm10, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm11, %ymm4 ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm14, %ymm5 -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-ONLY-NEXT: addq $168, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i8_stride4_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX512F-NEXT: vpshufb %ymm7, %ymm3, %ymm5 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512F-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] -; AVX512F-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 -; AVX512F-NEXT: vpmovdb %zmm2, %xmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX512F-NEXT: vpshufb %ymm7, %ymm5, %ymm9 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm6 -; AVX512F-NEXT: vpshufb %ymm7, %ymm6, %ymm7 -; AVX512F-NEXT: vpermt2d %ymm9, %ymm1, %ymm7 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512F-NEXT: vpshufb %ymm6, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-NEXT: vpshufb %ymm6, %ymm1, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [0,4,0,4,0,4,8,12] +; AVX512F-NEXT: vpermt2d %ymm2, %ymm7, %ymm3 +; AVX512F-NEXT: vpmovdb %zmm5, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX512F-NEXT: vpshufb %ymm6, %ymm2, %ymm9 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX512F-NEXT: vpermt2d %ymm9, %ymm7, %ymm6 +; AVX512F-NEXT: vpmovdb %zmm4, %xmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm8[4,5,6,7] ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-NEXT: vpshufb %ymm8, %ymm3, %ymm9 -; AVX512F-NEXT: vpshufb %ymm8, %ymm4, %ymm10 -; AVX512F-NEXT: vpermt2d %ymm9, %ymm1, %ymm10 -; AVX512F-NEXT: vpsrld $8, %zmm2, %zmm9 +; AVX512F-NEXT: vpshufb %ymm8, %ymm0, %ymm9 +; AVX512F-NEXT: vpshufb %ymm8, %ymm1, %ymm10 +; AVX512F-NEXT: vpermt2d %ymm9, %ymm7, %ymm10 +; AVX512F-NEXT: vpsrld $8, %zmm5, %zmm9 ; AVX512F-NEXT: vpmovdb %zmm9, %xmm9 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-NEXT: vpshufb %ymm8, %ymm5, %ymm10 -; AVX512F-NEXT: vpshufb %ymm8, %ymm6, %ymm8 -; AVX512F-NEXT: vpermt2d %ymm10, %ymm1, %ymm8 -; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm10 +; AVX512F-NEXT: vpshufb %ymm8, %ymm2, %ymm10 +; AVX512F-NEXT: vpshufb %ymm8, %ymm3, %ymm8 +; AVX512F-NEXT: vpermt2d %ymm10, %ymm7, %ymm8 +; AVX512F-NEXT: vpsrld $8, %zmm4, %zmm10 ; AVX512F-NEXT: vpmovdb %zmm10, %xmm10 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm9[4,5,6,7] ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-NEXT: vpshufb %ymm9, %ymm3, %ymm10 -; AVX512F-NEXT: vpshufb %ymm9, %ymm4, %ymm11 -; AVX512F-NEXT: vpermt2d %ymm10, %ymm1, %ymm11 -; AVX512F-NEXT: vpsrld $16, %zmm2, %zmm10 +; AVX512F-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512F-NEXT: vpshufb %ymm9, %ymm1, %ymm11 +; AVX512F-NEXT: vpermt2d %ymm10, %ymm7, %ymm11 +; AVX512F-NEXT: vpsrld $16, %zmm5, %zmm10 ; AVX512F-NEXT: vpmovdb %zmm10, %xmm10 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512F-NEXT: vpshufb %ymm9, %ymm5, %ymm11 -; AVX512F-NEXT: vpshufb %ymm9, %ymm6, %ymm9 -; AVX512F-NEXT: vpermt2d %ymm11, %ymm1, %ymm9 -; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm11 +; AVX512F-NEXT: vpshufb %ymm9, %ymm2, %ymm11 +; AVX512F-NEXT: vpshufb %ymm9, %ymm3, %ymm9 +; AVX512F-NEXT: vpermt2d %ymm11, %ymm7, %ymm9 +; AVX512F-NEXT: vpsrld $16, %zmm4, %zmm11 ; AVX512F-NEXT: vpmovdb %zmm11, %xmm11 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7] ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX512F-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512F-NEXT: vpermt2d %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vpsrld $24, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-NEXT: vpshufb %ymm10, %ymm5, %ymm3 -; AVX512F-NEXT: vpshufb %ymm10, %ymm6, %ymm4 -; AVX512F-NEXT: vpermt2d %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm0 +; AVX512F-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512F-NEXT: vpermt2d %ymm0, %ymm7, %ymm1 +; AVX512F-NEXT: vpsrld $24, %zmm5, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpshufb %ymm10, %ymm2, %ymm1 +; AVX512F-NEXT: vpshufb %ymm10, %ymm3, %ymm2 +; AVX512F-NEXT: vpermt2d %ymm1, %ymm7, %ymm2 +; AVX512F-NEXT: vpsrld $24, %zmm4, %zmm1 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm0, (%r8) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index 29e3247e1451a5..430b380190c57f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -18,26 +18,26 @@ define void @load_i8_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: packuswb %xmm6, %xmm6 ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: movw %ax, (%rsi) @@ -45,7 +45,7 @@ define void @load_i8_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movw %ax, (%rdx) ; SSE-NEXT: movd %xmm5, %eax ; SSE-NEXT: movw %ax, (%rcx) -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movd %xmm2, %eax ; SSE-NEXT: movw %ax, (%r8) ; SSE-NEXT: movd %xmm6, %eax ; SSE-NEXT: movw %ax, (%r9) @@ -82,25 +82,25 @@ define void @load_i8_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm7 ; SSE-NEXT: pand %xmm6, %xmm7 ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3] @@ -110,11 +110,11 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm6, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pand %xmm7, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] ; SSE-NEXT: movdqa %xmm2, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2] @@ -124,10 +124,10 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] @@ -208,44 +208,44 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,5,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,5,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] ; SSE-NEXT: pand %xmm9, %xmm7 ; SSE-NEXT: pandn %xmm8, %xmm9 ; SSE-NEXT: por %xmm7, %xmm9 @@ -255,22 +255,22 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm7 ; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: pslld $24, %xmm8 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: por %xmm10, %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] @@ -281,7 +281,7 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm10, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm11, %xmm11 @@ -291,32 +291,32 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; SSE-NEXT: movdqa %xmm3, %xmm11 ; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 ; SSE-NEXT: por %xmm11, %xmm10 ; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] ; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm10, %xmm12 ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] @@ -330,10 +330,10 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pandn %xmm0, %xmm7 ; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movq %xmm5, (%rsi) -; SSE-NEXT: movq %xmm2, (%rdx) +; SSE-NEXT: movq %xmm6, (%rsi) +; SSE-NEXT: movq %xmm4, (%rdx) ; SSE-NEXT: movq %xmm9, (%rcx) -; SSE-NEXT: movq %xmm10, (%r8) +; SSE-NEXT: movq %xmm12, (%r8) ; SSE-NEXT: movq %xmm7, (%r9) ; SSE-NEXT: retq ; @@ -422,144 +422,141 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm9 -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa 32(%rdi), %xmm10 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: packuswb %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm7 ; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm12, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm12[2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm14 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm12[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm14, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: pand %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm12, %xmm15 -; SSE-NEXT: por %xmm13, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,4,5,7] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm0 -; SSE-NEXT: packuswb %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm9[3,0] -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm15 +; SSE-NEXT: por %xmm14, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3],xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm12, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm6 +; SSE-NEXT: packuswb %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm7[3,0] +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm15, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm15, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] @@ -568,21 +565,22 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: pandn %xmm0, %xmm15 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm10, %xmm14 -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: pand %xmm4, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -592,106 +590,102 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,0] -; SSE-NEXT: por %xmm7, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,6,7] +; SSE-NEXT: pandn %xmm15, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm0[2,0] +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm9, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[3,0] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm1[1,2] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm0[1,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm10, (%rcx) -; SSE-NEXT: movdqa %xmm6, (%r8) +; SSE-NEXT: movdqa %xmm10, (%rdx) +; SSE-NEXT: movdqa %xmm11, (%rcx) +; SSE-NEXT: movdqa %xmm5, (%r8) ; SSE-NEXT: movaps %xmm4, (%r9) ; SSE-NEXT: retq ; @@ -821,54 +815,54 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-LABEL: load_i8_stride5_vf16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u] +; AVX512F-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[4,9,14],zero,zero,zero,xmm4[2,7,12,u,u,u] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] -; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX512F-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] +; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] -; AVX512F-NEXT: vpor %xmm6, %xmm2, %xmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512F-NEXT: vmovdqa %ymm2, %ymm7 -; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm7 +; AVX512F-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] +; AVX512F-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512F-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm7 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u] ; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u] ; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512F-NEXT: vpshufb %xmm4, %xmm7, %xmm7 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12] ; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512F-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm8 +; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm8 ; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u] ; AVX512F-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512F-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512F-NEXT: vpshufb %xmm4, %xmm8, %xmm8 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13] ; AVX512F-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512F-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u] ; AVX512F-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14] -; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] -; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14] +; AVX512F-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[1,6,11,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12],zero,zero,zero,xmm6[u,u,u,u] +; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512F-NEXT: vmovdqa %xmm6, (%rsi) +; AVX512F-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512F-NEXT: vmovdqa %xmm7, (%rdx) ; AVX512F-NEXT: vmovdqa %xmm8, (%rcx) ; AVX512F-NEXT: vmovdqa %xmm1, (%r8) @@ -951,235 +945,230 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $184, %rsp -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $168, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pxor %xmm7, %xmm7 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa 64(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 96(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm14, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm13, %xmm13 +; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: por %xmm11, %xmm13 +; SSE-NEXT: movdqa 64(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm4, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: movdqa 128(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm7[8],xmm14[9],xmm7[9],xmm14[10],xmm7[10],xmm14[11],xmm7[11],xmm14[12],xmm7[12],xmm14[13],xmm7[13],xmm14[14],xmm7[14],xmm14[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; SSE-NEXT: packuswb %xmm14, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,0,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm14 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: movdqa 80(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] +; SSE-NEXT: pand %xmm12, %xmm15 +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: por %xmm15, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm12, %xmm12 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm14, %xmm12 +; SSE-NEXT: movdqa 144(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm13, %xmm13 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm12 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: por %xmm13, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: psllq $48, %xmm12 +; SSE-NEXT: packuswb %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pand %xmm5, %xmm12 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm13, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm12 +; SSE-NEXT: packuswb %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] @@ -1187,59 +1176,126 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm6 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm14[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm11[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm14 ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pand %xmm5, %xmm13 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pandn %xmm11, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm11, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm11 -; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm11, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: pand %xmm3, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,1,2,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] @@ -1247,264 +1303,187 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm11 -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm11, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm8, %xmm14 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm11, %xmm8 ; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm8, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm8, %xmm11 -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm11[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,5] -; SSE-NEXT: packuswb %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: pxor %xmm13, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[2,0] +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[3,0] +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[1,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] +; SSE-NEXT: pand %xmm4, %xmm12 +; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,1,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSE-NEXT: packuswb %xmm2, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,1] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[1,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm11, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,0,3,4,5,6,7] +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; SSE-NEXT: packuswb %xmm1, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,1] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[1,2] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: packuswb %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,1] +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1513,14 +1492,14 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm12, 16(%rcx) +; SSE-NEXT: movdqa %xmm11, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm9, 16(%r8) -; SSE-NEXT: movdqa %xmm8, (%r8) -; SSE-NEXT: movaps %xmm1, 16(%r9) -; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: addq $184, %rsp +; SSE-NEXT: movdqa %xmm8, 16(%r8) +; SSE-NEXT: movdqa %xmm6, (%r8) +; SSE-NEXT: movaps %xmm2, 16(%r9) +; SSE-NEXT: movaps %xmm1, (%r9) +; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride5_vf32: @@ -1540,37 +1519,37 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm6, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm14 ; AVX1-ONLY-NEXT: vorps %ymm14, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 @@ -1579,41 +1558,41 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2,3,4],xmm7[5,6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm8[u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 @@ -1621,26 +1600,26 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[3,8,13],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm15 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,7,12],zero,zero,zero,xmm8[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -1651,16 +1630,16 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[0,5,10,15] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm12, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1906,50 +1885,50 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11] -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm7 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[1,6,11] +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512BW-NEXT: kmovd %eax, %k4 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k4} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm5 {%k5} -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u] -; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm5 {%k5} +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13,u,u,u] +; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[2,7,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX512BW-NEXT: vpor %xmm5, %xmm9, %xmm5 ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7],ymm6[8,9,10,11,12],ymm5[13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm6 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1] ; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5} +; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm6 {%k5} ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4} ; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] ; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[3,8,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k1} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] ; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000 @@ -1963,13 +1942,13 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[4,9,14] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512BW-NEXT: movl $-33554432, %eax # imm = 0xFE000000 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} +; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm10 {%k3} ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1} ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -1989,7 +1968,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3} ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx) -; AVX512BW-NEXT: vmovdqa %ymm8, (%rcx) +; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx) ; AVX512BW-NEXT: vmovdqa %ymm10, (%r8) ; AVX512BW-NEXT: vmovdqa %ymm0, (%r9) ; AVX512BW-NEXT: vzeroupper @@ -2012,460 +1991,451 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $552, %rsp # imm = 0x228 -; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm12, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: movdqa 160(%rdi), %xmm0 +; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pxor %xmm11, %xmm11 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; SSE-NEXT: pand %xmm8, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa 224(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 48(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: packuswb %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm12, %xmm7 ; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm13, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa 240(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa 288(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: movdqa 240(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm15 +; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm13, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm11[8],xmm15[9],xmm11[9],xmm15[10],xmm11[10],xmm15[11],xmm11[11],xmm15[12],xmm11[12],xmm15[13],xmm11[13],xmm15[14],xmm11[14],xmm15[15],xmm11[15] +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm15[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: movdqa 304(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm11[8],xmm15[9],xmm11[9],xmm15[10],xmm11[10],xmm15[11],xmm11[11],xmm15[12],xmm11[12],xmm15[13],xmm11[13],xmm15[14],xmm11[14],xmm15[15],xmm11[15] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 96(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa 144(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; SSE-NEXT: packuswb %xmm10, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,3] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 80(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm13 +; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: psllq $48, %xmm3 -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: pandn %xmm10, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,5,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm10, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3] +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm4 -; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: movdqa %xmm6, %xmm7 ; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm3 -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] @@ -2476,326 +2446,328 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: psllq $48, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm13, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm4, %xmm7 -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: packuswb %xmm0, %xmm7 +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm7 ; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pand %xmm9, %xmm13 +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm15 -; SSE-NEXT: pand %xmm8, %xmm13 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] @@ -2804,40 +2776,39 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] -; SSE-NEXT: pxor %xmm10, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,6,5,6,7] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -2847,279 +2818,281 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm11[8],xmm8[9],xmm11[9],xmm8[10],xmm11[10],xmm8[11],xmm11[11],xmm8[12],xmm11[12],xmm8[13],xmm11[13],xmm8[14],xmm11[14],xmm8[15],xmm11[15] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[1,2] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[1,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm12, %xmm8 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: packuswb %xmm0, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,1] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[1,2] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[1,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm15 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] -; SSE-NEXT: pand %xmm6, %xmm15 -; SSE-NEXT: por %xmm8, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm11[8],xmm8[9],xmm11[9],xmm8[10],xmm11[10],xmm8[11],xmm11[11],xmm8[12],xmm11[12],xmm8[13],xmm11[13],xmm8[14],xmm11[14],xmm8[15],xmm11[15] +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: packuswb %xmm0, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,1] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm0[1,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[1,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15] -; SSE-NEXT: pand %xmm6, %xmm13 -; SSE-NEXT: por %xmm5, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: packuswb %xmm0, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm0[1,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm9 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: pshufd $232, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,1] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[1,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm12, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,0,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rdx) +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3128,408 +3101,416 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm9, 16(%r8) -; SSE-NEXT: movdqa %xmm10, 48(%r8) -; SSE-NEXT: movdqa %xmm11, (%r8) -; SSE-NEXT: movdqa %xmm14, 32(%r8) +; SSE-NEXT: movdqa %xmm10, 16(%r8) +; SSE-NEXT: movdqa %xmm12, 48(%r8) +; SSE-NEXT: movdqa %xmm13, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps %xmm5, 48(%r9) +; SSE-NEXT: movaps %xmm9, 48(%r9) ; SSE-NEXT: movaps %xmm8, (%r9) -; SSE-NEXT: movaps %xmm2, 32(%r9) +; SSE-NEXT: movaps %xmm4, 32(%r9) ; SSE-NEXT: addq $552, %rsp # imm = 0x228 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] +; AVX1-ONLY-NEXT: subq $472, %rsp # imm = 0x1D8 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm4, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm4, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,zero,zero,zero,xmm7[4,9,14,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,7,12] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm12, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm15, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [9,14,0,128,128,128,128,4,9,14,0,128,128,128,128,4] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,0,0,5,10,15,128,128,128,0,0,5,10,15,128] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,0,5,10,15],zero,zero,zero,xmm9[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <1,6,11,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm3, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm15, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[2,7,12] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [10,15,0,128,128,128,0,5,10,15,0,128,128,128,0,5] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm12[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm1, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[2,7,12],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[0,5,10,15,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[1,6,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,4,5,6,7,8,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,9,14,0,4,9,14,0,4,9,14,0,4,9,14] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,0,5,10,15,0,128,128,128,0,5,10,15,0,128,128] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,0,2,7,12,128,128,128,0,0,2,7,12,128,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[3,8,13],zero,zero,zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm4[4,9,14] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm12, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm7[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[3,8,13],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u],zero,zero,zero,xmm14[1,6,11,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm9[4,9,14] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u],zero,zero,zero,xmm4[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[4,9,14],zero,zero,zero,zero,zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm8[3,8,13,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7] ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm5, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[4,9,14],zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u],zero,zero,zero,xmm14[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[4,9,14],zero,zero,zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm3, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) @@ -3549,271 +3530,271 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX1-ONLY-NEXT: addq $472, %rsp # imm = 0x1D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i8_stride5_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $136, %rsp -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm11, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm8, %ymm15 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm8, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm6, %ymm11, %ymm1 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm8, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm8, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm6, %ymm11, %ymm1 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm8, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm8, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm6, %ymm11, %ymm1 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm8, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm6 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm8, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm10, %xmm10 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpshufb %ymm0, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm10, %ymm12, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %ymm0, %ymm15, %ymm12 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm5 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm11, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm4, %xmm12 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm5 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm15, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm14 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm7 +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm12, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm12 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm10, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm12, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm15, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm0 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm10, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm3 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm15, %ymm2 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm15 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm9, %ymm8, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm7 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm14, %ymm14 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm14, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm12 -; AVX2-ONLY-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm11 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] -; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm14 -; AVX2-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm7 +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm2 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm11, %ymm6, %ymm3 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm9, %ymm8, %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm14 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm3 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] +; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm11 +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6 +; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm10 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1,2,3,4],ymm6[5,6,7],ymm13[8,9,10,11,12],ymm6[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm11 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm13 +; AVX2-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm15 -; AVX2-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] -; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpshufb %ymm15, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6 +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm12 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm10 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm5, %ymm5 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm15, %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm15 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm1, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm14, %xmm9 +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm6 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm3 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm2 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-ONLY-NEXT: addq $136, %rsp ; AVX2-ONLY-NEXT: vzeroupper @@ -3837,45 +3818,45 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm7 ; AVX512F-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm8 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,6,11,16,21,26,31,20,25,30,19,24,29],zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,6,11,16,21,26,31,20,25,30,19,24,29],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm6, %ymm9 +; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm6, %ymm11 ; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm6 ; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm10 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm9 +; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm9 ; AVX512F-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm10 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,ymm10[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm9 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] +; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] ; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpternlogq $186, %ymm12, %ymm16, %ymm1 -; AVX512F-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[1,6,11] -; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512F-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm17 +; AVX512F-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11] +; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512F-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512F-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm14, %zmm17 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-NEXT: vpternlogq $184, %zmm9, %zmm20, %zmm17 +; AVX512F-NEXT: vpternlogq $184, %zmm11, %zmm20, %zmm17 ; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX512F-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512F-NEXT: vmovdqa %ymm5, %ymm9 -; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm9 -; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm2 +; AVX512F-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX512F-NEXT: vmovdqa %ymm5, %ymm11 +; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm14, %ymm11 +; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[1,6,11] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero -; AVX512F-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,4,9,14],zero,zero,zero,xmm11[2,7,12],zero,zero,zero +; AVX512F-NEXT: vpor %xmm2, %xmm11, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm17, %zmm19 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm14, %ymm15, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[3,8,13],zero,zero,zero @@ -3885,9 +3866,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm2 ; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm3, %xmm11, %xmm3 ; AVX512F-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm3 @@ -3897,21 +3878,21 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm11, %ymm2 ; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm2 ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,7,12,17,22,27,16,21,26,31,20,25,30],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm1, %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[2,7,12] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpternlogq $184, %zmm2, %zmm20, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm17 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm14, %ymm15, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13] @@ -3921,13 +3902,13 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm2 ; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm2 -; AVX512F-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm11, %ymm1 ; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[4,9,14,u,u,u] @@ -3939,14 +3920,14 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,8,13,18,23,28,17,22,27,16,21,26,31],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm1, %ymm3 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[3,8,13] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpternlogq $184, %zmm3, %zmm20, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm18 -; AVX512F-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm14, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero @@ -3956,8 +3937,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm2 ; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -3974,24 +3955,24 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm13[4,9,14] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm4, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %ymm15, %ymm0, %ymm12 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero -; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512F-NEXT: vpternlogq $226, %ymm15, %ymm0, %ymm14 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,3,8,13],zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero +; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm9 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm11 +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm11 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] @@ -4059,15 +4040,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movl $127, %eax ; AVX512BW-NEXT: kmovd %eax, %k4 ; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} -; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11] -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm9, %xmm13, %xmm9 ; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm13 ; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm9 ; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm8 ; AVX512BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2} @@ -4076,38 +4057,38 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm18 ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm13 {%k3} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6} +; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k6} ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u] ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm13 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] ; AVX512BW-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm13 {%k6} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm13 {%k4} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15 ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 -; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm11 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm14 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm13 ; AVX512BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1} ; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] @@ -4116,8 +4097,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19 +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm13 {%k4} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm19 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] ; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000 @@ -4140,8 +4121,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm10 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512BW-NEXT: vporq %xmm14, %xmm16, %xmm14 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 @@ -4176,11 +4157,11 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm11 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm13 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm10[3,4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index 9ec0e2f036e85c..91f8e17452db06 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -19,44 +19,44 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride6_vf2: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,3,2,3] +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: movd %xmm2, %edi ; SSE-NEXT: movw %di, (%rsi) ; SSE-NEXT: movd %xmm5, %esi ; SSE-NEXT: movw %si, (%rdx) ; SSE-NEXT: movd %xmm6, %edx ; SSE-NEXT: movw %dx, (%rcx) -; SSE-NEXT: movd %xmm4, %ecx -; SSE-NEXT: movw %cx, (%r8) ; SSE-NEXT: movd %xmm3, %ecx +; SSE-NEXT: movw %cx, (%r8) +; SSE-NEXT: movd %xmm1, %ecx ; SSE-NEXT: movw %cx, (%r9) -; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: movd %xmm4, %ecx ; SSE-NEXT: movw %cx, (%rax) ; SSE-NEXT: retq ; @@ -97,74 +97,74 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: andps %xmm2, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: andps %xmm3, %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm5[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm6[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,0,1,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: movd %xmm0, (%rsi) -; SSE-NEXT: movd %xmm4, (%rdx) +; SSE-NEXT: movd %xmm1, (%rdx) ; SSE-NEXT: movd %xmm7, (%rcx) -; SSE-NEXT: movd %xmm6, (%r8) -; SSE-NEXT: movd %xmm1, (%r9) -; SSE-NEXT: movd %xmm2, (%rax) +; SSE-NEXT: movd %xmm8, (%r8) +; SSE-NEXT: movd %xmm2, (%r9) +; SSE-NEXT: movd %xmm3, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride6_vf4: @@ -217,13 +217,13 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm4 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm8, %xmm1 @@ -268,11 +268,11 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm9, %xmm9 ; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm12 ; SSE-NEXT: por %xmm9, %xmm12 ; SSE-NEXT: movdqa %xmm12, %xmm9 ; SSE-NEXT: pand %xmm5, %xmm9 @@ -284,13 +284,13 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm13, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,5,6] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm7[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm14, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm14, %xmm11 +; SSE-NEXT: por %xmm13, %xmm11 ; SSE-NEXT: movdqa %xmm12, %xmm13 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] @@ -315,36 +315,36 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm9, %xmm12 ; SSE-NEXT: pandn %xmm13, %xmm12 ; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,1,0,2] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] @@ -352,12 +352,12 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 ; SSE-NEXT: movq %xmm2, (%rsi) ; SSE-NEXT: movq %xmm1, (%rdx) -; SSE-NEXT: movq %xmm10, (%rcx) +; SSE-NEXT: movq %xmm11, (%rcx) ; SSE-NEXT: movq %xmm12, (%r8) -; SSE-NEXT: movq %xmm3, (%r9) +; SSE-NEXT: movq %xmm5, (%r9) ; SSE-NEXT: movq %xmm9, (%rax) ; SSE-NEXT: retq ; @@ -472,126 +472,128 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movdqa 64(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pand %xmm4, %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pand %xmm4, %xmm14 -; SSE-NEXT: movdqa 80(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pand %xmm7, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm8 ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm13 +; SSE-NEXT: movdqa 80(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: packuswb %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm5, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm6 -; SSE-NEXT: por %xmm6, %xmm12 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm5, %xmm11 +; SSE-NEXT: pand %xmm14, %xmm15 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -599,9 +601,9 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm10, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] @@ -609,161 +611,161 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] ; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm15[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 ; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pandn %xmm10, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,0,3] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0] ; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: packuswb %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 ; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,3] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm2[2,3] ; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: packuswb %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movdqa %xmm12, (%rdx) -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm10, (%r8) +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movdqa %xmm11, (%rdx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: movdqa %xmm4, (%r8) ; SSE-NEXT: movdqa %xmm6, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm0, (%rax) +; SSE-NEXT: movdqa %xmm3, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride6_vf16: @@ -1076,30 +1078,29 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i8_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $280, %rsp # imm = 0x118 -; SSE-NEXT: movdqa 64(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $264, %rsp # imm = 0x108 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm15 +; SSE-NEXT: movdqa 16(%rdi), %xmm14 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pand %xmm7, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] @@ -1107,238 +1108,238 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movdqa 128(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: movdqa 160(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm3 ; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm15 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm2, %xmm13 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm12, %xmm12 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12 +; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm14 +; SSE-NEXT: por %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm12, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: packuswb %xmm15, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; SSE-NEXT: psrld $16, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: packuswb %xmm2, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm2[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm5[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm12, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm14 +; SSE-NEXT: por %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: packuswb %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: packuswb %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm4, %xmm12 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -1346,9 +1347,9 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] @@ -1362,25 +1363,25 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm13 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] @@ -1388,10 +1389,10 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] @@ -1404,555 +1405,552 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm9[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pandn %xmm5, %xmm15 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0] -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm13[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm13[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm11[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm13, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm14, %xmm12 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: packuswb %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm5, %xmm10 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: packuswb %xmm12, %xmm11 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm11, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3] -; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: packuswb %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,3] +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm7[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: por %xmm5, %xmm11 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm5, %xmm12 +; SSE-NEXT: packuswb %xmm12, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm9, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: por %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm9[2,3] +; SSE-NEXT: psrlq $48, %xmm9 ; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm14[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm6 -; SSE-NEXT: por %xmm12, %xmm6 -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: packuswb %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: por %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm9, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,7,4] +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movdqa %xmm15, (%r8) -; SSE-NEXT: movdqa %xmm3, 16(%r9) -; SSE-NEXT: movdqa %xmm8, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%r8) +; SSE-NEXT: movdqa %xmm1, (%r8) +; SSE-NEXT: movdqa %xmm2, 16(%r9) +; SSE-NEXT: movdqa %xmm3, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm7, (%rax) -; SSE-NEXT: addq $280, %rsp # imm = 0x118 +; SSE-NEXT: movdqa %xmm5, (%rax) +; SSE-NEXT: addq $264, %rsp # imm = 0x108 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $168, %rsp +; AVX1-ONLY-NEXT: subq $152, %rsp ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[5,11] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[2,8,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm3, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm12 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm8[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[3,9,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm10, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[0,6,12] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[1,7,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm8[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm8[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm5[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[2,8,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm4, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm11 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm14[0],xmm11[0] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm5[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[3,9,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm11, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm13[0,6,12] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm0, %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm13[1,7,13] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[4,10],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm5[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[5,11],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm5[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm13[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm14, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm1, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm1, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm5[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm4[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm5[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[5,11,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm6, %ymm7 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm5[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm11, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm4[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm11, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm11[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm15, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm8[1],xmm4[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r9) +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: addq $168, %rsp +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $152, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1963,79 +1961,79 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm11 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm8 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm10 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm0 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm10, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[2,8,14,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX2-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0> -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm7, %ymm6, %ymm13 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm12, %ymm14, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm14, %ymm4, %ymm8 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm9, %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12] -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm10, %ymm14, %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm15, %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[3,9,15,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX2-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm13 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[2,8,14],zero,zero,xmm13[0,6,12] +; AVX2-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm12, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13] -; AVX2-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm14, %ymm9 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm13, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm4, %ymm14, %ymm4 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm4, %xmm11 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero -; AVX2-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm14, %ymm9, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[3,9,15],zero,zero,xmm13[1,7,13] +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm4, %ymm15, %ymm9 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm8, %ymm5, %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm15, %ymm4 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[2,8,14] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero +; AVX2-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u> -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm6, %ymm12, %ymm6 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u> +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm7, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm13[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm7, %ymm12, %ymm7 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[3,9,15] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm3 @@ -2045,19 +2043,19 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero ; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-ONLY-NEXT: vzeroupper @@ -2074,87 +2072,87 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vmovdqa %ymm0, %ymm7 ; AVX512F-NEXT: vpternlogq $202, %ymm3, %ymm17, %ymm7 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3] +; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm4, %xmm5, %xmm5 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] ; AVX512F-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm1, %ymm10 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vmovdqa %ymm8, %ymm10 +; AVX512F-NEXT: vpternlogq $202, %ymm4, %ymm1, %ymm10 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $248, %ymm16, %ymm4, %ymm11 +; AVX512F-NEXT: vpternlogq $248, %ymm16, %ymm5, %ymm11 ; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm14 ; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm14 ; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero -; AVX512F-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7],ymm11[8,9,10],ymm5[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 %ymm2, %ymm18 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq $248, %ymm16, %ymm7, %ymm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq $248, %ymm16, %ymm7, %ymm9 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero ; AVX512F-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm8, %ymm10 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7],ymm9[8,9,10],ymm7[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm9, %ymm10 ; AVX512F-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm10 ; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm5, %ymm9 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm4, %ymm8 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpternlogq $248, %ymm16, %ymm12, %ymm14 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm12 ; AVX512F-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm12 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero ; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] -; AVX512F-NEXT: vpor %xmm4, %xmm15, %xmm4 -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] +; AVX512F-NEXT: vpor %xmm5, %xmm15, %xmm5 +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 +; AVX512F-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm5 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpternlogq $248, %ymm16, %ymm10, %ymm9 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpternlogq $248, %ymm16, %ymm10, %ymm8 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] ; AVX512F-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpternlogq $184, %ymm9, %ymm15, %ymm2 -; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm8 -; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero -; AVX512F-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-NEXT: vpternlogq $184, %ymm8, %ymm15, %ymm2 +; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm9 +; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm6 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero +; AVX512F-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vpternlogq $226, %ymm9, %ymm15, %ymm5 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm4[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vpternlogq $226, %ymm8, %ymm15, %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero ; AVX512F-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] @@ -2166,9 +2164,9 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpternlogq $226, %ymm6, %ymm15, %ymm0 ; AVX512F-NEXT: vmovdqa64 %ymm18, (%rsi) ; AVX512F-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm4, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm5, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm2, (%r8) -; AVX512F-NEXT: vmovdqa %ymm5, (%r9) +; AVX512F-NEXT: vmovdqa %ymm4, (%r9) ; AVX512F-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2181,15 +2179,15 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512BW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 +; AVX512BW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm7 ; AVX512BW-NEXT: movw $-28124, %r10w # imm = 0x9224 ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm8, %ymm6 {%k2} +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm7, %ymm6 {%k2} ; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm4, %ymm7 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm4, %ymm8 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm5, %xmm5 ; AVX512BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800 @@ -2204,27 +2202,27 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero,xmm8[u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero ; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vpblendmw %ymm8, %ymm1, %ymm9 {%k2} +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vpblendmw %ymm7, %ymm1, %ymm8 {%k2} ; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512BW-NEXT: kmovd %edi, %k3 -; AVX512BW-NEXT: vpblendmw %ymm4, %ymm0, %ymm10 {%k3} -; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512BW-NEXT: vpblendmw %ymm4, %ymm0, %ymm9 {%k3} +; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-NEXT: kmovd %edi, %k4 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm8[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k1} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero ; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm14 @@ -2233,34 +2231,34 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm7 {%k2} +; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm11 {%k2} ; AVX512BW-NEXT: movw $9289, %di # imm = 0x2449 ; AVX512BW-NEXT: kmovd %edi, %k5 -; AVX512BW-NEXT: vmovdqu16 %ymm8, %ymm1 {%k5} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13] -; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu16 %ymm7, %ymm1 {%k5} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k4} = ymm8[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13] +; AVX512BW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm8[5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm3 {%k3} ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm9 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k2} ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] @@ -2274,9 +2272,9 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512BW-NEXT: vmovdqa %ymm7, (%rcx) -; AVX512BW-NEXT: vmovdqa %ymm8, (%r8) -; AVX512BW-NEXT: vmovdqa %ymm9, (%r9) +; AVX512BW-NEXT: vmovdqa %ymm11, (%rcx) +; AVX512BW-NEXT: vmovdqa %ymm7, (%r8) +; AVX512BW-NEXT: vmovdqa %ymm8, (%r9) ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2299,1273 +2297,1260 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i8_stride6_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $808, %rsp # imm = 0x328 -; SSE-NEXT: movdqa 64(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $824, %rsp # imm = 0x338 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pand %xmm15, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa 320(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 336(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa 336(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 288(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa 288(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa 368(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 352(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 240(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 208(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa 192(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 272(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa 272(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 144(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: movdqa 160(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa 96(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: movdqa 96(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: packuswb %xmm15, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: packuswb %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm5 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; SSE-NEXT: packuswb %xmm13, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: packuswb %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm10, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm10, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; SSE-NEXT: packuswb %xmm10, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm8, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: packuswb %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm8, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm4, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm13, %xmm4 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: packuswb %xmm9, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: packuswb %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm9, %xmm4 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm9, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: por %xmm4, %xmm14 +; SSE-NEXT: packuswb %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm12[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm12[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm5, %xmm4 ; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm8[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: packuswb %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm7 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm15 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[2,3] +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm11[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: por %xmm12, %xmm13 +; SSE-NEXT: packuswb %xmm13, %xmm13 +; SSE-NEXT: pand %xmm15, %xmm13 +; SSE-NEXT: por %xmm9, %xmm13 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm11, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm9, %xmm14 +; SSE-NEXT: packuswb %xmm14, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm13 +; SSE-NEXT: por %xmm13, %xmm11 ; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm13, %xmm9 +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: pandn %xmm9, %xmm13 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: por %xmm14, %xmm9 +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: por %xmm13, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm14, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: por %xmm13, %xmm12 +; SSE-NEXT: packuswb %xmm12, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: por %xmm9, %xmm13 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm12, %xmm9 +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: packuswb %xmm10, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm8 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: packuswb %xmm12, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm9, %xmm10 -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm6 -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: packuswb %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm10 -; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,3] -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm9, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm14[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: por %xmm14, %xmm9 +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm12, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: por %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: por %xmm9, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,3] +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm4 +; SSE-NEXT: por %xmm12, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,7,4] +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3598,7 +3583,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movdqa %xmm7, 16(%r9) +; SSE-NEXT: movdqa %xmm3, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3606,11 +3591,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm4, 16(%rax) -; SSE-NEXT: movdqa %xmm6, 32(%rax) -; SSE-NEXT: movdqa %xmm12, 48(%rax) -; SSE-NEXT: movdqa %xmm2, (%rax) -; SSE-NEXT: addq $808, %rsp # imm = 0x328 +; SSE-NEXT: movdqa %xmm7, 16(%rax) +; SSE-NEXT: movdqa %xmm14, 32(%rax) +; SSE-NEXT: movdqa %xmm13, 48(%rax) +; SSE-NEXT: movdqa %xmm11, (%rax) +; SSE-NEXT: addq $824, %rsp # imm = 0x338 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride6_vf64: @@ -3618,328 +3603,316 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm15[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm3, %xmm13, %xmm3 ; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm13 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm13[0],xmm3[0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm3, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm13[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm3, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm2, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] ; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm3, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm0[3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm0[3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm13, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[0,6,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload @@ -3948,312 +3921,306 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm3, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm10 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] -; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[2,8,14],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm14[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[3,9,15],zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm9 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm6 -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0] -; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0] ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm15[1],xmm12[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm6, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[5,11],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm6[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm12[1],xmm9[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm8, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm9[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm9 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm7[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm10[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) ; AVX1-ONLY-NEXT: addq $808, %rsp # imm = 0x328 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -4261,528 +4228,523 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i8_stride6_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $328, %rsp # imm = 0x148 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm1[0,1] ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm9, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm9 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12 -; AVX2-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm0 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm11, %xmm15 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm12 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm12 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm12, %ymm4, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm14 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm10 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm2, %ymm12, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm13 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm14 +; AVX2-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm13 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm0[0,1],ymm14[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm13, %ymm14, %ymm0 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm6 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm15 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] +; AVX2-ONLY-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm6, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] -; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm8, %ymm7, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[2,8,14,u,u,u,u,u] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm11 +; AVX2-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0> +; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm10, %ymm9, %ymm2 +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm8, %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm12, %ymm9, %ymm8 +; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,10],zero,zero,zero,xmm1[2,8,14,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX2-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm14, %ymm13, %ymm7 +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm3 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm6 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm1 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm5 -; AVX2-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm3, %ymm15 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm5, %ymm15, %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm8, %ymm7, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm7 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm15 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm11, %ymm9, %ymm0 -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm1 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm6, %ymm4, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm12 -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm11 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm12, %ymm9, %ymm12 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u> ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm9 -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm14, %ymm13, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm2 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm14 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm14, %xmm2 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm4, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm13 +; AVX2-ONLY-NEXT: vpor %xmm8, %xmm13, %xmm8 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm6, %ymm8, %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm1 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm14, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm15, %xmm14 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm11 -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm13, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm15, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm5 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm7 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm13 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm8 +; AVX2-ONLY-NEXT: vpor %xmm13, %xmm8, %xmm8 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm13 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm0[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm12, %xmm1 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm12, %xmm6 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm4 ; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm6, %ymm5 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm5, %ymm4 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm11 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX2-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] -; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm1 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm2 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm6, %ymm2 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm9 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm10, %xmm3 +; AVX2-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm10 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm10[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm3 +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm3 +; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm4 +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm5, %ymm4 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm3, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm8 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm7 ; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm7 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6 ; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm11 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm8 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3,4,5,6,7],ymm10[8,9,10],ymm3[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm7 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i8_stride6_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $40, %rsp ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vmovdqa64 224(%rdi), %ymm25 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512F-NEXT: vmovdqa %ymm11, %ymm0 -; AVX512F-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm15 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm31 +; AVX512F-NEXT: vmovdqa %ymm6, %ymm0 +; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm31, %ymm0 ; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512F-NEXT: vpor %xmm3, %xmm6, %xmm9 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm30 -; AVX512F-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm29 -; AVX512F-NEXT: vmovdqa64 160(%rdi), %ymm18 -; AVX512F-NEXT: vmovdqa %ymm11, %ymm6 -; AVX512F-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm6 -; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm7, %xmm12 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX512F-NEXT: vpshufb %xmm8, %xmm6, %xmm13 -; AVX512F-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa %ymm11, %ymm9 -; AVX512F-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 -; AVX512F-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512F-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512F-NEXT: vporq %xmm1, %xmm5, %xmm17 +; AVX512F-NEXT: vpshufb %xmm7, %xmm4, %xmm5 +; AVX512F-NEXT: vpor %xmm3, %xmm5, %xmm8 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm29 +; AVX512F-NEXT: vmovdqa64 32(%rdi), %ymm30 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512F-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm11, %ymm5 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-NEXT: vpshufb %xmm12, %xmm10, %xmm13 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX512F-NEXT: vpshufb %xmm9, %xmm5, %xmm14 +; AVX512F-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX512F-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm28 +; AVX512F-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512F-NEXT: vpternlogq $202, %ymm30, %ymm29, %ymm8 +; AVX512F-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX512F-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX512F-NEXT: vporq %xmm1, %xmm7, %xmm17 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %ymm24 -; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512F-NEXT: vmovdqa %ymm11, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm22, %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512F-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512F-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm18 +; AVX512F-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm18, %ymm7 +; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm1 +; AVX512F-NEXT: vpshufb %xmm12, %xmm1, %xmm12 +; AVX512F-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX512F-NEXT: vpor %xmm12, %xmm9, %xmm9 +; AVX512F-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm23 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512F-NEXT: vporq %xmm0, %xmm4, %xmm21 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512F-NEXT: vporq %xmm4, %xmm6, %xmm28 -; AVX512F-NEXT: vpshufb %xmm8, %xmm9, %xmm4 -; AVX512F-NEXT: vpshufb %xmm12, %xmm13, %xmm6 -; AVX512F-NEXT: vporq %xmm4, %xmm6, %xmm21 -; AVX512F-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512F-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm0, %xmm10, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512F-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512F-NEXT: vporq %xmm4, %xmm5, %xmm25 +; AVX512F-NEXT: vpshufb %xmm12, %xmm8, %xmm4 +; AVX512F-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX512F-NEXT: vporq %xmm4, %xmm2, %xmm26 +; AVX512F-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vpshufb %xmm10, %xmm7, %xmm1 ; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm27 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512F-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4 -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm15 -; AVX512F-NEXT: vpshufb %xmm0, %xmm15, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm5 -; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa %ymm11, %ymm5 -; AVX512F-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-NEXT: vpshufb %xmm7, %xmm5, %xmm8 -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512F-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX512F-NEXT: vpor %xmm8, %xmm13, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm8, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm31, %ymm15, %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-NEXT: vpshufb %xmm14, %xmm5, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512F-NEXT: vpor %xmm4, %xmm10, %xmm2 ; AVX512F-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512F-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13 -; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512F-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX512F-NEXT: vporq %xmm0, %xmm6, %xmm16 -; AVX512F-NEXT: vmovdqa %ymm11, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm24, %ymm10 -; AVX512F-NEXT: vpshufb %xmm7, %xmm10, %xmm8 -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm7 -; AVX512F-NEXT: vpshufb %xmm12, %xmm7, %xmm12 -; AVX512F-NEXT: vpor %xmm8, %xmm12, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm12, %xmm15, %xmm15 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512F-NEXT: vpor %xmm4, %xmm15, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13] -; AVX512F-NEXT: vpor %xmm1, %xmm15, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa %ymm6, %ymm13 +; AVX512F-NEXT: vpternlogq $202, %ymm11, %ymm22, %ymm13 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-NEXT: vpshufb %xmm7, %xmm13, %xmm4 +; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX512F-NEXT: vpshufb %xmm9, %xmm2, %xmm10 +; AVX512F-NEXT: vpor %xmm4, %xmm10, %xmm3 +; AVX512F-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa %ymm8, %ymm12 +; AVX512F-NEXT: vpternlogq $202, %ymm29, %ymm30, %ymm12 +; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm10 +; AVX512F-NEXT: vpshufb %xmm14, %xmm10, %xmm4 +; AVX512F-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512F-NEXT: vporq %xmm4, %xmm0, %xmm16 +; AVX512F-NEXT: vmovdqa %ymm6, %ymm0 +; AVX512F-NEXT: vpternlogq $202, %ymm18, %ymm24, %ymm0 +; AVX512F-NEXT: vpshufb %xmm7, %xmm0, %xmm14 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512F-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX512F-NEXT: vpor %xmm14, %xmm9, %xmm3 +; AVX512F-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[3,9,15,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[3,9,15],zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-NEXT: vpshufb %xmm5, %xmm13, %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] +; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 -; AVX512F-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vmovdqa %ymm3, %ymm2 +; AVX512F-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm2 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm13[2,3],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $1, 96(%rdi), %ymm13, %ymm23 +; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm23, %ymm9 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm14 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1 -; AVX512F-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 -; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 -; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17 -; AVX512F-NEXT: vpshufb %xmm12, %xmm14, %xmm0 -; AVX512F-NEXT: vpshufb %xmm8, %xmm13, %xmm1 -; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm21 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u,u,u,u,u,1,7,13],zero,zero,zero,xmm10[5,11],zero,zero,zero -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] -; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm28 +; AVX512F-NEXT: vpternlogq $226, %zmm28, %zmm17, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $184, %zmm14, %zmm28, %zmm13 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm14 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq $248, %ymm4, %ymm26, %ymm9 +; AVX512F-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm4 +; AVX512F-NEXT: vinserti32x4 $2, %xmm21, %zmm4, %zmm4 +; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm17, %zmm9 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 +; AVX512F-NEXT: vpternlogq $184, %zmm9, %zmm28, %zmm17 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[3,9,15],zero,zero,xmm12[1,7,13],zero,zero,zero,xmm12[u,u,u,u,u] +; AVX512F-NEXT: vporq %xmm2, %xmm4, %xmm21 +; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] +; AVX512F-NEXT: vporq %xmm0, %xmm2, %xmm26 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512F-NEXT: vpternlogq $226, %ymm26, %ymm11, %ymm7 -; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm14, %xmm7, %xmm2 -; AVX512F-NEXT: vporq %xmm1, %xmm2, %xmm26 -; AVX512F-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512F-NEXT: vpternlogq $226, %ymm29, %ymm9, %ymm10 -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-NEXT: vpshufb %xmm1, %xmm8, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512F-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512F-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm11 -; AVX512F-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4 -; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm22, %ymm9 -; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX512F-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX512F-NEXT: vporq %xmm1, %xmm2, %xmm24 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm13 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm13, %xmm7, %xmm1 -; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm18 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-NEXT: vpshufb %xmm2, %xmm8, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-NEXT: vpshufb %xmm0, %xmm10, %xmm8 -; AVX512F-NEXT: vpor %xmm1, %xmm8, %xmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX512F-NEXT: vpternlogq $226, %ymm31, %ymm6, %ymm15 +; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm4 +; AVX512F-NEXT: vpshufb %xmm2, %xmm4, %xmm0 +; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm9, %xmm15, %xmm7 +; AVX512F-NEXT: vporq %xmm0, %xmm7, %xmm25 +; AVX512F-NEXT: vpternlogq $226, %ymm22, %ymm8, %ymm11 +; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-NEXT: vpshufb %xmm0, %xmm14, %xmm7 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-NEXT: vpshufb %xmm12, %xmm11, %xmm10 +; AVX512F-NEXT: vpor %xmm7, %xmm10, %xmm10 +; AVX512F-NEXT: vpternlogq $202, %ymm29, %ymm30, %ymm6 +; AVX512F-NEXT: vmovdqa %ymm3, %ymm5 +; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm1, %ymm5 +; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm18, %ymm8 +; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm7 +; AVX512F-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512F-NEXT: vpshufb %xmm12, %xmm8, %xmm12 +; AVX512F-NEXT: vpor %xmm0, %xmm12, %xmm12 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm4, %xmm0 +; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm4, %xmm15, %xmm15 +; AVX512F-NEXT: vporq %xmm0, %xmm15, %xmm18 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX512F-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX512F-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm14, %ymm5, %ymm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $236, %ymm22, %ymm3, %ymm16 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpternlogq $236, %ymm22, %ymm4, %ymm21 -; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5 -; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vpternlogq $248, %ymm22, %ymm4, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 -; AVX512F-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm1 -; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512F-NEXT: vpshufb %xmm5, %xmm1, %xmm5 -; AVX512F-NEXT: vpshufb %xmm14, %xmm11, %xmm14 -; AVX512F-NEXT: vpor %xmm5, %xmm14, %xmm5 -; AVX512F-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX512F-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512F-NEXT: vpternlogq $248, %ymm22, %ymm9, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm9 -; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $242, %ymm9, %ymm12, %ymm10 -; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm9 -; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm10 -; AVX512F-NEXT: vinserti32x4 $2, %xmm26, %zmm10, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $226, %zmm10, %zmm14, %zmm5 +; AVX512F-NEXT: vpternlogq $236, %ymm22, %ymm0, %ymm16 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX512F-NEXT: vpternlogq $236, %ymm22, %ymm5, %ymm21 +; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512F-NEXT: vpshufb %ymm14, %ymm3, %ymm14 +; AVX512F-NEXT: vpternlogq $248, %ymm22, %ymm5, %ymm14 +; AVX512F-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm5 +; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX512F-NEXT: vmovdqa64 %xmm27, %xmm14 +; AVX512F-NEXT: vpshufb %xmm14, %xmm3, %xmm14 +; AVX512F-NEXT: vpshufb %xmm9, %xmm6, %xmm9 +; AVX512F-NEXT: vpor %xmm14, %xmm9, %xmm9 +; AVX512F-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512F-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512F-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vpternlogq $226, %ymm23, %ymm7, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm8[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm9 +; AVX512F-NEXT: vpternlogq $248, %ymm22, %ymm9, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm9 +; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm7 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $242, %ymm9, %ymm14, %ymm12 +; AVX512F-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm9 +; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-NEXT: vinserti32x4 $2, %xmm25, %zmm10, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq $226, %zmm10, %zmm12, %zmm8 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %zmm5, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512F-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm13, %xmm11, %xmm5 -; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq $242, %ymm0, %ymm12, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm2 -; AVX512F-NEXT: vinserti32x4 $2, %xmm18, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq $226, %zmm2, %zmm14, %zmm1 -; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm10, %zmm0 +; AVX512F-NEXT: vpternlogq $184, %zmm8, %zmm10, %zmm9 +; AVX512F-NEXT: vmovdqa64 %xmm24, %xmm8 +; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq $242, %ymm2, %ymm14, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm3 +; AVX512F-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm3 +; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm12, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm10, %zmm2 ; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm16 ; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 -; AVX512F-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm21 +; AVX512F-NEXT: vpternlogq $184, %zmm16, %zmm28, %zmm5 +; AVX512F-NEXT: vpternlogq $184, %zmm21, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm3, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-NEXT: addq $40, %rsp +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4826,9 +4788,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10] ; AVX512BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800 -; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm2 {%k2} +; AVX512BW-NEXT: kmovd %r10d, %k3 +; AVX512BW-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k3} +; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm2 {%k3} ; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3] ; AVX512BW-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm14 @@ -4846,8 +4808,8 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512BW-NEXT: movabsq $-8796093022208, %rdi # imm = 0xFFFFF80000000000 -; AVX512BW-NEXT: kmovq %rdi, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm2 {%k3} +; AVX512BW-NEXT: kmovq %rdi, %k2 +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm2 {%k2} ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm7, %xmm9, %xmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> @@ -4864,8 +4826,8 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %xmm9, %xmm21, %xmm9 ; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm9 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] -; AVX512BW-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} +; AVX512BW-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k3} +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k3} ; AVX512BW-NEXT: vpshufb %ymm7, %ymm22, %ymm7 ; AVX512BW-NEXT: vpshufb %xmm12, %xmm25, %xmm8 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm24, %xmm12 @@ -4874,26 +4836,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k3} -; AVX512BW-NEXT: vpblendmw %ymm13, %ymm5, %ymm15 {%k4} -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k2} +; AVX512BW-NEXT: vpblendmw %ymm13, %ymm5, %ymm7 {%k4} +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> ; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492 -; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vpblendmw %ymm10, %ymm3, %ymm8 {%k2} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm8, %xmm16 -; AVX512BW-NEXT: vpshufb %xmm7, %xmm16, %xmm12 +; AVX512BW-NEXT: kmovd %edi, %k3 +; AVX512BW-NEXT: vpblendmw %ymm10, %ymm3, %ymm15 {%k3} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512BW-NEXT: vpshufb %xmm8, %xmm16, %xmm12 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm17, %xmm8, %xmm18 +; AVX512BW-NEXT: vpshufb %xmm17, %xmm15, %xmm18 ; AVX512BW-NEXT: vporq %xmm12, %xmm18, %xmm18 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-NEXT: kmovd %edi, %k5 -; AVX512BW-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5} -; AVX512BW-NEXT: vpblendmw %ymm23, %ymm0, %ymm20 {%k2} +; AVX512BW-NEXT: vpshufb %ymm19, %ymm7, %ymm18 {%k5} +; AVX512BW-NEXT: vpblendmw %ymm23, %ymm0, %ymm20 {%k3} ; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21 -; AVX512BW-NEXT: vpshufb %xmm7, %xmm21, %xmm7 +; AVX512BW-NEXT: vpshufb %xmm8, %xmm21, %xmm8 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm20, %xmm12 -; AVX512BW-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512BW-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm26, %ymm17 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = ; AVX512BW-NEXT: vpshufb %xmm22, %xmm17, %xmm12 @@ -4902,20 +4864,20 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %xmm25, %xmm24, %xmm27 ; AVX512BW-NEXT: vporq %xmm12, %xmm27, %xmm12 ; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm7, %zmm12, %zmm12 +; AVX512BW-NEXT: vinserti32x4 $2, %xmm8, %zmm12, %zmm12 ; AVX512BW-NEXT: movl $2097151, %edi # imm = 0x1FFFFF ; AVX512BW-NEXT: kmovq %rdi, %k6 ; AVX512BW-NEXT: vmovdqu8 %zmm18, %zmm12 {%k6} -; AVX512BW-NEXT: vpblendmw %ymm14, %ymm4, %ymm7 {%k4} +; AVX512BW-NEXT: vpblendmw %ymm14, %ymm4, %ymm8 {%k4} ; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm18 {%k1} ; AVX512BW-NEXT: vpshufb %xmm22, %xmm18, %xmm22 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm27 ; AVX512BW-NEXT: vpshufb %xmm25, %xmm27, %xmm25 ; AVX512BW-NEXT: vporq %xmm22, %xmm25, %xmm22 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512BW-NEXT: vpshufb %ymm19, %ymm7, %ymm22 {%k5} +; AVX512BW-NEXT: vpshufb %ymm19, %ymm8, %ymm22 {%k5} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqu8 %zmm19, %zmm12 {%k3} +; AVX512BW-NEXT: vmovdqu8 %zmm19, %zmm12 {%k2} ; AVX512BW-NEXT: movw $9289, %di # imm = 0x2449 ; AVX512BW-NEXT: kmovd %edi, %k4 ; AVX512BW-NEXT: vmovdqu16 %ymm14, %ymm4 {%k4} @@ -4923,28 +4885,28 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm13, %xmm16, %xmm14 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm16, %xmm8, %xmm8 -; AVX512BW-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] -; AVX512BW-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5} -; AVX512BW-NEXT: vpshufb %xmm13, %xmm21, %xmm13 -; AVX512BW-NEXT: vpshufb %xmm16, %xmm20, %xmm15 -; AVX512BW-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX512BW-NEXT: vpshufb %xmm15, %xmm17, %xmm16 +; AVX512BW-NEXT: vpshufb %xmm16, %xmm15, %xmm15 +; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] +; AVX512BW-NEXT: vpshufb %ymm15, %ymm7, %ymm14 {%k5} +; AVX512BW-NEXT: vpshufb %xmm13, %xmm21, %xmm7 +; AVX512BW-NEXT: vpshufb %xmm16, %xmm20, %xmm13 +; AVX512BW-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = +; AVX512BW-NEXT: vpshufb %xmm16, %xmm17, %xmm13 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = ; AVX512BW-NEXT: vpshufb %xmm17, %xmm24, %xmm19 -; AVX512BW-NEXT: vporq %xmm16, %xmm19, %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm13, %zmm16, %zmm13 -; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm13 {%k6} -; AVX512BW-NEXT: vpshufb %xmm15, %xmm18, %xmm8 -; AVX512BW-NEXT: vpshufb %xmm17, %xmm27, %xmm15 -; AVX512BW-NEXT: vpor %xmm8, %xmm15, %xmm8 -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3} +; AVX512BW-NEXT: vporq %xmm13, %xmm19, %xmm13 +; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-NEXT: vinserti32x4 $2, %xmm7, %zmm13, %zmm13 +; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm13 {%k6} +; AVX512BW-NEXT: vpshufb %xmm16, %xmm18, %xmm7 +; AVX512BW-NEXT: vpshufb %xmm17, %xmm27, %xmm14 +; AVX512BW-NEXT: vpor %xmm7, %xmm14, %xmm7 +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-NEXT: vpshufb %ymm15, %ymm8, %ymm7 {%k5} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm13 {%k2} ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> @@ -4961,7 +4923,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %xmm14, %xmm8, %xmm14 ; AVX512BW-NEXT: vpshufb %xmm16, %xmm0, %xmm16 ; AVX512BW-NEXT: vporq %xmm14, %xmm16, %xmm14 -; AVX512BW-NEXT: vmovdqu16 %ymm26, %ymm1 {%k2} +; AVX512BW-NEXT: vmovdqu16 %ymm26, %ymm1 {%k3} ; AVX512BW-NEXT: vextracti32x4 $1, %ymm1, %xmm16 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = ; AVX512BW-NEXT: vpshufb %xmm17, %xmm16, %xmm18 @@ -4974,7 +4936,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: kmovq %rdi, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm10 {%k1} ; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm7 -; AVX512BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k2} +; AVX512BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k3} ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm11 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm11, %xmm14 ; AVX512BW-NEXT: vpshufb %xmm19, %xmm6, %xmm17 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 3b13741aaf7d5b..f74a6a94af7972 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -20,50 +20,50 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movd %xmm1, %edi +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movd %xmm4, %edi ; SSE-NEXT: movw %di, (%rsi) -; SSE-NEXT: movd %xmm4, %esi +; SSE-NEXT: movd %xmm2, %esi ; SSE-NEXT: movw %si, (%rdx) -; SSE-NEXT: movd %xmm5, %edx +; SSE-NEXT: movd %xmm6, %edx ; SSE-NEXT: movw %dx, (%rcx) -; SSE-NEXT: movd %xmm6, %ecx +; SSE-NEXT: movd %xmm5, %ecx ; SSE-NEXT: movw %cx, (%r8) ; SSE-NEXT: movd %xmm7, %ecx ; SSE-NEXT: movw %cx, (%r9) ; SSE-NEXT: movd %xmm0, %ecx ; SSE-NEXT: movw %cx, (%r10) -; SSE-NEXT: movd %xmm2, %ecx +; SSE-NEXT: movd %xmm1, %ecx ; SSE-NEXT: movw %cx, (%rax) ; SSE-NEXT: retq ; @@ -108,108 +108,109 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm6[2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: pand %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm4, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] +; SSE-NEXT: pandn %xmm15, %xmm4 +; SSE-NEXT: por %xmm9, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm9 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[1,1,0,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: por %xmm5, %xmm15 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[1,1,0,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: por %xmm9, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: por %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE-NEXT: por %xmm14, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,0,2,3] -; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,0,2,3] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movd %xmm2, (%rsi) -; SSE-NEXT: movd %xmm5, (%rdx) -; SSE-NEXT: movd %xmm3, (%rcx) -; SSE-NEXT: movd %xmm6, (%r8) -; SSE-NEXT: movd %xmm4, (%r9) -; SSE-NEXT: movd %xmm8, (%rdi) +; SSE-NEXT: movd %xmm4, (%rsi) +; SSE-NEXT: movd %xmm9, (%rdx) +; SSE-NEXT: movd %xmm6, (%rcx) +; SSE-NEXT: movd %xmm5, (%r8) +; SSE-NEXT: movd %xmm8, (%r9) +; SSE-NEXT: movd %xmm3, (%rdi) ; SSE-NEXT: movd %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -391,134 +392,127 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 ; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm13, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: movdqa %xmm4, %xmm14 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: movaps %xmm0, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm6[2,3] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[2,3] ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm13 -; SSE-NEXT: por %xmm7, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,4,5,6] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm10, %xmm6 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSE-NEXT: pand %xmm0, %xmm14 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm3 @@ -528,30 +522,26 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] @@ -559,61 +549,60 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: movq %xmm13, (%rsi) -; SSE-NEXT: movq %xmm9, (%rdx) -; SSE-NEXT: movq %xmm8, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm10, (%r9) -; SSE-NEXT: movq %xmm11, (%rdi) -; SSE-NEXT: movq %xmm0, (%rax) +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: movq %xmm6, (%rsi) +; SSE-NEXT: movq %xmm10, (%rdx) +; SSE-NEXT: movq %xmm11, (%rcx) +; SSE-NEXT: movq %xmm2, (%r8) +; SSE-NEXT: movq %xmm12, (%r9) +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: movq %xmm1, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf8: @@ -870,25 +859,27 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $168, %rsp +; SSE-NEXT: subq $184, %rsp ; SSE-NEXT: movdqa 96(%rdi), %xmm15 -; SSE-NEXT: movdqa 80(%rdi), %xmm4 -; SSE-NEXT: movdqa 64(%rdi), %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa 48(%rdi), %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm13, %xmm13 +; SSE-NEXT: pxor %xmm9, %xmm9 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -897,23 +888,22 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,1] @@ -924,37 +914,33 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm3[8],xmm15[9],xmm3[9],xmm15[10],xmm3[10],xmm15[11],xmm3[11],xmm15[12],xmm3[12],xmm15[13],xmm3[13],xmm15[14],xmm3[14],xmm15[15],xmm3[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm15, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -963,13 +949,15 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pxor %xmm7, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm2 @@ -978,279 +966,270 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm13 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: pand %xmm4, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSE-NEXT: pand %xmm14, %xmm11 ; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pand %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,2,2,3] +; SSE-NEXT: por %xmm11, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,2,3] ; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: por %xmm8, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pslld $16, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm13 ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pslld $16, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; SSE-NEXT: packuswb %xmm8, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm13 -; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pandn %xmm2, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm13, %xmm2 +; SSE-NEXT: packuswb %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,7,6] -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: pandn %xmm2, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm13, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm13 +; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm6[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm2, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm13, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,7,6] +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: por %xmm13, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm15, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm13, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: por %xmm7, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: pand %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,1,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm4, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: packuswb %xmm2, %xmm10 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: packuswb %xmm2, %xmm8 +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm13, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 @@ -1258,252 +1237,255 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,2,1] +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: andps %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: andps %xmm14, %xmm2 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,5,4,7,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,5,4,7,6] ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] +; SSE-NEXT: pand %xmm8, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm13, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,0,65535,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: andps %xmm5, %xmm6 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE-NEXT: andps %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: por %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE-NEXT: pxor %xmm8, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: por %xmm9, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] -; SSE-NEXT: packuswb %xmm7, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm6[0],xmm4[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535] -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,1,0,3] +; SSE-NEXT: packuswb %xmm7, %xmm5 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,0,65535,65535] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: andps %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: andps %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: por %xmm5, %xmm14 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm8, (%rcx) -; SSE-NEXT: movdqa %xmm15, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: movdqa %xmm6, (%rcx) +; SSE-NEXT: movdqa %xmm0, (%r8) ; SSE-NEXT: movdqa %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm2, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm5, (%rax) -; SSE-NEXT: addq $168, %rsp +; SSE-NEXT: movdqa %xmm14, (%rax) +; SSE-NEXT: addq $184, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u],zero,zero,xmm6[1,8,15,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,xmm6[2,9,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,4,11],zero,zero,xmm5[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vpxor %xmm12, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[3,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[3,10] ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,5,12],zero,zero,xmm5[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[4,11] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[4,11] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u],zero,zero,xmm6[4,11,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm13, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm5[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[5,12] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[5,12] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm11, %xmm13, %xmm11 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm14 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm5[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[6,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[6,13] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm0, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm5[u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] ; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[0,7,14] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm1[0,7,14] ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm14, %xmm8 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm0, %xmm8, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[3,10,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm5[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[1,8,15] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm3[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm1[1,8,15] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] ; AVX1-ONLY-NEXT: vpblendw $31, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4],xmm1[5,6,7] @@ -1516,7 +1498,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rax) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i8_stride7_vf16: @@ -1637,33 +1619,33 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] ; AVX512F-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm8, %ymm6 -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6 -; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm7, %xmm6, %xmm9 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512F-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %xmm9, %xmm7, %xmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm7 +; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm8[3,10] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero +; AVX512F-NEXT: vpor %xmm9, %xmm8, %xmm9 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $184, %xmm7, %xmm10, %xmm9 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm7, %ymm8 +; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm11, %xmm8, %xmm8 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero ; AVX512F-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512F-NEXT: vpternlogq $184, %xmm10, %xmm7, %xmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm10, %ymm12 +; AVX512F-NEXT: vpternlogq $184, %xmm8, %xmm10, %xmm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm8, %ymm12 ; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm12 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] ; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -1673,42 +1655,42 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512F-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512F-NEXT: vpternlogq $184, %xmm12, %xmm7, %xmm14 -; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm8 -; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX512F-NEXT: vpternlogq $184, %xmm12, %xmm10, %xmm14 +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm6 +; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm12 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm12, %xmm8, %xmm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero ; AVX512F-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512F-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm12 -; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm9 -; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14] +; AVX512F-NEXT: vpternlogq $184, %xmm6, %xmm10, %xmm12 +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm7 +; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm6 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512F-NEXT: vpor %xmm13, %xmm9, %xmm9 -; AVX512F-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm9 -; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX512F-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512F-NEXT: vpternlogq $184, %xmm6, %xmm10, %xmm7 +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vpternlogq $184, %xmm1, %xmm7, %xmm0 +; AVX512F-NEXT: vpternlogq $184, %xmm1, %xmm10, %xmm0 ; AVX512F-NEXT: vmovdqa %xmm5, (%rsi) -; AVX512F-NEXT: vmovdqa %xmm6, (%rdx) +; AVX512F-NEXT: vmovdqa %xmm9, (%rdx) ; AVX512F-NEXT: vmovdqa %xmm11, (%rcx) ; AVX512F-NEXT: vmovdqa %xmm14, (%r8) ; AVX512F-NEXT: vmovdqa %xmm12, (%r9) -; AVX512F-NEXT: vmovdqa %xmm9, (%r10) +; AVX512F-NEXT: vmovdqa %xmm7, (%r10) ; AVX512F-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1727,13 +1709,13 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: kmovd %r11d, %k1 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX512BW-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm4[5,6,7] +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] +; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2} @@ -1743,7 +1725,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm0[1],xmm1[2,3,4],xmm0[5],xmm1[6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -1757,7 +1739,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero ; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} ; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 @@ -1769,7 +1751,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512BW-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2} @@ -1778,7 +1760,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero ; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k3} @@ -1788,7 +1770,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6],xmm0[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u],zero,zero,xmm4[2,9],zero,zero,zero ; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512BW-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k4} @@ -1798,10 +1780,10 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6],xmm0[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi) +; AVX512BW-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm6, (%rdx) ; AVX512BW-NEXT: vmovdqa %xmm7, (%rcx) ; AVX512BW-NEXT: vmovdqa %xmm8, (%r8) @@ -1831,28 +1813,29 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $648, %rsp # imm = 0x288 -; SSE-NEXT: movdqa 208(%rdi), %xmm14 -; SSE-NEXT: movdqa 192(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: subq $632, %rsp # imm = 0x278 +; SSE-NEXT: movdqa 208(%rdi), %xmm6 +; SSE-NEXT: movdqa 192(%rdi), %xmm15 +; SSE-NEXT: movdqa 176(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm13 ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: pxor %xmm14, %xmm14 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1861,25 +1844,20 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm12 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] @@ -1890,26 +1868,25 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] @@ -1922,23 +1899,24 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1947,23 +1925,21 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] @@ -1972,27 +1948,27 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa 64(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -2002,100 +1978,95 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: packuswb %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 @@ -2103,793 +2074,778 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,2,2,3] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pand %xmm13, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pslld $16, %xmm9 +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm14 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm13, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,5] +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pslld $16, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm10, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE-NEXT: packuswb %xmm8, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm10, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm12 +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: por %xmm5, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm12, %xmm5 +; SSE-NEXT: packuswb %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm12, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; SSE-NEXT: pand %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: por %xmm2, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: por %xmm10, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm12, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; SSE-NEXT: packuswb %xmm5, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: packuswb %xmm15, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3],xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] +; SSE-NEXT: pand %xmm2, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,1,3] +; SSE-NEXT: por %xmm11, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm10 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; SSE-NEXT: pand %xmm11, %xmm14 -; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,1,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm15 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] ; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: packuswb %xmm0, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: andps %xmm15, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm3[0],xmm7[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm14 -; SSE-NEXT: andps %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: andps %xmm15, %xmm7 +; SSE-NEXT: por %xmm7, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm5[0],xmm8[1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm12, %xmm5 -; SSE-NEXT: andps %xmm0, %xmm8 -; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm12, %xmm8 -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm4[0],xmm10[1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm11, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm9, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm12, %xmm4 +; SSE-NEXT: andps %xmm5, %xmm10 +; SSE-NEXT: por %xmm10, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm9, %xmm6 -; SSE-NEXT: andps %xmm3, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm10[0],xmm8[1,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,1,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm10, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: andps %xmm1, %xmm8 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3],xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: por %xmm10, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[3,3,3,3] ; SSE-NEXT: packuswb %xmm8, %xmm10 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: andps %xmm3, %xmm10 -; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm11, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] -; SSE-NEXT: pxor %xmm12, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por %xmm10, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[3,3,3,3] -; SSE-NEXT: packuswb %xmm11, %xmm10 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm11, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm7[0],xmm10[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3],xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; SSE-NEXT: pand %xmm8, %xmm12 +; SSE-NEXT: por %xmm13, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm12, %xmm7 ; SSE-NEXT: andps %xmm1, %xmm10 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm10, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm15[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[3,3,3,3] +; SSE-NEXT: packuswb %xmm12, %xmm11 +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm10[0],xmm11[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; SSE-NEXT: pand %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm8 +; SSE-NEXT: por %xmm12, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: andps %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: por %xmm11, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2898,566 +2854,556 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movdqa %xmm4, (%r9) -; SSE-NEXT: movdqa %xmm14, 16(%r9) +; SSE-NEXT: movdqa %xmm2, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm6, (%rax) -; SSE-NEXT: movdqa %xmm5, 16(%rax) +; SSE-NEXT: movdqa %xmm3, (%rax) +; SSE-NEXT: movdqa %xmm4, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm1, (%rax) -; SSE-NEXT: movdqa %xmm8, 16(%rax) -; SSE-NEXT: addq $648, %rsp # imm = 0x288 +; SSE-NEXT: movdqa %xmm8, (%rax) +; SSE-NEXT: movdqa %xmm7, 16(%rax) +; SSE-NEXT: addq $632, %rsp # imm = 0x278 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $200, %rsp +; AVX1-ONLY-NEXT: subq $120, %rsp ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm6[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u],zero,zero,xmm8[3,10,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,5,12],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[4,11,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,6,13],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u],zero,zero,xmm11[3,10,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,5,12],zero,zero,xmm2[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u],zero,zero,xmm9[1,8,15,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u],zero,zero,xmm11[4,11,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,6,13],zero,zero,xmm2[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[1,8,15],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,5,12],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,6,13],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,4,11],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,3,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u],zero,zero,xmm8[1,8,15,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[4,11,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,6,13],zero,zero,xmm6[u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm3, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm4, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] -; AVX1-ONLY-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[3,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u],zero,zero,xmm9[3,10,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,5,12],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm3[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u],zero,zero,xmm9[4,11,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,4,11],zero,zero,xmm8[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u],zero,zero,xmm11[0,7,14,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,5,12],zero,zero,xmm8[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,3,10],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u],zero,zero,xmm11[1,8,15,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[4,11,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,6,13],zero,zero,xmm8[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm12, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14] +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[5,12,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,0,7,14],zero,zero,xmm8[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps (%rsp), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm13, %ymm15, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; AVX1-ONLY-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[3,10] +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[4,11] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[5,12] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[5,12] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15,u,u] +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[6,13] +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm13, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm10 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15,u,u] -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[6,13] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[2,9,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[0,7,14] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[3,10,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[1,8,15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm14, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm13 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[2,9,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[0,7,14] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[3,10,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[1,8,15] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $200, %rsp +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: addq $120, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i8_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $72, %rsp -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: subq $40, %rsp +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm7, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm11, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm7, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7,8,9],ymm4[10],ymm3[11,12,13],ymm4[14],ymm3[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm12, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm11, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm0 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[5,12] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm12, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm4[u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-SLOW-NEXT: vpor %xmm10, %xmm14, %xmm10 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm12, %xmm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] -; AVX2-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm12, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14] +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] -; AVX2-SLOW-NEXT: vpor %xmm11, %xmm15, %xmm11 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm12, %ymm11, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm11, %ymm12, %ymm11 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15] +; AVX2-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm11, %ymm12, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm13 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm8, %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm8, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1,2],ymm5[3],ymm9[4,5,6],ymm5[7,8],ymm9[9,10],ymm5[11],ymm9[12,13,14],ymm5[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm1 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6,7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm7 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7,8],ymm7[9],ymm11[10,11,12],ymm7[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm6 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm13[1,2,3,4,5,6,7],ymm4[8],ymm13[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-SLOW-NEXT: addq $72, %rsp +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-SLOW-NEXT: addq $40, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i8_stride7_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $40, %rsp -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] @@ -3465,9 +3411,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,6,1,2,4,6] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] @@ -3475,18 +3419,18 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm8, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7,8,9],ymm7[10],ymm1[11,12,13],ymm7[14],ymm1[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm9, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u] @@ -3494,12 +3438,12 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] ; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] @@ -3507,352 +3451,353 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12] ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX2-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-FAST-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm12 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX2-FAST-NEXT: vpor %xmm10, %xmm14, %xmm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm7, %xmm15, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm12, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm14, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-FAST-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm12, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm9, %ymm14 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14] -; AVX2-FAST-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpor %xmm14, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm12, %ymm15 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm8, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm9, %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm10, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15] ; AVX2-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm5, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm5, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm8, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8,9,10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1,2],ymm4[3],ymm13[4,5,6],ymm4[7,8],ymm13[9,10],ymm4[11],ymm13[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6],ymm8[7,8],ymm10[9,10,11],ymm8[12],ymm10[13,14],ymm8[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5,6],ymm6[7,8],ymm7[9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4],ymm11[5,6],ymm7[7,8],ymm11[9,10,11],ymm7[12],ymm11[13,14],ymm7[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,3,5,6,1,3,5,6] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6,7,8],ymm7[9],ymm1[10,11],ymm7[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7,8],ymm7[9],ymm4[10,11,12],ymm7[13],ymm4[14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,6,1,3,5,6] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm15[1,2,3,4,5,6,7],ymm5[8],ymm15[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm14[1,2,3,4,5,6,7],ymm1[8],ymm14[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm9[1,2,3,4,5,6,7],ymm3[8],ymm9[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i8_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm7, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7,8,9],ymm4[10],ymm3[11,12,13],ymm4[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm11, %ymm12, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm11, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[5,12] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm12, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm4[u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %xmm10, %xmm14, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm10, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm12, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm15, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm11, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm14, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm12, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm7, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm8, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm5, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1,2],ymm5[3],ymm9[4,5,6],ymm5[7,8],ymm9[9,10],ymm5[11],ymm9[12,13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6,7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7,8],ymm7[9],ymm11[10,11,12],ymm7[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm13[1,2,3,4,5,6,7],ymm4[8],ymm13[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $72, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3861,70 +3806,70 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm1 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm3 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] +; AVX512F-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 ; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm14, %ymm9 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm9 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %xmm10, %xmm9, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa %ymm14, %ymm8 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm8 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[5,12],zero,zero,xmm9[1,8,15,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,7,14],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpor %xmm9, %xmm8, %xmm13 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm1, %ymm9, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa %ymm11, %ymm8 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm4, %ymm9, %ymm8 ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7,8,9],ymm10[10],ymm8[11,12],ymm10[13],ymm8[14,15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm8 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm12, %ymm16, %ymm8 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm18 ; AVX512F-SLOW-NEXT: vmovdqa %ymm11, %ymm12 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm12 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm12 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %xmm13, %xmm12, %xmm12 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3,4,5,6],ymm13[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] ; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm12 ; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm12 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %xmm15, %xmm12, %xmm15 +; AVX512F-SLOW-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa %ymm14, %ymm12 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm12 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm9, %ymm4, %ymm12 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm12 -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm8, %ymm16, %ymm12 +; AVX512F-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm12 +; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm15, %ymm16, %ymm12 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm19 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm8 @@ -3933,12 +3878,12 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %xmm15, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm15 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm15 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm9, %ymm4, %ymm15 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm15 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm14 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm14 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm8 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] @@ -3946,7 +3891,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] @@ -3954,14 +3899,14 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm20 ; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm8 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm8 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm8 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %xmm12, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512F-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] @@ -3973,7 +3918,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %xmm14, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm14 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm9, %ymm4, %ymm14 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm14 @@ -3981,14 +3926,14 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm21 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm8 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm8 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %xmm12, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX512F-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm8, %ymm16, %ymm12 @@ -3999,20 +3944,20 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %xmm14, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa %ymm11, %ymm14 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm14 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm9, %ymm4, %ymm14 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm14 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa %ymm11, %ymm8 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm8 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm8 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %xmm12, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] ; AVX512F-SLOW-NEXT: vpor %xmm12, %xmm15, %xmm12 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 @@ -4024,33 +3969,33 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %xmm15, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm15 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm1, %ymm9, %ymm15 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm4, %ymm9, %ymm15 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm15 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm13 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm13 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] ; AVX512F-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] -; AVX512F-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm16, %ymm3 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm16, %ymm2 ; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm11 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm1, %ymm9, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vpternlogq $202, %ymm4, %ymm9, %ymm0 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, (%rdx) @@ -4067,54 +4012,54 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] +; AVX512F-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,6,1,2,4,6] ; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm12 ; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm13, %ymm7 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm7 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm7, %xmm8, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm6 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,7,14],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %xmm7, %xmm6, %xmm11 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa %ymm9, %ymm11 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm1, %ymm7, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa %ymm9, %ymm6 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm3, %ymm7, %ymm6 ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7,8,9],ymm8[10],ymm6[11,12],ymm8[13],ymm6[14,15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm6, %ymm16, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,3,4,6,1,3,4,6] -; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm10 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-FAST-NEXT: vpternlogq $226, %ymm10, %ymm16, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512F-FAST-NEXT: vmovdqa %ymm9, %ymm10 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,xmm10[4,11],zero,zero,xmm10[0,7,14,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,3,4,6,1,3,4,6] +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3,4,5,6],ymm11[7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] ; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm10 ; AVX512F-FAST-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm10 @@ -4123,68 +4068,68 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpor %xmm14, %xmm10, %xmm14 ; AVX512F-FAST-NEXT: vmovdqa %ymm13, %ymm10 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm10 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm7, %ymm3, %ymm10 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm14, %ymm10 -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm6, %ymm16, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm14 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm14 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm13 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] +; AVX512F-FAST-NEXT: vpternlogq $226, %ymm15, %ymm16, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, %ymm14 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm14 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm7, %ymm3, %ymm15 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm8[3],ymm15[4,5],ymm8[6],ymm15[7,8,9,10],ymm8[11],ymm15[12,13],ymm8[14],ymm15[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm16, %ymm14, %ymm15 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm13 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[1,8,15,u,u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm6, %xmm13, %xmm6 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,3,5,6,1,3,5,6] -; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX512F-FAST-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,3,5,6,1,3,5,6] +; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm12 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm6 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm12 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm12 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm13 ; AVX512F-FAST-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12] ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512F-FAST-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512F-FAST-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm13, %ymm17, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, %ymm15 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm15 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm7, %ymm3, %ymm15 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm15 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm16, %ymm13, %ymm15 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm6[1,2,3,4,5,6,7],ymm15[8],ymm6[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm20 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u] @@ -4194,7 +4139,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX512F-FAST-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm13 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm6, %ymm17, %ymm13 ; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm6 ; AVX512F-FAST-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm6 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 @@ -4202,15 +4147,15 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpor %xmm6, %xmm15, %xmm6 ; AVX512F-FAST-NEXT: vmovdqa %ymm9, %ymm15 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm15 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm7, %ymm3, %ymm15 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm15 +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm16, %ymm6, %ymm15 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm21 ; AVX512F-FAST-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] @@ -4220,7 +4165,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14] ; AVX512F-FAST-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm13 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm6, %ymm17, %ymm13 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, %ymm6 ; AVX512F-FAST-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm6 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 @@ -4228,33 +4173,33 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpor %xmm6, %xmm15, %xmm6 ; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm1, %ymm7, %ymm15 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm3, %ymm7, %ymm15 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm15 +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm16, %ymm6, %ymm15 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm11 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm11 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15] -; AVX512F-FAST-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm16, %ymm3 +; AVX512F-FAST-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm1, %ymm17, %ymm2 ; AVX512F-FAST-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm9, %xmm4 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vpternlogq $202, %ymm1, %ymm7, %ymm0 +; AVX512F-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512F-FAST-NEXT: vpternlogq $202, %ymm3, %ymm7, %ymm0 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm16, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa %ymm10, (%rdx) @@ -4276,7 +4221,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> ; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm4 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm5 +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm6 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> ; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm11 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> @@ -4284,25 +4229,25 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> ; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm10 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-SLOW-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512BW-SLOW-NEXT: kmovd %r11d, %k5 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm3 {%k5} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: movw $992, %r11w # imm = 0x3E0 ; AVX512BW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm5, %ymm3 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512BW-SLOW-NEXT: movw $8772, %r11w # imm = 0x2244 ; AVX512BW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm7, %ymm6, %ymm8 {%k1} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm7, %ymm5, %ymm8 {%k1} ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u] @@ -4318,10 +4263,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] ; AVX512BW-SLOW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512BW-SLOW-NEXT: kmovd %edi, %k4 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm1 {%k4} +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm3 {%k4} ; AVX512BW-SLOW-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512BW-SLOW-NEXT: kmovd %edi, %k2 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm13 {%k2} ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] @@ -4332,7 +4277,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k3} ; AVX512BW-SLOW-NEXT: movw $9288, %di # imm = 0x2448 ; AVX512BW-SLOW-NEXT: kmovd %edi, %k3 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm7, %ymm6, %ymm13 {%k3} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm7, %ymm5, %ymm13 {%k3} ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u] @@ -4344,7 +4289,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k4} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k1} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm13 {%k1} ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] @@ -4352,7 +4297,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-SLOW-NEXT: kmovd %edi, %k4 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm12 {%k5} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm7, %ymm12 {%k5} ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm15 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] @@ -4365,7 +4310,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k2} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm7, %ymm13 {%k2} ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] @@ -4378,7 +4323,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-SLOW-NEXT: kmovd %edi, %k5 ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm14 {%k3} ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] @@ -4386,7 +4331,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm7, %ymm13 {%k1} ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] @@ -4397,15 +4342,15 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm1, %ymm14 {%k2} ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm6[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm7, %ymm13 {%k3} ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] @@ -4416,7 +4361,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm1, %ymm14 {%k1} ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] @@ -4424,30 +4369,30 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm7, %ymm5 {%k2} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] ; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm7, %ymm5 {%k5} +; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k3} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm5[1,2,3,4,5,6,7],ymm1[8],ymm5[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512BW-SLOW-NEXT: vmovdqa %ymm10, (%rdx) ; AVX512BW-SLOW-NEXT: vmovdqa %ymm12, (%rcx) ; AVX512BW-SLOW-NEXT: vmovdqa %ymm11, (%r8) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm5, (%r9) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm6, (%r9) ; AVX512BW-SLOW-NEXT: vmovdqa %ymm4, (%r10) ; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -4463,7 +4408,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] ; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm4 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm5 +; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm6 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] ; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm9 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] @@ -4471,25 +4416,25 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] ; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm8 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k5 -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512BW-FAST-NEXT: vpblendmw %ymm1, %ymm2, %ymm3 {%k5} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: movw $992, %r11w # imm = 0x3E0 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu16 %ymm5, %ymm3 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512BW-FAST-NEXT: movw $8772, %r11w # imm = 0x2244 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-FAST-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k1} +; AVX512BW-FAST-NEXT: vpblendmw %ymm7, %ymm5, %ymm11 {%k1} ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u] @@ -4503,10 +4448,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] ; AVX512BW-FAST-NEXT: movl $-524288, %r11d # imm = 0xFFF80000 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k4 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm1 {%k4} +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm3 {%k4} ; AVX512BW-FAST-NEXT: movw $4644, %r11w # imm = 0x1224 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k2 -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k2} +; AVX512BW-FAST-NEXT: vpblendmw %ymm1, %ymm2, %ymm11 {%k2} ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u] @@ -4517,7 +4462,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3} ; AVX512BW-FAST-NEXT: movw $9288, %r11w # imm = 0x2448 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k3 -; AVX512BW-FAST-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3} +; AVX512BW-FAST-NEXT: vpblendmw %ymm7, %ymm5, %ymm11 {%k3} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u] ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] @@ -4529,7 +4474,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] ; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm8 {%k4} -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k1} +; AVX512BW-FAST-NEXT: vpblendmw %ymm1, %ymm2, %ymm11 {%k1} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] @@ -4537,7 +4482,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: movl $261632, %r11d # imm = 0x3FE00 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k4 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm10 {%k5} +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm7, %ymm10 {%k5} ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u] @@ -4550,91 +4495,91 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm11 {%k2} +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm7, %ymm11 {%k2} ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] -; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-FAST-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[5,12] +; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-FAST-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-FAST-NEXT: kmovd %edi, %k5 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm13, %ymm11 {%k5} +; AVX512BW-FAST-NEXT: vpblendmw %ymm1, %ymm2, %ymm13 {%k3} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm11[1,2,3,4,5,6,7],ymm13[8],ymm11[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm7, %ymm11 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[6,13] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-FAST-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm13, %ymm11 {%k5} +; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm1, %ymm13 {%k2} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm6[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0],ymm11[1,2,3,4,5,6,7],ymm13[8],ymm11[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm7, %ymm11 {%k3} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm12[0,7,14] +; AVX512BW-FAST-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14] -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm13, %ymm11 {%k5} +; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm1, %ymm13 {%k1} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm11[1,2,3,4,5,6,7],ymm13[8],ymm11[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %ymm7, %ymm5 {%k2} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm12[1,8,15] ; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm7, %ymm5 {%k5} +; AVX512BW-FAST-NEXT: vmovdqu16 %ymm2, %ymm1 {%k3} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm5[1,2,3,4,5,6,7],ymm1[8],ymm5[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512BW-FAST-NEXT: vmovdqa %ymm8, (%rdx) ; AVX512BW-FAST-NEXT: vmovdqa %ymm10, (%rcx) ; AVX512BW-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX512BW-FAST-NEXT: vmovdqa %ymm5, (%r9) +; AVX512BW-FAST-NEXT: vmovdqa %ymm6, (%r9) ; AVX512BW-FAST-NEXT: vmovdqa %ymm4, (%r10) ; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper @@ -4660,31 +4605,31 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1528, %rsp # imm = 0x5F8 -; SSE-NEXT: movdqa 208(%rdi), %xmm12 -; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: subq $1544, %rsp # imm = 0x608 +; SSE-NEXT: movdqa 208(%rdi), %xmm10 +; SSE-NEXT: movdqa 192(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm7 ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pxor %xmm7, %xmm7 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -4692,1341 +4637,1338 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 256(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa 272(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa 240(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa 224(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa 288(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa 304(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: movdqa 320(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 368(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa 384(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa 352(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa 336(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa 400(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa 416(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: movdqa 416(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: movdqa 432(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa 64(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 80(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm7, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: packuswb %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: packuswb %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: packuswb %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: packuswb %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pand %xmm14, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pand %xmm14, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pand %xmm14, %xmm11 +; SSE-NEXT: por %xmm3, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm14 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm13, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm12, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm15, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pand %xmm3, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: pslld $16, %xmm13 -; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5] -; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,4,6,5] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm8, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm11, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,5] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm2, %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; SSE-NEXT: pand %xmm14, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: packuswb %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,5] ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; SSE-NEXT: pand %xmm14, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,3,2,3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: packuswb %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm2, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,1,3] +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: pand %xmm7, %xmm12 ; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: pxor %xmm11, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm13 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: por %xmm9, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,2,1,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pxor %xmm2, %xmm2 @@ -6038,7 +5980,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pxor %xmm7, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -6047,1252 +5989,1223 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: andps %xmm14, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] +; SSE-NEXT: pand %xmm8, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm13, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: andps %xmm14, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm8, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: por %xmm4, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE-NEXT: pxor %xmm12, %xmm12 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] ; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: por %xmm10, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm6[0],xmm7[1,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: andps %xmm10, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm8[0],xmm9[1,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm7, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: por %xmm13, %xmm8 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm11[0],xmm10[1,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: pandn %xmm7, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: por %xmm7, %xmm13 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm13, %xmm7 -; SSE-NEXT: andps %xmm5, %xmm9 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: por %xmm13, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm9, %xmm15 -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm9[0],xmm15[1,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pand %xmm14, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: por %xmm11, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: andps %xmm5, %xmm15 -; SSE-NEXT: por %xmm15, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm12, %xmm4 +; SSE-NEXT: andps %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] ; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: packuswb %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,2,2,3] -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm10, %xmm10 ; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm14 -; SSE-NEXT: pandn %xmm8, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: pandn %xmm14, %xmm11 -; SSE-NEXT: andps %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: por %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] -; SSE-NEXT: packuswb %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] +; SSE-NEXT: packuswb %xmm6, %xmm10 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15] -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,1,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm14 -; SSE-NEXT: pand %xmm6, %xmm14 -; SSE-NEXT: por %xmm8, %xmm14 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm14, %xmm3 -; SSE-NEXT: andps %xmm5, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[3,3,3,3] -; SSE-NEXT: packuswb %xmm12, %xmm8 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15] -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: andps %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: por %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm13, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: por %xmm13, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[3,3,3,3] -; SSE-NEXT: packuswb %xmm13, %xmm8 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15] -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: andps %xmm4, %xmm8 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm14, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] -; SSE-NEXT: pxor %xmm15, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,3,3,3] -; SSE-NEXT: packuswb %xmm8, %xmm14 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm1[0],xmm14[1,2,3] +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: andps %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[3,3,3,3] +; SSE-NEXT: packuswb %xmm9, %xmm10 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm6[0],xmm10[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: andps %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[3,3,3,3] +; SSE-NEXT: packuswb %xmm6, %xmm10 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm2[0],xmm10[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: andps %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[3,3,3,3] +; SSE-NEXT: packuswb %xmm11, %xmm10 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm2[0],xmm10[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm15[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: andps %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: andps %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm11, (%rax) -; SSE-NEXT: movdqa %xmm9, 48(%rax) -; SSE-NEXT: movdqa %xmm7, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movdqa %xmm13, (%rax) +; SSE-NEXT: movdqa %xmm4, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm1, (%rax) -; SSE-NEXT: movdqa %xmm13, 48(%rax) -; SSE-NEXT: movdqa %xmm12, 32(%rax) -; SSE-NEXT: movdqa %xmm3, 16(%rax) -; SSE-NEXT: addq $1528, %rsp # imm = 0x5F8 +; SSE-NEXT: movdqa %xmm14, (%rax) +; SSE-NEXT: movdqa %xmm6, 48(%rax) +; SSE-NEXT: movdqa %xmm9, 32(%rax) +; SSE-NEXT: movdqa %xmm0, 16(%rax) +; SSE-NEXT: addq $1544, %rsp # imm = 0x608 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $744, %rsp # imm = 0x2E8 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,6,13,0,0,0,128,128,128,6,13,0,0,0,128] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: subq $728, %rsp # imm = 0x2D8 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,6,13,0,0,0,128,128,128,6,13,0,0,0,128] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,128,128,128,5,12,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,7,14,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,0,0,0,3,10,128,128,128,0,0,0,3,10,128] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,7,14,128,128,0,0,0,0,7,14,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,15,0,0,0,128,128,1,8,15,0,0,0,128,128,1] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,0,0,0,3,10,128,128,128,0,0,0,3,10,128] ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [8,15,0,0,0,128,128,1,8,15,0,0,0,128,128,1] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [8,15,128,128,0,0,0,1,8,15,128,128,0,0,0,1] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm2, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm13 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm2, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [9,128,128,128,0,0,0,2,9,128,128,128,0,0,0,2] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [9,128,128,128,0,0,0,2,9,128,128,128,0,0,0,2] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,0,7,14,0,0,0,128,128,0,7,14,0,0,0,128] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm5, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,7,14,0,0,0,128,128,0,7,14,0,0,0,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm11 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm2, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,128,128,128,5,12,0] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,128,128,128,5,12,0,0,0,128,128,128,5,12,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm5, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm2, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm3, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm4, %xmm13, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm10[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm12 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm3, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm14, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm12 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,128,128,0,7,14,0,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm5 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm14 -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm5, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm12 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm3, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm2, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,128,128,128,6,13,0] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm1, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,128,6,13,0,0,0,128,128,128,6,13,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm3, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,2,9,128,128,128,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm5, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm2, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,3,10,128,128,128,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,2,9,128,128,128,0,0,0,2,9,128,128,128,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,3,10,128,128,128,0,0,0,3,10,128,128,128,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm4, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm3, %xmm12, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX1-ONLY-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpblendvb %xmm12, (%rsp), %xmm3, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm11 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm13, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm11, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1,2],xmm8[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1,2],xmm7[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm8, %xmm10, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm2, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm10, %ymm12, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm10 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] ; AVX1-ONLY-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm0[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,128,128,3,10,0,128,128,128,128,128,3,10] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,128,3,10,0,128,128,128,128,128,3,10] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [1,8,15,0,1,8,15,0,1,8,15,0,1,8,15,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm14, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX1-ONLY-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm8[7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm10, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm14 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] @@ -7300,277 +7213,266 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-ONLY-NEXT: vxorps %xmm7, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm13 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpxor %xmm11, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm11[7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12] ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [12,0,0,0,128,128,128,5,12,0,0,0,128,128,128,5] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,0,7,14,128,128,0,0,0,0,7,14,128] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[u,u] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0] ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,128,128,128,128,128,6,13,0,128,128,128,128,128,6,13] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm14[6,13] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm10 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm14[u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm5[6,13] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,1,8,15,128,128,0,0,0,1,8,15,128] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6] ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,1,8,15],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm3[0,7,14] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm13 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[2,9,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[0,7,14] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm14, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm12[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm10[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 @@ -7601,526 +7503,537 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX1-ONLY-NEXT: addq $744, %rsp # imm = 0x2E8 +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX1-ONLY-NEXT: addq $728, %rsp # imm = 0x2D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i8_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $760, %rsp # imm = 0x2F8 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-SLOW-NEXT: subq $776, %rsp # imm = 0x308 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm9 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] +; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7,8,9],ymm2[10],ymm0[11,12,13],ymm2[14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm12, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm8 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm11 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm8 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7,8,9],ymm2[10],ymm0[11,12,13],ymm2[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm12 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm11, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm11 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm10 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm11 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm8 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm14, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm8 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm12 +; AVX2-SLOW-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm12, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm7 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm7, %ymm8 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm14, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm9 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm7 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm9 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm11 -; AVX2-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm10 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm11 +; AVX2-SLOW-NEXT: vpor %xmm10, %xmm15, %xmm10 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm9 +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm14, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] +; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX2-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm7 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm8 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm7, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm7 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm9 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm10 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm13 +; AVX2-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm15 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm15, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8,9,10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5,6],ymm2[7,8],ymm3[9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4,5,6],ymm1[7,8],ymm4[9,10],ymm1[11],ymm4[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm3 ; AVX2-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2,3],ymm3[4],ymm13[5,6],ymm3[7,8],ymm13[9,10,11],ymm3[12],ymm13[13,14],ymm3[15] ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2,3],ymm1[4],ymm9[5,6],ymm1[7,8],ymm9[9,10,11],ymm1[12],ymm9[13,14],ymm1[15] ; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6,7,8],ymm2[9],ymm12[10,11],ymm2[12],ymm12[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7,8],ymm4[9],ymm7[10,11,12],ymm4[13],ymm7[14,15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm9 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm8[0],mem[1,2,3,4,5,6,7],ymm8[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm10[0],mem[1,2,3,4,5,6,7],ymm10[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm13[0],mem[1,2,3,4,5,6,7],ymm13[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -8129,257 +8042,238 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-SLOW-NEXT: addq $760, %rsp # imm = 0x2F8 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-SLOW-NEXT: addq $776, %rsp # imm = 0x308 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i8_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $776, %rsp # imm = 0x308 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX2-FAST-NEXT: subq $744, %rsp # imm = 0x2E8 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm15 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm13, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm9, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm12, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3,4,5],ymm8[6],ymm2[7,8,9],ymm8[10],ymm2[11,12,13],ymm8[14],ymm2[15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm5 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7,8,9],ymm8[10],ymm3[11,12,13],ymm8[14],ymm3[15] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm11, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,0,2,1,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [1,2,0,2,1,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpor %xmm4, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,0,2,1,3,4,6] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm11 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm9 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm14, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm12 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm12, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] +; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm14, %xmm9 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-NEXT: vpor %xmm9, %xmm15, %xmm9 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm15, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm10, %xmm9 +; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm10 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm11 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm13 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm13, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm13 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm10 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 @@ -8387,71 +8281,65 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm13 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm9 +; AVX2-FAST-NEXT: vpor %xmm13, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm8, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm10 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm9 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm9, %xmm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm8 -; AVX2-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm12 +; AVX2-FAST-NEXT: vpor %xmm7, %xmm12, %xmm7 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm13, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm13, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm8, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm11, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm13 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8459,62 +8347,61 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm15 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm13 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm15 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8,9,10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7,8,9,10],ymm7[11],ymm0[12,13],ymm7[14],ymm0[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm14 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 @@ -8523,36 +8410,36 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm6 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5,6],ymm1[7,8],ymm12[9,10],ymm1[11],ymm12[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1,2],ymm1[3],ymm15[4,5,6],ymm1[7,8],ymm15[9,10],ymm1[11],ymm15[12,13,14],ymm1[15] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4],ymm14[5,6],ymm3[7,8],ymm14[9,10,11],ymm3[12],ymm14[13,14],ymm3[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2,3],ymm3[4],ymm11[5,6],ymm3[7,8],ymm11[9,10,11],ymm3[12],ymm11[13,14],ymm3[15] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm9 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1,2,3],ymm1[4],ymm15[5,6],ymm1[7,8],ymm15[9,10,11],ymm1[12],ymm15[13,14],ymm1[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2,3],ymm1[4],ymm12[5,6],ymm1[7,8],ymm12[9,10,11],ymm1[12],ymm12[13,14],ymm1[15] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm15 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> @@ -8566,7 +8453,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 @@ -8576,7 +8463,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 @@ -8584,13 +8471,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7,8],ymm4[9],ymm10[10,11,12],ymm4[13],ymm10[14,15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 @@ -8599,45 +8486,44 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm10, %xmm4 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,1,2,1,3,5,6] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3,4,5,6,7],ymm8[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,1,2,1,3,5,6] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm6 -; AVX2-FAST-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2,3,4,5,6,7],ymm11[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm5[1,2,3,4,5,6,7],ymm14[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm8[0],mem[1,2,3,4,5,6,7],ymm8[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] @@ -8647,7 +8533,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, (%rsp), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload @@ -8673,525 +8559,536 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) -; AVX2-FAST-NEXT: addq $776, %rsp # imm = 0x308 +; AVX2-FAST-NEXT: addq $744, %rsp # imm = 0x2E8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i8_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $760, %rsp # imm = 0x2F8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: subq $776, %rsp # imm = 0x308 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7,8,9],ymm2[10],ymm0[11,12,13],ymm2[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm12, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7,8,9],ymm2[10],ymm0[11,12,13],ymm2[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm11, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm14, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm14, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm10, %xmm15, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm14, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm7, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm10, %xmm13, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm10, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm6, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm15, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8,9,10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5,6],ymm2[7,8],ymm3[9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4,5,6],ymm1[7,8],ymm4[9,10],ymm1[11],ymm4[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2,3],ymm3[4],ymm13[5,6],ymm3[7,8],ymm13[9,10,11],ymm3[12],ymm13[13,14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2,3],ymm1[4],ymm9[5,6],ymm1[7,8],ymm9[9,10,11],ymm1[12],ymm9[13,14],ymm1[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6,7,8],ymm2[9],ymm12[10,11],ymm2[12],ymm12[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7,8],ymm4[9],ymm7[10,11,12],ymm4[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm8[0],mem[1,2,3,4,5,6,7],ymm8[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm10[0],mem[1,2,3,4,5,6,7],ymm10[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm13[0],mem[1,2,3,4,5,6,7],ymm13[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -9200,2020 +9097,2019 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $760, %rsp # imm = 0x2F8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $776, %rsp # imm = 0x308 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $72, %rsp -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: subq $40, %rsp +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm19, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm14, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm13, %ymm16, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm3, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm19, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 288(%rdi), %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm27, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm29, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpor %xmm4, %xmm9, %xmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7,8,9],ymm13[10],ymm10[11,12,13],ymm13[14],ymm10[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm9, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm12[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7,8,9],ymm2[10],ymm12[11,12,13],ymm2[14],ymm12[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm25, %ymm9, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm21 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm13, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm30, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm0[2],ymm7[3,4,5],ymm0[6],ymm7[7,8,9],ymm0[10],ymm7[11,12,13],ymm0[14],ymm7[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm12[4,11],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm27, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm14, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm13, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4,5],ymm7[6],ymm3[7,8,9],ymm7[10],ymm3[11,12,13],ymm7[14],ymm3[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm8, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm26, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm20, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm20, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7,8,9,10],ymm13[11],ymm10[12,13],ymm13[14],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8,9,10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm7, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8,9,10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm25, %ymm3, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm14, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm13, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8,9,10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm5, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm14[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm20, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4,5,6],ymm13[7,8],ymm10[9,10],ymm13[11],ymm10[12,13,14],ymm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm20, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm12[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4,5,6],ymm10[7,8],ymm12[9,10],ymm10[11],ymm12[12,13,14],ymm10[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm7, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5,6],ymm0[7,8],ymm7[9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm5, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm25, %ymm3, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm14, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm13, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5,6],ymm7[7,8],ymm3[9,10],ymm7[11],ymm3[12,13,14],ymm7[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[2,9,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm13, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm10, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm20, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3],ymm13[4],ymm10[5,6],ymm13[7,8],ymm10[9,10,11],ymm13[12],ymm10[13,14],ymm13[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm19, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm8[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm20, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3],ymm12[4],ymm10[5,6],ymm12[7,8],ymm10[9,10,11],ymm12[12],ymm10[13,14],ymm12[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm7, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm27, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6,7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm5, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm0, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm7, %ymm3, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm27, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7,8],ymm10[9],ymm7[10,11,12],ymm10[13],ymm7[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm0, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm7, %ymm3, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm12, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[6,13] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm6, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm20, %ymm1, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm25, %ymm3, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm15, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm29, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7,8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm25, %ymm3, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm24, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm29, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[2,9,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm24, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm15, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm15, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm13, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm20, %ymm5, %ymm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm19, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm19, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6],ymm8[7,8],ymm10[9,10,11],ymm8[12],ymm10[13,14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm0, %ymm2, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm17, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm0, %ymm5, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7,8],ymm8[9],ymm1[10,11,12],ymm8[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm2, %ymm5, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm13, %ymm16, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm13, %ymm16, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm12[1,2,3],ymm7[4],ymm12[5,6],ymm7[7,8],ymm12[9,10,11],ymm7[12],ymm12[13,14],ymm7[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm19, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6,7,8],ymm7[9],ymm15[10,11],ymm7[12],ymm15[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm14, %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm18, %ymm9, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm17, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm18, %ymm9, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm19, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm17, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm15, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm19, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm27, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm29, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm16[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm23, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm11[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0,1],ymm2[2],ymm11[3,4],ymm2[5],ymm11[6,7,8,9],ymm2[10],ymm11[11,12],ymm2[13],ymm11[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm17, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $72, %rsp +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $40, %rsp ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i8_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: subq $40, %rsp +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm11, %ymm14, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 160(%rdi), %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm31, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,6,1,2,4,6] -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,6,1,2,4,6] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 288(%rdi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm28, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm27, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm15, %xmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 352(%rdi), %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7,8,9],ymm7[10],ymm15[11,12,13],ymm7[14],ymm15[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm29, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm10[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm15, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 416(%rdi), %ymm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm7, %ymm22, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm8, %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm25, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4,5],ymm9[6],ymm5[7,8,9],ymm9[10],ymm5[11,12,13],ymm9[14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] -; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm11, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7,8,9],ymm5[10],ymm7[11,12,13],ymm5[14],ymm7[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm31, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,4,6,1,3,4,6] +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm27, %ymm18, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[5,12,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u] ; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm18, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm29, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7,8,9,10],ymm10[11],ymm8[12,13],ymm10[14],ymm8[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm7, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm8, %ymm22, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm11, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8,9,10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm1, %ymm2, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm28, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,5,6,1,3,5,6] +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm8, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm9, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm27, %ymm18, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u] ; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm5, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm29, %ymm16, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3],ymm7[4,5,6],ymm10[7,8],ymm7[9,10],ymm10[11],ymm7[12,13,14],ymm10[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm5, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm7, %ymm22, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm29, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7,8,9,10],ymm9[11],ymm5[12,13],ymm9[14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm3, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,5,6,1,3,5,6] -; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm7, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm24, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm18, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2],ymm7[3],ymm5[4,5,6],ymm7[7,8],ymm5[9,10],ymm7[11],ymm5[12,13,14],ymm7[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm29, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2],ymm9[3],ymm3[4,5,6],ymm9[7,8],ymm3[9,10],ymm9[11],ymm3[12,13,14],ymm9[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm11, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4,5,6],ymm4[7,8],ymm5[9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm2, %ymm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm28, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 208(%rdi), %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[5,12] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512F-ONLY-FAST-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm2, %ymm4, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm0[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm10, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm27, %ymm18, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm29, %ymm16, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3],ymm10[4],ymm9[5,6],ymm10[7,8],ymm9[9,10,11],ymm10[12],ymm9[13,14],ymm10[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm8, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm8, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm18, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm9, %ymm22, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[2,9],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vporq %xmm8, %xmm2, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm27, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm29, %ymm16, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm9, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm8, %ymm22, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm29, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm28, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6,7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm28, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6,7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm2, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm8, %ymm22, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm27, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm29, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm8, %ymm22, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm28, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm11, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm18, %ymm3, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm11, %ymm14, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm11, %ymm14, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[5,12],zero,zero,xmm10[1,8,15],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3],ymm11[4],ymm9[5,6],ymm11[7,8],ymm9[9,10,11],ymm11[12],ymm9[13,14],ymm11[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm0, %ymm30, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm2, %ymm4, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6,7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm18, %ymm11, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm12, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm12, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3],ymm10[4],ymm8[5,6],ymm10[7,8],ymm8[9,10,11],ymm10[12],ymm8[13,14],ymm10[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm1, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm3, %ymm25, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6,7,8],ymm10[9],ymm14[10,11],ymm10[12],ymm14[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm9, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm0, %ymm10, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7,8],ymm13[9],ymm11[10,11,12],ymm13[13],ymm11[14,15] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7,8],ymm9[9],ymm3[10,11,12],ymm9[13],ymm3[14,15] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm3, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm11, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm16, %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm10, %ymm2, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm28, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm16, %ymm15, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm31, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm8, %ymm4, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[5,12,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm12, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm7, %ymm4, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm11, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm28, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm27, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm4[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm26, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm29[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, (%rdx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $40, %rsp ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $104, %rsp -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm30 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: subq $40, %rsp +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm14, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm13, %ymm16, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm31 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm28 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512DQ-SLOW-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm19, %zmm10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 288(%rdi), %ymm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm17, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm29, %ymm5 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u] ; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm13 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm13 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm13[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4,5],ymm10[6],ymm13[7,8,9],ymm10[10],ymm13[11,12,13],ymm10[14],ymm13[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm9, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm12 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm12 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm12[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7,8,9],ymm2[10],ymm12[11,12,13],ymm2[14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm25, %ymm9, %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] ; AVX512DQ-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm21 ; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm25, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm1[2],ymm4[3,4,5],ymm1[6],ymm4[7,8,9],ymm1[10],ymm4[11,12,13],ymm1[14],ymm4[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm12 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm12[4,11],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm23, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm14, %ymm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm13, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4,5],ymm7[6],ymm3[7,8,9],ymm7[10],ymm3[11,12,13],ymm7[14],ymm3[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm8, %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm20, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm20, %ymm3 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm10, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm10 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7,8,9,10],ymm13[11],ymm10[12,13],ymm13[14],ymm10[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm10 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8,9,10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm4, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm25, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm25, %ymm3, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm23, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm1, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm14, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm13, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8,9,10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, %xmm3 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm20, %ymm3 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm12 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm12[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4,5,6],ymm10[7,8],ymm12[9,10],ymm10[11],ymm12[12,13,14],ymm10[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm25, %ymm3, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm23, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm14, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm13, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5,6],ymm7[7,8],ymm3[9,10],ymm7[11],ymm3[12,13,14],ymm7[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[2,9,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm8, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm12[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm20, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm13 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm13[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm13[1,2],ymm6[3],ymm13[4,5,6],ymm6[7,8],ymm13[9,10],ymm6[11],ymm13[12,13,14],ymm6[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm4, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm6, %ymm25, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2],ymm11[3],ymm4[4,5,6],ymm11[7,8],ymm4[9,10],ymm11[11],ymm4[12,13,14],ymm11[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm23, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm13, %xmm6 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm12[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm13, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm20, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm6 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm6 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm6[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1,2,3],ymm10[4],ymm6[5,6],ymm10[7,8],ymm6[9,10,11],ymm10[12],ymm6[13,14],ymm10[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm4, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm10, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm6, %ymm25, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm23 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm4, %ymm25, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm29 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm4, %ymm25, %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vporq %xmm4, %xmm0, %xmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm13 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm20, %ymm15, %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm6 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm15 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3],ymm11[4],ymm10[5,6],ymm11[7,8],ymm10[9,10,11],ymm11[12],ymm10[13,14],ymm11[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm1, %ymm2, %ymm14 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm19, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2,3],ymm11[4],ymm6[5,6,7,8],ymm11[9],ymm6[10,11],ymm11[12],ymm6[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm1, %ymm12, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7,8],ymm11[9],ymm15[10,11,12],ymm11[13],ymm15[14,15] -; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm2, %ymm6, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm19, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm8[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm20, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm10 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3],ymm12[4],ymm10[5,6],ymm12[7,8],ymm10[9,10,11],ymm12[12],ymm10[13,14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm25, %ymm3, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm3 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm23, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm28 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm29, %ymm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm3[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3],ymm10[4],ymm3[5,6,7,8],ymm10[9],ymm3[10,11],ymm10[12],ymm3[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm25, %ymm2, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm25 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm23, %ymm25 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm15, %ymm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm29, %ymm3 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[2,9,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm10 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm27 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm23, %ymm27 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm12 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm15, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm15, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm13, %ymm12 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm20, %ymm4, %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm15 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm13, %ymm16, %ymm15 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm13, %ymm16, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm12[1,2,3],ymm7[4],ymm12[5,6],ymm7[7,8],ymm12[9,10,11],ymm7[12],ymm12[13,14],ymm7[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm12 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm19, %ymm3 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6,7,8],ymm7[9],ymm15[10,11],ymm7[12],ymm15[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm13 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7,8],ymm7[9],ymm4[10,11,12],ymm7[13],ymm4[14,15] +; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm6 ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm14, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm18, %ymm9, %ymm16 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm19, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm6, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] -; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm19, %ymm6 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm18, %ymm9, %ymm11 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] +; AVX512DQ-SLOW-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm19, %ymm8 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15] +; AVX512DQ-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm19, %ymm7 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm2, %zmm7 ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm23, %zmm0, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm29, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm17, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm3 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm16[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm25, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm27, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm29, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm11[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0,1],ymm2[2],ymm11[3,4],ymm2[5],ymm11[6,7,8,9],ymm2[10],ymm11[11,12],ymm2[13],ymm11[14,15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm28, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm17, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm7 {%k1} ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-SLOW-NEXT: addq $104, %rsp +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-SLOW-NEXT: addq $40, %rsp ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i8_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: pushq %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm1 +; AVX512DQ-FAST-NEXT: subq $40, %rsp +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7,8,9],ymm10[10],ymm2[11,12],ymm10[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm11, %ymm14, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm9 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 160(%rdi), %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 160(%rdi), %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm28, %ymm30, %ymm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,6,1,2,4,6] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,6,1,2,4,6] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm26, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 288(%rdi), %ymm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm4 ; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm4, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpor %xmm4, %xmm15, %xmm15 ; AVX512DQ-FAST-NEXT: vmovdqa64 352(%rdi), %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm15 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7,8,9],ymm7[10],ymm15[11,12,13],ymm7[14],ymm15[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm17, %ymm10 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm10[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm23, %ymm15, %ymm8 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vmovdqa64 416(%rdi), %ymm19 ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm8 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm7, %ymm23, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4,5],ymm10[6],ymm6[7,8,9],ymm10[10],ymm6[11,12,13],ymm10[14],ymm6[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] -; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm10 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm10 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm8, %ymm21, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm7 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm11, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4,5],ymm9[6],ymm7[7,8,9],ymm9[10],ymm7[11,12,13],ymm9[14],ymm7[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm28, %ymm30, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,4,6,1,3,4,6] +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm7 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm7 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[5,12,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm24, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm6 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm7 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm7 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm6, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm7, %ymm23, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm28, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7,8,9,10],ymm10[11],ymm6[12,13],ymm10[14],ymm6[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,5,6,1,3,5,6] -; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm7, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm24, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5,6],ymm7[7,8],ymm6[9,10],ymm7[11],ymm6[12,13,14],ymm7[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm23, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm17, %ymm8 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7,8,9,10],ymm10[11],ymm8[12,13],ymm10[14],ymm8[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm23, %ymm7, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm8, %ymm21, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm7 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm11, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7,8,9,10],ymm9[11],ymm7[12,13],ymm9[14],ymm7[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm28, %ymm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,5,6,1,3,5,6] +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm5 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm7 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm16, %ymm7 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3],ymm7[4,5,6],ymm10[7,8],ymm7[9,10],ymm10[11],ymm7[12,13,14],ymm10[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm23, %ymm5, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11] +; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm7, %ymm21, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm22 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm3[1,2],ymm10[3],ymm3[4,5,6],ymm10[7,8],ymm3[9,10],ymm10[11],ymm3[12,13,14],ymm10[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm11, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2],ymm9[3],ymm5[4,5,6],ymm9[7,8],ymm5[9,10],ymm9[11],ymm5[12,13,14],ymm9[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm10 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm28, %ymm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm2, %xmm6 +; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa 208(%rdi), %xmm7 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[5,12] -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512DQ-FAST-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm2, %ymm1, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm0[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm8, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, %xmm4 +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm10, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm16, %ymm9 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3],ymm10[4],ymm9[5,6],ymm10[7,8],ymm9[9,10,11],ymm10[12],ymm9[13,14],ymm10[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm23, %ymm8, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm8 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm9, %ymm21, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm16, %ymm8 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm9, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm8, %ymm23, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm24 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm28, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm27 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm23, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7,8],ymm8[9],ymm6[10,11,12],ymm8[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm28 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm23, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %xmm6, %xmm3, %xmm23 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm9, %xmm13, %xmm9 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm18, %ymm11, %ymm29 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6,7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm23, %ymm2, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm23 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm8, %ymm21, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[2,9],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm8 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm9 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm17, %ymm9 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14] +; AVX512DQ-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm8, %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm9, %ymm21, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm28, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm9, %xmm8, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm11, %ymm10 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm18, %ymm3, %ymm29 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm13 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm12, %ymm13 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm12, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm11, %ymm14, %ymm13 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm11, %ymm14, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3],ymm11[4],ymm10[5,6],ymm11[7,8],ymm10[9,10,11],ymm11[12],ymm10[13,14],ymm11[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm10 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm2, %ymm1, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6,7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7,8],ymm9[9],ymm3[10,11,12],ymm9[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm28, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm16, %ymm15, %ymm17 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm28, %ymm30, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14] +; AVX512DQ-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm3, %ymm1, %ymm8 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6],ymm0[7,8],ymm8[9,10,11],ymm0[12],ymm8[13,14],ymm0[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm1, %ymm12 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm8, %ymm1, %ymm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6,7,8],ymm0[9],ymm13[10,11],ymm0[12],ymm13[13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7,8],ymm0[9],ymm11[10,11,12],ymm0[13],ymm11[14,15] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm16, %ymm15, %ymm4 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14] -; AVX512DQ-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm1, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15] -; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm3, %ymm1, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm11, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm2, %zmm5 ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm27, %zmm0, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm28, %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm23, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm25, %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] ; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm29, %xmm1 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm4[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm22, %xmm1 +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm17[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm31, %xmm1 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-FAST-NEXT: popq %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-FAST-NEXT: addq $40, %rsp ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm26 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm26, %zmm0, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm26, %zmm0, %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm26, %zmm0, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm26, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: kmovq %k1, %k2 ; AVX512BW-ONLY-SLOW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm17 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm0, %ymm17 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512BW-ONLY-SLOW-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm7, %ymm0 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm8, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm27[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm13, %xmm5 ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13 ; AVX512BW-ONLY-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm0 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm5, %xmm0, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm5 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm16 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512BW-ONLY-SLOW-NEXT: movw $4644, %ax # imm = 0x1224 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm15 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[2,9] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm21, %xmm15, %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm0, %ymm21 ; AVX512BW-ONLY-SLOW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm22 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm15 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm23 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm15, %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm22, %ymm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm14, %xmm22, %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm15, %ymm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm7, %ymm15 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm15[u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,xmm15[4,11],zero,zero,xmm15[0,7,14,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm15, %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm24 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm23 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm27[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm15, %xmm23, %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm12, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm12 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm15, %xmm12, %xmm15 ; AVX512BW-ONLY-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm22, %xmm14, %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm14, %xmm21, %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k6 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm15 {%k6} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm7, %ymm11, %ymm12 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm23[u,u],zero,zero,zero,xmm23[5,12],zero,zero,xmm23[1,8,15,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm12, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm9, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm20 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = zero,zero,xmm27[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm12, %xmm20, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm2, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm15, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm3, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k6} = ymm22[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm7, %ymm11, %ymm15 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[2,9,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm20, %xmm15, %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 ; AVX512BW-ONLY-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k2 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm20, %ymm15 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = zero,zero,xmm27[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm15, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm15 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm21, %ymm19 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm2 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm2, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[2,9],zero,zero,zero,xmm19[5,12,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm12 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm12[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7,8,9,10],ymm3[11],ymm12[12,13],ymm3[14],ymm12[15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm2 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2],ymm12[3],ymm3[4,5,6],ymm12[7,8],ymm3[9,10],ymm12[11],ymm3[12,13,14],ymm12[15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm2 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3],ymm12[4],ymm3[5,6],ymm12[7,8],ymm3[9,10,11],ymm12[12],ymm3[13,14],ymm12[15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm15 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm2, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm2, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm2 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm2, %ymm20 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm2 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovq %k1, %k7 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7,8],ymm12[9],ymm3[10,11,12],ymm12[13],ymm3[14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm19 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm2 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm2 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm2 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm21 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm7, %ymm11, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm21 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm18, %ymm0 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm14, %ymm13 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm0, %xmm12, %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm12 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm10, %ymm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm16, %ymm5 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm7, %ymm11, %ymm12 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm10, %ymm1 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm11, %ymm7 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-ONLY-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm10, %ymm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm10, %ymm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm10, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm11, %ymm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm26, %zmm8, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm26, %zmm9, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm26, %zmm11, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 {%k6} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm27[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm11 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k6} = ymm9[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm27[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm12 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm9, %zmm12 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm26, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k6} = ymm8[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm27, %xmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm4, %zmm1 {%k6} ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -11221,11 +11117,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rdi) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper @@ -11233,45 +11129,45 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-ONLY-FAST-LABEL: load_i8_stride7_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm0, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm3, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm0, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm3, %zmm0, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm0, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm3, %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm3, %zmm0, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-ONLY-FAST-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm2, %ymm12, %ymm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k2 ; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm5, %xmm0, %xmm26 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm4, %ymm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512BW-ONLY-FAST-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k3 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm14, %ymm11, %ymm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k6 ; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] @@ -11291,13 +11187,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 -; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} +; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm4, %zmm26 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX512BW-ONLY-FAST-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k6 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm10 {%k6} +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k4 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm10 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] @@ -11309,15 +11205,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512BW-ONLY-FAST-NEXT: movw $4644, %ax # imm = 0x1224 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k4 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm20 {%k4} +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k5 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm20 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm22 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 ; AVX512BW-ONLY-FAST-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm2, %ymm12, %ymm20 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm23 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] @@ -11326,7 +11222,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm20, %ymm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm14, %ymm11, %ymm20 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] @@ -11335,44 +11231,44 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] ; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm15, %xmm20, %xmm15 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm14, %zmm9 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm15, %xmm14, %xmm15 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm13, %zmm9 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm2, %ymm12, %ymm13 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm15, %xmm13, %xmm15 ; AVX512BW-ONLY-FAST-NEXT: movl $261632, %r10d # imm = 0x3FE00 -; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k5 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm20 +; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k6 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 {%k6} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm14, %ymm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm13, %xmm20 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm13, %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] ; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm14, %xmm19, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm13, %xmm19, %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm2, %ymm12, %ymm0 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k6} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm14, %ymm15 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] @@ -11398,8 +11294,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm26 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] @@ -11423,7 +11319,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm17 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm17 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] @@ -11431,15 +11327,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm16 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm16 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm17 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero @@ -11449,26 +11345,26 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm16, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm18 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm16, %xmm16 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,0,7,14],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm16, %ymm17 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm18 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm16, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm18, %ymm16 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 @@ -11476,92 +11372,93 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm0 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm21 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm2, %ymm0 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm2, %ymm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm14, %ymm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm12, %ymm2 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm14, %ymm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm14, %ymm11 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm14, %xmm12, %xmm12 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm13, %ymm12 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm14, %xmm22, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm14, %ymm12 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[1,8,15],zero,zero,xmm21[4,11,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm13, %xmm21, %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm14, %xmm21, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm22, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm21, %ymm13 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm21, %ymm14 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm11, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm11, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm11, %ymm0, %ymm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm11, %xmm19, %xmm11 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm11, %ymm21 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm19, %xmm19 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm19, %ymm11 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm3, %zmm19, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm3, %zmm20, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm3, %zmm21, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm0, %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k6} = ymm3[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm11, %zmm2 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm18[0],xmm12[0],xmm18[1],xmm12[1],xmm18[2],xmm12[2],xmm18[3],xmm12[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm12, %zmm11 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm12, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm12, %zmm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm12, %xmm18, %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 {%k6} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm18 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm14, %zmm12 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k6} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm3 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm3, %zmm2 {%k6} +; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] @@ -11570,396 +11467,396 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512BW-ONLY-FAST-NEXT: movl $4186112, %edi # imm = 0x3FE000 ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm0, %zmm4, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%rsi) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rcx) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rdi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rdi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm26 ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm26, %zmm0, %zmm22 ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm26, %zmm0, %zmm25 ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm26, %zmm0, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm26, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQBW-SLOW-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm3 {%k1} -; AVX512DQBW-SLOW-NEXT: kmovq %k1, %k2 ; AVX512DQBW-SLOW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm16 +; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm17 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm0, %ymm17 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512DQBW-SLOW-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k6 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6} +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm7, %ymm0 {%k3} ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm8, %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm27 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm27[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm13, %xmm5 ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13 ; AVX512DQBW-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k3 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k4 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm0 {%k4} ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] ; AVX512DQBW-SLOW-NEXT: vporq %xmm5, %xmm0, %xmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm5 {%k3} ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] ; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k7 ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm16 ; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512DQBW-SLOW-NEXT: movw $4644, %ax # imm = 0x1224 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k4 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512DQBW-SLOW-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k5 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm15 {%k5} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm21 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[2,9] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero +; AVX512DQBW-SLOW-NEXT: vporq %xmm21, %xmm15, %xmm15 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm0, %ymm21 ; AVX512DQBW-SLOW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm22 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm15 {%k5} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm23 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm15, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm22, %ymm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm14, %xmm22, %xmm14 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k5} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm15, %ymm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm7, %ymm15 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm15[u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,xmm15[4,11],zero,zero,xmm15[0,7,14,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm15, %xmm15 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm24 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm23 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm27[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm15, %xmm23, %xmm15 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm12 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm15, %xmm12, %xmm15 ; AVX512DQBW-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 -; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k5 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm22, %xmm14, %xmm14 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm14, %xmm21, %xmm14 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k6 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm15 {%k6} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm7, %ymm11, %ymm12 {%k2} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm23 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm23[u,u],zero,zero,zero,xmm23[5,12],zero,zero,xmm23[1,8,15,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm12, %xmm12 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm9, %xmm20 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm20 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = zero,zero,xmm27[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm12, %xmm20, %xmm12 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm2, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm15, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm3, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k6} = ymm22[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm7, %ymm11, %ymm15 {%k5} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm20 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[2,9,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm20, %xmm15, %xmm15 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQBW-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 ; AVX512DQBW-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k2 ; AVX512DQBW-SLOW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm20, %ymm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = zero,zero,xmm27[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm15, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm15 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm21, %ymm19 {%k2} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm2 {%k5} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm2, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[2,9],zero,zero,zero,xmm19[5,12,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm12 {%k4} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm12[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7,8,9,10],ymm3[11],ymm12[12,13],ymm3[14],ymm12[15] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm2 {%k3} ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] ; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm3 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2],ymm12[3],ymm3[4,5,6],ymm12[7,8],ymm3[9,10],ymm12[11],ymm3[12,13,14],ymm12[15] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm2 {%k4} ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] ; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3],ymm12[4],ymm3[5,6],ymm12[7,8],ymm3[9,10,11],ymm12[12],ymm3[13,14],ymm12[15] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm3 {%k5} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm15 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm2 {%k1} ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] ; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm2, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm2 {%k4} ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm2 {%k5} ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] ; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm3 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovq %k1, %k7 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7,8],ymm12[9],ymm3[10,11,12],ymm12[13],ymm3[14,15] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm2 {%k3} ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero ; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm2 {%k4} ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] ; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm22 {%k6} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm9, %ymm1, %ymm21 {%k6} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm7, %ymm11, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm21 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm18, %ymm0 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm14, %ymm13 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm12, %xmm0 ; AVX512DQBW-SLOW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm9, %ymm1, %ymm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm9, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm11 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm16, %ymm5 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm3 {%k5} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm7, %ymm11, %ymm12 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm10, %ymm1 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm11, %ymm7 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQBW-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm10, %ymm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm11 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] ; AVX512DQBW-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm11 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] -; AVX512DQBW-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm7, %xmm11, %xmm7 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm8, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] +; AVX512DQBW-SLOW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm26, %zmm8, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm26, %zmm9, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm11, %zmm11 -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm26, %zmm11, %zmm11 +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 {%k6} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm12 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm27[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k6} ; AVX512DQBW-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm9 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k6} = ymm9[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm27[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm9 {%k5} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k6} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm26, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k6} = ymm8[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm27, %xmm8 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm4, %zmm1 {%k6} ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, (%rdx) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rdi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%rdi) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq @@ -11967,28 +11864,28 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-LABEL: load_i8_stride7_vf64: ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm24 ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermw %zmm3, %zmm1, %zmm16 ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] ; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermw %zmm3, %zmm1, %zmm25 ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] ; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermw %zmm3, %zmm1, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermw %zmm3, %zmm1, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQBW-FAST-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm2, %ymm12, %ymm1 {%k1} ; AVX512DQBW-FAST-NEXT: kmovq %k1, %k2 ; AVX512DQBW-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 @@ -11999,12 +11896,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 ; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm14 ; AVX512DQBW-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512DQBW-FAST-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1} -; AVX512DQBW-FAST-NEXT: kmovq %k1, %k3 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm14, %ymm11, %ymm4 {%k1} +; AVX512DQBW-FAST-NEXT: kmovq %k1, %k6 ; AVX512DQBW-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] @@ -12017,40 +11914,40 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQBW-FAST-NEXT: vmovdqa 240(%rdi), %xmm7 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqa 224(%rdi), %xmm8 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 240(%rdi), %xmm5 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqa 224(%rdi), %xmm7 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm4 ; AVX512DQBW-FAST-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 -; AVX512DQBW-FAST-NEXT: kmovq %rax, %k5 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 +; AVX512DQBW-FAST-NEXT: kmovq %rax, %k3 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm4, %zmm1 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 288(%rdi), %ymm6 ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX512DQBW-FAST-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k6 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm10 {%k6} +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k4 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm10 {%k4} ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] ; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm10, %xmm21 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k7 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm8, %ymm21 {%k7} ; AVX512DQBW-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm8 ; AVX512DQBW-FAST-NEXT: movw $4644, %ax # imm = 0x1224 -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k4 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm20 {%k4} +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k5 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm8, %ymm20 {%k5} ; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm22 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512DQBW-FAST-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 ; AVX512DQBW-FAST-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm2, %ymm12, %ymm20 {%k5} ; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm23 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] @@ -12059,7 +11956,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 ; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm20, %ymm9 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm14, %ymm11, %ymm20 {%k4} ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] @@ -12068,235 +11965,235 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] ; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm5[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vporq %xmm15, %xmm20, %xmm15 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm14, %zmm9 {%k5} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm15 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm13, %zmm9 {%k3} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm2, %ymm12, %ymm13 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm15, %xmm13, %xmm15 ; AVX512DQBW-FAST-NEXT: movl $261632, %r10d # imm = 0x3FE00 -; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k5 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm20 +; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k6 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 {%k6} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm14, %ymm13 {%k2} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm13, %xmm20 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm13, %xmm13 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] ; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm14, %xmm19, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm13, %xmm19, %xmm13 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm2, %ymm12, %ymm0 {%k4} ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k6} = ymm16[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm14, %ymm15 {%k5} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u],zero,zero,zero,xmm16[6,13],zero,zero,xmm16[2,9,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm19, %xmm15, %xmm15 +; AVX512DQBW-FAST-NEXT: vporq %xmm16, %xmm15, %xmm15 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQBW-FAST-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQBW-FAST-NEXT: vporq %xmm16, %xmm23, %xmm16 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512DQBW-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 ; AVX512DQBW-FAST-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm16, %ymm15 {%k2} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,xmm5[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm16, %xmm23, %xmm16 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm16, %zmm15, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm16 {%k1} ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k3 ; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512DQBW-FAST-NEXT: kmovq %rax, %k2 ; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k5} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm15, %ymm0 {%k7} ; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm18 {%k1} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] -; AVX512DQBW-FAST-NEXT: vporq %xmm21, %xmm18, %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm8, %ymm15 {%k1} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm15, %xmm15 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm15, %ymm0 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm17 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm15, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm8, %ymm15 {%k4} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm15, %ymm0 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k4} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm16 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm15, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm10, %ymm15 {%k5} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm17 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm15, %ymm0 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm16 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] ; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm15, %zmm17 ; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm18 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm15 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm18 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm16, %xmm16 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm15, %xmm15 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm17, %ymm15 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm17 {%k5} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm18 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,1,8,15],zero,zero,xmm17[4,11],zero,zero,xmm17[u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm17, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm18, %ymm17 {%k1} ; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm10, %ymm0 {%k1} ; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero ; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm15 {%k3} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm10, %ymm0 {%k4} ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] ; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm21 {%k4} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm2, %ymm0 {%k5} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm2, %ymm18 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm14, %ymm21 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm12, %ymm2 {%k4} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm14, %ymm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm14, %ymm11 {%k5} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512DQBW-FAST-NEXT: vpor %xmm14, %xmm12, %xmm12 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQBW-FAST-NEXT: vporq %xmm14, %xmm22, %xmm14 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQBW-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm13, %ymm12 {%k2} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm13, %xmm0 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm14, %ymm12 {%k2} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[1,8,15],zero,zero,xmm21[4,11,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm14, %xmm21, %xmm14 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] -; AVX512DQBW-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm13, %ymm0 {%k2} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm21, %xmm22, %xmm21 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm21, %ymm14 {%k2} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm11, %xmm21 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm13, %xmm11, %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512DQBW-FAST-NEXT: vporq %xmm11, %xmm19, %xmm11 +; AVX512DQBW-FAST-NEXT: vporq %xmm21, %xmm11, %xmm11 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm13 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm11 -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm2 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm2, %xmm21, %xmm2 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-FAST-NEXT: vpshufb %xmm21, %xmm8, %xmm11 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm11, %zmm2 {%k5} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm11 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm18[0],xmm12[0],xmm18[1],xmm12[1],xmm18[2],xmm12[2],xmm18[3],xmm12[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm11 {%k5} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb %xmm21, %xmm7, %xmm3 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] +; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm19, %xmm19 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm19, %ymm11 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512DQBW-FAST-NEXT: vpermw %zmm3, %zmm19, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512DQBW-FAST-NEXT: vpermw %zmm3, %zmm20, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512DQBW-FAST-NEXT: vpermw %zmm3, %zmm21, %zmm3 +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k6} = ymm3[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm21 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm5[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm12, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm12, %zmm0 {%k6} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm12 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm12, %xmm18, %xmm12 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm12 {%k6} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm5[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm18 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm14, %zmm12 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k6} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm3, %zmm2 {%k6} ; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm10, %ymm8 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] ; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm3, %zmm3 ; AVX512DQBW-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm6, %ymm4 {%k1} ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] @@ -12305,26 +12202,26 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k1 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm15, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,0,7,14],zero,zero,xmm8[3,10],zero,zero,zero ; AVX512DQBW-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] ; AVX512DQBW-FAST-NEXT: vpermi2w %zmm3, %zmm4, %zmm5 -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm2 {%k1} ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rsi) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rdi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, (%rdi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <448 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 78a2729caf5a4c..e8fff34084775d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -21,38 +21,38 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm6, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movd %xmm0, %edi +; SSE-NEXT: movd %xmm1, %edi ; SSE-NEXT: movw %di, (%rsi) ; SSE-NEXT: movd %xmm5, %esi ; SSE-NEXT: movw %si, (%rdx) @@ -64,7 +64,7 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movw %cx, (%r9) ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: movw %cx, (%r11) -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %xmm0, %ecx ; SSE-NEXT: movw %cx, (%r10) ; SSE-NEXT: movd %xmm4, %ecx ; SSE-NEXT: movw %cx, (%rax) @@ -143,25 +143,25 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] ; SSE-NEXT: packuswb %xmm10, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm10, %xmm8 ; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] ; SSE-NEXT: movdqa %xmm5, %xmm4 @@ -171,20 +171,20 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,2,2,3] +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm12, %xmm12 ; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm13, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm13, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] ; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: pandn %xmm10, %xmm12 @@ -192,33 +192,33 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm10, %xmm10 ; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: por %xmm12, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm7[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm11, %xmm11 ; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[2,0,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm12, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm12, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 @@ -227,11 +227,11 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movd %xmm0, (%rsi) ; SSE-NEXT: movd %xmm3, (%rdx) -; SSE-NEXT: movd %xmm6, (%rcx) +; SSE-NEXT: movd %xmm8, (%rcx) ; SSE-NEXT: movd %xmm10, (%r8) -; SSE-NEXT: movd %xmm5, (%r9) -; SSE-NEXT: movd %xmm7, (%r11) -; SSE-NEXT: movd %xmm8, (%r10) +; SSE-NEXT: movd %xmm11, (%r9) +; SSE-NEXT: movd %xmm9, (%r11) +; SSE-NEXT: movd %xmm5, (%r10) ; SSE-NEXT: movd %xmm1, (%rax) ; SSE-NEXT: retq ; @@ -379,212 +379,215 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i8_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 48(%rdi), %xmm10 +; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,1,3] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm14, %xmm9 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm7[8],xmm14[9],xmm7[9],xmm14[10],xmm7[10],xmm14[11],xmm7[11],xmm14[12],xmm7[12],xmm14[13],xmm7[13],xmm14[14],xmm7[14],xmm14[15],xmm7[15] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,1,1,3] +; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm14, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm9, %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[3,3,3,3] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,2,0,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm7[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm11, %xmm11 ; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,4,6] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,4,6] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm6, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm3, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,1,1,3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm13, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,4,5,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[3,3,3,3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $236, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,3,2,3] ; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlps %xmm0, (%rdx) -; SSE-NEXT: movq %xmm8, (%rcx) -; SSE-NEXT: movq %xmm15, (%r8) +; SSE-NEXT: movq %xmm9, (%rcx) +; SSE-NEXT: movq %xmm1, (%r8) ; SSE-NEXT: movq %xmm11, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %xmm9, (%rax) +; SSE-NEXT: movq %xmm2, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %xmm4, (%rax) +; SSE-NEXT: movq %xmm5, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq %xmm3, (%rax) -; SSE-NEXT: popq %rax +; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride8_vf8: @@ -815,30 +818,30 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i8_stride8_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $328, %rsp # imm = 0x148 -; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: subq $312, %rsp # imm = 0x138 +; SSE-NEXT: movdqa (%rdi), %xmm8 ; SSE-NEXT: movdqa 16(%rdi), %xmm14 ; SSE-NEXT: movdqa 32(%rdi), %xmm13 -; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 ; SSE-NEXT: movdqa 64(%rdi), %xmm15 -; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa 80(%rdi), %xmm5 ; SSE-NEXT: movdqa 96(%rdi), %xmm12 -; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa 112(%rdi), %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 @@ -846,115 +849,114 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: packuswb %xmm1, %xmm3 ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm6, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; SSE-NEXT: movdqa %xmm15, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm4, %xmm10 ; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[1,1,1,1] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm15 -; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: por %xmm10, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] @@ -964,158 +966,158 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pandn %xmm9, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: por %xmm15, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm10, %xmm10 ; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pandn %xmm9, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: por %xmm15, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm15, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm15, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: pand %xmm1, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: packuswb %xmm2, %xmm9 +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,1,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm14 -; SSE-NEXT: pand %xmm6, %xmm14 -; SSE-NEXT: por %xmm9, %xmm14 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,2,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd $212, (%rsp), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,6] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm9, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm13, %xmm13 +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: por %xmm10, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -1125,151 +1127,152 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pandn %xmm9, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,1,3] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: por %xmm14, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,1,3] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm13, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: packuswb %xmm0, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,1,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pandn %xmm9, %xmm15 +; SSE-NEXT: packuswb %xmm0, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,1,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,1,3] -; SSE-NEXT: packuswb %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,1,3] +; SSE-NEXT: packuswb %xmm13, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: packuswb %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: pandn %xmm9, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: packuswb %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm15, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm14, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,5,7,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm12 +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: por %xmm14, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm7, %xmm6 ; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,3,3] -; SSE-NEXT: pshufd $255, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movapd %xmm11, (%r9) +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movapd %xmm10, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm15, (%rax) +; SSE-NEXT: movapd %xmm13, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm7, (%rax) +; SSE-NEXT: movapd %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, (%rax) -; SSE-NEXT: addq $328, %rsp # imm = 0x148 +; SSE-NEXT: addq $312, %rsp # imm = 0x138 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride8_vf16: @@ -1886,535 +1889,531 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i8_stride8_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $904, %rsp # imm = 0x388 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm6 -; SSE-NEXT: movdqa 144(%rdi), %xmm13 -; SSE-NEXT: movdqa 160(%rdi), %xmm11 -; SSE-NEXT: movdqa 176(%rdi), %xmm14 +; SSE-NEXT: subq $888, %rsp # imm = 0x378 +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm12 +; SSE-NEXT: movdqa 144(%rdi), %xmm11 +; SSE-NEXT: movdqa 160(%rdi), %xmm8 +; SSE-NEXT: movdqa 176(%rdi), %xmm7 ; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm7 -; SSE-NEXT: movdqa 224(%rdi), %xmm8 -; SSE-NEXT: movdqa 240(%rdi), %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa 208(%rdi), %xmm9 +; SSE-NEXT: movdqa 224(%rdi), %xmm10 +; SSE-NEXT: movdqa 240(%rdi), %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: packuswb %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm14 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pandn %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm14 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,1,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: packuswb %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,1,1] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 ; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,1,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; SSE-NEXT: packuswb %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,1,3] -; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: packuswb %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,1,1] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm7, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: packuswb %xmm12, %xmm12 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm12, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm8, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: packuswb %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,1,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[1,1,1,1] +; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: pand %xmm5, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm15, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm10 ; SSE-NEXT: pandn %xmm15, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: pand %xmm5, %xmm15 ; SSE-NEXT: por %xmm10, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm10, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm10, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm5, %xmm15 ; SSE-NEXT: pandn %xmm10, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm10 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2423,59 +2422,58 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $212, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2484,31 +2482,30 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -2518,11 +2515,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2530,7 +2527,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2538,7 +2535,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2548,7 +2545,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2556,7 +2553,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -2567,15 +2564,15 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -2585,11 +2582,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2597,236 +2594,236 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: packuswb %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: packuswb %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: packuswb %xmm1, %xmm15 +; SSE-NEXT: pand %xmm6, %xmm15 +; SSE-NEXT: por %xmm8, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm4, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: packuswb %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pand %xmm5, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshuflw $116, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm11, %xmm15 +; SSE-NEXT: por %xmm14, %xmm15 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pshufhw $237, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm2 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pand %xmm5, %xmm14 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm15[0],xmm14[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm15, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm15, %xmm2 +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm12, %xmm12 +; SSE-NEXT: pand %xmm5, %xmm12 +; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movapd %xmm10, (%r9) -; SSE-NEXT: movapd %xmm6, 16(%r9) +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movapd %xmm7, (%r9) +; SSE-NEXT: movapd %xmm10, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm4, (%rax) -; SSE-NEXT: movapd %xmm8, 16(%rax) +; SSE-NEXT: movapd %xmm1, (%rax) +; SSE-NEXT: movapd %xmm4, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm12, (%rax) -; SSE-NEXT: movapd %xmm15, 16(%rax) +; SSE-NEXT: movapd %xmm14, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 16(%rax) -; SSE-NEXT: movapd %xmm5, (%rax) -; SSE-NEXT: addq $904, %rsp # imm = 0x388 +; SSE-NEXT: movapd %xmm8, (%rax) +; SSE-NEXT: addq $888, %rsp # imm = 0x378 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride8_vf32: @@ -3265,13 +3262,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-LABEL: load_i8_stride8_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3284,7 +3282,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 @@ -3294,8 +3293,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm11 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3] @@ -3307,9 +3305,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 @@ -3324,12 +3322,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -3338,9 +3337,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -3349,15 +3347,16 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -3366,9 +3365,10 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -3378,9 +3378,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3389,10 +3388,10 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3 @@ -3401,23 +3400,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -3440,37 +3438,36 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -3479,10 +3476,10 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3497,19 +3494,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] @@ -3520,19 +3519,18 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3546,44 +3544,45 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3597,43 +3596,42 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm15, %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3650,38 +3648,38 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3712,35 +3710,35 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FAST-LABEL: load_i8_stride8_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $248, %rsp +; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm1 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm14 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm10 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm11 @@ -3749,253 +3747,252 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa %xmm14, %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm12 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm14, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX2-FAST-NEXT: vpermd (%rsp), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm4 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm6 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -4004,29 +4001,30 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: addq $248, %rsp +; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i8_stride8_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4039,7 +4037,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 @@ -4049,8 +4048,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3] @@ -4062,9 +4060,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm12 @@ -4079,12 +4077,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -4093,9 +4092,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -4104,15 +4102,16 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -4121,9 +4120,10 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -4133,9 +4133,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4144,10 +4143,10 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm3 @@ -4156,23 +4155,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -4195,37 +4193,36 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -4234,10 +4231,10 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4252,19 +4249,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] @@ -4275,19 +4274,18 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4301,44 +4299,45 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4352,43 +4351,42 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4405,38 +4403,38 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4468,94 +4466,92 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-LABEL: load_i8_stride8_vf32: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpmovqb %ymm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] -; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-SLOW-NEXT: vpmovqb %ymm7, %xmm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm18 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3] +; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm22 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm13 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm12[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm26 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4563,43 +4559,41 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm20 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm10 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4607,127 +4601,130 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm26 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm27 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm7 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm25 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm28 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm24 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm13 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm26 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm24 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm2 @@ -4735,83 +4732,86 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm23 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm7 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm13 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm16, %zmm3 -; AVX512F-SLOW-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, (%rcx) @@ -4820,9 +4820,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm14, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -5476,37 +5476,37 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7] -; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm9 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm10 +; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm10 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm9 ; AVX512BW-FAST-NEXT: vpermd %ymm11, %ymm8, %ymm11 ; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm11, %ymm12 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm8, %ymm10 -; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm10, %ymm13 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm8, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm12, %ymm13 ; AVX512BW-FAST-NEXT: vpermd %ymm15, %ymm8, %ymm15 ; AVX512BW-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm12 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14 ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] ; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm0, %zmm13 ; AVX512BW-FAST-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb %ymm16, %ymm9, %ymm12 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FAST-NEXT: vpshufb %ymm16, %ymm10, %ymm9 ; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm11, %ymm13 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm18, %ymm10, %ymm13 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm18, %ymm12, %ymm13 ; AVX512BW-FAST-NEXT: vpshufb %ymm19, %ymm15, %ymm14 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13 @@ -5519,11 +5519,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpsrlq $40, %zmm0, %zmm14 ; AVX512BW-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm9, %ymm13 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm10, %ymm13 ; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm11, %ymm14 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm22, %ymm10, %ymm14 +; AVX512BW-FAST-NEXT: vpshufb %ymm22, %ymm12, %ymm14 ; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm15, %ymm6 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] @@ -5540,16 +5540,16 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb %ymm24, %ymm9, %ymm9 +; AVX512BW-FAST-NEXT: vpshufb %ymm24, %ymm10, %ymm10 ; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm11, %ymm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm26, %ymm10, %ymm10 -; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm15, %ymm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm26, %ymm12, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm15, %ymm12 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 @@ -5559,14 +5559,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpsrlq $56, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512BW-FAST-NEXT: vmovaps %ymm1, (%rsi) ; AVX512BW-FAST-NEXT: vmovdqa64 %ymm29, (%rdx) ; AVX512BW-FAST-NEXT: vmovdqa %ymm7, (%rcx) ; AVX512BW-FAST-NEXT: vmovdqa %ymm4, (%r8) ; AVX512BW-FAST-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-FAST-NEXT: vmovdqa %ymm12, (%r11) +; AVX512BW-FAST-NEXT: vmovdqa %ymm9, (%r11) ; AVX512BW-FAST-NEXT: vmovdqa %ymm6, (%r10) ; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper @@ -5597,77 +5597,75 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: subq $2040, %rsp # imm = 0x7F8 ; SSE-NEXT: movdqa 64(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm5 -; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 +; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: movdqa 160(%rdi), %xmm4 ; SSE-NEXT: movdqa 176(%rdi), %xmm13 -; SSE-NEXT: movdqa 192(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm2 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm14 ; SSE-NEXT: movdqa 224(%rdi), %xmm9 -; SSE-NEXT: movdqa 240(%rdi), %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,0,0,255,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm3 @@ -5676,34 +5674,34 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 496(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa 480(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 464(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa 448(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa 416(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa 384(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm3 @@ -5712,73 +5710,73 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa 352(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa 320(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa 304(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa 288(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 272(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa 256(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: packuswb %xmm0, %xmm15 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: packuswb %xmm0, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm2[0,3] +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm5 ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: packuswb %xmm2, %xmm2 @@ -5787,63 +5785,63 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; SSE-NEXT: packuswb %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5855,50 +5853,50 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 @@ -5906,33 +5904,33 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: packuswb %xmm2, %xmm3 @@ -5941,9 +5939,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5955,45 +5953,46 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6005,33 +6004,33 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: packuswb %xmm2, %xmm3 @@ -6040,9 +6039,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6056,171 +6055,171 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm8, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: por %xmm14, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,1,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; SSE-NEXT: packuswb %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[1,1,1,1] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm5, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: packuswb %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[1,1,1,1] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: por %xmm6, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm15, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm14, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm15, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm14, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,1,1,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm14, %xmm14 ; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm14, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm7, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: por %xmm7, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6229,58 +6228,58 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm6, %xmm6 ; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6288,69 +6287,69 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm14, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm6, %xmm6 ; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6358,9 +6357,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: por %xmm15, %xmm7 +; SSE-NEXT: por %xmm14, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6369,95 +6368,95 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm6, %xmm6 ; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: por %xmm14, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm7 @@ -6468,34 +6467,34 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm7 @@ -6506,34 +6505,34 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm7 @@ -6544,19 +6543,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 @@ -6564,17 +6563,18 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,3,3] +; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 @@ -6582,18 +6582,18 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] @@ -6616,14 +6616,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] @@ -6643,18 +6643,18 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] @@ -6677,14 +6677,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] @@ -6693,9 +6693,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 @@ -6708,14 +6708,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] @@ -6724,7 +6724,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $212, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] @@ -6738,25 +6738,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 @@ -6770,14 +6770,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] @@ -6800,14 +6800,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] @@ -6836,11 +6836,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6866,7 +6866,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6874,7 +6874,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -6904,11 +6904,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6918,11 +6918,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 @@ -6934,7 +6933,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6942,19 +6941,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: packuswb %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm2, %xmm8 ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -6971,11 +6970,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7001,7 +7000,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7009,20 +7008,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -7039,11 +7038,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7069,14 +7068,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,1,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -7085,24 +7085,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: packuswb %xmm2, %xmm5 ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] @@ -7119,33 +7119,34 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] @@ -7162,12 +7163,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] @@ -7179,18 +7180,18 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] @@ -7207,12 +7208,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] @@ -7224,18 +7225,18 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] @@ -7252,93 +7253,55 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm8, %xmm2 @@ -7347,12 +7310,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7363,13 +7326,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm1, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm8 +; SSE-NEXT: pand %xmm11, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,3,3] +; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7382,39 +7347,74 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,2,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: por %xmm5, %xmm12 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm12[0],xmm5[1] +; SSE-NEXT: movaps %xmm15, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -7454,7 +7454,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm15, 32(%rax) +; SSE-NEXT: movapd %xmm14, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -7462,15 +7462,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm5, 48(%rax) -; SSE-NEXT: movapd %xmm7, 32(%rax) -; SSE-NEXT: movapd %xmm11, 16(%rax) -; SSE-NEXT: movapd %xmm14, (%rax) +; SSE-NEXT: movapd %xmm3, 48(%rax) +; SSE-NEXT: movapd %xmm6, 32(%rax) +; SSE-NEXT: movapd %xmm10, 16(%rax) +; SSE-NEXT: movapd %xmm7, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm0, 48(%rax) -; SSE-NEXT: movapd %xmm1, 32(%rax) -; SSE-NEXT: movapd %xmm2, 16(%rax) -; SSE-NEXT: movapd %xmm3, (%rax) +; SSE-NEXT: movapd %xmm5, 48(%rax) +; SSE-NEXT: movapd %xmm0, 32(%rax) +; SSE-NEXT: movapd %xmm1, 16(%rax) +; SSE-NEXT: movapd %xmm2, (%rax) ; SSE-NEXT: addq $2040, %rsp # imm = 0x7F8 ; SSE-NEXT: retq ; @@ -7481,66 +7481,66 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -7548,65 +7548,66 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -7616,43 +7617,43 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] @@ -7678,19 +7679,18 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 @@ -7737,21 +7737,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -7759,10 +7760,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 @@ -7782,12 +7783,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 @@ -7837,8 +7837,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 @@ -7848,10 +7847,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -7859,19 +7860,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 @@ -7882,10 +7883,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 @@ -7938,21 +7938,21 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -7965,26 +7965,28 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 @@ -8036,8 +8038,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] @@ -8047,7 +8048,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm15 @@ -8069,18 +8070,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload @@ -8106,12 +8108,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -8146,7 +8148,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm15 @@ -8173,19 +8175,17 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] @@ -8204,10 +8204,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] @@ -8242,7 +8244,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm13 @@ -8277,10 +8279,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 @@ -8335,74 +8336,73 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-SLOW-LABEL: load_i8_stride8_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $840, %rsp # imm = 0x348 +; AVX2-SLOW-NEXT: subq $808, %rsp # imm = 0x328 ; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm12 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm14 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm8 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -8410,117 +8410,117 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8528,20 +8528,21 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 @@ -8551,23 +8552,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -8579,50 +8580,48 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8630,102 +8629,103 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8733,100 +8733,104 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8834,106 +8838,101 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8941,100 +8940,102 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -9043,100 +9044,98 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -9144,51 +9143,51 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm12 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) @@ -9216,64 +9215,63 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%rax) -; AVX2-SLOW-NEXT: addq $840, %rsp # imm = 0x348 +; AVX2-SLOW-NEXT: addq $808, %rsp # imm = 0x328 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i8_stride8_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $904, %rsp # imm = 0x388 -; AVX2-FAST-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 368(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 @@ -9282,85 +9280,85 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm13 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm8 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm2[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm9 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm10 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm11 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -9370,26 +9368,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 @@ -9422,16 +9420,16 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 @@ -9443,7 +9441,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] @@ -9454,16 +9452,16 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 @@ -9490,18 +9488,18 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -9513,7 +9511,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9522,43 +9520,43 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9790,11 +9788,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm10 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -9811,17 +9809,17 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 @@ -9830,11 +9828,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 @@ -9904,74 +9902,73 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FAST-PERLANE-LABEL: load_i8_stride8_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $840, %rsp # imm = 0x348 +; AVX2-FAST-PERLANE-NEXT: subq $808, %rsp # imm = 0x328 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -9979,117 +9976,117 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10097,20 +10094,21 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 @@ -10120,23 +10118,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -10148,50 +10146,48 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10199,102 +10195,103 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10302,100 +10299,104 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10403,106 +10404,101 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10510,100 +10506,102 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10612,100 +10610,98 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -10713,51 +10709,51 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) @@ -10785,36 +10781,38 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $840, %rsp # imm = 0x348 +; AVX2-FAST-PERLANE-NEXT: addq $808, %rsp # imm = 0x328 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i8_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 -; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm17 +; AVX512F-SLOW-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm30 ; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -10828,20 +10826,21 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm14 ; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm16 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm19 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX512F-SLOW-NEXT: vpmovqb %zmm17, %xmm4 +; AVX512F-SLOW-NEXT: vpmovqb %zmm30, %xmm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 ; AVX512F-SLOW-NEXT: movb $-64, %al @@ -10849,18 +10848,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm13 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm10 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512F-SLOW-NEXT: vpmovqb %zmm5, %xmm5 @@ -10873,151 +10869,151 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm17 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512F-SLOW-NEXT: vpmovqb %zmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm28 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm21 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm4 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm31 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm23 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm26 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm23 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm18 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm7 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm17, %zmm6 +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm30, %zmm6 ; AVX512F-SLOW-NEXT: vpmovqb %zmm6, %xmm6 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm19 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm19 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm26 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm31 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm27 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm28, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm21 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm24 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm22 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 @@ -11027,36 +11023,37 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm12 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm30, %zmm9 ; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -11068,86 +11065,87 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm29 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm28, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm28 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm19, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, %zmm18 ; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm9 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm9 @@ -11160,136 +11158,138 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm28, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm18, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm28 ; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm25 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm23 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm19, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm29 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm26 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm27 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm27 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm21 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 @@ -11297,236 +11297,231 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm31 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm17, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm18, %zmm9 ; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm18, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm19, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm18, %zmm9 ; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm17 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm18 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm30 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm26, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm19, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm24 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm23 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm11 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm11 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm23, %zmm9 +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm28, %zmm9 ; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm11 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm26, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm19, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -11545,59 +11540,61 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: addq $504, %rsp # imm = 0x1F8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i8_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $440, %rsp # imm = 0x1B8 +; AVX512F-FAST-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm4 ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm10 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm3 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm14 ; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm31 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX512F-FAST-NEXT: vmovdqa 336(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm8 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] ; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 @@ -11611,12 +11608,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm30 ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm6 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 @@ -11625,86 +11623,89 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm19 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm17 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm29 -; AVX512F-FAST-NEXT: vpmovqb %zmm29, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-FAST-NEXT: vpmovqb %zmm2, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm26 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm16 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm19 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm14 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm23 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm15 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX512F-FAST-NEXT: vpsrlq $8, %zmm28, %zmm3 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm2 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm15 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm19 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX512F-FAST-NEXT: vpsrlq $8, %zmm29, %zmm3 @@ -11714,63 +11715,67 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm30 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm31 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm10, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm17 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX512F-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm27 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] ; AVX512F-FAST-NEXT: vpsrlq $16, %zmm28, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 ; AVX512F-FAST-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm22 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm25 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm26 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm28 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm30 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm6 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm16 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX512F-FAST-NEXT: vpsrlq $16, %zmm29, %zmm3 @@ -11780,60 +11785,64 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm7 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 ; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm24 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm10, %xmm11 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vpsrlq $24, %zmm28, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512F-FAST-NEXT: vpsrlq $24, %zmm18, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm25 ; AVX512F-FAST-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm19 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm18 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm16 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX512F-FAST-NEXT: vpsrlq $24, %zmm29, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -11841,259 +11850,257 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm2 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm20 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm22 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm17 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm16 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm23 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm10 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm12 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm24, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm25, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 ; AVX512F-FAST-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm13 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 {%k1} -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm24 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-FAST-NEXT: vpermd (%rsp), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm11 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm8 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm29, %zmm12 -; AVX512F-FAST-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm30, %zmm11 +; AVX512F-FAST-NEXT: vpmovqb %zmm11, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm11 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm6 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm11 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm12 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm13 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX512F-FAST-NEXT: vpsrlq $40, %zmm25, %zmm14 ; AVX512F-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm15 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm20 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm21 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm25 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 ; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm22 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm13 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512F-FAST-NEXT: vpsrlq $40, %zmm29, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512F-FAST-NEXT: vpsrlq $40, %zmm30, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 ; AVX512F-FAST-NEXT: vpmovqb %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm29 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm10 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm1 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm16 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm18 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm12 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm12 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm23 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm30 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm25, %zmm14 +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm24, %zmm14 ; AVX512F-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm15 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm11 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm13 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm18, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm20, %zmm13 ; AVX512F-FAST-NEXT: vpmovqb %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm21 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm18 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm5 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm12 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] -; AVX512F-FAST-NEXT: vpsrlq $56, %zmm25, %zmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX512F-FAST-NEXT: vpsrlq $56, %zmm24, %zmm11 ; AVX512F-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vpsrlq $56, %zmm18, %zmm3 -; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512F-FAST-NEXT: vpsrlq $56, %zmm20, %zmm2 +; AVX512F-FAST-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm1, (%rsi) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -12105,679 +12112,675 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm1, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm1, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-FAST-NEXT: addq $440, %rsp # imm = 0x1B8 +; AVX512F-FAST-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: load_i8_stride8_vf64: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: subq $744, %rsp # imm = 0x2E8 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-SLOW-NEXT: subq $728, %rsp # imm = 0x2D8 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm16 ; AVX512BW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm4 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm5 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, %xmm8 ; AVX512BW-SLOW-NEXT: vmovdqa 480(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm6 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm7, %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm27 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm4, %xmm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 352(%rdi), %xmm27 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm5, %xmm5 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 368(%rdi), %xmm26 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm26, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vmovdqa 336(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm15 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX512BW-SLOW-NEXT: vpmovqb %zmm1, %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm12 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm31 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm31, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] +; AVX512BW-SLOW-NEXT: vpmovqb %zmm16, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm20 ; AVX512BW-SLOW-NEXT: movb $-64, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm28 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm28, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm20 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm29 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 208(%rdi), %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm17, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm8, %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm16[0],xmm10[0],xmm16[1],xmm10[1],xmm16[2],xmm10[2],xmm16[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5,6],ymm7[7] +; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm1, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, %xmm12 +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm1, %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm17[0],xmm13[0],xmm17[1],xmm13[1],xmm17[2],xmm13[2],xmm17[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm10, %xmm10 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm13, %xmm13 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX512BW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm16 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm17 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3] +; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm21 -; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 400(%rdi), %xmm21 +; AVX512BW-SLOW-NEXT: vmovdqa 416(%rdi), %xmm6 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa 400(%rdi), %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 416(%rdi), %xmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 432(%rdi), %xmm29 +; AVX512BW-SLOW-NEXT: vmovdqa 432(%rdi), %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm19 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm26, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm30, %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm30, %xmm25 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm25, %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm27, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm27, %xmm25 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm24 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm20, %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, %xmm23 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm27 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm9, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm6, %xmm25 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm3, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm3, %xmm21 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm7, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm21, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm0, %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm31, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm27, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm22, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm28, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm31, %xmm24 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm15, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm28, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm29, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm17, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm29 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm22, %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 176(%rdi), %xmm24 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm0, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm6, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm24, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm7, %xmm25 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm25[0],xmm3[0],xmm25[1],xmm3[1],xmm25[2],xmm3[2],xmm25[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm6, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 144(%rdi), %xmm23 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm23, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, %xmm27 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm25, %xmm30 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm30[0],xmm0[0],xmm30[1],xmm0[1],xmm30[2],xmm0[2],xmm30[3],xmm0[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm10, %xmm12 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm28, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm17, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm18, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm5, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm19, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, %xmm20 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm19, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, %xmm23 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm24 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm29, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm17, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm30[0],xmm15[0],xmm30[1],xmm15[1],xmm30[2],xmm15[2],xmm30[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, %xmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm21, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm17 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm7, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm19[0],xmm0[0],xmm19[1],xmm0[1],xmm19[2],xmm0[2],xmm19[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm20, %xmm28 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm28[0],xmm0[0],xmm28[1],xmm0[1],xmm28[2],xmm0[2],xmm28[3],xmm0[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm27, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm22, %zmm13 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm19 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm28 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm15 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm21, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm21, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm16, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm28 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm31, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm23, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm27, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm25, %xmm28 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5],ymm15[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm28, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm10, %xmm27 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm18, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm5, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm22, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm25, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, (%rsp) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm17, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm28 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm28[0],xmm4[0],xmm28[1],xmm4[1],xmm28[2],xmm4[2],xmm28[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm28 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm8, %xmm30 +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm20, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm30[0],xmm15[1],xmm30[1],xmm15[2],xmm30[2],xmm15[3],xmm30[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5],ymm15[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm6, %zmm13 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm19, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm31, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm30[0],xmm15[0],xmm30[1],xmm15[1],xmm30[2],xmm15[2],xmm30[3],xmm15[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3] +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm6, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm15[0,1],xmm4[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm26, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm21, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm21 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm22 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm21, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm30[0],xmm15[0],xmm30[1],xmm15[1],xmm30[2],xmm15[2],xmm30[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm25 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm15 ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm31, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm23, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm31, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm25, %xmm28 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm14, %xmm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm17, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm18, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm28, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm21, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm28 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm22, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm27, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm28 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm28 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm10, %xmm30 +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm20, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, %xmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5],ymm2[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa %xmm10, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm10, %zmm13 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm19, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, %xmm20 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm30[0],xmm15[0],xmm30[1],xmm15[1],xmm30[2],xmm15[2],xmm30[3],xmm15[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm23, %zmm15 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm8, %xmm23 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm30[0],xmm15[0],xmm30[1],xmm15[1],xmm30[2],xmm15[2],xmm30[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm31, %xmm13 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm15 ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm8, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, %xmm29 +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm31, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm25, %xmm28 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5],ymm15[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm22 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm17, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm18, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm3 +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm21, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm31, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm24, %xmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm19 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm10, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm28 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm26, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm28 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, %xmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, %xmm10 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm28 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm28[0],xmm5[0],xmm28[1],xmm5[1],xmm28[2],xmm5[2],xmm28[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm10, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm13, %zmm5 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm16, %xmm20 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm28 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm28[0],xmm5[0],xmm28[1],xmm5[1],xmm28[2],xmm5[2],xmm28[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, %xmm7 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm31, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm29, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm25, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm22, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm18, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm20, %zmm2 +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm21, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm22, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm31, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm26, %xmm19 -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, %xmm21 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm19, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm17, %xmm28 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm28 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm26, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm30, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsp), %xmm30 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm30, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm28 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm28[0],xmm5[0],xmm28[1],xmm5[1],xmm28[2],xmm5[2],xmm28[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm10, %zmm5 +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm13, %zmm5 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm17, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, %xmm26 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm20 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm28 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm28[0],xmm5[0],xmm28[1],xmm5[1],xmm28[2],xmm5[2],xmm28[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm31, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm20, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm25, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm18, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm19, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm15, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm28, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm22, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm31, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm8 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm9 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm21, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm17, %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm11 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5],ymm8[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm24, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm27, %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm30, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm10, %zmm6 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm30, %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm13, %zmm7 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm17, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm27, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm26, %xmm7 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm20, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm23, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm31, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm20, %xmm8 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm25, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm23, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm28, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm18, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm19, %zmm3 +; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm15, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -12799,14 +12802,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-SLOW-NEXT: addq $744, %rsp # imm = 0x2E8 +; AVX512BW-SLOW-NEXT: addq $728, %rsp # imm = 0x2D8 ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: load_i8_stride8_vf64: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: subq $328, %rsp # imm = 0x148 -; AVX512BW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512BW-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 @@ -12824,39 +12827,39 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm19 -; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm19, %ymm2 +; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm17 +; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm17, %ymm2 ; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm9 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm29 -; AVX512BW-FAST-NEXT: vpermd %ymm29, %ymm0, %ymm14 +; AVX512BW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm28 +; AVX512BW-FAST-NEXT: vpermd %ymm28, %ymm0, %ymm14 ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm3 ; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm10 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FAST-NEXT: vmovdqa64 368(%rdi), %xmm21 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm21, %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 368(%rdi), %xmm20 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm20, %xmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 352(%rdi), %xmm19 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm19, %xmm3 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FAST-NEXT: vmovdqa 336(%rdi), %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm28 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm28, %xmm6 +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm25 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-FAST-NEXT: vmovdqa64 336(%rdi), %xmm18 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm18, %xmm5 +; AVX512BW-FAST-NEXT: vmovdqa 320(%rdi), %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm12, %xmm6 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-FAST-NEXT: vpmovqb %zmm18, %xmm5 +; AVX512BW-FAST-NEXT: vpmovqb %zmm4, %xmm5 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm20 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm21 ; AVX512BW-FAST-NEXT: movb $-64, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm21 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm17 -; AVX512BW-FAST-NEXT: vpshufb %ymm11, %ymm17, %ymm1 +; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm16 +; AVX512BW-FAST-NEXT: vpshufb %ymm11, %ymm16, %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm3 @@ -12867,68 +12870,69 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 ; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm6 ; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm27 -; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm0, %ymm16 -; AVX512BW-FAST-NEXT: vpshufb %ymm10, %ymm16, %ymm8 +; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm0, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-FAST-NEXT: vmovdqa64 112(%rdi), %xmm26 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm26, %xmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 96(%rdi), %xmm24 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm24, %xmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 112(%rdi), %xmm29 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm29, %xmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 96(%rdi), %xmm23 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm23, %xmm7 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512BW-FAST-NEXT: vmovdqa64 80(%rdi), %xmm22 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm22, %xmm25 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm22, %xmm26 ; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm8, %xmm23 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm8, %xmm25 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm25[0],xmm26[0],xmm25[1],xmm26[1],xmm25[2],xmm26[2],xmm25[3],xmm26[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm10[3] ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 ; AVX512BW-FAST-NEXT: vpmovqb %zmm10, %xmm13 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm30, %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa %ymm9, %ymm11 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm31, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa %ymm9, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa %ymm9, %ymm5 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm19, %ymm13 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm17, %ymm13 +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm9, %ymm24 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm21, %xmm15 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm23 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm15[0],xmm23[1],xmm15[1],xmm23[2],xmm15[2],xmm23[3],xmm15[3] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm23 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm20, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm19, %xmm25 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm25[0],xmm15[0],xmm25[1],xmm15[1],xmm25[2],xmm15[2],xmm25[3],xmm15[3] +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm25 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm18, %xmm6 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm18, %xmm26 ; AVX512BW-FAST-NEXT: vmovdqa %xmm12, %xmm7 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm25 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm28, %xmm20 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm20[0],xmm25[0],xmm20[1],xmm25[1],xmm20[2],xmm25[2],xmm20[3],xmm25[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm12, %xmm21 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm21[0],xmm26[0],xmm21[1],xmm26[1],xmm21[2],xmm26[2],xmm21[3],xmm26[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3] -; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm18, %zmm15 +; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm4, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm11, %ymm17, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm11, %ymm16, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm15 -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm16, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm24, %ymm2, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm26, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm24, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm29, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm23, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm22, %xmm13 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm8, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm22, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm8, %xmm15 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] ; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm10, %zmm13 @@ -12937,45 +12941,45 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm30, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm9, %ymm21 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm31, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm30, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm5, %ymm18 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm31, %ymm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm5, %ymm24 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm19, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa %ymm6, %ymm9 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm12 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm17, %ymm11 +; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512BW-FAST-NEXT: vmovdqa %xmm4, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm7, %xmm23 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm28, %xmm20 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm20[0],xmm15[0],xmm20[1],xmm15[1],xmm20[2],xmm15[2],xmm20[3],xmm15[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm21 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm7, %xmm26 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm21[0],xmm15[0],xmm21[1],xmm15[1],xmm21[2],xmm15[2],xmm21[3],xmm15[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vpsrlq $16, %zmm18, %zmm15 +; AVX512BW-FAST-NEXT: vpsrlq $16, %zmm4, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm17, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm3, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm18, %ymm16, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm24, %ymm3, %ymm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] ; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm15 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm16, %ymm9 +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm9 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm29, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm11 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm22, %xmm11 ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm13 @@ -12987,44 +12991,43 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm30, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm31, %ymm9 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm30, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm31, %ymm9 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm19, %ymm9 +; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm17, %ymm9 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm20, %xmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm19, %xmm24 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm19, %xmm12 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm28, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm25, %xmm14 +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm15 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512BW-FAST-NEXT: vpsrlq $24, %zmm18, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512BW-FAST-NEXT: vpsrlq $24, %zmm4, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512BW-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm17, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm16, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] ; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm16, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm26, %xmm1 -; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm24, %xmm2 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm29, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm23, %xmm2 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm22, %xmm2 ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm3 @@ -13037,33 +13040,32 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] -; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpermd (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vpermd (%rsp), %ymm3, %ymm1 # 32-byte Folded Reload ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm2 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm9 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpermd %ymm29, %ymm3, %ymm14 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm2 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vpermd %ymm28, %ymm3, %ymm14 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm11 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm19, %xmm16 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm20, %xmm17 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm20, %xmm16 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm15 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm23, %xmm15 -; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm28, %xmm29 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm29[0],xmm15[0],xmm29[1],xmm15[1],xmm29[2],xmm15[2],xmm29[3],xmm15[3] +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm25, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm7, %xmm28 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm18, %zmm15 +; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm17, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 @@ -13071,61 +13073,61 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm19 # 32-byte Folded Reload ; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm19, %ymm9 ; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm18 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm18, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm18, %ymm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7] ; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm3, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm21, %ymm13 +; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm3, %ymm20 +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm20, %ymm13 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm29, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm11 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm22, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm8, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm22, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm8, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm10, %zmm11 ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm29 +; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm9 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm9 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm11 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm12 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm11 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm16, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm17, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm23, %xmm20 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm28, %xmm27 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm20[0],xmm27[1],xmm20[1],xmm27[2],xmm20[2],xmm27[3],xmm20[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm25, %xmm21 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm26, %xmm27 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512BW-FAST-NEXT: vpsrlq $40, %zmm25, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-FAST-NEXT: vpsrlq $40, %zmm17, %zmm12 ; AVX512BW-FAST-NEXT: vpmovqb %zmm12, %xmm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm19, %ymm9 -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm18, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm19, %ymm9 +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm18, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm21, %ymm4 +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm20, %ymm4 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm29, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm11 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm22, %xmm11 ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm12 @@ -13135,25 +13137,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm20 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm4 +; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm21 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm3 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm1, %ymm4 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm4 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm14, %ymm9 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm9 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm16, %xmm9 -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm17, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm24, %xmm11 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm28, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm25, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3] ; AVX512BW-FAST-NEXT: vpsrlq $48, %zmm27, %zmm12 @@ -13161,15 +13163,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm19, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm18, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm19, %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm18, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm21, %ymm13 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm15, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm20, %ymm13 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm26, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm24, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm29, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm23, %xmm4 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm22, %xmm12 ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm11 @@ -13180,22 +13182,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm0 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm5, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm2, %ymm1 ; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm16, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm17, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm24, %xmm4 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm23, %xmm9 -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm28, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm25, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm26, %xmm11 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] ; AVX512BW-FAST-NEXT: vpsrlq $56, %zmm27, %zmm9 @@ -13203,15 +13205,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm19, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm19, %ymm0 ; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm18, %ymm9 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] ; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm15, %ymm9 -; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm21, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm20, %ymm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm26, %xmm5 -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm24, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm29, %xmm5 +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm23, %xmm1 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm22, %xmm5 ; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 @@ -13230,9 +13232,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovaps %zmm1, (%rcx) ; AVX512BW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FAST-NEXT: vmovaps %zmm1, (%r8) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, (%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, (%r9) ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, (%rax) ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll index 5cd794b7f1c4dc..c56f28e5a22c0f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll @@ -338,58 +338,58 @@ define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movdqa 96(%rdi), %xmm6 ; SSE-NEXT: movdqa 80(%rdi), %xmm4 ; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm7 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa 96(%rsi), %xmm11 -; SSE-NEXT: movdqa 80(%rsi), %xmm12 -; SSE-NEXT: movdqa 64(%rsi), %xmm13 +; SSE-NEXT: movdqa 96(%rsi), %xmm10 +; SSE-NEXT: movdqa 80(%rsi), %xmm11 +; SSE-NEXT: movdqa 64(%rsi), %xmm12 ; SSE-NEXT: movdqa (%rsi), %xmm9 -; SSE-NEXT: movdqa 16(%rsi), %xmm10 +; SSE-NEXT: movdqa 16(%rsi), %xmm13 ; SSE-NEXT: movdqa 32(%rsi), %xmm14 ; SSE-NEXT: movdqa 48(%rsi), %xmm15 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] ; SSE-NEXT: movdqa %xmm5, %xmm14 ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] ; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSE-NEXT: movdqa 112(%rsi), %xmm11 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; SSE-NEXT: movdqa 112(%rsi), %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: movdqa %xmm0, 224(%rdx) -; SSE-NEXT: movdqa %xmm7, 240(%rdx) +; SSE-NEXT: movdqa %xmm8, 240(%rdx) ; SSE-NEXT: movdqa %xmm6, 192(%rdx) -; SSE-NEXT: movdqa %xmm12, 208(%rdx) +; SSE-NEXT: movdqa %xmm11, 208(%rdx) ; SSE-NEXT: movdqa %xmm4, 160(%rdx) -; SSE-NEXT: movdqa %xmm13, 176(%rdx) +; SSE-NEXT: movdqa %xmm12, 176(%rdx) ; SSE-NEXT: movdqa %xmm3, 128(%rdx) ; SSE-NEXT: movdqa %xmm15, 144(%rdx) ; SSE-NEXT: movdqa %xmm5, 96(%rdx) ; SSE-NEXT: movdqa %xmm14, 112(%rdx) ; SSE-NEXT: movdqa %xmm2, 64(%rdx) -; SSE-NEXT: movdqa %xmm10, 80(%rdx) +; SSE-NEXT: movdqa %xmm13, 80(%rdx) ; SSE-NEXT: movdqa %xmm1, 32(%rdx) ; SSE-NEXT: movdqa %xmm9, 48(%rdx) -; SSE-NEXT: movdqa %xmm8, (%rdx) +; SSE-NEXT: movdqa %xmm7, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index f91dd72bfe3ff5..a47986e309bee7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -178,41 +178,41 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i16_stride3_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa (%rsi), %xmm2 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; SSE-NEXT: pandn %xmm6, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,1,3,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, 32(%rcx) ; SSE-NEXT: movdqa %xmm5, 16(%rcx) -; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm3, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride3_vf8: @@ -382,32 +382,32 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: movdqa (%rsi), %xmm2 ; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] +; SSE-NEXT: movdqa (%rdx), %xmm3 +; SSE-NEXT: movdqa 16(%rdx), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,2,2] +; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pandn %xmm10, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,0,0] +; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: por %xmm10, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm2[3,3,3,3,4,5,6,7] @@ -415,11 +415,11 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, %xmm12 ; SSE-NEXT: pandn %xmm11, %xmm12 ; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,2,2] -; SSE-NEXT: pandn %xmm10, %xmm3 -; SSE-NEXT: por %xmm12, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,2,2] +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: por %xmm12, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] @@ -428,27 +428,27 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm0 ; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm11 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm10, %xmm6 ; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 ; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm10, 32(%rcx) ; SSE-NEXT: movdqa %xmm6, 80(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm3, 16(%rcx) -; SSE-NEXT: movdqa %xmm8, 48(%rcx) +; SSE-NEXT: movdqa %xmm4, 16(%rcx) +; SSE-NEXT: movdqa %xmm9, 48(%rcx) ; SSE-NEXT: movdqa %xmm5, 64(%rcx) ; SSE-NEXT: retq ; @@ -545,8 +545,8 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i16_stride3_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] @@ -559,7 +559,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 @@ -569,18 +569,18 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rcx) @@ -589,8 +589,8 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] @@ -603,7 +603,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm4 @@ -613,18 +613,18 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rcx) @@ -633,9 +633,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i16_stride3_vf16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm3 ; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm4 ; AVX512F-NEXT: vprold $16, %xmm3, %xmm5 @@ -646,28 +646,28 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11] ; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u,12,13],zero,zero,ymm1[u,u,14,15],zero,zero,ymm1[u,u,16,17],zero,zero,ymm1[u,u,18,19],zero,zero,ymm1[u,u,20,21],zero,zero -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[10,11,u,u],zero,zero,ymm2[12,13,u,u],zero,zero,ymm2[14,15,u,u],zero,zero,ymm2[16,17,u,u],zero,zero,ymm2[18,19,u,u],zero,zero,ymm2[20,21] -; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[u,u,12,13],zero,zero,ymm0[u,u,14,15],zero,zero,ymm0[u,u,16,17],zero,zero,ymm0[u,u,18,19],zero,zero,ymm0[u,u,20,21],zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,u,u],zero,zero,ymm1[12,13,u,u],zero,zero,ymm1[14,15,u,u],zero,zero,ymm1[16,17,u,u],zero,zero,ymm1[18,19,u,u],zero,zero,ymm1[20,21] +; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 -; AVX512F-NEXT: vprold $16, %xmm4, %xmm1 +; AVX512F-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm2[10,11],zero,zero,zero,zero,ymm2[12,13],zero,zero,zero,zero,ymm2[14,15],zero,zero,zero,zero,ymm2[16,17],zero,zero,zero,zero,ymm2[18,19],zero,zero,zero,zero +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-NEXT: vprold $16, %xmm4, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] ; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <5,5,u,6,6,u,7,7> -; AVX512F-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512F-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa %ymm2, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -786,8 +786,8 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm5, %xmm15 ; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 @@ -796,61 +796,61 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pand %xmm5, %xmm3 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,2,2] +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,2,2] ; SSE-NEXT: pandn %xmm12, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,0,0] ; SSE-NEXT: pandn %xmm12, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 ; SSE-NEXT: movdqa %xmm12, 32(%rcx) -; SSE-NEXT: movdqa %xmm1, 80(%rcx) +; SSE-NEXT: movdqa %xmm7, 80(%rcx) ; SSE-NEXT: movdqa %xmm0, 128(%rcx) -; SSE-NEXT: movdqa %xmm11, 176(%rcx) +; SSE-NEXT: movdqa %xmm3, 176(%rcx) ; SSE-NEXT: movdqa %xmm5, (%rcx) ; SSE-NEXT: movdqa %xmm2, 16(%rcx) ; SSE-NEXT: movdqa %xmm15, 48(%rcx) @@ -873,12 +873,12 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm4[1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] @@ -887,7 +887,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm4[1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] @@ -905,7 +905,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm11[1,2],xmm14[3],xmm11[4,5],xmm14[6],xmm11[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -913,15 +913,15 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 @@ -1073,44 +1073,44 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm6 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm8, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm5 ; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 96(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 160(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -1150,44 +1150,44 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm9, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm8, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -1308,333 +1308,335 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $328, %rsp # imm = 0x148 -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: subq $344, %rsp # imm = 0x158 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rsi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 ; SSE-NEXT: movdqa (%rdx), %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: movdqa 32(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdx), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm10 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa 48(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 64(%rdi), %xmm6 -; SSE-NEXT: movdqa 64(%rsi), %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: movdqa 64(%rdi), %xmm4 +; SSE-NEXT: movdqa 64(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa 80(%rsi), %xmm12 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa 80(%rsi), %xmm14 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 96(%rdi), %xmm6 -; SSE-NEXT: movdqa 96(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa 96(%rsi), %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm11 ; SSE-NEXT: movdqa 112(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 112(%rdi), %xmm8 -; SSE-NEXT: movdqa 112(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: movdqa 112(%rdi), %xmm15 +; SSE-NEXT: movdqa 112(%rsi), %xmm8 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: pshufd $250, (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,2,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm3[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm12, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, 368(%rcx) -; SSE-NEXT: movdqa %xmm3, 320(%rcx) -; SSE-NEXT: movdqa %xmm1, 272(%rcx) -; SSE-NEXT: movdqa %xmm14, 224(%rcx) +; SSE-NEXT: movdqa %xmm1, 320(%rcx) +; SSE-NEXT: movdqa %xmm6, 272(%rcx) +; SSE-NEXT: movdqa %xmm5, 224(%rcx) ; SSE-NEXT: movdqa %xmm2, 176(%rcx) -; SSE-NEXT: movdqa %xmm4, 128(%rcx) -; SSE-NEXT: movdqa %xmm5, 80(%rcx) -; SSE-NEXT: movdqa %xmm6, 32(%rcx) -; SSE-NEXT: movdqa %xmm11, 352(%rcx) -; SSE-NEXT: movdqa %xmm10, 336(%rcx) -; SSE-NEXT: movdqa %xmm13, 304(%rcx) -; SSE-NEXT: movdqa %xmm15, 288(%rcx) +; SSE-NEXT: movdqa %xmm3, 128(%rcx) +; SSE-NEXT: movdqa %xmm4, 80(%rcx) +; SSE-NEXT: movdqa %xmm7, 32(%rcx) +; SSE-NEXT: movdqa %xmm13, 352(%rcx) +; SSE-NEXT: movdqa %xmm9, 336(%rcx) +; SSE-NEXT: movdqa %xmm11, 304(%rcx) +; SSE-NEXT: movdqa %xmm12, 288(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1659,77 +1661,78 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: addq $328, %rsp # imm = 0x148 +; SSE-NEXT: addq $344, %rsp # imm = 0x158 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $280, %rsp # imm = 0x118 +; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 @@ -1758,116 +1761,112 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3],xmm15[4],xmm0[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1],xmm2[2,3],xmm14[4],xmm2[5,6],xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm9[1],xmm14[2,3],xmm9[4],xmm14[5,6],xmm9[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm14[2],xmm9[3,4],xmm14[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm10[2],xmm14[3,4],xmm10[5],xmm14[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6],xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0],xmm12[1],xmm9[2,3],xmm12[4],xmm9[5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2],xmm7[3,4],xmm15[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3],xmm13[4],xmm15[5,6],xmm13[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3],xmm9[4],xmm5[5,6],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6],xmm11[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm13[2],xmm9[3,4],xmm13[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm15[2],xmm11[3,4],xmm15[5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3,4],xmm8[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, (%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 80(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 288(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 368(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 368(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm14, 320(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 336(%rcx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 336(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 96(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 176(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 240(%rcx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1888,7 +1887,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rcx) -; AVX1-ONLY-NEXT: addq $280, %rsp # imm = 0x118 +; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride3_vf64: @@ -2043,38 +2042,38 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i16_stride3_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm4 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm5 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm7 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 @@ -2083,9 +2082,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm8 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] @@ -2093,62 +2092,61 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm15 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm4, %ymm8 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm4 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm13, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm4, %ymm12 ; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm13 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpermd (%rdi), %ymm4, %ymm9 +; AVX2-FAST-NEXT: vpermd (%rdi), %ymm4, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpermd 64(%rdi), %ymm4, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm11, %ymm5 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm4, %ymm14 @@ -2158,19 +2156,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 320(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm13, 352(%rcx) @@ -2187,38 +2185,38 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm13, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm8 @@ -2227,9 +2225,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] @@ -2237,62 +2235,61 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm10, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm13, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm4, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm13, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm10, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm4, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermd 64(%rdi), %ymm4, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm5, %ymm11, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm4, %ymm14 @@ -2302,19 +2299,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 320(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 288(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 352(%rcx) @@ -2334,24 +2331,24 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512F-NEXT: vmovdqa %ymm1, %ymm6 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-NEXT: vmovdqa64 16(%rsi), %xmm24 -; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512F-NEXT: vmovdqa64 16(%rsi), %xmm23 +; AVX512F-NEXT: vmovdqa64 32(%rsi), %xmm25 ; AVX512F-NEXT: vprold $16, %xmm5, %xmm8 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-NEXT: vmovdqa64 16(%rdi), %xmm25 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512F-NEXT: vmovdqa64 16(%rdi), %xmm24 +; AVX512F-NEXT: vmovdqa64 32(%rdi), %xmm27 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-NEXT: vpshufb %xmm4, %xmm9, %xmm9 ; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-NEXT: vmovdqa (%rdx), %ymm3 @@ -2359,8 +2356,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512F-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = -; AVX512F-NEXT: vpermd %ymm3, %ymm19, %ymm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = +; AVX512F-NEXT: vpermd %ymm3, %ymm18, %ymm3 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512F-NEXT: vpandnq %ymm3, %ymm16, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 @@ -2372,8 +2369,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512F-NEXT: vpshufb %xmm0, %xmm10, %xmm10 -; AVX512F-NEXT: vmovdqa64 %xmm0, %xmm26 +; AVX512F-NEXT: vpshufb %xmm4, %xmm10, %xmm10 ; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 ; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm12 @@ -2386,13 +2382,13 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] ; AVX512F-NEXT: vinserti128 $1, %xmm15, %ymm12, %ymm12 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm12[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> -; AVX512F-NEXT: vpermd 64(%rdx), %zmm20, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vpternlogq $184, %zmm15, %zmm21, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> +; AVX512F-NEXT: vpermd 64(%rdx), %zmm19, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vpternlogq $184, %zmm15, %zmm20, %zmm10 ; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm15 -; AVX512F-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512F-NEXT: vpshufb %ymm7, %ymm15, %ymm15 +; AVX512F-NEXT: vmovdqa %ymm6, %ymm1 +; AVX512F-NEXT: vpshufb %ymm6, %ymm15, %ymm15 ; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm12 ; AVX512F-NEXT: vpshufb %ymm2, %ymm12, %ymm12 ; AVX512F-NEXT: vpor %ymm15, %ymm12, %ymm12 @@ -2407,77 +2403,78 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm12[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-NEXT: vmovdqa 96(%rdx), %ymm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <5,5,u,6,6,u,7,7> -; AVX512F-NEXT: vpermd %ymm12, %ymm23, %ymm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vpandnq %ymm15, %ymm22, %ymm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = <5,5,u,6,6,u,7,7> +; AVX512F-NEXT: vpermd %ymm12, %ymm22, %ymm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vpandnq %ymm15, %ymm21, %ymm15 ; AVX512F-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX512F-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vpternlogq $248, %zmm12, %zmm5, %zmm18 +; AVX512F-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm12 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vpternlogq $248, %zmm15, %zmm5, %zmm12 ; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512F-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512F-NEXT: vpshufb %ymm2, %ymm15, %ymm15 -; AVX512F-NEXT: vpor %ymm5, %ymm15, %ymm5 +; AVX512F-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX512F-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm15 -; AVX512F-NEXT: vprold $16, %xmm15, %xmm0 +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512F-NEXT: vprold $16, %xmm6, %xmm0 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm13 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX512F-NEXT: vmovdqa64 %xmm26, %xmm15 -; AVX512F-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] +; AVX512F-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX512F-NEXT: vpermd %ymm14, %ymm19, %ymm7 -; AVX512F-NEXT: vpandnq %ymm7, %ymm16, %ymm7 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-NEXT: vpermd %ymm14, %ymm18, %ymm6 +; AVX512F-NEXT: vpandnq %ymm6, %ymm16, %ymm6 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512F-NEXT: vpternlogq $248, %zmm17, %zmm0, %zmm5 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512F-NEXT: vpshufb %ymm2, %ymm7, %ymm7 -; AVX512F-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm13 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; AVX512F-NEXT: vpshufb %xmm11, %xmm14, %xmm14 -; AVX512F-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm13[2],xmm7[3,4],xmm13[5],xmm7[6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm14, %ymm7, %ymm7 -; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-NEXT: vpermd %ymm8, %ymm23, %ymm7 -; AVX512F-NEXT: vpandnq %ymm7, %ymm22, %ymm7 -; AVX512F-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-NEXT: vpternlogq $248, %zmm12, %zmm0, %zmm7 -; AVX512F-NEXT: vprold $16, %xmm6, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm0[2],xmm8[3,4],xmm0[5],xmm8[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512F-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512F-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX512F-NEXT: vpor %ymm0, %ymm6, %ymm0 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm7 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX512F-NEXT: vprold $16, %xmm7, %xmm7 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm13, %ymm6, %ymm6 +; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-NEXT: vpermd %ymm8, %ymm22, %ymm6 +; AVX512F-NEXT: vpandnq %ymm6, %ymm21, %ymm6 +; AVX512F-NEXT: vpshufb %ymm9, %ymm8, %ymm7 +; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-NEXT: vpternlogq $248, %zmm15, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-NEXT: vprold $16, %xmm25, %xmm0 +; AVX512F-NEXT: vmovdqa64 %xmm27, %xmm2 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm27[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %xmm23, %xmm4 ; AVX512F-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512F-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512F-NEXT: vprold $16, %xmm24, %xmm4 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm25[1,1,2,2] +; AVX512F-NEXT: vprold $16, %xmm23, %xmm4 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm24[1,1,2,2] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-NEXT: vpermd (%rdx), %zmm20, %zmm1 -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm21, %zmm1 +; AVX512F-NEXT: vpermd (%rdx), %zmm19, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm12, 320(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512F-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll index 74cd79ab551e2a..2b649c984afd56 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -340,159 +340,159 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa (%rdx), %xmm7 -; SSE-NEXT: movdqa 16(%rdx), %xmm4 -; SSE-NEXT: movdqa (%rcx), %xmm8 -; SSE-NEXT: movdqa 16(%rcx), %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm5 +; SSE-NEXT: movdqa (%rcx), %xmm6 +; SSE-NEXT: movdqa 16(%rcx), %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movdqa %xmm1, 96(%r8) -; SSE-NEXT: movdqa %xmm6, 112(%r8) -; SSE-NEXT: movdqa %xmm8, 64(%r8) -; SSE-NEXT: movdqa %xmm10, 80(%r8) +; SSE-NEXT: movdqa %xmm3, 112(%r8) +; SSE-NEXT: movdqa %xmm6, 64(%r8) +; SSE-NEXT: movdqa %xmm8, 80(%r8) ; SSE-NEXT: movdqa %xmm0, 32(%r8) -; SSE-NEXT: movdqa %xmm5, 48(%r8) -; SSE-NEXT: movdqa %xmm2, (%r8) -; SSE-NEXT: movdqa %xmm3, 16(%r8) +; SSE-NEXT: movdqa %xmm2, 48(%r8) +; SSE-NEXT: movdqa %xmm9, (%r8) +; SSE-NEXT: movdqa %xmm10, 16(%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride4_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride4_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride4_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 @@ -504,84 +504,84 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,1,1,2,2,3,3] ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2],ymm9[3],ymm4[4],ymm9[5],ymm4[6],ymm9[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4],ymm5[5],ymm2[6],ymm5[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r8) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride4_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -644,13 +644,13 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride4_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm0 +; SSE-NEXT: movdqa 32(%rsi), %xmm2 ; SSE-NEXT: movdqa (%rdx), %xmm7 ; SSE-NEXT: movdqa 16(%rdx), %xmm13 ; SSE-NEXT: movdqa 32(%rdx), %xmm10 @@ -659,72 +659,72 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 32(%rcx), %xmm12 ; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] ; SSE-NEXT: movdqa %xmm13, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] ; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm13[2],xmm8[3],xmm13[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; SSE-NEXT: movdqa 48(%rdx), %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; SSE-NEXT: movdqa 48(%rdx), %xmm13 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] ; SSE-NEXT: movdqa 48(%rcx), %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] -; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa %xmm13, %xmm10 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; SSE-NEXT: movdqa 48(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa 48(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; SSE-NEXT: movdqa %xmm2, 224(%r8) -; SSE-NEXT: movdqa %xmm1, 240(%r8) -; SSE-NEXT: movdqa %xmm3, 192(%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movdqa %xmm1, 224(%r8) +; SSE-NEXT: movdqa %xmm2, 240(%r8) +; SSE-NEXT: movdqa %xmm5, 192(%r8) ; SSE-NEXT: movdqa %xmm0, 208(%r8) -; SSE-NEXT: movdqa %xmm4, 160(%r8) -; SSE-NEXT: movdqa %xmm9, 176(%r8) -; SSE-NEXT: movdqa %xmm13, 128(%r8) -; SSE-NEXT: movdqa %xmm14, 144(%r8) +; SSE-NEXT: movdqa %xmm9, 160(%r8) +; SSE-NEXT: movdqa %xmm3, 176(%r8) +; SSE-NEXT: movdqa %xmm14, 128(%r8) +; SSE-NEXT: movdqa %xmm15, 144(%r8) ; SSE-NEXT: movdqa %xmm11, 96(%r8) ; SSE-NEXT: movdqa %xmm8, 112(%r8) ; SSE-NEXT: movdqa %xmm7, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movdqa %xmm5, 32(%r8) +; SSE-NEXT: movdqa %xmm4, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movdqa %xmm6, (%r8) @@ -739,13 +739,13 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] @@ -753,7 +753,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2],ymm1[3],ymm11[4],ymm1[5],ymm11[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] @@ -804,7 +804,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 @@ -813,15 +813,15 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8) @@ -829,7 +829,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -840,13 +840,13 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,0,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm9, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] @@ -854,7 +854,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2],ymm1[3],ymm11[4],ymm1[5],ymm11[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] @@ -905,7 +905,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm9, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm9, %ymm5 @@ -914,15 +914,15 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 160(%r8) @@ -930,39 +930,39 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 224(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 192(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride4_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm8 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm11 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,1,1,2,2,3,3] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2],ymm12[3],ymm2[4],ymm12[5],ymm2[6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2],ymm12[3],ymm1[4],ymm12[5],ymm1[6],ymm12[7] ; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 ; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm7[0],zero,xmm7[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] @@ -975,48 +975,48 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm9[0],zero,xmm9[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm12, %ymm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4],ymm11[5],ymm8[6],ymm11[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] ; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4],ymm10[5],ymm4[6],ymm10[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4],ymm10[5],ymm9[6],ymm10[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 64(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 160(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 128(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 64(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 128(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm8, 224(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm7, 192(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1027,13 +1027,13 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] @@ -1041,7 +1041,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2],ymm1[3],ymm11[4],ymm1[5],ymm11[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] @@ -1092,7 +1092,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm9, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm9, %ymm5 @@ -1101,15 +1101,15 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 64(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%r8) @@ -1117,7 +1117,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 224(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 192(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1125,92 +1125,92 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rcx), %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdx), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rsi), %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm4[0],zero,xmm4[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm14, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm11[0],zero,xmm11[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm13, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm11, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm13, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm10[0],zero,xmm10[1],zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm5[0],zero,xmm5[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm9, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] @@ -1218,89 +1218,89 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 64(%r8) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride4_vf32: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm9, %ymm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm13 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm14 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512F-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm13, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm11[0],zero,xmm11[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm15, %ymm11 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm13, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm12, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm10[0],zero,xmm10[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm15, %ymm10 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm10, %zmm8 +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm12, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm8[0],zero,xmm8[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm12, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm7, %zmm6 -; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm13, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm13, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm12, %zmm0 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 128(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 192(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 128(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 64(%r8) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1308,58 +1308,58 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 ; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rcx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rcx), %xmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdx), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdx), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm13, %ymm11 ; AVX512DQ-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm13, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm7[0],zero,xmm7[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm11, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm11[0],zero,xmm11[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm13, %ymm11 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm11, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm11, %zmm9 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] @@ -1369,19 +1369,19 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm8, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm7, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] @@ -1398,7 +1398,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 128(%r8) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 64(%r8) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq @@ -1453,141 +1453,141 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i16_stride4_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $184, %rsp -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa 32(%rdi), %xmm12 -; SSE-NEXT: movdqa 48(%rdi), %xmm13 -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 48(%rdi), %xmm15 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 ; SSE-NEXT: movdqa 32(%rsi), %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm6 -; SSE-NEXT: movdqa 16(%rdx), %xmm4 +; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: movdqa 16(%rdx), %xmm8 ; SSE-NEXT: movdqa 32(%rdx), %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm7 -; SSE-NEXT: movdqa 16(%rcx), %xmm8 -; SSE-NEXT: movdqa 32(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm9[2],xmm14[3],xmm9[3] +; SSE-NEXT: movdqa (%rcx), %xmm10 +; SSE-NEXT: movdqa 16(%rcx), %xmm11 +; SSE-NEXT: movdqa 32(%rcx), %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: movdqa 48(%rcx), %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa 48(%rcx), %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa 48(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movdqa %xmm15, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdx), %xmm0 ; SSE-NEXT: movdqa 64(%rcx), %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 ; SSE-NEXT: movdqa 64(%rsi), %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa 80(%rdx), %xmm0 -; SSE-NEXT: movdqa 80(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa 80(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movdqa 80(%rdx), %xmm1 +; SSE-NEXT: movdqa 80(%rcx), %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa 80(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movdqa 96(%rdx), %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movdqa 96(%rdx), %xmm0 ; SSE-NEXT: movdqa 96(%rcx), %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa 96(%rdi), %xmm5 -; SSE-NEXT: movdqa 96(%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa 96(%rsi), %xmm5 ; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: movdqa 112(%rdx), %xmm2 -; SSE-NEXT: movdqa 112(%rcx), %xmm7 +; SSE-NEXT: movdqa 112(%rcx), %xmm5 ; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa 112(%rsi), %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm1 @@ -1595,7 +1595,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] @@ -1604,27 +1604,27 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm4, 480(%r8) ; SSE-NEXT: movdqa %xmm1, 464(%r8) ; SSE-NEXT: movdqa %xmm3, 448(%r8) -; SSE-NEXT: movdqa %xmm5, 432(%r8) -; SSE-NEXT: movdqa %xmm10, 416(%r8) -; SSE-NEXT: movdqa %xmm9, 400(%r8) -; SSE-NEXT: movdqa %xmm12, 384(%r8) -; SSE-NEXT: movdqa %xmm11, 368(%r8) -; SSE-NEXT: movdqa %xmm15, 352(%r8) -; SSE-NEXT: movdqa %xmm8, 336(%r8) +; SSE-NEXT: movdqa %xmm9, 432(%r8) +; SSE-NEXT: movdqa %xmm11, 416(%r8) +; SSE-NEXT: movdqa %xmm12, 400(%r8) +; SSE-NEXT: movdqa %xmm14, 384(%r8) +; SSE-NEXT: movdqa %xmm10, 368(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%r8) +; SSE-NEXT: movdqa %xmm7, 336(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%r8) -; SSE-NEXT: movdqa %xmm13, 304(%r8) +; SSE-NEXT: movdqa %xmm8, 304(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%r8) -; SSE-NEXT: movdqa %xmm14, 272(%r8) +; SSE-NEXT: movdqa %xmm13, 272(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%r8) +; SSE-NEXT: movdqa %xmm15, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1659,7 +1659,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: subq $40, %rsp ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 @@ -1671,9 +1671,9 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -1693,8 +1693,8 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 @@ -1704,15 +1704,15 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm12, %ymm4 @@ -1726,25 +1726,25 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2],ymm4[3],ymm15[4],ymm4[5],ymm15[6],ymm4[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4],ymm7[5],ymm13[6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0],ymm6[1],ymm13[2],ymm6[3],ymm13[4],ymm6[5],ymm13[6],ymm6[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] @@ -1754,7 +1754,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 @@ -1763,15 +1763,15 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm8[0],zero,xmm8[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] ; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] @@ -1844,10 +1844,10 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm12, 448(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 416(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%r8) @@ -1868,7 +1868,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: subq $40, %rsp ; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm8 @@ -1880,9 +1880,9 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -1902,8 +1902,8 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 @@ -1913,15 +1913,15 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 80(%rcx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa 80(%rdx), %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm12, %ymm4 @@ -1935,25 +1935,25 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2],ymm4[3],ymm15[4],ymm4[5],ymm15[6],ymm4[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm7[0],zero,xmm7[1],zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,0,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm6[0],zero,xmm6[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4],ymm7[5],ymm13[6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0],ymm6[1],ymm13[2],ymm6[3],ymm13[4],ymm6[5],ymm13[6],ymm6[7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] @@ -1963,7 +1963,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 @@ -1972,15 +1972,15 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm13, %ymm11 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm8[0],zero,xmm8[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] ; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm12 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] @@ -2053,10 +2053,10 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa %ymm12, 448(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 416(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 384(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 224(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 224(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm10, 192(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, 160(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 128(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 128(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, 352(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%r8) @@ -2077,7 +2077,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: subq $40, %rsp ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm8 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm9 @@ -2108,8 +2108,8 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 ; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm13[1],ymm3[2],ymm13[3],ymm3[4],ymm13[5],ymm3[6],ymm13[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2117,13 +2117,13 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm5[0],zero,xmm5[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm13, %ymm5 @@ -2134,21 +2134,21 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm13[1],ymm5[2],ymm13[3],ymm5[4],ymm13[5],ymm5[6],ymm13[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm13 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm8[0],zero,xmm8[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm14, %ymm8 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm14, %ymm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4],ymm14[5],ymm8[6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4],ymm14[5],ymm7[6],ymm14[7] ; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm14 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero @@ -2157,20 +2157,20 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2],ymm0[3],ymm10[4],ymm0[5],ymm10[6],ymm0[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0],ymm11[1],ymm0[2],ymm11[3],ymm0[4],ymm11[5],ymm0[6],ymm11[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] @@ -2190,26 +2190,26 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm12[1],ymm0[2],ymm12[3],ymm0[4],ymm12[5],ymm0[6],ymm12[7] ; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm13[0],zero,xmm13[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13 +; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vmovdqa 112(%rcx), %xmm15 ; AVX2-FAST-NEXT: vmovdqa 112(%rdx), %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm3[1],ymm13[2],ymm3[3],ymm13[4],ymm3[5],ymm13[6],ymm3[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm3[1],ymm14[2],ymm3[3],ymm14[4],ymm3[5],ymm14[6],ymm3[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 @@ -2218,23 +2218,23 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0],ymm5[1],ymm14[2],ymm5[3],ymm14[4],ymm5[5],ymm14[6],ymm5[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 480(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm13, 448(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 480(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 448(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 416(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm9, 384(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 224(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 224(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm11, 192(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm10, 160(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 128(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm6, 352(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%r8) @@ -2255,7 +2255,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm8 @@ -2267,9 +2267,9 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -2289,8 +2289,8 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 @@ -2300,15 +2300,15 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rcx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm12, %ymm4 @@ -2322,25 +2322,25 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2],ymm4[3],ymm15[4],ymm4[5],ymm15[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm7[0],zero,xmm7[1],zero -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4],ymm7[5],ymm13[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0],ymm6[1],ymm13[2],ymm6[3],ymm13[4],ymm6[5],ymm13[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] @@ -2350,7 +2350,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 @@ -2359,15 +2359,15 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm13, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm8[0],zero,xmm8[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] @@ -2440,10 +2440,10 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 448(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 416(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 384(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 224(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 224(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 192(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 160(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 128(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 352(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%r8) @@ -2588,31 +2588,31 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] @@ -2650,8 +2650,8 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 320(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 320(%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 256(%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 448(%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 384(%r8) @@ -2661,55 +2661,55 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-FAST-LABEL: store_i16_stride4_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %xmm19 -; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %xmm20 +; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %xmm21 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm17 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm7 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm8 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm9 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm10 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm5, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm11 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm7, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512F-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm5, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm8 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm7, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm9 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] ; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm0[0],zero,xmm0[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm0, %ymm16, %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm8[0],zero,xmm8[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm9[0],zero,xmm9[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm9, %ymm15, %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm19 ; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm15 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm5, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm7, %zmm19 {%k1} ; AVX512F-FAST-NEXT: vmovdqa 112(%rsi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -2726,7 +2726,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm5, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm7, %zmm15 {%k1} ; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -2743,73 +2743,73 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm5, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm7, %zmm16 {%k1} ; AVX512F-FAST-NEXT: vmovdqa 80(%rsi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm18 = xmm2[0],zero,xmm2[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa 80(%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 80(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm5, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa 80(%rcx), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 80(%rdx), %xmm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm7, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm5, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm2 -; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm5, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm9, %zmm2 +; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm7, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm7, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm9 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 192(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 320(%r8) +; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm7, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 320(%r8) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 256(%r8) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 448(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 384(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 384(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 64(%r8) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -2822,46 +2822,46 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm17 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rcx), %xmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 ; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdx), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 ; AVX512DQ-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm6, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm7, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] ; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm18 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm13 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm15 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] @@ -2886,116 +2886,116 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rcx), %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rdx), %xmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm6[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm6, %ymm16, %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm7[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm16, %ymm7 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm0, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm19 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm13[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm16, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm0, %zmm15 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm13[0],zero,xmm13[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm16, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm16 ; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rcx), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdx), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm13[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm20, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm0, %zmm16 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm11, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm6, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm10, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm6, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm7, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 128(%r8) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 320(%r8) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 256(%r8) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 448(%r8) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index d821b370f78a69..730c5172b095a6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -166,52 +166,52 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,7] +; SSE-NEXT: pand %xmm7, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm5, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movq %xmm3, 32(%r9) -; SSE-NEXT: movdqa %xmm2, (%r9) -; SSE-NEXT: movdqa %xmm4, 16(%r9) +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movq %xmm4, 32(%r9) +; SSE-NEXT: movdqa %xmm3, (%r9) +; SSE-NEXT: movdqa %xmm5, 16(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf4: @@ -413,23 +413,23 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm5 ; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm1 +; SSE-NEXT: movdqa (%rcx), %xmm2 ; SSE-NEXT: movdqa (%r8), %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 ; SSE-NEXT: por %xmm9, %xmm11 ; SSE-NEXT: pand %xmm8, %xmm11 @@ -439,8 +439,8 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm4, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm4 ; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] @@ -462,16 +462,16 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] ; SSE-NEXT: pand %xmm7, %xmm12 ; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: pand %xmm3, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: por %xmm12, %xmm3 ; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] @@ -484,18 +484,18 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm7, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, 16(%r9) +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, 16(%r9) ; SSE-NEXT: movdqa %xmm5, 48(%r9) -; SSE-NEXT: movdqa %xmm1, 64(%r9) +; SSE-NEXT: movdqa %xmm3, 64(%r9) ; SSE-NEXT: movdqa %xmm0, (%r9) ; SSE-NEXT: movdqa %xmm4, 32(%r9) ; SSE-NEXT: retq @@ -556,12 +556,12 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-LABEL: store_i16_stride5_vf8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,3,2,4,6,7,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] @@ -585,9 +585,9 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpsrlq $48, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] @@ -684,13 +684,13 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf8: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,1,2,3,5,5,6,7] @@ -715,12 +715,12 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] ; AVX512F-SLOW-NEXT: vpbroadcastd 12(%r8), %xmm1 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, 64(%r9) @@ -800,161 +800,162 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa 16(%rsi), %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa (%rcx), %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm11 -; SSE-NEXT: movdqa 16(%r8), %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm0 +; SSE-NEXT: movdqa 16(%rdx), %xmm11 +; SSE-NEXT: movdqa (%rcx), %xmm5 +; SSE-NEXT: movdqa 16(%rcx), %xmm12 +; SSE-NEXT: movdqa 16(%r8), %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm6 ; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: pandn %xmm7, %xmm13 ; SSE-NEXT: por %xmm4, %xmm13 -; SSE-NEXT: pand %xmm12, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm13 ; SSE-NEXT: por %xmm6, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm13 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm6 ; SSE-NEXT: por %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm15, %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: pand %xmm2, %xmm13 ; SSE-NEXT: por %xmm7, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm15 ; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm10 ; SSE-NEXT: movdqa (%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: por %xmm14, %xmm10 +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm13 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,1,3,3] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm15, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: pand %xmm14, %xmm13 ; SSE-NEXT: por %xmm0, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 ; SSE-NEXT: pand %xmm15, %xmm13 -; SSE-NEXT: por %xmm13, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm11, %xmm12 ; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm11, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm12[1] +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] +; SSE-NEXT: pand %xmm14, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pand %xmm3, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] +; SSE-NEXT: pandn %xmm12, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: por %xmm14, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -963,25 +964,24 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] ; SSE-NEXT: pandn %xmm0, %xmm13 ; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,2,2,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%r9) +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: pand %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm2, (%r9) ; SSE-NEXT: movdqa %xmm13, 16(%r9) ; SSE-NEXT: movdqa %xmm15, 48(%r9) -; SSE-NEXT: movdqa %xmm9, 64(%r9) -; SSE-NEXT: movdqa %xmm7, 80(%r9) -; SSE-NEXT: movdqa %xmm12, 96(%r9) -; SSE-NEXT: movdqa %xmm14, 128(%r9) +; SSE-NEXT: movdqa %xmm3, 64(%r9) +; SSE-NEXT: movdqa %xmm8, 80(%r9) +; SSE-NEXT: movdqa %xmm6, 96(%r9) +; SSE-NEXT: movdqa %xmm10, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -994,7 +994,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -1003,32 +1003,32 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3,4,5,6],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2,3,4],xmm7[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1,2,3,4],xmm8[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm4[1],xmm8[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm14 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm14 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] @@ -1040,16 +1040,16 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2,3],xmm8[4],xmm12[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm13[2],xmm8[3,4,5,6],xmm13[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm13[2],xmm7[3,4,5,6],xmm13[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm15, %ymm9 @@ -1061,57 +1061,57 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm11[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm12 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3,4,5],xmm13[6],xmm11[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm3[2],xmm6[3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm11[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm12[3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm9, 96(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 80(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 128(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) @@ -1364,11 +1364,11 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf16: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm6 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] @@ -1394,23 +1394,23 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,1,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[1,1,2,2] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[3,0,3,0,7,4,7,4] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] ; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm1, %ymm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 @@ -1419,35 +1419,35 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm7 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512F-SLOW-NEXT: vpandn %ymm7, %ymm8, %ymm7 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3,4],ymm1[5,6,7,8],ymm4[9],ymm1[10],ymm4[11,12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm1, %ymm9, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512F-SLOW-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf16: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm2 ; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm5 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm6 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] @@ -1472,23 +1472,23 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,1,1,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm3[1,1,2,2] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[3,0,3,0,7,4,7,4] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] ; AVX512F-FAST-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm1, %ymm10 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm0, %ymm10 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX512F-FAST-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 @@ -1497,22 +1497,22 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm7 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512F-FAST-NEXT: vpandn %ymm7, %ymm8, %ymm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3,4],ymm1[5,6,7,8],ymm4[9],ymm1[10],ymm4[11,12],ymm1[13,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vpternlogq $226, %ymm1, %ymm9, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512F-FAST-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -1560,396 +1560,386 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i16_stride5_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $248, %rsp -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm6 -; SSE-NEXT: movdqa 16(%rsi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm13 +; SSE-NEXT: movdqa (%rdx), %xmm11 +; SSE-NEXT: movdqa (%rcx), %xmm0 +; SSE-NEXT: movdqa 16(%rcx), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r8), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa (%r8), %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa 16(%rdx), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,2,2] -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: por %xmm11, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa 16(%r8), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm14, %xmm9 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movdqa 16(%r8), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: movdqa 32(%rsi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: movdqa 32(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 -; SSE-NEXT: movdqa 32(%rdx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,2,2] -; SSE-NEXT: pand %xmm13, %xmm8 -; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: movdqa 32(%rdx), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,2,2] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm6, %xmm12 ; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: movdqa 32(%r8), %xmm7 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: por %xmm5, %xmm12 +; SSE-NEXT: pand %xmm14, %xmm12 +; SSE-NEXT: movdqa 32(%r8), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 ; SSE-NEXT: por %xmm12, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: movdqa 48(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: movdqa 48(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm12 ; SSE-NEXT: movdqa 48(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm5, %xmm12 ; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: movdqa 48(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,3,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm5[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] ; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[0,1,0,1] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm12 -; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm12 ; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm4[1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,1,0,1] +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm7 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm13 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm5, %xmm12 -; SSE-NEXT: por %xmm6, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,1,0,1] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: pandn %xmm6, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] -; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: por %xmm11, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7] +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] ; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm12, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm5[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm13, 304(%r9) -; SSE-NEXT: movdqa %xmm10, 288(%r9) -; SSE-NEXT: movdqa %xmm1, 256(%r9) -; SSE-NEXT: movdqa %xmm15, 240(%r9) -; SSE-NEXT: movdqa %xmm14, 224(%r9) -; SSE-NEXT: movdqa %xmm9, 208(%r9) -; SSE-NEXT: movdqa %xmm8, 176(%r9) +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: psrlq $48, %xmm14 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm14[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm8, 304(%r9) +; SSE-NEXT: movdqa %xmm5, 288(%r9) +; SSE-NEXT: movdqa %xmm15, 256(%r9) +; SSE-NEXT: movdqa %xmm10, 240(%r9) +; SSE-NEXT: movdqa %xmm2, 224(%r9) +; SSE-NEXT: movdqa %xmm6, 208(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1982,14 +1972,14 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i16_stride5_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $56, %rsp -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm4[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -1997,19 +1987,19 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm13 ; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm4 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] @@ -2018,62 +2008,62 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm14, %ymm7 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm4[1],xmm8[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm5[1],xmm8[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1,2,3,4],xmm7[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2,3,4],xmm7[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[1,1,2,2] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm15[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm15[1],xmm6[1] +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4],xmm3[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2086,15 +2076,15 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm0[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm0[3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 @@ -2105,7 +2095,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] @@ -2121,32 +2111,32 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm5 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm2[1,2,3,4],xmm7[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm15[1],xmm2[1] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm14, %ymm7 ; AVX1-ONLY-NEXT: vandps %ymm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm12, %ymm7 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] @@ -2155,9 +2145,9 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 @@ -2165,63 +2155,63 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm12[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2],xmm5[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3,4,5],xmm12[6],xmm8[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm12[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm15[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1,2,3,4],xmm5[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%r9) +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm15[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2],xmm7[3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1,2,3,4],xmm4[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 64(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 80(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm11, 128(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2252,332 +2242,338 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i16_stride5_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $72, %rsp -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $104, %rsp +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm4[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7,8],ymm7[9],ymm11[10],ymm7[11],ymm11[12,13],ymm7[14],ymm11[15] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm9 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3,4],ymm14[5,6,7,8],ymm13[9],ymm14[10],ymm13[11,12],ymm14[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3,4],ymm13[5,6,7,8],ymm12[9],ymm13[10],ymm12[11,12],ymm13[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm12, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3,4],ymm0[5,6,7,8],ymm5[9],ymm0[10],ymm5[11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm12[2,3,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm12 ; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3],ymm5[4],ymm2[5,6],ymm5[7],ymm2[8,9],ymm5[10],ymm2[11],ymm5[12],ymm2[13,14],ymm5[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[3,0,3,0,7,4,7,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15] ; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm7[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 224(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm12, 288(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 256(%r9) +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 224(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 288(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 256(%r9) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) -; AVX2-SLOW-NEXT: addq $72, %rsp +; AVX2-SLOW-NEXT: addq $104, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $40, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-NEXT: subq $72, %rsp +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm3 ; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3],xmm9[4],xmm3[5],xmm9[6],xmm3[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm9 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm9 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,1,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] +; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3],xmm13[4],xmm7[5],xmm13[6],xmm7[7] ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2],xmm13[3],xmm10[4,5],xmm13[6],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm9 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm9, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm8, %ymm11, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm11, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm14 ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm12 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3,4],ymm12[5,6,7,8],ymm6[9],ymm12[10],ymm6[11,12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3,4],ymm12[5,6,7,8],ymm3[9],ymm12[10],ymm3[11,12],ymm12[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm13, %ymm1, %ymm12 ; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm13 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm8[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm7[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermq $165, (%rsp), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9) @@ -2586,168 +2582,173 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 160(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: addq $40, %rsp +; AVX2-FAST-NEXT: addq $72, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride5_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3],xmm9[4],xmm3[5],xmm9[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm12, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3],xmm13[4],xmm7[5],xmm13[6],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2],xmm13[3],xmm10[4,5],xmm13[6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm11, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm13, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm14, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3,4],ymm12[5,6,7,8],ymm6[9],ymm12[10],ymm6[11,12],ymm12[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3,4],ymm12[5,6,7,8],ymm3[9],ymm12[10],ymm3[11,12],ymm12[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm13, %ymm1, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm8[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermq $165, (%rsp), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%r9) @@ -2756,27 +2757,28 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 288(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $72, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm12 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm20 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,2,3,3,7,6,7,7] ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] @@ -2785,14 +2787,14 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm15 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,3,2,3,6,7,6,7] ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm6 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] @@ -2813,25 +2815,25 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm18[1,1,2,2] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13],ymm0[14],ymm7[15] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm11 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5],xmm11[6],xmm9[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm11 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7,8],ymm2[9],ymm9[10],ymm2[11],ymm9[12,13],ymm2[14],ymm9[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm8 ; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] ; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm4 @@ -2844,9 +2846,9 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm9 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2],xmm2[3],xmm9[4,5],xmm2[6],xmm9[7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm4 @@ -2860,41 +2862,41 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm7, %zmm4 ; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm2 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,1,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm3, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm3, %ymm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm18[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm18[2,3,2,3,6,7,6,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10],ymm8[11],ymm3[12,13],ymm8[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10,11],ymm12[12],ymm8[13],ymm12[14],ymm8[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[3,2,3,3,7,6,7,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3,4],ymm0[5,6,7,8],ymm5[9],ymm0[10],ymm5[11,12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2],ymm12[3,4],ymm0[5,6,7,8],ymm12[9],ymm0[10],ymm12[11,12],ymm0[13,14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm7, %zmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm3 +; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm20[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5],ymm0[6],ymm5[7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13],ymm0[14],ymm5[15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm5[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13],ymm0[14],ymm7[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,2,1,4,5,6,5] ; AVX512F-SLOW-NEXT: vprolq $16, %ymm6, %ymm6 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] @@ -2903,20 +2905,20 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[3,0,3,0,7,4,7,4] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm1 ; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm3, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm9, %ymm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm4, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 256(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 192(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 128(%r9) @@ -2925,158 +2927,159 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm19 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm19[1,1,2,2] +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm17 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm17[1,1,2,2] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm16 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[1,2,2,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[1,2,2,2] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2],xmm14[3],xmm13[4,5],xmm14[6],xmm13[7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm14 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm14 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpandnq %ymm14, %ymm16, %ymm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm17 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm25 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpandnq %ymm3, %ymm18, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm19 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm12 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm14 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2],xmm15[3],xmm3[4,5],xmm15[6],xmm3[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm22 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6],xmm12[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,0,1,8,9,8,8] -; AVX512F-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm23 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm14, %zmm16, %zmm1 -; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm3 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm14[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1],ymm3[2],ymm13[3],ymm3[4],ymm13[5,6],ymm3[7],ymm13[8,9],ymm3[10],ymm13[11],ymm3[12],ymm13[13,14],ymm3[15] +; AVX512F-FAST-NEXT: vpermi2q %zmm12, %zmm5, %zmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5],xmm10[6],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %xmm22 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm14, %zmm18, %zmm2 +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm10 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm15[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm2, %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3],ymm14[4],ymm5[5,6],ymm14[7],ymm5[8,9],ymm14[10],ymm5[11],ymm14[12],ymm5[13,14],ymm14[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm13, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm13 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10],ymm13[11],ymm14[12,13],ymm13[14],ymm14[15] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5],ymm11[6],ymm3[7,8],ymm11[9],ymm3[10,11],ymm11[12],ymm3[13],ymm11[14],ymm3[15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm1[1],ymm13[2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7,8],ymm1[9],ymm13[10],ymm1[11],ymm13[12,13],ymm1[14],ymm13[15] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm7 -; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm11, %zmm3 +; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm5 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm3, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8],ymm7[9],ymm3[10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3],ymm7[4],ymm5[5,6],ymm7[7],ymm5[8,9],ymm7[10],ymm5[11],ymm7[12],ymm5[13,14],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,3,2,3,10,11,10,10] -; AVX512F-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm7 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm5 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2],ymm11[3,4],ymm5[5,6,7,8],ymm11[9],ymm5[10],ymm11[11,12],ymm5[13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] -; AVX512F-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm8 -; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm1 -; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vpandn %ymm5, %ymm13, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm17[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3],ymm5[4,5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10],ymm13[11],ymm5[12,13],ymm13[14],ymm5[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm17[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm13[2],ymm4[3],ymm13[4],ymm4[5,6],ymm13[7],ymm4[8,9],ymm13[10],ymm4[11],ymm13[12],ymm4[13,14],ymm13[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,3,2,3,10,11,10,10] +; AVX512F-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm13 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm5 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,3,2,3,10,10,11,10] +; AVX512F-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm13, %zmm18, %zmm9 +; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm5 +; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3,4],ymm1[5,6,7,8],ymm4[9],ymm1[10],ymm4[11,12],ymm1[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9] +; AVX512F-FAST-NEXT: vpermi2q %zmm6, %zmm1, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3,4],ymm2[5,6,7,8],ymm3[9],ymm2[10],ymm3[11,12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,3,2,8,9,8,9] -; AVX512F-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm4 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] -; AVX512F-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm5 -; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm2 -; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 256(%r9) +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,3,2,2,8,9,8,9] +; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm11, %zmm3 +; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 256(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 192(%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -3153,813 +3156,808 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $616, %rsp # imm = 0x268 -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm12 -; SSE-NEXT: movdqa 16(%rsi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: subq $632, %rsp # imm = 0x278 +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm7 +; SSE-NEXT: movdqa 16(%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm6 ; SSE-NEXT: movdqa (%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r8), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa 16(%rcx), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm10 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa 16(%rdx), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,2,2] +; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 ; SSE-NEXT: movdqa 16(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: movdqa 32(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm12 ; SSE-NEXT: movdqa 32(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa 32(%rdx), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,2,2] -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: movdqa 32(%rdx), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[1,1,2,2] +; SSE-NEXT: pand %xmm15, %xmm14 +; SSE-NEXT: por %xmm14, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: pand %xmm2, %xmm10 ; SSE-NEXT: movdqa 32(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 ; SSE-NEXT: movdqa 48(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm14 ; SSE-NEXT: movdqa 48(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: movdqa 48(%rdx), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: por %xmm15, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pand %xmm2, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: movdqa 48(%rdx), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,2,2] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm14, %xmm12 +; SSE-NEXT: pand %xmm2, %xmm12 ; SSE-NEXT: movdqa 48(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 64(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm11, %xmm15 -; SSE-NEXT: movdqa 64(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: movdqa 64(%rdx), %xmm0 +; SSE-NEXT: por %xmm12, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: por %xmm15, %xmm11 -; SSE-NEXT: pand %xmm2, %xmm11 +; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa 64(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa 64(%rcx), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm14 +; SSE-NEXT: movdqa 64(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,2,2] +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm14 ; SSE-NEXT: movdqa 64(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: movdqa 80(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 ; SSE-NEXT: movdqa 80(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm14 ; SSE-NEXT: movdqa 80(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: por %xmm11, %xmm15 -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,2,2] +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm14 ; SSE-NEXT: movdqa 80(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: movdqa 96(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 ; SSE-NEXT: movdqa 96(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm14 ; SSE-NEXT: movdqa 96(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: por %xmm11, %xmm15 -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,2,2] +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm14 ; SSE-NEXT: movdqa 96(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa 112(%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa 112(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movdqa 112(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: por %xmm14, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa 112(%r8), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,1,1] -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm11, %xmm15 -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm0[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,1] +; SSE-NEXT: pand %xmm5, %xmm14 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: pand %xmm11, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm12 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm12[1] +; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm7 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm7[1] ; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: por %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,0,1] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] ; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: por %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,1] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,1] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,1] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm13 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,1,1] -; SSE-NEXT: pand %xmm4, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: pand %xmm2, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: por %xmm14, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm3, %xmm14 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,1,1,1] +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: pand %xmm12, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: por %xmm13, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: pandn %xmm4, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] ; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,1] +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,1,1] -; SSE-NEXT: pand %xmm4, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm13, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm6[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5,7,6] +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 ; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: por %xmm9, %xmm10 -; SSE-NEXT: movdqa %xmm10, 624(%r9) -; SSE-NEXT: movdqa %xmm0, 608(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 576(%r9) -; SSE-NEXT: movdqa %xmm2, 560(%r9) -; SSE-NEXT: movdqa %xmm12, 544(%r9) +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm15, 624(%r9) +; SSE-NEXT: movdqa %xmm14, 608(%r9) +; SSE-NEXT: movdqa %xmm2, 576(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 560(%r9) +; SSE-NEXT: movdqa %xmm0, 544(%r9) ; SSE-NEXT: movdqa %xmm11, 528(%r9) -; SSE-NEXT: movdqa %xmm14, 496(%r9) -; SSE-NEXT: movdqa %xmm15, 480(%r9) +; SSE-NEXT: movdqa %xmm13, 496(%r9) +; SSE-NEXT: movdqa %xmm12, 480(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4024,26 +4022,26 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) -; SSE-NEXT: addq $616, %rsp # imm = 0x268 +; SSE-NEXT: addq $632, %rsp # imm = 0x278 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm10[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -4060,16 +4058,16 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 @@ -4077,79 +4075,79 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm13 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,2,2] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1],xmm11[2],xmm10[3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4,5,6],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm5 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm14[1],xmm4[2,3,4,5],xmm14[6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm14[1],xmm4[2,3,4,5],xmm14[6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm8[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm8 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm5[2],xmm7[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm4[2],xmm7[3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm1 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm4[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4],xmm5[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4],xmm4[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 @@ -4187,8 +4185,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -4198,25 +4196,25 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4,5,6],xmm5[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm9 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm9[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm10[4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm13, %ymm9 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm10[1],xmm13[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 @@ -4230,17 +4228,17 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3,4],xmm8[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2,3,4],xmm7[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm8[2],xmm0[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] @@ -4252,7 +4250,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] @@ -4261,7 +4259,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] @@ -4322,19 +4320,19 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9 ; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm11[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm10[4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm9, %ymm13, %ymm9 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm11[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm10[1],xmm13[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 @@ -4370,7 +4368,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] @@ -4387,9 +4385,9 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] @@ -4405,8 +4403,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -4419,30 +4417,30 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -4450,11 +4448,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] @@ -4577,145 +4575,145 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i16_stride5_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm15 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm15, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm15, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm4 -; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm4 -; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm4 -; AVX2-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm11 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 ; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm7[0,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm11[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4723,36 +4721,37 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm10 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,1,2,1,4,5,6,5] @@ -4760,151 +4759,150 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3],ymm14[4],ymm0[5,6],ymm14[7],ymm0[8,9],ymm14[10],ymm0[11],ymm14[12],ymm0[13,14],ymm14[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[3,2,3,3,7,6,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm11 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[3,2,3,3,7,6,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3,4],ymm5[5,6,7,8],ymm9[9],ymm5[10],ymm9[11,12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2],ymm15[3,4],ymm5[5,6,7,8],ymm15[9],ymm5[10],ymm15[11,12],ymm5[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7,8],ymm9[9],ymm5[10],ymm9[11],ymm5[12,13],ymm9[14],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10],ymm15[11],ymm5[12,13],ymm15[14],ymm5[15] +; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2],ymm14[3,4],ymm9[5,6,7,8],ymm14[9],ymm9[10],ymm14[11,12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3,4],ymm15[5,6,7,8],ymm14[9],ymm15[10],ymm14[11,12],ymm15[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm9 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm14, %ymm5 +; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm11, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm0 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[3,0,3,0,7,4,7,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5],ymm9[6],ymm14[7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13],ymm9[14],ymm14[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13],ymm9[14],ymm15[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm12 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10],ymm14[11],ymm12[12,13],ymm14[14],ymm12[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm14 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm12, %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm8 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[3,0,3,0,7,4,7,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,1,2,2] +; AVX2-SLOW-NEXT: vpermq $165, (%rsp), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm9, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm12, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm7, %ymm4, %ymm4 @@ -4915,8 +4913,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 384(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 224(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 608(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 608(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4933,7 +4930,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm13, 128(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4952,338 +4950,338 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i16_stride5_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX2-FAST-NEXT: subq $968, %rsp # imm = 0x3C8 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 64(%r8), %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm12, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 64(%r8), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 96(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = mem[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm10[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm14, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm9 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,1,4,5,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3],ymm12[4],ymm3[5,6],ymm12[7],ymm3[8,9],ymm12[10],ymm3[11],ymm12[12],ymm3[13,14],ymm12[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm12 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13],ymm13[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3],ymm15[4],ymm4[5,6],ymm15[7],ymm4[8,9],ymm15[10],ymm4[11],ymm15[12],ymm4[13,14],ymm15[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm15 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7,8],ymm14[9],ymm1[10,11],ymm14[12],ymm1[13],ymm14[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3],ymm11[4],ymm0[5,6],ymm11[7],ymm0[8,9],ymm11[10],ymm0[11],ymm11[12],ymm0[13,14],ymm11[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm14[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 80(%r8), %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq 80(%r8), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm15, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 112(%r8), %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 112(%r8), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2],ymm14[3,4],ymm11[5,6,7,8],ymm14[9],ymm11[10],ymm14[11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7,8],ymm15[9],ymm11[10],ymm15[11],ymm11[12,13],ymm15[14],ymm11[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm15 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm8[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2],ymm12[3,4],ymm15[5,6,7,8],ymm12[9],ymm15[10],ymm12[11,12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10],ymm15[11],ymm12[12,13],ymm15[14],ymm12[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm15 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2],ymm11[3,4],ymm15[5,6,7,8],ymm11[9],ymm15[10],ymm11[11,12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vpshufd $238, (%rsp), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm15 -; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3,4],ymm15[5,6,7,8],ymm9[9],ymm15[10],ymm9[11,12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2],ymm10[3,4],ymm15[5,6,7,8],ymm10[9],ymm15[10],ymm10[11,12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 88(%r8), %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 120(%r8), %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpermq $165, (%rsp), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10],ymm12[11],ymm0[12,13],ymm12[14],ymm0[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 88(%r8), %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 120(%r8), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13],ymm12[14],ymm14[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10],ymm12[11],ymm15[12,13],ymm12[14],ymm15[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10,11],ymm15[12],ymm0[13],ymm15[14],ymm0[15] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm10, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm13[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm10, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 544(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm5, 384(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 224(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 608(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 608(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5314,344 +5312,344 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-NEXT: addq $936, %rsp # imm = 0x3A8 +; AVX2-FAST-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX2-FAST-PERLANE-NEXT: subq $968, %rsp # imm = 0x3C8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 64(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 64(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 96(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 72(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 72(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 104(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 104(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm10[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm14, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,1,4,5,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3],ymm12[4],ymm3[5,6],ymm12[7],ymm3[8,9],ymm12[10],ymm3[11],ymm12[12],ymm3[13,14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13],ymm13[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3],ymm15[4],ymm4[5,6],ymm15[7],ymm4[8,9],ymm15[10],ymm4[11],ymm15[12],ymm4[13,14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7,8],ymm14[9],ymm1[10,11],ymm14[12],ymm1[13],ymm14[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3],ymm11[4],ymm0[5,6],ymm11[7],ymm0[8,9],ymm11[10],ymm0[11],ymm11[12],ymm0[13,14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm14[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm11, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm15, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 112(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 112(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2],ymm14[3,4],ymm11[5,6,7,8],ymm14[9],ymm11[10],ymm14[11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm8, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7,8],ymm15[9],ymm11[10],ymm15[11],ymm11[12,13],ymm15[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm8[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2],ymm12[3,4],ymm15[5,6,7,8],ymm12[9],ymm15[10],ymm12[11,12],ymm15[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10],ymm15[11],ymm12[12,13],ymm15[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2],ymm11[3,4],ymm15[5,6,7,8],ymm11[9],ymm15[10],ymm11[11,12],ymm15[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd $238, (%rsp), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3,4],ymm15[5,6,7,8],ymm9[9],ymm15[10],ymm9[11,12],ymm15[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2],ymm10[3,4],ymm15[5,6,7,8],ymm10[9],ymm15[10],ymm10[11,12],ymm15[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 88(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 120(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm6, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10],ymm12[11],ymm0[12,13],ymm12[14],ymm0[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, (%rsp), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 88(%r8), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 120(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13],ymm12[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10],ymm12[11],ymm15[12,13],ymm12[14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm9[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10,11],ymm15[12],ymm0[13],ymm15[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm10, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm13[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm10, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 544(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 384(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 608(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 608(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5682,344 +5680,353 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-PERLANE-NEXT: addq $936, %rsp # imm = 0x3A8 +; AVX2-FAST-PERLANE-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm15 +; AVX512F-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm14 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdx), %ymm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm3 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,2,2,2] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm18 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm18[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm5 -; AVX512F-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpandnq %ymm4, %ymm17, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm21 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm21[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpandnq %ymm0, %ymm27, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm31 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3,4],ymm4[5,6,7,8],ymm3[9],ymm4[10],ymm3[11,12],ymm4[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3,4],ymm3[5,6,7,8],ymm1[9],ymm3[10],ymm1[11,12],ymm3[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm16 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm10 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm1[0,1,2,3],zmm0[0,1,0,1] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm2[0,1,2,3],zmm0[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm19 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm4 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm23 +; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3,4],ymm0[5,6,7,8],ymm3[9],ymm0[10],ymm3[11,12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm8[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm30 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm0[0,1,2,3],zmm1[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm22 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8],ymm0[9],ymm8[10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6],xmm7[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm3[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5],xmm8[6],xmm4[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] -; AVX512F-SLOW-NEXT: vpandnq %ymm7, %ymm17, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm8 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm17 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm2, %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm15, %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13],ymm3[14],ymm0[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10],ymm12[11],ymm2[12,13],ymm12[14],ymm2[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm2[0,1,2,3],zmm6[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0],ymm6[1],ymm13[2],ymm6[3],ymm13[4,5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10],ymm6[11],ymm13[12,13],ymm6[14],ymm13[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3],xmm12[4],xmm6[5],xmm12[6],xmm6[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[0,1,1,1] +; AVX512F-SLOW-NEXT: vpandnq %ymm6, %ymm27, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm12 +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm7, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm14, %ymm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm12[1],ymm7[2],ymm12[3,4],ymm7[5,6,7,8],ymm12[9],ymm7[10],ymm12[11,12],ymm7[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm4, %ymm12 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm12[0,1],ymm7[2],ymm12[3],ymm7[4],ymm12[5,6],ymm7[7],ymm12[8,9],ymm7[10],ymm12[11],ymm7[12],ymm12[13,14],ymm7[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,2,6,7,6,6] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm30[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5],ymm1[6],ymm14[7,8],ymm1[9],ymm14[10,11],ymm1[12],ymm14[13],ymm1[14],ymm14[15] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm9, %ymm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7,8],ymm1[9],ymm9[10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5],ymm9[6],ymm6[7,8],ymm9[9],ymm6[10,11],ymm9[12],ymm6[13],ymm9[14],ymm6[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10,11],ymm7[12],ymm0[13],ymm7[14],ymm0[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm30[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm8, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3],ymm7[4],ymm1[5,6],ymm7[7],ymm1[8,9],ymm7[10],ymm1[11],ymm7[12],ymm1[13,14],ymm7[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 ; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandnq %ymm1, %ymm18, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm19[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5],ymm1[6],ymm14[7,8],ymm1[9],ymm14[10,11],ymm1[12],ymm14[13],ymm1[14],ymm14[15] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm12, %ymm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3],ymm14[4],ymm12[5,6],ymm14[7],ymm12[8,9],ymm14[10],ymm12[11],ymm14[12],ymm12[13,14],ymm14[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8],ymm1[9],ymm14[10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm31 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandnq %ymm1, %ymm17, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm16[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm10, %ymm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm16[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm9[2],ymm3[3],ymm9[4],ymm3[5,6],ymm9[7],ymm3[8,9],ymm9[10],ymm3[11],ymm9[12],ymm3[13,14],ymm9[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm31[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7,8],ymm1[9],ymm9[10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm31[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10,11],ymm9[12],ymm1[13],ymm9[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm11 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm1 -; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm19, %zmm29 -; AVX512F-SLOW-NEXT: vpternlogq $226, (%rsp), %zmm16, %zmm28 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm14 -; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm22 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm14, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm19, %zmm14 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm24[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[0,1,0,0] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm2 -; AVX512F-SLOW-NEXT: vpandnq %ymm2, %ymm18, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm27[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,1,0,0] +; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm13, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $226, (%rsp), %zmm30, %zmm25 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm11 +; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm23 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm13, %zmm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm22[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm23 = mem[0,1,0,0] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm10 +; AVX512F-SLOW-NEXT: vpandnq %ymm10, %ymm17, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm21[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm29[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm28[0,1,0,0] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm19[2,3,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm20[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm10[2,2,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm21 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm25 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm2, %zmm21, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm2, %zmm25, %zmm17 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm19, %zmm2 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm19 = mem[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm21, %zmm19 -; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm12, %zmm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm18, %zmm18 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm19 = mem[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm21, %zmm19 -; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm18 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm18, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm12, %zmm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm28, %zmm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm11, %zmm21, %zmm8 -; AVX512F-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm11 -; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm12, %zmm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm21, %zmm3 -; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm12, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm16, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm30, %zmm16, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm3, %zmm6, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm3, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 384(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 576(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 192(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, 448(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 512(%r9) -; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm18[2,3,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm21[2,2,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm18 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm24 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm10, %zmm18, %zmm19 +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm10, %zmm24, %zmm27 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm13, %zmm10 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm13 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm10, %zmm18, %zmm13 +; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm17, %zmm13 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm17 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm18, %zmm17 +; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm10, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm26, %zmm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm10, %zmm18, %zmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm10 +; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm13, %zmm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm12, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm18, %zmm4 +; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm5 +; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm13, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm30, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm30, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm4, %zmm2, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm4, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 256(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 576(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 192(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 320(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 448(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 512(%r9) +; AVX512F-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf64: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: subq $360, %rsp # imm = 0x168 -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 96(%rdx), %ymm18 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 96(%rdx), %ymm21 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[3,0,3,0,7,4,7,4] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6035,284 +6042,281 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm23[1,1,2,2] +; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm28 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm28[1,1,2,2] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm5, %ymm3, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 32(%rdx), %ymm27 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa %ymm6, %ymm11 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm3, %ymm7, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdx), %ymm16 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX512F-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm5 -; AVX512F-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm6 ; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5],xmm9[6],xmm7[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm13, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm30 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm30[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3],xmm9[4],xmm1[5],xmm9[6],xmm1[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm29 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm29[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5],xmm10[6],xmm1[7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm16 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm31 ; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpandn %ymm7, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm29 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm12 +; AVX512F-FAST-NEXT: vpandn %ymm9, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm27 ; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm7 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2],xmm9[3],xmm3[4,5],xmm9[6],xmm3[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,0,1,8,9,8,8] -; AVX512F-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm19 = zmm4[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm19 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm9 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2],xmm10[3],xmm7[4,5],xmm10[6],xmm7[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,0,1,8,9,8,8] +; AVX512F-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm18 = zmm3[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm26, %zmm18 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6],xmm0[7] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm21 = zmm5[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm28[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm17 = zmm5[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm26, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm25 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm25[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13],ymm0[14],ymm3[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm9, %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8,9],ymm3[10],ymm5[11],ymm3[12],ymm5[13,14],ymm3[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm30 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm5 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm23 ; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm0, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm24 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpandnq %ymm0, %ymm19, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm22 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512F-FAST-NEXT: vprolq $16, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,3,2,3,10,11,10,10] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm2 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512F-FAST-NEXT: vprolq $16, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3],ymm5[4],ymm2[5,6],ymm5[7],ymm2[8,9],ymm5[10],ymm2[11],ymm5[12],ymm2[13,14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,3,2,3,10,11,10,10] +; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2],ymm12[3,4],ymm0[5,6,7,8],ymm12[9],ymm0[10],ymm12[11,12],ymm0[13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5],ymm12[6],ymm4[7,8],ymm12[9],ymm4[10,11],ymm12[12],ymm4[13],ymm12[14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,3,2,3,10,10,11,10] +; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm26, %zmm4 ; AVX512F-FAST-NEXT: vpbroadcastq 112(%r8), %ymm0 -; AVX512F-FAST-NEXT: vpbroadcastq 120(%r8), %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm23[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm6, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5],ymm15[6],ymm8[7,8],ymm15[9],ymm8[10,11],ymm15[12],ymm8[13],ymm15[14],ymm8[15] +; AVX512F-FAST-NEXT: vpbroadcastq 120(%r8), %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm20, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm4, %ymm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3],ymm2[4],ymm8[5,6],ymm2[7],ymm8[8,9],ymm2[10],ymm8[11],ymm2[12],ymm8[13,14],ymm2[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8],ymm0[9],ymm8[10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm8 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm8 -; AVX512F-FAST-NEXT: vpbroadcastq 80(%r8), %ymm2 -; AVX512F-FAST-NEXT: vpandn %ymm2, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm15 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7,8],ymm14[9],ymm9[10],ymm14[11],ymm9[12,13],ymm14[14],ymm9[15] -; AVX512F-FAST-NEXT: vprolq $16, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15] -; AVX512F-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm13 -; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm27[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2],ymm14[3,4],ymm9[5,6,7,8],ymm14[9],ymm9[10],ymm14[11,12],ymm9[13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm27[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm7[0],ymm14[1],ymm7[2,3],ymm14[4],ymm7[5],ymm14[6],ymm7[7,8],ymm14[9],ymm7[10,11],ymm14[12],ymm7[13],ymm14[14],ymm7[15] -; AVX512F-FAST-NEXT: vpermt2q %zmm9, %zmm18, %zmm14 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm13, %zmm31, %zmm14 -; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm7 -; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm14, %zmm17, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,2,3,2,8,9,8,9] -; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vpbroadcastq 80(%r8), %ymm0 +; AVX512F-FAST-NEXT: vpandnq %ymm0, %ymm19, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm19 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm10 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm29[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10],ymm13[11],ymm10[12,13],ymm13[14],ymm10[15] +; AVX512F-FAST-NEXT: vprolq $16, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm29[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3],ymm13[4],ymm6[5,6],ymm13[7],ymm6[8,9],ymm13[10],ymm6[11],ymm13[12],ymm6[13,14],ymm13[15] +; AVX512F-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm6 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm10 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2],ymm13[3,4],ymm10[5,6,7,8],ymm13[9],ymm10[10],ymm13[11,12],ymm10[13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5],ymm13[6],ymm3[7,8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13],ymm13[14],ymm3[15] +; AVX512F-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm26, %zmm3 +; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm6 +; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm20, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2],ymm10[3,4],ymm2[5,6,7,8],ymm10[9],ymm2[10],ymm10[11,12],ymm2[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,3,2,8,9,8,9] +; AVX512F-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm15 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,3,2,2,8,9,8,9] +; AVX512F-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm5 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2],ymm11[3,4],ymm5[5,6,7,8],ymm11[9],ymm5[10],ymm11[11,12],ymm5[13,14,15] +; AVX512F-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 88(%r8), %ymm2 +; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm10, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,3,2,2,8,9,8,9] -; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm1, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512F-FAST-NEXT: vpbroadcastq 88(%r8), %ymm0 -; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm24[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm14[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm25[2,3,2,3,6,7,6,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] -; AVX512F-FAST-NEXT: vpermt2q %zmm6, %zmm13, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm9 -; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm9, %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm3, %zmm9 +; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm9, %zmm10, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm31 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm5, %zmm3, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm5, %zmm16, %zmm29 -; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm19, %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm21, %zmm5, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm25 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm22, %zmm1, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm1, %zmm25, %zmm26 -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm1, %zmm8, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 448(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 384(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 576(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 192(%r9) +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm4, %zmm5, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm4, %zmm31, %zmm27 +; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm18, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm17, %zmm4, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm23 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm3, %zmm23, %zmm22 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm3, %zmm8, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 448(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 384(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 576(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 320(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 192(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 512(%r9) ; AVX512F-FAST-NEXT: addq $360, %rsp # imm = 0x168 ; AVX512F-FAST-NEXT: vzeroupper @@ -6323,93 +6327,93 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38> +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38> ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm13, %zmm3 ; AVX512BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u> +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm15, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u> +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm20, %zmm21 ; AVX512BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 -; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm19, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm24, %zmm25 +; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm25 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm23, %zmm25 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm26, %zmm27 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 -; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 +; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k3} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm8 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm20, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm10 {%k3} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm22, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm25 {%k2} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm27, %zmm29 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm0 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm5, %zmm16 +; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm16 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm2, %zmm18 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm20 +; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm20 {%k2} +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm19, %zmm20 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm2, %zmm22 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm24 {%k3} +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm23, %zmm24 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k3} +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 576(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index 4522be032322c9..bd864d15cfdc44 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -490,23 +490,23 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%rcx), %xmm9 ; SSE-NEXT: movdqa (%r8), %xmm6 -; SSE-NEXT: movdqa (%r9), %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: movdqa (%r9), %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm2[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,0] ; SSE-NEXT: andps %xmm2, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andnps %xmm11, %xmm3 -; SSE-NEXT: orps %xmm10, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,1,2,3] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: andnps %xmm11, %xmm5 +; SSE-NEXT: orps %xmm10, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: movdqa %xmm0, %xmm8 @@ -515,7 +515,7 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm9[0,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] ; SSE-NEXT: andps %xmm2, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; SSE-NEXT: andnps %xmm9, %xmm2 ; SSE-NEXT: orps %xmm8, %xmm2 @@ -526,44 +526,44 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: andps %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] ; SSE-NEXT: movaps %xmm8, %xmm9 ; SSE-NEXT: andnps %xmm11, %xmm9 ; SSE-NEXT: orps %xmm10, %xmm9 ; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm10[0,2] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm3[1] +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm10[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm10, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: andnps %xmm13, %xmm11 -; SSE-NEXT: orps %xmm12, %xmm11 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm6[1,1,1,1,4,5,6,7] +; SSE-NEXT: andps %xmm10, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,2,3,3] +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: andnps %xmm12, %xmm13 +; SSE-NEXT: orps %xmm11, %xmm13 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[1,1,1,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[0,2] -; SSE-NEXT: andps %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] -; SSE-NEXT: pslld $16, %xmm5 -; SSE-NEXT: andnps %xmm5, %xmm8 -; SSE-NEXT: orps %xmm4, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[0,2] +; SSE-NEXT: andps %xmm8, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] +; SSE-NEXT: pslld $16, %xmm4 +; SSE-NEXT: andnps %xmm4, %xmm8 +; SSE-NEXT: orps %xmm3, %xmm8 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] -; SSE-NEXT: andps %xmm10, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[0,2] +; SSE-NEXT: andps %xmm10, %xmm11 ; SSE-NEXT: andnps %xmm6, %xmm10 -; SSE-NEXT: orps %xmm12, %xmm10 +; SSE-NEXT: orps %xmm11, %xmm10 ; SSE-NEXT: movaps %xmm10, 16(%rax) ; SSE-NEXT: movaps %xmm8, 48(%rax) -; SSE-NEXT: movaps %xmm11, 64(%rax) +; SSE-NEXT: movaps %xmm13, 64(%rax) ; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: movaps %xmm2, 32(%rax) -; SSE-NEXT: movaps %xmm3, 80(%rax) +; SSE-NEXT: movaps %xmm5, 80(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf8: @@ -865,178 +865,180 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp ; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movdqa (%rdx), %xmm14 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa (%rcx), %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm2 -; SSE-NEXT: movdqa 16(%r8), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm8 -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm6[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm0[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm7, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: andnps %xmm9, %xmm0 -; SSE-NEXT: orps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[3,3] -; SSE-NEXT: movdqa (%r8), %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,2],xmm9[2,3] -; SSE-NEXT: movdqa (%r9), %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: movdqa (%rdx), %xmm13 +; SSE-NEXT: movdqa 16(%rdx), %xmm6 +; SSE-NEXT: movdqa (%rcx), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm5 +; SSE-NEXT: movdqa 16(%r8), %xmm1 +; SSE-NEXT: movdqa 16(%r9), %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm9[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm0[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: andnps %xmm11, %xmm1 +; SSE-NEXT: orps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm7[3,3] +; SSE-NEXT: movdqa (%r8), %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,2],xmm12[2,3] +; SSE-NEXT: movdqa (%r9), %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: andnps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3] -; SSE-NEXT: andps %xmm7, %xmm13 -; SSE-NEXT: orps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0,1,3] +; SSE-NEXT: andps %xmm2, %xmm14 +; SSE-NEXT: orps %xmm14, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm12[3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,6,5,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm6[3,3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: andnps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: andnps %xmm4, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm7, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3] ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm13[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: andnps %xmm3, %xmm7 -; SSE-NEXT: orps %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm11[0] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm11[1,3] +; SSE-NEXT: andnps %xmm3, %xmm2 +; SSE-NEXT: orps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm8[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2] -; SSE-NEXT: pslld $16, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pslld $16, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm8, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 ; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: por %xmm3, %xmm14 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] +; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[0,2] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: andps %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm14[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 ; SSE-NEXT: andps %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm13[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,0,1,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: andps %xmm3, %xmm5 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm15[0] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm5[0,2] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: andps %xmm1, %xmm14 -; SSE-NEXT: por %xmm14, %xmm15 -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: andps %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: andps %xmm1, %xmm13 +; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: andps %xmm3, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] -; SSE-NEXT: andps %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,1,1] -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[0,2] +; SSE-NEXT: andps %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,1,1] +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm1, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm10[0,2] -; SSE-NEXT: andps %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] +; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm13, %xmm5 +; SSE-NEXT: andps %xmm1, %xmm9 +; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm11[0,2] +; SSE-NEXT: andps %xmm1, %xmm7 +; SSE-NEXT: pslld $16, %xmm12 +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm1, 48(%rax) -; SSE-NEXT: movdqa %xmm0, 96(%rax) +; SSE-NEXT: movdqa %xmm5, 96(%rax) ; SSE-NEXT: movdqa %xmm3, 112(%rax) -; SSE-NEXT: movdqa %xmm11, 160(%rax) +; SSE-NEXT: movdqa %xmm8, 160(%rax) ; SSE-NEXT: movdqa %xmm15, (%rax) -; SSE-NEXT: movdqa %xmm8, 16(%rax) -; SSE-NEXT: movdqa %xmm9, 64(%rax) -; SSE-NEXT: movdqa %xmm13, 144(%rax) -; SSE-NEXT: movaps %xmm7, 32(%rax) +; SSE-NEXT: movdqa %xmm4, 16(%rax) +; SSE-NEXT: movdqa %xmm10, 64(%rax) +; SSE-NEXT: movdqa %xmm14, 144(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1059,27 +1061,27 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm11 -; AVX1-ONLY-NEXT: vpslld $16, %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm12 +; AVX1-ONLY-NEXT: vpslld $16, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm10[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5,6],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -1087,36 +1089,36 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm14 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm13 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1],xmm10[2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1],xmm12[2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm2[1],xmm10[2,3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0],xmm2[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3],xmm2[4,5,6,7] @@ -1126,45 +1128,45 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm2[0,1],xmm10[0],xmm2[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm2[0,1],xmm11[0],xmm2[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,0,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm14[0],xmm3[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm13[0],xmm3[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm7[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5,6],xmm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1175,8 +1177,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa %xmm9, 112(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 96(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm15, 64(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 80(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 176(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 80(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 176(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1189,8 +1191,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i16_stride6_vf16: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm13 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 @@ -1220,9 +1222,9 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] @@ -1253,9 +1255,9 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] @@ -1279,8 +1281,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] @@ -1319,50 +1321,50 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm11 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm11 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastq %xmm9, %ymm9 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1,2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm9, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11] +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm9[0],ymm15[1],ymm9[1],ymm15[2],ymm9[2],ymm15[3],ymm9[3],ymm15[8],ymm9[8],ymm15[9],ymm9[9],ymm15[10],ymm9[10],ymm15[11],ymm9[11] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm9 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,0,3,2,1,0,3,2] ; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 @@ -1376,20 +1378,20 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,0,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 @@ -1403,7 +1405,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1420,117 +1422,110 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $24, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm10[4],ymm0[5],ymm10[5],ymm0[6],ymm10[6],ymm0[7],ymm10[7],ymm0[12],ymm10[12],ymm0[13],ymm10[13],ymm0[14],ymm10[14],ymm0[15],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm7[4],ymm4[4],ymm7[5],ymm4[5],ymm7[6],ymm4[6],ymm7[7],ymm4[7],ymm7[12],ymm4[12],ymm7[13],ymm4[13],ymm7[14],ymm4[14],ymm7[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm15, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm14, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm14, %ymm15, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4],ymm15[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm6 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm10 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,1,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm9, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 96(%rax) @@ -1541,7 +1536,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $24, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1549,15 +1543,15 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm3 ; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} ymm7 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,1,2,3,6,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] @@ -1565,30 +1559,30 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX512F-SLOW-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,2,2,3,5,6,6,7] ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <5,u,14,6,u,15,7,u> ; AVX512F-SLOW-NEXT: vpermi2d %ymm7, %ymm8, %ymm9 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] ; AVX512F-SLOW-NEXT: vpermi2d %zmm9, %zmm7, %zmm8 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm16 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,8,u,1,9,u,2,10> -; AVX512F-SLOW-NEXT: vpermi2d %ymm9, %ymm11, %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpermi2d %ymm10, %ymm12, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512F-SLOW-NEXT: vpbroadcastq %xmm14, %ymm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -1596,34 +1590,34 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,1,2,1] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,1,2,1] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-SLOW-NEXT: vpermi2d %ymm14, %ymm12, %ymm15 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512F-SLOW-NEXT: vpermi2d %ymm14, %ymm9, %ymm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <4,12,u,5,13,u,6,14> -; AVX512F-SLOW-NEXT: vpermi2d %ymm4, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <4,12,u,5,13,u,6,14> +; AVX512F-SLOW-NEXT: vpermi2d %ymm4, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,u,10,2,u,11,3,u> ; AVX512F-SLOW-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] ; AVX512F-SLOW-NEXT: vpermi2d %ymm1, %ymm3, %ymm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -1639,19 +1633,19 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm1 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm9 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,8,u,1,9,u,2,10> -; AVX512F-FAST-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm9 +; AVX512F-FAST-NEXT: vpermi2d %ymm8, %ymm11, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm8 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm11 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] ; AVX512F-FAST-NEXT: vpermi2d %ymm13, %ymm12, %ymm14 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm13 ; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm12 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] ; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -1661,16 +1655,16 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,2,2,3] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm12, %zmm13 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,u,10,2,u,11,3,u> -; AVX512F-FAST-NEXT: vpermi2d %ymm8, %ymm6, %ymm7 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-FAST-NEXT: vpermi2d %ymm9, %ymm6, %ymm7 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] ; AVX512F-FAST-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] @@ -1758,360 +1752,361 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $312, %rsp # imm = 0x138 -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: subq $280, %rsp # imm = 0x118 +; SSE-NEXT: movdqa (%rdi), %xmm5 ; SSE-NEXT: movdqa 16(%rdi), %xmm13 ; SSE-NEXT: movdqa (%rsi), %xmm3 ; SSE-NEXT: movdqa 16(%rsi), %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm5 +; SSE-NEXT: movdqa (%rdx), %xmm6 ; SSE-NEXT: movdqa 16(%rdx), %xmm14 -; SSE-NEXT: movdqa (%rcx), %xmm4 -; SSE-NEXT: movdqa 16(%rcx), %xmm10 -; SSE-NEXT: movdqa (%r8), %xmm8 -; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa (%rcx), %xmm9 +; SSE-NEXT: movdqa 16(%rcx), %xmm7 +; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: movdqa (%r9), %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm4[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm4[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm12, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm7[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm6, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: andnps %xmm11, %xmm0 -; SSE-NEXT: orps %xmm9, %xmm0 +; SSE-NEXT: orps %xmm10, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm5[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm4[2,3] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm6[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm9[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: andnps %xmm4, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: andnps %xmm9, %xmm0 ; SSE-NEXT: orps %xmm3, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 16(%r8), %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm9[0,1] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 16(%r8), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm9[0,1] ; SSE-NEXT: movdqa 16(%r9), %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: andnps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm11 -; SSE-NEXT: orps %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: andnps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] +; SSE-NEXT: andps %xmm12, %xmm10 +; SSE-NEXT: orps %xmm10, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm7[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: andnps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm5 +; SSE-NEXT: movdqa 32(%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rsi), %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 32(%r8), %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm15[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm13[0,1] +; SSE-NEXT: movdqa 32(%r9), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: andnps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] +; SSE-NEXT: andps %xmm12, %xmm11 +; SSE-NEXT: orps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm5[3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm10[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: andnps %xmm10, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm1 ; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm2 -; SSE-NEXT: movdqa 32(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 32(%r8), %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm13[0,1] -; SSE-NEXT: movdqa 32(%r9), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa 48(%rdx), %xmm5 +; SSE-NEXT: movdqa 48(%rcx), %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: movaps %xmm6, %xmm13 -; SSE-NEXT: andnps %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm12 -; SSE-NEXT: orps %xmm12, %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: movaps %xmm6, %xmm11 -; SSE-NEXT: andnps %xmm1, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm3 -; SSE-NEXT: movdqa 48(%rcx), %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa 48(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[3,3] -; SSE-NEXT: movdqa 48(%r8), %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm13[0,1] -; SSE-NEXT: movdqa 48(%r9), %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: andnps %xmm13, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm4[3,3] +; SSE-NEXT: movdqa 48(%r8), %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm10[0,1] +; SSE-NEXT: movdqa 48(%r9), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: andnps %xmm10, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm11 -; SSE-NEXT: orps %xmm11, %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,6,5,7,7] +; SSE-NEXT: andps %xmm12, %xmm11 +; SSE-NEXT: orps %xmm11, %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: andnps %xmm1, %xmm6 -; SSE-NEXT: orps %xmm0, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm12 +; SSE-NEXT: orps %xmm0, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: andps %xmm1, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[0,2] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: andps %xmm5, %xmm11 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[0,2] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: andps %xmm1, %xmm6 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[0,2] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: andps %xmm5, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[0,2] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: andps %xmm0, %xmm10 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,1],xmm6[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm10[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: andps %xmm14, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[0,2] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: andps %xmm0, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,2] ; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: andps %xmm1, %xmm6 +; SSE-NEXT: andps %xmm14, %xmm6 ; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: andps %xmm5, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[0,2] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: andps %xmm0, %xmm8 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: andps %xmm1, %xmm8 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm11, %xmm8 -; SSE-NEXT: andps %xmm5, %xmm15 -; SSE-NEXT: por %xmm15, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm10[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm9, %xmm4 +; SSE-NEXT: andps %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: andps %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm7[0] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm3[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: andps %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm7[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: pandn %xmm11, %xmm6 -; SSE-NEXT: andps %xmm1, %xmm9 -; SSE-NEXT: por %xmm9, %xmm6 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm10[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm13, %xmm9 -; SSE-NEXT: andps %xmm5, %xmm11 -; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: andps %xmm0, %xmm10 +; SSE-NEXT: por %xmm10, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: movaps %xmm3, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: andps %xmm1, %xmm11 -; SSE-NEXT: por %xmm11, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm11[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: andps %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm2[0,2] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: pandn %xmm13, %xmm10 -; SSE-NEXT: andps %xmm5, %xmm14 -; SSE-NEXT: por %xmm14, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2] -; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm14, %xmm13 -; SSE-NEXT: andps %xmm1, %xmm11 -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: andps %xmm5, %xmm14 -; SSE-NEXT: por %xmm14, %xmm11 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm2[0,2] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm7 +; SSE-NEXT: andps %xmm0, %xmm15 +; SSE-NEXT: por %xmm15, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm12, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm14[0,2] -; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] -; SSE-NEXT: andps %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm14, %xmm5 -; SSE-NEXT: por %xmm12, %xmm5 +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[1,3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm11[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: andps %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: andps %xmm0, %xmm10 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm5[0] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm5[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm11[0,2] +; SSE-NEXT: andps %xmm14, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm5[0,2] +; SSE-NEXT: andps %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: por %xmm13, %xmm0 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm5, 352(%rax) -; SSE-NEXT: movdqa %xmm1, 336(%rax) -; SSE-NEXT: movdqa %xmm11, 304(%rax) -; SSE-NEXT: movdqa %xmm13, 288(%rax) -; SSE-NEXT: movdqa %xmm10, 256(%rax) -; SSE-NEXT: movdqa %xmm15, 240(%rax) -; SSE-NEXT: movdqa %xmm9, 208(%rax) -; SSE-NEXT: movdqa %xmm6, 192(%rax) -; SSE-NEXT: movdqa %xmm8, 160(%rax) -; SSE-NEXT: movdqa %xmm7, 144(%rax) +; SSE-NEXT: movdqa %xmm0, 352(%rax) +; SSE-NEXT: movdqa %xmm14, 336(%rax) +; SSE-NEXT: movdqa %xmm2, 304(%rax) +; SSE-NEXT: movdqa %xmm15, 288(%rax) +; SSE-NEXT: movdqa %xmm7, 256(%rax) +; SSE-NEXT: movdqa %xmm4, 240(%rax) +; SSE-NEXT: movdqa %xmm6, 208(%rax) +; SSE-NEXT: movdqa %xmm8, 192(%rax) +; SSE-NEXT: movdqa %xmm9, 160(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2124,8 +2119,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 368(%rax) +; SSE-NEXT: movaps %xmm12, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2140,76 +2134,76 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $312, %rsp # imm = 0x138 +; SSE-NEXT: addq $280, %rsp # imm = 0x118 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $120, %rsp ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm2 -; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm3 +; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm4 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm13[2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm5 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,6,6,7] @@ -2217,24 +2211,24 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -2242,25 +2236,25 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm7[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 @@ -2439,19 +2433,21 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i16_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $616, %rsp # imm = 0x268 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill @@ -2463,203 +2459,203 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm11 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm14 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[8],ymm1[8],ymm12[9],ymm1[9],ymm12[10],ymm1[10],ymm12[11],ymm1[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm0[1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm12, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,1,1,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm13 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm10[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm12, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm8[4],ymm11[4],ymm8[5],ymm11[5],ymm8[6],ymm11[6],ymm8[7],ymm11[7],ymm8[12],ymm11[12],ymm8[13],ymm11[13],ymm8[14],ymm11[14],ymm8[15],ymm11[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm13 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm14[4],ymm7[5],ymm14[5],ymm7[6],ymm14[6],ymm7[7],ymm14[7],ymm7[12],ymm14[12],ymm7[13],ymm14[13],ymm7[14],ymm14[14],ymm7[15],ymm14[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm14[4],ymm7[4],ymm14[5],ymm7[5],ymm14[6],ymm7[6],ymm14[7],ymm7[7],ymm14[12],ymm7[12],ymm14[13],ymm7[13],ymm14[14],ymm7[14],ymm14[15],ymm7[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm10, %ymm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm12, %ymm15 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm12, %ymm12 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 @@ -2667,48 +2663,48 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[8],ymm11[8],ymm8[9],ymm11[9],ymm8[10],ymm11[10],ymm8[11],ymm11[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm7[0],ymm14[1],ymm7[1],ymm14[2],ymm7[2],ymm14[3],ymm7[3],ymm14[8],ymm7[8],ymm14[9],ymm7[9],ymm14[10],ymm7[10],ymm14[11],ymm7[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm15, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 288(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 352(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2718,18 +2714,18 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $616, %rsp # imm = 0x268 +; AVX2-SLOW-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: subq $696, %rsp # imm = 0x2B8 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm13 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 @@ -2741,14 +2737,14 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] @@ -2757,14 +2753,12 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm12 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -2777,16 +2771,16 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm9 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm11 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2796,42 +2790,44 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2842,145 +2838,148 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,2,1,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,2,1,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm10[1],ymm0[2,3],ymm10[4],ymm0[5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm4[4],ymm10[5],ymm4[5],ymm10[6],ymm4[6],ymm10[7],ymm4[7],ymm10[12],ymm4[12],ymm10[13],ymm4[13],ymm10[14],ymm4[14],ymm10[15],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[12],mem[12],ymm3[13],mem[13],ymm3[14],mem[14],ymm3[15],mem[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm13, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[12],mem[12],ymm1[13],mem[13],ymm1[14],mem[14],ymm1[15],mem[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = mem[0,0,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [5,4,2,2,5,4,6,6] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6],ymm3[7] -; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm15, %ymm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [5,4,2,2,5,4,6,6] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7] +; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[2],mem[2],ymm4[3],mem[3],ymm4[8],mem[8],ymm4[9],mem[9],ymm4[10],mem[10],ymm4[11],mem[11] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[2],mem[2],ymm5[3],mem[3],ymm5[8],mem[8],ymm5[9],mem[9],ymm5[10],mem[10],ymm5[11],mem[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm13, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 288(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 288(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2992,33 +2991,33 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: addq $696, %rsp # imm = 0x2B8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: subq $632, %rsp # imm = 0x278 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm7 @@ -3030,54 +3029,55 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] @@ -3087,28 +3087,27 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -3119,148 +3118,153 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm8[1],ymm0[2,3],ymm8[4],ymm0[5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[12],ymm10[12],ymm12[13],ymm10[13],ymm12[14],ymm10[14],ymm12[15],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm12, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm13[4],mem[4],ymm13[5],mem[5],ymm13[6],mem[6],ymm13[7],mem[7],ymm13[12],mem[12],ymm13[13],mem[13],ymm13[14],mem[14],ymm13[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[8],ymm9[8],ymm7[9],ymm9[9],ymm7[10],ymm9[10],ymm7[11],ymm9[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm14[0],ymm2[1],ymm14[1],ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[8],ymm14[8],ymm2[9],ymm14[9],ymm2[10],ymm14[10],ymm2[11],ymm14[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[1],ymm15[1],ymm3[2],ymm15[2],ymm3[3],ymm15[3],ymm3[8],ymm15[8],ymm3[9],ymm15[9],ymm3[10],ymm15[10],ymm3[11],ymm15[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[2],mem[2],ymm13[3],mem[3],ymm13[8],mem[8],ymm13[9],mem[9],ymm13[10],mem[10],ymm13[11],mem[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3269,451 +3273,451 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: addq $632, %rsp # imm = 0x278 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride6_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm12, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm2[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm5 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm7[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm7[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm24, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3],ymm7[4],ymm2[5,6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm5[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm4[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm9[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm13, %xmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm3[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm5[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm11[0,1,2,3],zmm0[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm10 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm12 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm10, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm10, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm16, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm12, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm12, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride6_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: pushq %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,1,2,3,11,11,11,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [5,6,5,6,5,6,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,1,2,3,11,11,11,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [5,6,5,6,5,6,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm7 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,21,10,11,20,13,14,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [8,21,10,11,20,13,14,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm17, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm4[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm5, %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm3[0,1,2,3],zmm8[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <2,2,u,3,10,u,10,11> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm9[4],ymm2[5],ymm9[5],ymm2[6],ymm9[6],ymm2[7],ymm9[7],ymm2[12],ymm9[12],ymm2[13],ymm9[13],ymm2[14],ymm9[14],ymm2[15],ymm9[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <2,2,u,3,10,u,10,11> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm18, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm10[0],ymm4[1],ymm10[1],ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[8],ymm10[8],ymm4[9],ymm10[9],ymm4[10],ymm10[10],ymm4[11],ymm10[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm11 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[1],ymm11[1],ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[8],ymm11[8],ymm4[9],ymm11[9],ymm4[10],ymm11[10],ymm4[11],ymm11[11] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm17, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm11[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm19, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm0[0,1,2,3],zmm12[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm14, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,2,1,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [1,0,2,2,1,0,2,2] -; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm23, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [1,0,2,2,1,0,2,2] +; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm12 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [16,9,10,17,12,13,18,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm8, %ymm22, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm12[0,1,2,3],zmm1[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,9,10,17,12,13,18,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [0,1,8,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm5, %ymm23, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm6[0,1,2,3],zmm0[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm23, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm31 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm14, %ymm22, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm2[0,1,2,3],zmm5[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[8],ymm9[8],ymm2[9],ymm9[9],ymm2[10],ymm9[10],ymm2[11],ymm9[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [1,1,1,1,10,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm7, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,20,11,12,21,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,9,2,3,8,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm21, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,u,0,1,u,10,10,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm3, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm21, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm12, %ymm23, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm0[0,1,2,3],zmm1[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm15[0],ymm0[1],ymm15[1],ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[8],ymm15[8],ymm0[9],ymm15[9],ymm0[10],ymm15[10],ymm0[11],ymm15[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [1,1,1,1,10,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm15, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,20,11,12,21,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm3, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,9,2,3,8,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm22, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,u,0,1,u,10,10,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm5, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm5, %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm18[0,1,2,3],zmm10[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm22, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm16[0,1,2,3],zmm21[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm26, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm20, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm17, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm16, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: popq %rax +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm18, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm17, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -3722,12 +3726,12 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 @@ -3735,7 +3739,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm14 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] @@ -3756,31 +3760,32 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm30 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm15 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm26 +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm27 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm31 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm0[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm17 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm7, %xmm16 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} @@ -3790,385 +3795,385 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm22 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm21 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[12],ymm9[12],ymm6[13],ymm9[13],ymm6[14],ymm9[14],ymm6[15],ymm9[15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm13[4],ymm7[4],ymm13[5],ymm7[5],ymm13[6],ymm7[6],ymm13[7],ymm7[7],ymm13[12],ymm7[12],ymm13[13],ymm7[13],ymm13[14],ymm7[14],ymm13[15],ymm7[15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX512DQ-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[2],ymm0[2],ymm8[3],ymm0[3],ymm8[8],ymm0[8],ymm8[9],ymm0[9],ymm8[10],ymm0[10],ymm8[11],ymm0[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm0[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm24, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm2, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm9 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm8[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm1[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm0[4],ymm8[4],ymm0[5],ymm8[5],ymm0[6],ymm8[6],ymm0[7],ymm8[7],ymm0[12],ymm8[12],ymm0[13],ymm8[13],ymm0[14],ymm8[14],ymm0[15],ymm8[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,2,3,3,5,6,7,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm0[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm16[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm4 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm4, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm25, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6],ymm4[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm30[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm1[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm11 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[8],ymm8[8],ymm0[9],ymm8[9],ymm0[10],ymm8[10],ymm0[11],ymm8[11] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm12 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm11, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2],ymm10[3,4],ymm2[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm11 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[8],ymm7[8],ymm13[9],ymm7[9],ymm13[10],ymm7[10],ymm13[11],ymm7[11] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[8],ymm9[8],ymm6[9],ymm9[9],ymm6[10],ymm9[10],ymm6[11],ymm9[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm8, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm31[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm26, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm22, %zmm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm27, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm7 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm21, %zmm9 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride6_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[8],ymm14[8],ymm1[9],ymm14[9],ymm1[10],ymm14[10],ymm1[11],ymm14[11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 ; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [1,1,1,1,10,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm21, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm20, %zmm19 ; AVX512DQ-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm1, %zmm19 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm17, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm13 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm16 ; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm6, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,u,0,1,u,10,10,u> -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm21, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm8, %xmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm30 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm18, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,u,0,1,u,10,10,u> +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm22, %zmm11 +; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm20, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm20, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm6, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm31 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm15 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,1,2,3,11,11,11,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm7, %zmm20, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm11, %ymm18, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm9 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm14[0],ymm2[0],ymm14[1],ymm2[1],ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[8],ymm2[8],ymm14[9],ymm2[9],ymm14[10],ymm2[10],ymm14[11],ymm2[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,2,3,11,11,11,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [5,6,5,6,5,6,7,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm4, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm14, %ymm21, %ymm14 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm5, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm14, %ymm18 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm0[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,2,u,3,10,u,10,11> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm1[4],ymm14[4],ymm1[5],ymm14[5],ymm1[6],ymm14[6],ymm1[7],ymm14[7],ymm1[12],ymm14[12],ymm1[13],ymm14[13],ymm1[14],ymm14[14],ymm1[15],ymm14[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm21, %ymm6 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm14 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm14, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm22, %ymm2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm6[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm4[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm4 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,2,1,8,9,8,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[12],ymm6[12],ymm3[13],ymm6[13],ymm3[14],ymm6[14],ymm3[15],ymm6[15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm6 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm9, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <2,2,u,3,10,u,10,11> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm14[4],ymm3[4],ymm14[5],ymm3[5],ymm14[6],ymm3[6],ymm14[7],ymm3[7],ymm14[12],ymm3[12],ymm14[13],ymm3[13],ymm14[14],ymm3[14],ymm14[15],ymm3[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm21, %ymm6 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm14 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[1],ymm3[1],ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[8],ymm3[8],ymm14[9],ymm3[9],ymm14[10],ymm3[10],ymm14[11],ymm3[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm3, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm3, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,2,1,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,0,2,2,1,0,2,2] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm9 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm6 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm5, %ymm8 ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm9, %ymm9 ; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm6, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm6, %zmm9 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm10 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm8, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm7 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm8, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm4, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm4, %ymm5, %ymm10 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm10[0,1,2,3],zmm6[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm7, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm10[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm5, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm5 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm5, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero +; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm1, %ymm9 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm9[0,1,2,3],zmm8[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm7 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm18, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm23, %zmm1, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm17[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm23[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[0,1,2,3] ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -4177,88 +4182,88 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-LABEL: store_i16_stride6_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: movw $9362, %cx # imm = 0x2492 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm6 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm6, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm9, %zmm6 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm6, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm10, %zmm6 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm6, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31> +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm11, %zmm6 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm6, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u> +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 64 @@ -4284,746 +4289,731 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $808, %rsp # imm = 0x328 ; SSE-NEXT: movdqa (%rdi), %xmm10 ; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm12 -; SSE-NEXT: movdqa 16(%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm6 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm1 +; SSE-NEXT: movdqa 16(%rdx), %xmm12 +; SSE-NEXT: movdqa (%rcx), %xmm5 ; SSE-NEXT: movdqa 16(%rcx), %xmm3 -; SSE-NEXT: movdqa (%r8), %xmm9 -; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm5[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm8, %xmm0 -; SSE-NEXT: orps %xmm7, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm12[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm6, %xmm0 -; SSE-NEXT: orps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa (%r8), %xmm8 +; SSE-NEXT: movdqa (%r9), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 16(%r8), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[0,1] -; SSE-NEXT: movdqa 16(%r9), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm4[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm4[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm4, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: andnps %xmm7, %xmm9 +; SSE-NEXT: orps %xmm6, %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] +; SSE-NEXT: andps %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: andnps %xmm5, %xmm6 +; SSE-NEXT: orps %xmm2, %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm1[3,3] +; SSE-NEXT: movdqa 16(%r8), %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm6[0,1] +; SSE-NEXT: movdqa 16(%r9), %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: andnps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] +; SSE-NEXT: andps %xmm4, %xmm5 +; SSE-NEXT: orps %xmm5, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm12[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm2 -; SSE-NEXT: movdqa 32(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: andnps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm1 +; SSE-NEXT: movdqa 32(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rsi), %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[3,3] +; SSE-NEXT: movdqa 32(%r8), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm7[0,1] +; SSE-NEXT: movdqa 32(%r9), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: andnps %xmm7, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: andps %xmm4, %xmm6 +; SSE-NEXT: orps %xmm6, %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm5[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: andnps %xmm5, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdx), %xmm1 +; SSE-NEXT: movdqa 48(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm12 +; SSE-NEXT: movdqa 48(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 32(%r8), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm8[0,1] -; SSE-NEXT: movdqa 32(%r9), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm7 -; SSE-NEXT: orps %xmm7, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[3,3] +; SSE-NEXT: movdqa 48(%r8), %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm9[0,1] +; SSE-NEXT: movdqa 48(%r9), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: andnps %xmm9, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] +; SSE-NEXT: andps %xmm4, %xmm7 +; SSE-NEXT: orps %xmm7, %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm6[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm2 -; SSE-NEXT: movdqa 48(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 48(%r8), %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm6[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: andnps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdx), %xmm1 +; SSE-NEXT: movdqa 64(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rsi), %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[3,3] +; SSE-NEXT: movdqa 64(%r8), %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm11[0,1] -; SSE-NEXT: movdqa 48(%r9), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm8 -; SSE-NEXT: orps %xmm8, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm11[0,1] +; SSE-NEXT: movdqa 64(%r9), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: andnps %xmm11, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] +; SSE-NEXT: andps %xmm4, %xmm9 +; SSE-NEXT: orps %xmm9, %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm7[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm7[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdx), %xmm2 -; SSE-NEXT: movdqa 64(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: andnps %xmm7, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdx), %xmm1 +; SSE-NEXT: movdqa 80(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm13 +; SSE-NEXT: movdqa 80(%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 64(%r8), %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm12[0,1] -; SSE-NEXT: movdqa 64(%r9), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm11 -; SSE-NEXT: orps %xmm11, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[3,3] +; SSE-NEXT: movdqa 80(%r8), %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm12[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm15[0,1] +; SSE-NEXT: movdqa 80(%r9), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: andnps %xmm15, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] +; SSE-NEXT: andps %xmm4, %xmm11 +; SSE-NEXT: orps %xmm11, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm8[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdx), %xmm2 -; SSE-NEXT: movdqa 80(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa 80(%rsi), %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 80(%r8), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm8[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm15[0,1] -; SSE-NEXT: movdqa 80(%r9), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm12 -; SSE-NEXT: orps %xmm12, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm9[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: andnps %xmm9, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdx), %xmm1 +; SSE-NEXT: movdqa 96(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm11[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: andnps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm2 -; SSE-NEXT: movdqa 96(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa 96(%rsi), %xmm12 -; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm13 +; SSE-NEXT: movdqa 96(%rsi), %xmm11 +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 96(%r8), %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm0[0,1] -; SSE-NEXT: movdqa 96(%r9), %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm13 -; SSE-NEXT: andnps %xmm0, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[3,3] +; SSE-NEXT: movdqa 96(%r8), %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm9[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[0,1] +; SSE-NEXT: movdqa 96(%r9), %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: andnps %xmm14, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm15 -; SSE-NEXT: orps %xmm15, %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: andps %xmm4, %xmm15 +; SSE-NEXT: orps %xmm15, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: andnps %xmm1, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm11[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: andnps %xmm11, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdx), %xmm4 -; SSE-NEXT: movdqa 112(%rcx), %xmm5 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdx), %xmm11 +; SSE-NEXT: movdqa 112(%rcx), %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa 112(%rsi), %xmm1 +; SSE-NEXT: movdqa 112(%rsi), %xmm14 ; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[3,3] -; SSE-NEXT: movdqa 112(%r8), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm1[3,3] +; SSE-NEXT: movdqa 112(%r8), %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm13[0,1] ; SSE-NEXT: movdqa 112(%r9), %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: andnps %xmm13, %xmm12 +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: andnps %xmm13, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm15 -; SSE-NEXT: orps %xmm15, %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: andps %xmm4, %xmm15 +; SSE-NEXT: orps %xmm15, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm4[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm11[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm13[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: andnps %xmm1, %xmm14 -; SSE-NEXT: orps %xmm0, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; SSE-NEXT: andnps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[0,2] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm1[0] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: andps %xmm7, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm2[0,2] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: andps %xmm15, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: movdqa %xmm8, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm14[0,2] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: andps %xmm7, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[0,2] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: andps %xmm15, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: andps %xmm10, %xmm9 -; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[0,2] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: andps %xmm15, %xmm10 +; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm9[1] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: andps %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: andps %xmm10, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: andps %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[0,2] +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: andps %xmm10, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: andps %xmm15, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: andps %xmm10, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[0,2] +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: pandn %xmm2, %xmm13 +; SSE-NEXT: andps %xmm15, %xmm12 +; SSE-NEXT: por %xmm12, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm12 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: andps %xmm10, %xmm11 -; SSE-NEXT: por %xmm11, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: andps %xmm15, %xmm9 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm9[0,2] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: andps %xmm7, %xmm5 +; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,1,1,1,4,5,6,7] +; SSE-NEXT: movaps %xmm10, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm11[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: andps %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2] -; SSE-NEXT: andps %xmm12, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm8 -; SSE-NEXT: pandn %xmm8, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[0,2] -; SSE-NEXT: andps %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: andps %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[0,2] +; SSE-NEXT: andps %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: por %xmm6, %xmm15 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm10, 736(%rax) -; SSE-NEXT: movdqa %xmm12, 720(%rax) -; SSE-NEXT: movdqa %xmm15, 688(%rax) -; SSE-NEXT: movdqa %xmm11, 672(%rax) -; SSE-NEXT: movdqa %xmm4, 640(%rax) -; SSE-NEXT: movdqa %xmm6, 624(%rax) -; SSE-NEXT: movdqa %xmm9, 592(%rax) -; SSE-NEXT: movdqa %xmm13, 576(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 544(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 528(%rax) +; SSE-NEXT: movdqa %xmm15, 736(%rax) +; SSE-NEXT: movdqa %xmm7, 720(%rax) +; SSE-NEXT: movdqa %xmm10, 688(%rax) +; SSE-NEXT: movdqa %xmm9, 672(%rax) +; SSE-NEXT: movdqa %xmm3, 640(%rax) +; SSE-NEXT: movdqa %xmm4, 624(%rax) +; SSE-NEXT: movdqa %xmm8, 592(%rax) +; SSE-NEXT: movdqa %xmm12, 576(%rax) +; SSE-NEXT: movdqa %xmm13, 544(%rax) +; SSE-NEXT: movdqa %xmm14, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5068,7 +5058,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps %xmm14, 752(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 752(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 704(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5702,36 +5693,35 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i16_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: subq $1480, %rsp # imm = 0x5C8 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm9 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm12 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm14 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] @@ -5743,16 +5733,16 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 @@ -5760,7 +5750,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] @@ -5775,12 +5765,15 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm11 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 @@ -5818,7 +5811,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] @@ -5867,14 +5860,13 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -5923,16 +5915,16 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5942,9 +5934,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %ymm2 @@ -5954,52 +5946,52 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm14, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm14, %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = mem[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -6017,7 +6009,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpmovzxwd (%rsp), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] @@ -6045,7 +6037,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[8],ymm7[8],ymm15[9],ymm7[9],ymm15[10],ymm7[10],ymm15[11],ymm7[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -6062,187 +6056,186 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[2,3,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[12],mem[12],ymm1[13],mem[13],ymm1[14],mem[14],ymm1[15],mem[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[12],mem[12],ymm1[13],mem[13],ymm1[14],mem[14],ymm1[15],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm15, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2,3],ymm1[4],ymm9[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 736(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -6250,33 +6243,30 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 544(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 704(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 640(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 640(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 512(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 448(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6287,57 +6277,58 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $1544, %rsp # imm = 0x608 +; AVX2-SLOW-NEXT: addq $1480, %rsp # imm = 0x5C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: subq $1464, %rsp # imm = 0x5B8 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm12 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm13 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm3 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm4 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 @@ -6351,21 +6342,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm4 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm10 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 @@ -6379,43 +6371,43 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm13 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm12 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6431,144 +6423,143 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm6 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm12, %xmm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpmovzxwd (%rsp), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-FAST-NEXT: vmovdqa %xmm14, %xmm9 +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = mem[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm12 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = mem[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,0,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -6580,7 +6571,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -6615,7 +6608,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -6631,16 +6626,17 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm12 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -6648,64 +6644,56 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,2,1,2,u,u,3,3> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] @@ -6716,8 +6704,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -6734,7 +6721,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm7 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] @@ -6773,30 +6760,37 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm10[4],mem[4],ymm10[5],mem[5],ymm10[6],mem[6],ymm10[7],mem[7],ymm10[12],mem[12],ymm10[13],mem[13],ymm10[14],mem[14],ymm10[15],mem[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm10[4],mem[4],ymm10[5],mem[5],ymm10[6],mem[6],ymm10[7],mem[7],ymm10[12],mem[12],ymm10[13],mem[13],ymm10[14],mem[14],ymm10[15],mem[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4],ymm11[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -6814,25 +6808,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 704(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 640(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 640(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 512(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 448(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6843,388 +6834,381 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX2-FAST-NEXT: addq $1464, %rsp # imm = 0x5B8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1544, %rsp # imm = 0x608 +; AVX2-FAST-PERLANE-NEXT: subq $1448, %rsp # imm = 0x5A8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm4[0],ymm15[1],ymm4[1],ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[8],ymm4[8],ymm15[9],ymm4[9],ymm15[10],ymm4[10],ymm15[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[8],ymm7[8],ymm0[9],ymm7[9],ymm0[10],ymm7[10],ymm0[11],ymm7[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[1],ymm12[1],ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[8],ymm12[8],ymm1[9],ymm12[9],ymm1[10],ymm12[10],ymm1[11],ymm12[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm5 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[2,1,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm5 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm5 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm5 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm12, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -7235,140 +7219,140 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm6[4],mem[4],ymm6[5],mem[5],ymm6[6],mem[6],ymm6[7],mem[7],ymm6[12],mem[12],ymm6[13],mem[13],ymm6[14],mem[14],ymm6[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[4],mem[4],ymm15[5],mem[5],ymm15[6],mem[6],ymm15[7],mem[7],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm15, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm13[4],mem[4],ymm13[5],mem[5],ymm13[6],mem[6],ymm13[7],mem[7],ymm13[12],mem[12],ymm13[13],mem[13],ymm13[14],mem[14],ymm13[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm14, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm12[4],ymm1[5],ymm12[5],ymm1[6],ymm12[6],ymm1[7],ymm12[7],ymm1[12],ymm12[12],ymm1[13],ymm12[13],ymm1[14],ymm12[14],ymm1[15],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm10[4],mem[4],ymm10[5],mem[5],ymm10[6],mem[6],ymm10[7],mem[7],ymm10[12],mem[12],ymm10[13],mem[13],ymm10[14],mem[14],ymm10[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm13, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm14, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[12],ymm9[12],ymm6[13],ymm9[13],ymm6[14],ymm9[14],ymm6[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm13[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm7, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1],ymm10[2,3],ymm2[4],ymm10[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 736(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7414,907 +7398,901 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1544, %rsp # imm = 0x608 +; AVX2-FAST-PERLANE-NEXT: addq $1448, %rsp # imm = 0x5A8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride6_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $600, %rsp # imm = 0x258 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[12],ymm9[12],ymm6[13],ymm9[13],ymm6[14],ymm9[14],ymm6[15],ymm9[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm4, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm6 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm8[4],ymm1[4],ymm8[5],ymm1[5],ymm8[6],ymm1[6],ymm8[7],ymm1[7],ymm8[12],ymm1[12],ymm8[13],ymm1[13],ymm8[14],ymm1[14],ymm8[15],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm4[4],ymm14[5],ymm4[5],ymm14[6],ymm4[6],ymm14[7],ymm4[7],ymm14[12],ymm4[12],ymm14[13],ymm4[13],ymm14[14],ymm4[14],ymm14[15],ymm4[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm8[4],ymm1[4],ymm8[5],ymm1[5],ymm8[6],ymm1[6],ymm8[7],ymm1[7],ymm8[12],ymm1[12],ymm8[13],ymm1[13],ymm8[14],ymm1[14],ymm8[15],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3],ymm0[4],ymm14[5,6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1,2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm10[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm0[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[2,3,2,3,6,7,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm20 = ymm3[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm15 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm25, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm20, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm15, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3],ymm7[4],ymm14[5,6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm14[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm10[4],ymm14[5],ymm10[5],ymm14[6],ymm10[6],ymm14[7],ymm10[7],ymm14[12],ymm10[12],ymm14[13],ymm10[13],ymm14[14],ymm10[14],ymm14[15],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm0[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm9[0],ymm13[0],ymm9[1],ymm13[1],ymm9[2],ymm13[2],ymm9[3],ymm13[3],ymm9[8],ymm13[8],ymm9[9],ymm13[9],ymm9[10],ymm13[10],ymm9[11],ymm13[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm15[4],ymm1[4],ymm15[5],ymm1[5],ymm15[6],ymm1[6],ymm15[7],ymm1[7],ymm15[12],ymm1[12],ymm15[13],ymm1[13],ymm15[14],ymm1[14],ymm15[15],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm18, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4,5],ymm12[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm14[0],ymm10[0],ymm14[1],ymm10[1],ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[8],ymm10[8],ymm14[9],ymm10[9],ymm14[10],ymm10[10],ymm14[11],ymm10[11] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm14, %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm29, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm14, %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm3, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm13, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[8],ymm9[8],ymm6[9],ymm9[9],ymm6[10],ymm9[10],ymm6[11],ymm9[11] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm29, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm31, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,2,3],zmm3[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm2[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm16, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm12[0,1,2,3],zmm5[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm13 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm5[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm29, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm29, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm2[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm1[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm2[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm29, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm2, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm9[0,1,2,3],zmm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm31, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm14[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm14[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1,2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm9, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm9[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm24 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm24 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm5[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm16, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm13[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm13[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1,2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm12, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm12[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm0[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm25[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $96, (%rsp), %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm12[0,1,2,3],zmm1[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm26 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm26, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm0[1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm26 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm26, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm24, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm10, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm10, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm13, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm21, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm16 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm11, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm11, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $600, %rsp # imm = 0x258 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride6_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1256, %rsp # imm = 0x4E8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[2],ymm14[2],ymm11[3],ymm14[3],ymm11[8],ymm14[8],ymm11[9],ymm14[9],ymm11[10],ymm14[10],ymm11[11],ymm14[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[2],ymm0[2],ymm8[3],ymm0[3],ymm8[8],ymm0[8],ymm8[9],ymm0[9],ymm8[10],ymm0[10],ymm8[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm12[4],ymm1[5],ymm12[5],ymm1[6],ymm12[6],ymm1[7],ymm12[7],ymm1[12],ymm12[12],ymm1[13],ymm12[13],ymm1[14],ymm12[14],ymm1[15],ymm12[15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [5,6,5,6,5,6,7,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm8, %ymm26, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm12, %ymm26, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 ; AVX512F-ONLY-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm11 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [8,21,10,11,20,13,14,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm25, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm25, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm29, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm10, %ymm29, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = <2,2,u,3,10,u,10,11> -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm25, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm29, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm28, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[12],ymm0[12],ymm11[13],ymm0[13],ymm11[14],ymm0[14],ymm11[15],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm8[4],ymm1[4],ymm8[5],ymm1[5],ymm8[6],ymm1[6],ymm8[7],ymm1[7],ymm8[12],ymm1[12],ymm8[13],ymm1[13],ymm8[14],ymm1[14],ymm8[15],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm10, %ymm26, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm8, %ymm29, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm1[0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,2,1,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,2,1,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm17, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [1,0,2,2,1,0,2,2] ; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm31, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm8, %ymm31, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm9 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = [16,9,10,17,12,13,18,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm30, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm16, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm2[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm7, %ymm16, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm4[0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = ; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm31, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm16, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm2[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm30, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm5, %ymm16, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm4[0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm26, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm25, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm29, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm3[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm29, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm2[0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm1, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm29, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm1[0,1,2,3],zmm25[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm26, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm3, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm29, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm0[0,1,2,3],zmm25[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm31, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm30, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm15, %ymm16, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm7[0,1,2,3],zmm1[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm31, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm13 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm13, %ymm16, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm4[0,1,2,3],zmm0[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm31, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm31, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm12 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm1, %zmm12, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm16, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,2,3],zmm30[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,9,2,3,8,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,u,0,1,u,10,10,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm18, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm14, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm17, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm16, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm30[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,1,1,1,10,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,20,11,12,21,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm13, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,9,2,3,8,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm12, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,u,0,1,u,10,10,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm14, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[8],ymm13[8],ymm0[9],ymm13[9],ymm0[10],ymm13[10],ymm0[11],ymm13[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm6, %ymm14, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm18[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm13, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm12, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm13, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm24, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm5, %ymm12, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm23, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm13[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm5 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm20[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm28[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm17[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm28[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm20[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm17[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm16, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm26, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm16, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm25, %zmm0, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm25, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm26, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) @@ -8330,13 +8308,13 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX512F-ONLY-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride6_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX512DQ-SLOW-NEXT: subq $968, %rsp # imm = 0x3C8 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm2 @@ -8348,7 +8326,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 @@ -8383,9 +8361,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8394,27 +8372,27 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] @@ -8428,12 +8406,12 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8469,24 +8447,24 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm3 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm26 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} @@ -8497,45 +8475,47 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm28 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm4 ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm11 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm5 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm4[4],ymm11[5],ymm4[5],ymm11[6],ymm4[6],ymm11[7],ymm4[7],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX512DQ-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] @@ -8549,305 +8529,301 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm10 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm2[4],ymm14[5],ymm2[5],ymm14[6],ymm2[6],ymm14[7],ymm2[7],ymm14[12],ymm2[12],ymm14[13],ymm2[13],ymm14[14],ymm2[14],ymm14[15],ymm2[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[8],ymm6[8],ymm12[9],ymm6[9],ymm12[10],ymm6[10],ymm12[11],ymm6[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm12, %zmm6, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm0[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm6[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm6 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm15 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm15[0],ymm6[0],ymm15[1],ymm6[1],ymm15[2],ymm6[2],ymm15[3],ymm6[3],ymm15[8],ymm6[8],ymm15[9],ymm6[9],ymm15[10],ymm6[10],ymm15[11],ymm6[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm6[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm18, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm15 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm15, %ymm14 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm6[1,2],ymm14[3],ymm6[4,5],ymm14[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm6[0,1,2,3],zmm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm13, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm14[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm11 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm11[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm13[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm2, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm1, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm8 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm13 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm13[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm15 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm11 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm13[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm16 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm5[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm15 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm5[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm4, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm13[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm11[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[8],ymm12[8],ymm10[9],ymm12[9],ymm10[10],ymm12[10],ymm10[11],ymm12[11] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm11, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm4, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm11 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[8],ymm11[8],ymm0[9],ymm11[9],ymm0[10],ymm11[10],ymm0[11],ymm11[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm9, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm11 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm13 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm7 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm10 -; AVX512DQ-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm14 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm14[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm10[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm13[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm10[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm8 = mem[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm0 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm14 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm15 = mem[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm7 = mem[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm13 = mem[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm4 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm25 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm25 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm30, %zmm19 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm19 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm31, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm6 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm17, %zmm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm7 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm16, %zmm16 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm16 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm24, %zmm14 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm14 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm18, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm25, %zmm15 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm15 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm18, %zmm11 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm6, %zmm2 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-SLOW-NEXT: addq $936, %rsp # imm = 0x3A8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQ-SLOW-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride6_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 +; AVX512DQ-FAST-NEXT: subq $1208, %rsp # imm = 0x4B8 ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm0 @@ -8857,8 +8833,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 @@ -8868,572 +8844,570 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm19 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 ; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm28 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [1,1,1,1,10,10,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm20, %zmm22 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm18, %zmm21 ; AVX512DQ-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm3, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm3, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm23, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11] ; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm19, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,u,0,1,u,10,10,u> +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm24, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,u,0,1,u,10,10,u> ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] ; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm20, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm18, %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm4, %zmm26 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm10 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm11 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm23, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm24 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm19, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm30 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm24, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm12 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm6 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm20, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm6, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm16, %zmm5 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm5, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm23, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm7, %ymm19, %ymm17 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm24, %ymm17 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm7 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm20, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm18, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm20, %zmm25 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm20, %zmm23 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm19, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm24, %ymm20 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [2,1,2,3,11,11,11,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [5,6,5,6,5,6,7,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm27, %ymm0 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm28, %ymm1 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm6, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm30, %ymm8 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm4, %ymm19, %ymm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm8 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <2,2,u,3,10,u,10,11> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm27, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm28, %ymm1 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm30, %ymm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm2, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm19, %ymm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,0,2,1,8,9,8,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,0,2,2,1,0,2,2] -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,2,1,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm28, %zmm4 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm24[0],zero,xmm24[1],zero,xmm24[2],zero,xmm24[3],zero -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm16, %ymm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm8 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm4 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm8 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm30[0],zero,xmm30[1],zero,xmm30[2],zero,xmm30[3],zero +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm8, %ymm30, %ymm5 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm5[0,1,2,3],zmm6[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm5, %zmm6 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm30, %ymm6 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm6[0,1,2,3],zmm5[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm6 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm16, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] +; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm28, %ymm9 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm10, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm10 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm24, %zmm9 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm10, %ymm19, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm29, %zmm10 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm6[0,1,2,3],zmm9[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm12 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm31, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm28, %ymm12 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm14 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm12, %zmm13, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm12 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm10, %zmm24 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm12, %ymm19, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm31, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm14 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm18, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm15 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm3, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm16, %ymm4 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm4[0,1,2,3],zmm3[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm8, %xmm21 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm27, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm6, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm30, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm10[4],ymm13[4],ymm10[5],ymm13[5],ymm10[6],ymm13[6],ymm10[7],ymm13[7],ymm10[12],ymm13[12],ymm10[13],ymm13[13],ymm10[14],ymm13[14],ymm10[15],ymm13[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm5 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm6[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm6[4],ymm11[5],ymm6[5],ymm11[6],ymm6[6],ymm11[7],ymm6[7],ymm11[12],ymm6[12],ymm11[13],ymm6[13],ymm11[14],ymm6[14],ymm11[15],ymm6[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm27, %ymm8 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm11 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm10, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm8, %zmm5, %zmm23 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm8, %ymm30, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm31, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm11 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm19, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm13, %ymm13 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm13, %zmm11, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm13, %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm9 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm13 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm13, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm13, %zmm1, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm14, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm15 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm14, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm0 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm13 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm13, %zmm28 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm16, %ymm10 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm16, %ymm13 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm23[0,1,2,3] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm11[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm15, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm15 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm2, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm30, %ymm14 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm30, %ymm13 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm10[0,1,2,3],zmm24[0,1,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm14[0,1,2,3],zmm5[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm5 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm13[0,1,2,3],zmm1[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm3 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm28[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm11, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm10 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm29, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm25[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) -; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm0 = zmm17[0,1,2,3],mem[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 448(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 576(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 512(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm29, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm23[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm22[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 448(%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 384(%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm27, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm0 = zmm26[0,1,2,3],mem[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 640(%rax) -; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm0 = zmm22[0,1,2,3],mem[0,1,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm0 = zmm21[0,1,2,3],mem[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512DQ-FAST-NEXT: addq $1208, %rsp # imm = 0x4B8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm18, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm14, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm4, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm22, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm7, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm24, %zmm25 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm10, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm20, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm20, %zmm26 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm6, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm6, %zmm22 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm15, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm18, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm18, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm2, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm2, %zmm24 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm2, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm18, %zmm2 ; AVX512BW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm16 ; AVX512BW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm17 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u> -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31> -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm13, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm18, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm7 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm20, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm21, %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm17 +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm13, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm18, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm18, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm15, %zmm12 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm18, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm15, %zmm2 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 473ac8a546f904..a5f9d5079b1ee6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -172,76 +172,76 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,3] +; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm3[0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm2[0] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: por %xmm12, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: por %xmm9, %xmm5 ; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,65535] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] -; SSE-NEXT: pandn %xmm9, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; SSE-NEXT: pandn %xmm10, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,1,3,4,5,6,7] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 ; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: por %xmm8, %xmm4 +; SSE-NEXT: por %xmm9, %xmm4 ; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm7 ; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] -; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: andps %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: andps %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: andnps %xmm2, %xmm3 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: andnps %xmm3, %xmm2 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movq %xmm7, 48(%rax) -; SSE-NEXT: movdqa %xmm6, 32(%rax) +; SSE-NEXT: movdqa %xmm8, 32(%rax) ; SSE-NEXT: movdqa %xmm5, 16(%rax) ; SSE-NEXT: retq ; @@ -583,146 +583,148 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i16_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm5 -; SSE-NEXT: movdqa (%rcx), %xmm11 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa (%rdx), %xmm0 +; SSE-NEXT: movdqa (%rcx), %xmm10 ; SSE-NEXT: movdqa (%r8), %xmm4 -; SSE-NEXT: movdqa (%r9), %xmm10 -; SSE-NEXT: movdqa (%rax), %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa (%r9), %xmm11 +; SSE-NEXT: movdqa (%rax), %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, %xmm13 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm12 ; SSE-NEXT: por %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm12[0,3] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm12[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm9, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] +; SSE-NEXT: por %xmm8, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm11[0,1,2,3,4,4,4,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm9, %xmm14 -; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm8, %xmm14 +; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] ; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm10[3,3,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm15[0,2] -; SSE-NEXT: andps %xmm9, %xmm14 -; SSE-NEXT: andnps %xmm13, %xmm9 -; SSE-NEXT: orps %xmm14, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,1,0,1] +; SSE-NEXT: andps %xmm8, %xmm14 +; SSE-NEXT: andnps %xmm13, %xmm8 +; SSE-NEXT: orps %xmm14, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm14, %xmm12 ; SSE-NEXT: pandn %xmm13, %xmm14 ; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: movdqa %xmm10, %xmm12 ; SSE-NEXT: psrld $16, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: movdqa %xmm0, %xmm13 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm11[1] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm13, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: por %xmm13, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[3,3,3,3] -; SSE-NEXT: pandn %xmm11, %xmm10 -; SSE-NEXT: por %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE-NEXT: psrld $16, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm12, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,2],xmm2[1,1] -; SSE-NEXT: pandn %xmm13, %xmm8 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm14[0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm10[1] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: psrld $16, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: por %xmm14, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm11, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[3,3,3,3] +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: por %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm5[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,2],xmm3[1,1] +; SSE-NEXT: pandn %xmm9, %xmm15 +; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm13 +; SSE-NEXT: por %xmm7, %xmm13 +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm12[0,1] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,1] ; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,0,65535] ; SSE-NEXT: andps %xmm4, %xmm6 -; SSE-NEXT: andnps %xmm0, %xmm4 +; SSE-NEXT: andnps %xmm13, %xmm4 ; SSE-NEXT: orps %xmm6, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm11[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[2,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: andps %xmm2, %xmm5 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: orps %xmm5, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: andps %xmm3, %xmm0 +; SSE-NEXT: andnps %xmm2, %xmm3 +; SSE-NEXT: orps %xmm0, %xmm3 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, (%rax) +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movaps %xmm4, 64(%rax) -; SSE-NEXT: movdqa %xmm15, 16(%rax) -; SSE-NEXT: movdqa %xmm8, 32(%rax) -; SSE-NEXT: movaps %xmm9, 48(%rax) -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movdqa %xmm10, 96(%rax) +; SSE-NEXT: movdqa %xmm1, 16(%rax) +; SSE-NEXT: movdqa %xmm15, 32(%rax) +; SSE-NEXT: movaps %xmm8, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movdqa %xmm11, 96(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf8: @@ -1027,22 +1029,22 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm6 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm0 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,2,0,2] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,3,0,1] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,2,2,4,5,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11,12,13],ymm11[14],ymm10[15] @@ -1062,27 +1064,27 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 ; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] -; AVX512F-SLOW-NEXT: vpsrld $16, %xmm6, %xmm1 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] +; AVX512F-SLOW-NEXT: vpsrld $16, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpbroadcastd 12(%r10), %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6],xmm3[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6],xmm3[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u],zero,zero,zero,zero,ymm0[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[20,21,28,29,u,u,u,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512F-SLOW-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -1192,227 +1194,225 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $216, %rsp +; SSE-NEXT: subq $200, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: movdqa 16(%rdx), %xmm15 -; SSE-NEXT: movdqa 16(%rcx), %xmm1 -; SSE-NEXT: movdqa 16(%r8), %xmm8 +; SSE-NEXT: movdqa 16(%rcx), %xmm11 +; SSE-NEXT: movdqa 16(%r8), %xmm4 ; SSE-NEXT: movdqa 16(%r9), %xmm7 -; SSE-NEXT: movdqa 16(%rax), %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rax), %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{.*#+}} xmm8 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: andnps %xmm4, %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] -; SSE-NEXT: andnps %xmm1, %xmm6 -; SSE-NEXT: orps %xmm0, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: movdqa (%r9), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm11, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa (%rax), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa (%r8), %xmm6 +; SSE-NEXT: movdqa (%r9), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 ; SSE-NEXT: movdqa (%rdx), %xmm13 -; SSE-NEXT: movdqa (%rcx), %xmm5 +; SSE-NEXT: movdqa (%rcx), %xmm9 ; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa (%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm14[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm6[0,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,2],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm7[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,1,0,1] +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[3,3] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0,2] -; SSE-NEXT: andps %xmm8, %xmm1 -; SSE-NEXT: orps %xmm6, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movaps (%rsp), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[3,3] +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: orps %xmm7, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: pandn %xmm14, %xmm7 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[2,2,2,2] -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2] +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: andps %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: por %xmm5, %xmm14 ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm10[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm12[2,0] +; SSE-NEXT: movaps %xmm10, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,6,7] -; SSE-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: andps %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm8 +; SSE-NEXT: orps %xmm1, %xmm8 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: andps %xmm1, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: orps %xmm8, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 @@ -1428,100 +1428,102 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm7[1,1] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm5, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm9, %xmm1 -; SSE-NEXT: movaps (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,2],xmm11[1,1] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: andnps %xmm9, %xmm6 -; SSE-NEXT: orps %xmm1, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm12, %xmm9 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm12[2,1] -; SSE-NEXT: andps %xmm1, %xmm11 -; SSE-NEXT: orps %xmm9, %xmm11 -; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,2],xmm11[1,1] +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm7, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,1] -; SSE-NEXT: andps %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm15[0] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,2],xmm10[1,1] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: andnps %xmm7, %xmm1 +; SSE-NEXT: orps %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm6[2,1] +; SSE-NEXT: andps %xmm11, %xmm10 +; SSE-NEXT: orps %xmm3, %xmm10 +; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm9[0,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,1] +; SSE-NEXT: andps %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[0],mem[0] ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: andps %xmm4, %xmm13 -; SSE-NEXT: por %xmm13, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: andps %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: andps %xmm3, %xmm13 +; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: andps %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm15, %xmm3 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm4, 112(%rax) -; SSE-NEXT: movdqa %xmm5, (%rax) -; SSE-NEXT: movdqa %xmm1, 176(%rax) -; SSE-NEXT: movaps %xmm11, 64(%rax) -; SSE-NEXT: movaps %xmm6, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movdqa %xmm3, 112(%rax) +; SSE-NEXT: movdqa %xmm4, (%rax) +; SSE-NEXT: movdqa %xmm11, 176(%rax) +; SSE-NEXT: movaps %xmm10, 64(%rax) +; SSE-NEXT: movaps %xmm1, 32(%rax) +; SSE-NEXT: movdqa %xmm14, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1531,13 +1533,13 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps %xmm3, 144(%rax) +; SSE-NEXT: movaps %xmm2, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: movdqa %xmm8, 96(%rax) +; SSE-NEXT: movdqa %xmm5, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: addq $216, %rsp +; SSE-NEXT: addq $200, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf16: @@ -1568,48 +1570,48 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm8 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm10 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm13 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1,2,3,4,5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm11[1,2,3,4,5,6],xmm12[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm9 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm2[6],xmm10[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm13[3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm0 @@ -1621,27 +1623,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5],xmm10[6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,5],xmm11[6],xmm3[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm5 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm12 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[0,2],xmm12[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[0,2],xmm12[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm1 @@ -1650,32 +1652,32 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm11[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[0,2],xmm13[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm14 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm4[0,2],xmm13[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] @@ -1689,10 +1691,10 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1700,7 +1702,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm5 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -1710,12 +1713,12 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] @@ -1725,8 +1728,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm2[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm10[1],xmm2[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -1740,11 +1743,11 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,u,u,u,u,u,u,u,u,6,7,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 @@ -1774,25 +1777,24 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: subq $40, %rsp ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm13 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> ; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm7 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm13[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 @@ -1800,7 +1802,6 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = @@ -1817,17 +1818,17 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm12 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 @@ -1841,52 +1842,50 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,2,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm14 +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7,8,9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6,7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,3,6,6,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] @@ -1903,51 +1902,46 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6,7,8],ymm11[9],ymm1[10,11],ymm11[12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm12, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 @@ -1957,11 +1951,10 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 192(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: addq $40, %rsp @@ -2145,20 +2138,18 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = @@ -2166,6 +2157,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> @@ -2176,20 +2168,20 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm0 @@ -2198,103 +2190,102 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3],ymm11[4,5],ymm1[6],ymm11[7,8,9,10],ymm1[11],ymm11[12,13],ymm1[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6,7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7,8,9],ymm11[10],ymm15[11,12],ymm11[13],ymm15[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,3,3,3,6,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm10, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6,7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5,6,7,8],ymm1[9],ymm12[10,11],ymm1[12],ymm12[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7,8,9],ymm1[10],ymm11[11,12],ymm1[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm8[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7,8,9],ymm1[10],ymm12[11,12],ymm1[13],ymm12[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 @@ -2304,9 +2295,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2318,534 +2309,532 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride7_vf16: ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u],zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[16,17,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm4, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[12,13,14,15],zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm8[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm4, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[16,17,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm5, %ymm10, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm5, %ymm10, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[16,17,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm4, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,16,0,1,17,17,2,0,0,16,0,1,17,17,2,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u],zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[16,17,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,14,15],zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm7[u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm2, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm10[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[12,13,14,15],zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm9[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm6, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[16,17,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm11, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm0, %ymm11, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[16,17,u,u] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8,9,10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7,8,9],ymm14[10],ymm11[11,12],ymm14[13],ymm11[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7,8,9],ymm11[10],ymm15[11,12],ymm11[13],ymm15[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm12, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,16,0,1,17,17,2,0,0,16,0,1,17,17,2,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm11, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7,8],ymm13[9],ymm11[10,11],ymm13[12],ymm11[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm4, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7,8,9,10],ymm9[11],ymm6[12,13],ymm9[14],ymm6[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[2,1,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm22[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm12[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm13, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,3,3,6,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm12[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm1, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,3,3,6,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm8, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm10, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm19[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm0, 192(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf16: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[12,13,14,15],zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm7[u,u,u,u,u,u,u,u,16,17,18,19] ; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm3, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u],zero,zero,ymm6[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[16,17,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm3, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <2,u,3,2,u,10,10,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7,8,9],ymm3[10],ymm10[11,12],ymm3[13],ymm10[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u],zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[16,17,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vporq %ymm3, %ymm4, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6,7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm5[2],ymm14[3,4],ymm5[5],ymm14[6,7,8,9],ymm5[10],ymm14[11,12],ymm5[13],ymm14[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,u,3,2,u,10,10,11> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm2, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm0, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3,4],xmm0[5],xmm14[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm14, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm10, %ymm20, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm13, %ymm20, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm8, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,u,u,u,7,u,u,7> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm12[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm3, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[16,17,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7,8,9,10],ymm9[11],ymm7[12,13],ymm9[14],ymm7[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <6,u,u,u,7,u,u,7> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm13, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm10[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, 192(%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rcx) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride7_vf16: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm13 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u],zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[16,17,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm4, %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[12,13,14,15],zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm8[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm4, %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[16,17,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u] -; AVX512DQ-SLOW-NEXT: vporq %ymm5, %ymm10, %ymm19 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpandn %ymm5, %ymm10, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[16,17,u,u] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm20 -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm4, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm18 = [0,16,0,1,17,17,2,0,0,16,0,1,17,17,2,0] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm11, %zmm18 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u],zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[16,17,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,14,15],zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm7[u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm2, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm10[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[12,13,14,15],zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm9[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm6, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[16,17,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u] +; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm11, %ymm18 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpandn %ymm0, %ymm11, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[16,17,u,u] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8,9,10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm3, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7,8,9],ymm14[10],ymm11[11,12],ymm14[13],ymm11[14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7,8,9],ymm11[10],ymm15[11,12],ymm11[13],ymm15[14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm19 +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm12, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm20 = [0,16,0,1,17,17,2,0,0,16,0,1,17,17,2,0] +; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm11, %zmm20 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7,8],ymm13[9],ymm11[10,11],ymm13[12],ymm11[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm4, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7,8,9,10],ymm9[11],ymm6[12,13],ymm9[14],ymm6[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[2,1,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm22[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm12[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm3 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermd %zmm13, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,3,3,6,7,7,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm12[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm4 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermd %zmm1, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,3,3,6,7,7,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm8, %zmm8 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm6 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm10, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm21 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm21 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm19[2,2,2,3,6,6,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5] +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 192(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rcx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf16: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm8 ; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[12,13,14,15],zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm7[u,u,u,u,u,u,u,u,16,17,18,19] ; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm3, %ymm16 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm11 ; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u],zero,zero,ymm6[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[16,17,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm3, %ymm18 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <2,u,3,2,u,10,10,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7,8,9],ymm3[10],ymm10[11,12],ymm3[13],ymm10[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u],zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[16,17,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm4, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6,7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm5[2],ymm14[3,4],ymm5[5],ymm14[6,7,8,9],ymm5[10],ymm14[11,12],ymm5[13],ymm14[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,u,3,2,u,10,10,11> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vprold $16, %ymm2, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm0, %zmm18 ; AVX512DQ-FAST-NEXT: vprold $16, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm15 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm14 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3,4],xmm0[5],xmm14[6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm14, %zmm19 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm12 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandnq %ymm10, %ymm20, %ymm10 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpandnq %ymm13, %ymm20, %ymm13 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm20 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermd %zmm8, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,u,u,u,7,u,u,7> -; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm12[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[16,17,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7,8,9,10],ymm9[11],ymm7[12,13],ymm9[14],ymm7[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <6,u,u,u,7,u,u,7> +; AVX512DQ-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm13, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm10[2,2,2,3,6,6,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5] +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm0 ; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 ; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 192(%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 64(%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -2917,705 +2906,695 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $680, %rsp # imm = 0x2A8 +; SSE-NEXT: subq $664, %rsp # imm = 0x298 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm1 -; SSE-NEXT: movdqa 48(%rcx), %xmm5 -; SSE-NEXT: movdqa 48(%r8), %xmm9 +; SSE-NEXT: movdqa 48(%rcx), %xmm6 +; SSE-NEXT: movdqa 48(%r8), %xmm8 ; SSE-NEXT: movdqa 48(%r9), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rax), %xmm7 +; SSE-NEXT: movaps 48(%rax), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: andnps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: andnps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: orps %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: andps %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: movdqa (%r9), %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa (%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa (%rcx), %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm0, %xmm3 -; SSE-NEXT: orps %xmm2, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 16(%r8), %xmm10 -; SSE-NEXT: movdqa 16(%r9), %xmm8 -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa 16(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] +; SSE-NEXT: andps %xmm5, %xmm2 +; SSE-NEXT: orps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rax), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 16(%r8), %xmm5 +; SSE-NEXT: movdqa 16(%r9), %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm0, %xmm3 -; SSE-NEXT: orps %xmm2, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm15 ; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 32(%r8), %xmm10 -; SSE-NEXT: movdqa 32(%r9), %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 32(%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 16(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa 16(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE-NEXT: movdqa %xmm8, %xmm6 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm5, %xmm4 -; SSE-NEXT: andnps %xmm2, %xmm5 -; SSE-NEXT: orps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0,2] +; SSE-NEXT: andps %xmm15, %xmm2 +; SSE-NEXT: orps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm2[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,0,1] -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,2],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rax), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 32(%r8), %xmm14 +; SSE-NEXT: movdqa 32(%r9), %xmm13 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 32(%rcx), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: movdqa 32(%rdx), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa 32(%rsi), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,2,2,2] +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm10[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm6[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm5, %xmm9 +; SSE-NEXT: andnps %xmm3, %xmm5 +; SSE-NEXT: orps %xmm9, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: por %xmm15, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm6[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm8, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; SSE-NEXT: psrlq $48, %xmm15 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm14 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: por %xmm9, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm12 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,2,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: psrlq $48, %xmm7 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm6, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm5[1,1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: andnps %xmm2, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; SSE-NEXT: psrlq $48, %xmm15 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm15[1] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm13 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,5,4] +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm9[1,1] +; SSE-NEXT: movaps %xmm9, %xmm15 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: andnps %xmm3, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: orps %xmm2, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,2],xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: andnps %xmm9, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: orps %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm13[1,1] +; SSE-NEXT: movaps %xmm13, %xmm3 +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: andnps %xmm0, %xmm9 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: orps %xmm2, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm10[1,1] -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: andnps %xmm2, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: movaps %xmm12, %xmm10 -; SSE-NEXT: andnps %xmm1, %xmm10 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] -; SSE-NEXT: andnps %xmm1, %xmm12 -; SSE-NEXT: orps %xmm0, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2,2],mem[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[1,1] +; SSE-NEXT: andnps %xmm2, %xmm4 +; SSE-NEXT: orps %xmm0, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2,2],mem[2,0] ; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,6,6,7] -; SSE-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm13 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm13, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: andnps %xmm0, %xmm13 +; SSE-NEXT: orps %xmm2, %xmm13 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm2, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: orps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm1[2,1] +; SSE-NEXT: andps %xmm13, %xmm15 +; SSE-NEXT: orps %xmm0, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm11 -; SSE-NEXT: orps %xmm1, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movapd %xmm7, %xmm13 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm13 -; SSE-NEXT: orps %xmm1, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1],mem[0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,1] -; SSE-NEXT: andps %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm1[2,1] +; SSE-NEXT: andps %xmm13, %xmm14 +; SSE-NEXT: orps %xmm0, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] +; SSE-NEXT: andps %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 ; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[0],mem[0] ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: andps %xmm6, %xmm8 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: andps %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: andps %xmm2, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: andps %xmm2, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[0],mem[0] +; SSE-NEXT: shufps $98, (%rsp), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: andps %xmm2, %xmm11 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: andps %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm11, %xmm2 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm6, 336(%rax) -; SSE-NEXT: movdqa %xmm9, 224(%rax) +; SSE-NEXT: movdqa %xmm2, 336(%rax) +; SSE-NEXT: movdqa %xmm10, 224(%rax) ; SSE-NEXT: movdqa %xmm8, 112(%rax) -; SSE-NEXT: movdqa %xmm1, (%rax) -; SSE-NEXT: movdqa %xmm2, 288(%rax) -; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps %xmm11, 64(%rax) +; SSE-NEXT: movdqa %xmm0, (%rax) +; SSE-NEXT: movdqa %xmm13, 288(%rax) +; SSE-NEXT: movaps %xmm14, 176(%rax) +; SSE-NEXT: movaps %xmm15, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) -; SSE-NEXT: movaps %xmm12, 368(%rax) +; SSE-NEXT: movaps %xmm4, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3623,7 +3602,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) -; SSE-NEXT: movaps %xmm10, 256(%rax) +; SSE-NEXT: movaps %xmm9, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3631,8 +3610,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps %xmm6, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3640,12 +3618,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps %xmm12, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm15, 320(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, 320(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) @@ -3655,192 +3632,194 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 400(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rax) -; SSE-NEXT: addq $680, %rsp # imm = 0x2A8 +; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $584, %rsp # imm = 0x248 +; AVX1-ONLY-NEXT: subq $568, %rsp # imm = 0x238 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm13 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm9 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1,2,3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,5,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm6[6],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm6[6],xmm3[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm6[3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,2],xmm6[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm6[3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm7, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm13, %ymm14, %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm13 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[0,2],xmm5[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] @@ -3864,77 +3843,76 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm12[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[0,1,0,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm2[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm15[0,2],xmm2[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -3942,34 +3920,34 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm13[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[0,2],xmm13[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm10[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 @@ -3978,149 +3956,151 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1] -; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm2[3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm1[3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm6, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm12, %ymm9 ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 ; AVX1-ONLY-NEXT: vandps %ymm15, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4135,56 +4115,60 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) -; AVX1-ONLY-NEXT: addq $584, %rsp # imm = 0x248 +; AVX1-ONLY-NEXT: addq $568, %rsp # imm = 0x238 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $616, %rsp # imm = 0x268 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-SLOW-NEXT: subq $584, %rsp # imm = 0x248 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm14 ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm10 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm2, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm12 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,0,3,4,5,4,7] @@ -4215,22 +4199,22 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -4248,9 +4232,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] @@ -4270,115 +4254,112 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm15, %ymm14, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm15, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm14, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm3[2],xmm12[3,4],xmm3[5],xmm12[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm9 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2],xmm14[3,4],xmm3[5],xmm14[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1],xmm4[2],xmm9[3,4],xmm4[5],xmm9[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1],xmm4[2],xmm14[3,4],xmm4[5],xmm14[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,7,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm14 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -4396,167 +4377,166 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpshufd $212, (%rsp), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7,8,9],ymm5[10],ymm7[11,12],ymm5[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8,9,10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8,9,10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8,9,10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 320(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 320(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 288(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm14, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: addq $616, %rsp # imm = 0x268 +; AVX2-SLOW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $312, %rsp # imm = 0x138 +; AVX2-FAST-NEXT: subq $248, %rsp ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> @@ -4565,26 +4545,22 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] @@ -4593,128 +4569,129 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7,8,9],ymm2[10],ymm8[11,12],ymm2[13],ymm8[14,15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1],ymm2[2],ymm10[3,4],ymm2[5],ymm10[6,7,8,9],ymm2[10],ymm10[11,12],ymm2[13],ymm10[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm6 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7,8,9,10],ymm13[11],ymm1[12,13],ymm13[14],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm11 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7,8,9,10],ymm9[11],ymm0[12,13],ymm9[14],ymm0[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7,8,9,10],ymm9[11],ymm5[12,13],ymm9[14],ymm5[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7,8,9,10],ymm13[11],ymm3[12,13],ymm13[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm8, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm13, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4723,141 +4700,133 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm11 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm10, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1],xmm4[2],xmm11[3,4],xmm4[5],xmm11[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3,4],xmm6[5],xmm0[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) @@ -4872,56 +4841,59 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 192(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 256(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-NEXT: addq $312, %rsp # imm = 0x138 +; AVX2-FAST-NEXT: addq $248, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $616, %rsp # imm = 0x268 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm8, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm11, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm12, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm15, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 @@ -4930,7 +4902,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 @@ -4956,15 +4928,15 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4974,8 +4946,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -4985,11 +4957,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4998,7 +4970,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -5007,97 +4979,96 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm9, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm5[2],xmm9[3,4],xmm5[5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] @@ -5107,7 +5078,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] @@ -5121,1372 +5092,1334 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 40(%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,3,3,3,6,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7,8,9],ymm5[10],ymm7[11,12],ymm5[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8,9,10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm13, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7,8,9],ymm6[10],ymm8[11,12],ymm6[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $616, %rsp # imm = 0x268 +; AVX2-FAST-PERLANE-NEXT: addq $648, %rsp # imm = 0x288 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride7_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm9, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm15, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm8, %zmm1, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm15, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm1[2],ymm8[3,4],ymm1[5],ymm8[6,7,8,9],ymm1[10],ymm8[11,12],ymm1[13],ymm8[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm10, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7,8,9],ymm4[10],ymm10[11,12],ymm4[13],ymm10[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm4, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm7, %zmm5, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2,3],xmm7[4],xmm10[5,6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm10, %zmm7, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm0, %zmm11, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm4, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8,9,10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm3, %ymm8, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm13, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6,7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6,7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8,9,10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm1, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm1, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3],xmm0[4],xmm6[5,6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm4, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm6[2],xmm3[3,4],xmm6[5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7,8,9],ymm3[10],ymm12[11,12],ymm3[13],ymm12[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7,8,9],ymm10[10],ymm13[11,12],ymm10[13],ymm13[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6,7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7,8],ymm14[9],ymm10[10,11],ymm14[12],ymm10[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm22[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm17[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm19[0,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm31, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm27[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm24[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm23[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm17 = mem[2,3,3,3,6,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm21[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm19[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm8[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm17[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm22, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm31, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm31, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm11[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm22[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd $254, (%rsp), %ymm22 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm22 = mem[2,3,3,3,6,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm21[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm17[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd (%rax), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm10[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm15, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm22[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermd (%rax), %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm30, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermd (%rax), %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-ONLY-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $232, %rsp -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: subq $152, %rsp +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm6, %ymm7, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm7, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm1, %ymm2, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8,9,10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,1,3,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [2,2,2,3,8,8,8,9] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm13, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8,9,10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,0,1,1,12,13,u,15> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm15, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7,8,9,10],ymm13[11],ymm15[12,13],ymm13[14],ymm15[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm13, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,0,1,1,12,13,u,15> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm26 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7,8,9,10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,1,u,3,10,10,11,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm1, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,2,3,8,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm3, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7,8,9],ymm12[10],ymm4[11,12],ymm12[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [2,1,3,2,10,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm22, %zmm17, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,1,u,3,10,10,11,11> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm29, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [2,2,2,3,8,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm7, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,3,2,10,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm24, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm7[1],xmm11[2,3],xmm7[4],xmm11[5,6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,1,1,8,8,10,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm0, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm13, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2,3],xmm6[4],xmm12[5,6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3],xmm3[4],xmm11[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm26, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7,8,9],ymm5[10],ymm10[11,12],ymm5[13],ymm10[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm25, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6,7,8],ymm2[9],ymm12[10,11],ymm2[12],ymm12[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm31[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm2[1],ymm10[2,3],ymm2[4],ymm10[5,6,7,8],ymm2[9],ymm10[10,11],ymm2[12],ymm10[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm31[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <6,u,u,u,7,u,u,7> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm16, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm23, %zmm17, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm24, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm6, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $232, %rsp +; AVX512F-ONLY-FAST-NEXT: addq $152, %rsp ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride7_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm30 -; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm14 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm7 -; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm3 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm8 +; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm9 -; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm9, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm15, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX512DQ-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm12 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm8, %zmm1, %zmm26 -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm15, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm1[2],ymm8[3,4],ymm1[5],ymm8[6,7,8,9],ymm1[10],ymm8[11,12],ymm1[13],ymm8[14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm4 +; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm10 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm10, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,3,6,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7,8,9],ymm4[10],ymm10[11,12],ymm4[13],ymm10[14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm4, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm7, %zmm5, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2,3],xmm7[4],xmm10[5,6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm10, %zmm7, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm0, %zmm11, %zmm27 +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm28 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpandn %ymm4, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8,9,10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpandn %ymm3, %ymm8, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm13, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6,7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6,7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8,9,10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15] +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm1, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm1, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3],xmm0[4],xmm6[5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm6, %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm4, %xmm4 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm6[2],xmm3[3,4],xmm6[5],xmm3[6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7,8,9],ymm3[10],ymm12[11,12],ymm3[13],ymm12[14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7,8,9],ymm10[10],ymm13[11,12],ymm10[13],ymm13[14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6,7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7,8],ymm14[9],ymm10[10,11],ymm14[12],ymm10[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm22[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm17[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm19[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm30 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm31, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm27[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm24[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm23[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm17 = mem[2,3,3,3,6,7,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm21[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm19[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm8[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm14, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm17[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm22, %zmm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm31, %zmm5 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm4 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm31, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm11[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm22[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd $254, (%rsp), %ymm22 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm22 = mem[2,3,3,3,6,7,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm21[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm17[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vpbroadcastd (%rax), %ymm7 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm10[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm15, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm22[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm8 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm5 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermd (%rax), %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm30, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm6 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm2 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermd (%rax), %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 384(%rax) -; AVX512DQ-SLOW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $232, %rsp -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: subq $152, %rsp +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm12 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm7 -; AVX512DQ-FAST-NEXT: vporq %ymm6, %ymm7, %ymm25 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm7 +; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm7, %ymm26 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm1 ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm1 -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8,9,10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm24 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm21 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm28 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [2,2,2,3,8,8,8,9] ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm18 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm22 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512DQ-FAST-NEXT: vprold $16, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm13, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8,9,10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm8, %xmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,0,1,1,12,13,u,15> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm16 +; AVX512DQ-FAST-NEXT: vprold $16, %ymm15, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7,8,9,10],ymm13[11],ymm15[12,13],ymm13[14],ymm15[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm13, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm13 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,0,1,1,12,13,u,15> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm26 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm15 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm13 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7,8,9,10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,1,u,3,10,10,11,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm6 -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm1, %ymm5, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,2,3,8,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm13 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vprold $16, %ymm3, %ymm12 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7,8,9],ymm12[10],ymm4[11,12],ymm12[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm22, %zmm17, %zmm13 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm4 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm30 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,1,u,3,10,10,11,11> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm29, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vprold $16, %ymm7, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm24 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm24, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm7[1],xmm11[2,3],xmm7[4],xmm11[5,6],xmm7[7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,1,1,8,8,10,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm13 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm0, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm13, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2,3],xmm6[4],xmm12[5,6],xmm6[7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3],xmm3[4],xmm11[5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm26, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7,8,9],ymm5[10],ymm10[11,12],ymm5[13],ymm10[14,15] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 -; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm12 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm25, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6,7,8],ymm2[9],ymm12[10,11],ymm2[12],ymm12[13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm12 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm12 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm31[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm2[1],ymm10[2,3],ymm2[4],ymm10[5,6,7,8],ymm2[9],ymm10[10,11],ymm2[12],ymm10[13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm31[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <6,u,u,u,7,u,u,7> -; AVX512DQ-FAST-NEXT: vpermd %ymm16, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm5 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm23, %zmm17, %zmm19 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm30 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm14 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm14 -; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm24, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm5, %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm28 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm30 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm14 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm14 +; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm2 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 64(%rax) -; AVX512DQ-FAST-NEXT: addq $232, %rsp +; AVX512DQ-FAST-NEXT: addq $152, %rsp ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -6494,16 +6427,16 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 @@ -6512,26 +6445,26 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm0, %zmm5 ; AVX512BW-NEXT: movl $1623294726, %ecx # imm = 0x60C18306 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm8 ; AVX512BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm9, %zmm10 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} @@ -6540,86 +6473,86 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 ; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm9 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u> -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm10, %zmm11 ; AVX512BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm9 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm11 ; AVX512BW-NEXT: movl $405823681, %ecx # imm = 0x183060C1 ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm11 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm10, %zmm12 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm10 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm11 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u> -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm13 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm10, %zmm13 ; AVX512BW-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm12 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm13 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm10, %zmm14 ; AVX512BW-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm6 ; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm6 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm6 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rax) @@ -6648,77 +6581,79 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1640, %rsp # imm = 0x668 +; SSE-NEXT: subq $1608, %rsp # imm = 0x648 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 112(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdx), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdx), %xmm1 ; SSE-NEXT: movdqa 96(%rcx), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rcx), %xmm6 -; SSE-NEXT: movdqa 112(%r8), %xmm4 +; SSE-NEXT: movdqa 112(%rcx), %xmm2 +; SSE-NEXT: movdqa 112(%r8), %xmm7 ; SSE-NEXT: movdqa 112(%r9), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rax), %xmm7 +; SSE-NEXT: movaps 112(%rax), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm4 -; SSE-NEXT: andnps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: orps %xmm4, %xmm1 +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 96(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa 96(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa 96(%r9), %xmm1 @@ -6728,1384 +6663,1369 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] ; SSE-NEXT: movdqa 96(%rax), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: andps %xmm5, %xmm3 +; SSE-NEXT: andps %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] ; SSE-NEXT: movdqa %xmm14, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: andps %xmm5, %xmm3 +; SSE-NEXT: andps %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1] +; SSE-NEXT: movdqa (%rax), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm8 +; SSE-NEXT: movdqa (%r8), %xmm11 ; SSE-NEXT: movdqa (%r9), %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa (%rcx), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm12, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%r8), %xmm14 -; SSE-NEXT: movdqa 16(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: andps %xmm10, %xmm3 ; SSE-NEXT: orps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rax), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%r8), %xmm14 -; SSE-NEXT: movdqa 32(%r9), %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa 16(%r8), %xmm7 +; SSE-NEXT: movdqa 16(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%rcx), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 16(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 32(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 32(%rsi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm15, %xmm3 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 48(%r8), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%r9), %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa 32(%r8), %xmm0 +; SSE-NEXT: movdqa 32(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 48(%rcx), %xmm0 +; SSE-NEXT: movdqa 32(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 48(%rdx), %xmm0 +; SSE-NEXT: movdqa 32(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rsi), %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm5, %xmm15 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm15, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm10, %xmm3 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 64(%r8), %xmm1 -; SSE-NEXT: movdqa 64(%r9), %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa 48(%r8), %xmm0 +; SSE-NEXT: movdqa 48(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 64(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa 48(%rcx), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 64(%rdx), %xmm0 +; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm13 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm8 -; SSE-NEXT: movdqa 64(%rsi), %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa 48(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,2,2] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm9, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm8, %xmm3 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rax), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 80(%r8), %xmm1 -; SSE-NEXT: movdqa 80(%r9), %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa 64(%r8), %xmm0 +; SSE-NEXT: movdqa 64(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 80(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa 64(%rcx), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 80(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa 64(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm10, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm7 -; SSE-NEXT: movdqa 80(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; SSE-NEXT: andps %xmm9, %xmm4 +; SSE-NEXT: orps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,2],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rax), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa 80(%r8), %xmm10 +; SSE-NEXT: movdqa 80(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa 80(%rcx), %xmm9 ; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa 80(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm3 +; SSE-NEXT: movdqa 80(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0,2] +; SSE-NEXT: andps %xmm6, %xmm0 ; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd (%rsp), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[3,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; SSE-NEXT: movaps %xmm13, %xmm3 -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: orps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: psrld $16, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: por %xmm5, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[3,3] +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm10, %xmm1 +; SSE-NEXT: andnps %xmm4, %xmm10 +; SSE-NEXT: orps %xmm1, %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm5[3,3] +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: por %xmm5, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: andps %xmm5, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: por %xmm12, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: andps %xmm2, %xmm5 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,2,2,2] ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[1,1,1,1,4,5,6,7] +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,3,3] +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: andps %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pslld $16, %xmm4 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,1,3] +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,5,4] +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[1,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm8[1,1] ; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] +; SSE-NEXT: orps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufhw $164, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm8[1,1] -; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm7 ; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm3, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: orps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] -; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm6[1,1] ; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm3, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: orps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm14[1,1] ; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm3, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: orps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[1,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm15[1,1] +; SSE-NEXT: movaps %xmm15, %xmm8 ; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm3, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: orps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm15[1,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm9[1,1] ; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm3, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: orps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[1,1] -; SSE-NEXT: andnps %xmm1, %xmm5 -; SSE-NEXT: orps %xmm0, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0],mem[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm5[1,1] +; SSE-NEXT: andnps %xmm3, %xmm2 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[0],mem[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: andps %xmm2, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: orps %xmm9, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: andps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: andnps %xmm1, %xmm5 -; SSE-NEXT: orps %xmm7, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2,2],mem[2,0] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm1, %xmm5 +; SSE-NEXT: andps %xmm2, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,5,6,6,7] +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; SSE-NEXT: andnps %xmm3, %xmm2 +; SSE-NEXT: orps %xmm10, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: andps %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSE-NEXT: andnps %xmm3, %xmm10 +; SSE-NEXT: orps %xmm2, %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2,2],mem[2,0] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm5, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE-NEXT: andnps %xmm0, %xmm5 -; SSE-NEXT: orps %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: orps %xmm2, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,0,65535] ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movapd %xmm2, %xmm10 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm3[2,1] +; SSE-NEXT: andps %xmm4, %xmm10 +; SSE-NEXT: orps %xmm0, %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movapd %xmm10, %xmm5 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm5 -; SSE-NEXT: orps %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm12 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm12 -; SSE-NEXT: orps %xmm1, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm11 -; SSE-NEXT: orps %xmm1, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: movaps %xmm14, %xmm9 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm9 -; SSE-NEXT: orps %xmm1, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm7[2,1] +; SSE-NEXT: andps %xmm4, %xmm12 +; SSE-NEXT: orps %xmm0, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movapd %xmm15, %xmm10 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm7[2,1] +; SSE-NEXT: andps %xmm4, %xmm10 +; SSE-NEXT: orps %xmm0, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm7 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm7 -; SSE-NEXT: orps %xmm1, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movapd %xmm14, %xmm5 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm5 -; SSE-NEXT: orps %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,1] +; SSE-NEXT: andps %xmm4, %xmm7 +; SSE-NEXT: orps %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm6[2,1] +; SSE-NEXT: andps %xmm4, %xmm13 +; SSE-NEXT: orps %xmm0, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm8 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[2,1] +; SSE-NEXT: andps %xmm4, %xmm8 +; SSE-NEXT: orps %xmm0, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,1] +; SSE-NEXT: andps %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: andps %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: andps %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: andps %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1] +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: andps %xmm4, %xmm13 -; SSE-NEXT: por %xmm13, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: andps %xmm1, %xmm11 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: movaps (%rsp), %xmm11 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,0,1,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: andps %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: andps %xmm1, %xmm11 +; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: andps %xmm1, %xmm11 +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: andps %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1] +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,0,1,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: andps %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm4, 672(%rax) -; SSE-NEXT: movdqa %xmm15, 560(%rax) -; SSE-NEXT: movdqa %xmm13, 448(%rax) +; SSE-NEXT: movdqa %xmm1, 672(%rax) +; SSE-NEXT: movdqa %xmm14, 560(%rax) +; SSE-NEXT: movdqa %xmm15, 448(%rax) ; SSE-NEXT: movdqa %xmm3, 336(%rax) -; SSE-NEXT: movdqa %xmm6, 224(%rax) -; SSE-NEXT: movdqa %xmm8, 112(%rax) -; SSE-NEXT: movdqa %xmm10, (%rax) -; SSE-NEXT: movdqa %xmm2, 736(%rax) -; SSE-NEXT: movaps %xmm5, 624(%rax) -; SSE-NEXT: movaps %xmm7, 512(%rax) -; SSE-NEXT: movaps %xmm9, 400(%rax) -; SSE-NEXT: movaps %xmm11, 288(%rax) +; SSE-NEXT: movdqa %xmm5, 224(%rax) +; SSE-NEXT: movdqa %xmm6, 112(%rax) +; SSE-NEXT: movdqa %xmm9, (%rax) +; SSE-NEXT: movdqa %xmm4, 736(%rax) +; SSE-NEXT: movaps %xmm8, 624(%rax) +; SSE-NEXT: movaps %xmm13, 512(%rax) +; SSE-NEXT: movaps %xmm7, 400(%rax) +; SSE-NEXT: movaps %xmm10, 288(%rax) ; SSE-NEXT: movaps %xmm12, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -8200,99 +8120,99 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: addq $1640, %rsp # imm = 0x668 +; SSE-NEXT: addq $1608, %rsp # imm = 0x648 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm2[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0],xmm0[1],xmm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 112(%rax), %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm7[1,2,3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rax), %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1,2,3,4,5,6],xmm11[7] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm11[2],xmm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm11[1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm11[2],xmm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm6[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm13, %ymm11 ; AVX1-ONLY-NEXT: vandps %ymm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm15 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm14 ; AVX1-ONLY-NEXT: vorps %ymm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,0,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5],xmm12[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,3],xmm11[4,5,6,7] ; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm6[1],xmm8[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm14, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm8 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,5,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2,3,4,5,6],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3,4,5,6],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm8 @@ -8301,27 +8221,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm8[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4],xmm4[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4],xmm4[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 @@ -8331,15 +8251,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6],xmm5[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm3[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm0[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 @@ -8351,73 +8271,74 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm4 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,2],xmm4[1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,2],xmm4[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 @@ -8432,22 +8353,23 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[0,2],xmm3[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,2],xmm3[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 @@ -8469,9 +8391,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] @@ -8479,7 +8400,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] @@ -8492,9 +8413,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -8507,50 +8428,52 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm11[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm3, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6],xmm7[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm7[6],xmm2[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[0,2],xmm5[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm6 @@ -8568,38 +8491,36 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[0,2],xmm11[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[0,2],xmm2[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -8614,17 +8535,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 @@ -8632,68 +8553,70 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vmovdqa 64(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vmovdqa 64(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm12[5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm13, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm1, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6],xmm9[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 @@ -8703,47 +8626,47 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm9 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa 80(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4],xmm6[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,2],xmm2[1,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 80(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3,4],xmm6[5],xmm13[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,2],xmm0[1,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -8761,78 +8684,77 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm3[3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3,4],xmm13[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm14[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2,3,4],xmm12[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3,4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm3, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6],xmm13[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm10, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm0[3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[1,3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm13[6],xmm12[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm14[0,2],xmm2[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 @@ -8936,8 +8858,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vorps %ymm13, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -8947,15 +8869,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm10, %ymm12 ; AVX1-ONLY-NEXT: vandps %ymm10, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,3,3] @@ -8965,31 +8887,32 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm12 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm3[1],xmm12[1] ; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm2, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[0,0,0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm8, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm8, %ymm5 @@ -9009,78 +8932,75 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0],xmm2[1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm10, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm14, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm11 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm3[1],xmm11[1] -; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm3, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm12 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm5[1],xmm12[1] +; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm4, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 @@ -9232,76 +9152,75 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm11, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,3,u,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm11, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm11, %ymm3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm10, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <3,u,u,u,4,u,u,4> +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9315,12 +9234,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm1 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] @@ -9345,7 +9264,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -9416,37 +9335,39 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] @@ -9461,14 +9382,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 @@ -9476,160 +9396,156 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm15, %ymm14, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm15, %ymm14, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd $165, (%rsp), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3,4],xmm1[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm8, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2],xmm14[3,4],xmm6[5],xmm14[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3],xmm1[4],xmm14[5,6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3],xmm4[4],xmm14[5,6],xmm4[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm9[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[0,1,2,3,4,5,7,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm8, %ymm4 -; AVX2-SLOW-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,1,2,3,4,5,7,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm5[2],xmm14[3,4],xmm5[5],xmm14[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2],xmm14[3,4],xmm3[5],xmm14[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm8, %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] @@ -9640,7 +9556,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] @@ -9650,17 +9567,16 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] @@ -9708,50 +9624,66 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpshuflw $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpshuflw $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,3,u,u,u,4> -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpermd (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <3,u,u,3,u,u,u,4> +; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -9760,159 +9692,158 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm8 ; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX2-SLOW-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm9 -; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm9, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7,8,9],ymm8[10],ymm10[11,12],ymm8[13],ymm10[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm11[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7,8,9],ymm9[10],ymm11[11,12],ymm9[13],ymm11[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm10, %ymm8 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm15[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm15 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm12, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm10, %ymm11, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] ; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6,7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] ; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7] @@ -9922,10 +9853,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8,9,10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] @@ -9937,55 +9866,40 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm9, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8,9,10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15] -; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm10, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,3,6,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,3,6,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7,8,9],ymm11[10],ymm13[11,12],ymm11[13],ymm13[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm11, %ymm13, %ymm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7,8,9],ymm13[10],ymm14[11,12],ymm13[13],ymm14[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[2,3,3,3,6,7,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%rax) @@ -9993,20 +9907,21 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 640(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 608(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 640(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 608(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 416(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 384(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 416(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 768(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 736(%rax) @@ -10044,7 +9959,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i16_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-NEXT: subq $1288, %rsp # imm = 0x508 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 @@ -10084,300 +9999,296 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm10 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm12 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm4, %ymm5 ; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 64(%rax), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8,9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa 64(%rax), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm12[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7,8,9],ymm9[10],ymm1[11,12],ymm9[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm6 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7,8,9],ymm6[10],ymm15[11,12],ymm6[13],ymm15[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7,8,9],ymm14[10],ymm4[11,12],ymm14[13],ymm4[14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm15, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm14, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7,8],ymm5[9],ymm15[10,11],ymm5[12],ymm15[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4],ymm14[5],ymm0[6,7,8,9],ymm14[10],ymm0[11,12],ymm14[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7,8,9],ymm14[10],ymm2[11,12],ymm14[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7,8,9],ymm14[10],ymm6[11,12],ymm14[13],ymm6[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7,8,9,10],ymm4[11],ymm14[12,13],ymm4[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4],ymm14[5],ymm0[6,7,8,9],ymm14[10],ymm0[11,12],ymm14[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8,9,10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm10[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7,8,9,10],ymm0[11],ymm5[12,13],ymm0[14],ymm5[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7,8],ymm5[9],ymm15[10,11],ymm5[12],ymm15[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7,8,9,10],ymm0[11],ymm5[12,13],ymm0[14],ymm5[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm15, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <3,u,u,u,4,u,u,4> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] @@ -10405,68 +10316,68 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -10475,75 +10386,75 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd 124(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm15 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm13 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm11 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm12 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm13 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 @@ -10551,182 +10462,185 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 64(%rax), %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm3 +; AVX2-FAST-NEXT: vpbroadcastd 64(%rax), %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 96(%rax), %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm10, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastd 96(%rax), %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm7, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm9, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm0 -; AVX2-FAST-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-FAST-NEXT: vpshufd $229, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = mem[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpshufd $165, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm2[2],xmm14[3,4],xmm2[5],xmm14[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3,4],xmm14[5],xmm11[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1],xmm4[2],xmm9[3,4],xmm4[5],xmm9[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm11 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3],xmm1[4],xmm11[5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm9 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1],xmm6[2],xmm11[3,4],xmm6[5],xmm11[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1],xmm9[2],xmm15[3,4],xmm9[5],xmm15[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3],xmm1[4],xmm9[5,6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm11, %ymm7 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 68(%rax), %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 100(%rax), %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vpbroadcastd 68(%rax), %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX2-FAST-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm6, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -10750,10 +10664,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd 104(%rax), %ymm10 ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 544(%rax) @@ -10781,16 +10695,19 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 768(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 736(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 704(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 736(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 672(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 512(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 480(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 512(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 256(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%rax) @@ -10804,86 +10721,87 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 832(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1544, %rsp # imm = 0x608 +; AVX2-FAST-PERLANE-NEXT: subq $1560, %rsp # imm = 0x618 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm10, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm5, %ymm7, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm1, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm8, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm7, %ymm14, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm14, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10892,63 +10810,62 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm8, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm8, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> @@ -10956,45 +10873,45 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7,8],ymm1[9],ymm6[10,11],ymm1[12],ymm6[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -11007,69 +10924,70 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm2 @@ -11077,183 +10995,186 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rax), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rax), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm13, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm15, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm7, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm13, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[1,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd $165, (%rsp), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1],xmm4[2],xmm9[3,4],xmm4[5],xmm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3,4],xmm15[5],xmm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1],xmm9[2],xmm15[3,4],xmm9[5],xmm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1],xmm13[2,3],xmm1[4],xmm13[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3],xmm1[4],xmm9[5,6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm5[2],xmm13[3,4],xmm5[5],xmm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm9, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 68(%rax), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 68(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] @@ -11277,267 +11198,269 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 104(%rax), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm14, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7,8,9,10],ymm8[11],ymm4[12,13],ymm8[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm15, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm4, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8,9,10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7,8,9],ymm9[10],ymm11[11,12],ymm9[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7,8,9],ymm7[10],ymm11[11,12],ymm7[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7,8,9,10],ymm11[11],ymm7[12,13],ymm11[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8,9,10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8,9,10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm14[2],ymm10[3,4],ymm14[5],ymm10[6,7,8,9],ymm14[10],ymm10[11,12],ymm14[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm11, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8,9,10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7,8],ymm12[9],ymm13[10,11],ymm12[12],ymm13[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8,9,10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm6, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,3,3,6,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7,8,9],ymm1[10],ymm15[11,12],ymm1[13],ymm15[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[2,3,3,3,6,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,3,3,3,6,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm13, %ymm15, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 544(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 640(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 608(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 544(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 640(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 608(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 416(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 384(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 768(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 768(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 736(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11550,13 +11473,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 480(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 448(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) @@ -11568,621 +11491,625 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 832(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1544, %rsp # imm = 0x608 +; AVX2-FAST-PERLANE-NEXT: addq $1560, %rsp # imm = 0x618 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $2168, %rsp # imm = 0x878 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm6, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm8, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm13, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm15, %ymm14, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,1,3,2,10,10,10,11] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm14, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6,7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6,7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8,9,10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX512F-ONLY-SLOW-NEXT: subq $2136, %rsp # imm = 0x858 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm4, %ymm5, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm4, %ymm7, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm8, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm8, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm13, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm8, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm12, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3,4],ymm12[5],ymm6[6,7,8,9],ymm12[10],ymm6[11,12],ymm12[13],ymm6[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm12, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm12[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[14,15],zero,zero,ymm12[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,17],zero,zero,ymm12[u,u],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm9, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm15, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm14[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm12, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm12, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm3, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7,8,9],ymm5[10],ymm7[11,12],ymm5[13],ymm7[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm14[2],ymm7[3,4],ymm14[5],ymm7[6,7,8,9],ymm14[10],ymm7[11,12],ymm14[13],ymm7[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,1,3,2,10,10,10,11] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm5, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7,8,9],ymm7[10],ymm5[11,12],ymm7[13],ymm5[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8,9,10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15],zero,zero,ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17],zero,zero,ymm7[u,u],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm2, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm11, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm6, %zmm18, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm14, %ymm9, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpandnq %ymm6, %ymm17, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rax), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpandnq %ymm12, %ymm17, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm7, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm1[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rax), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm5, %ymm2, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm0, %ymm11, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rax), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm5, %ymm11, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0,1],ymm5[2],ymm14[3,4],ymm5[5],ymm14[6,7,8,9],ymm5[10],ymm14[11,12],ymm5[13],ymm14[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4],ymm14[5],ymm5[6,7,8,9],ymm14[10],ymm5[11,12],ymm14[13],ymm5[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[0,1,1,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm14, %ymm9, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm12, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7,8,9,10],ymm9[11],ymm0[12,13],ymm9[14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm14, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm5, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm12, %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm21, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm24, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermd 64(%rax), %zmm24, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm20, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermd 64(%rax), %zmm15, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm25[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm23[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm4, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2,3],xmm6[4],xmm9[5,6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm30, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3> -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm8, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3> +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm9, %xmm9 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2,3],xmm2[4],xmm10[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm10, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2],xmm5[3,4],xmm10[5],xmm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd (%rax), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpermd (%rax), %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6,7,8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd (%rax), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm6, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm17, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5,6,7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpermd (%rax), %zmm24, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm20, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5,6,7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm13[3],ymm5[4,5],ymm13[6],ymm5[7,8,9,10],ymm13[11],ymm5[12,13],ymm13[14],ymm5[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[2,3,3,3,6,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm2, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm12[2],xmm15[3,4],xmm12[5],xmm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7,8,9],ymm5[10],ymm13[11,12],ymm5[13],ymm13[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm27, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[2,3,3,3,6,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm28, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm13, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm0[2],xmm11[3,4],xmm0[5],xmm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm24 = mem[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm17 = mem[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm28 = mem[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm31, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm25, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm5[1],xmm15[2,3],xmm5[4],xmm15[5,6],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0],xmm2[1],xmm13[2,3],xmm2[4],xmm13[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm23 = mem[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm13[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm1[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm8[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,0,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm29 = mem[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm26 = mem[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm21 = mem[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm18[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm24, %zmm27, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm23[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm23[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm23 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm24, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm8, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm24, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm23 = mem[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm22 = mem[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm18[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm1[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm28, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm11, %zmm5, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm23 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm28[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm28 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm25 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm22, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm24 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm31, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -12192,1320 +12119,1307 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[2,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm26 = mem[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm12 = mem[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm5 = mem[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm21 = mem[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm13 = mem[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[2,1,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm2 = mem[2,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm17 = mem[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm4 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm27, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm27, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm31, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm13[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm31, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm26, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm19, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm28, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 320(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $2168, %rsp # imm = 0x878 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $2136, %rsp # imm = 0x858 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1432, %rsp # imm = 0x598 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: subq $1336, %rsp # imm = 0x538 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm15, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm9, %ymm13, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm15, %ymm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm15, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm13, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm15, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm13, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm9, %ymm8, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm16, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[14,15],zero,zero,ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[16,17],zero,zero,ymm8[u,u],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm13, %ymm9, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm13, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7,8,9],ymm9[10],ymm4[11,12],ymm9[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm17, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[14,15],zero,zero,ymm14[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[16,17],zero,zero,ymm14[u,u],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm1, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rax), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm10, %ymm14, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,2,2,3,10,9,11,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm25, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm4, %ymm16, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7,8,9,10],ymm4[11],ymm9[12,13],ymm4[14],ymm9[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,2,2,3,10,9,11,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7,8,9],ymm7[10],ymm5[11,12],ymm7[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm9, %zmm24, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <5,u,u,u,6,u,u,6> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] ; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm0, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm13, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,1,1,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm4, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm7, %ymm16, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm4, %zmm14, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7,8,9,10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,u,3,10,10,11,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm18, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,0,1,1,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm0, %zmm5, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7,8,9,10],ymm4[11],ymm7[12,13],ymm4[14],ymm7[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7,8],ymm7[9],ymm13[10,11],ymm7[12],ymm13[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm15, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,1,3,2,10,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm20, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm5, %ymm11, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm0, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8,9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7,8,9,10],ymm0[11],ymm5[12,13],ymm0[14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm8[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7,8],ymm5[9],ymm9[10,11],ymm5[12],ymm9[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm15, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm21[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3],ymm0[4],ymm9[5,6,7,8],ymm0[9],ymm9[10,11],ymm0[12],ymm9[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm21[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,1,3,2,10,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7,8,9],ymm7[10],ymm4[11,12],ymm7[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7,8,9],ymm7[10],ymm11[11,12],ymm7[13],ymm11[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm6, %zmm23, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm19[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8,9,10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm13, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm29, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm26, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,2,3,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,2,3,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,1,1,8,8,10,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm9, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,1,1,8,8,10,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm7, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm11, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm8, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm8, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm11, %zmm8, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm24, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7,8,9],ymm7[10],ymm1[11,12],ymm7[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm15, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7,8,9],ymm7[10],ymm1[11,12],ymm7[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm2, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm6, %zmm23, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm8, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm17[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm6, %zmm29, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm28, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [6,7,3,3,7,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm27, %ymm28, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm9, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm21[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm1[1],ymm9[2,3],ymm1[4],ymm9[5,6,7,8],ymm1[9],ymm9[10,11],ymm1[12],ymm9[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,2,3,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,0,2,1,8,8,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm24 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8,9,10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,1,3,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm21, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm23, %ymm28, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm20 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm31, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm11, %zmm30, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3],xmm1[4],xmm8[5,6],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,1,1,8,8,10,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm21, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm20[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7,8,9],ymm13[10],ymm14[11,12],ymm13[13],ymm14[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7,8,9,10],ymm10[11],ymm14[12,13],ymm10[14],ymm14[15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm21, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm13, %zmm26, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,3,3,7,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm22, %ymm4, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm31, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm11[2],xmm8[3,4],xmm11[5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm28[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6,7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,2,3,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,2,1,8,8,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm28, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm26 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,1,1,8,8,10,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm11, %zmm30, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,1,3,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm19, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm20[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7,8,9],ymm11[10],ymm7[11,12],ymm11[13],ymm7[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,3,3,7,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm16, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm11, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2,3],xmm5[4],xmm9[5,6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm30, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm5, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm8, %ymm9, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm11, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm15[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7,8,9],ymm7[10],ymm5[11,12],ymm7[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm7, %zmm21, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm9, %ymm10, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm12, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm12, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm13[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm12 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm6, %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm6, %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1432, %rsp # imm = 0x598 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1336, %rsp # imm = 0x538 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $2168, %rsp # imm = 0x878 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm6, %ymm2 -; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm2 -; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm10 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-SLOW-NEXT: subq $2136, %rsp # imm = 0x858 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vporq %ymm4, %ymm5, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm7 +; AVX512DQ-SLOW-NEXT: vporq %ymm4, %ymm7, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm8, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm13, %ymm8 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm8, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm12 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm12, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm12, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm12, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm30 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm12, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm5 +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm7 +; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm3, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7,8,9],ymm5[10],ymm7[11,12],ymm5[13],ymm7[14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm14[2],ymm7[3,4],ymm14[5],ymm7[6,7,8,9],ymm14[10],ymm7[11,12],ymm14[13],ymm7[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,1,3,2,10,10,10,11] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm5, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm20 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7,8,9],ymm7[10],ymm5[11,12],ymm7[13],ymm5[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8,9,10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15],zero,zero,ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17],zero,zero,ymm7[u,u],zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm2, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm11, %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rax), %ymm0 +; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpandn %ymm5, %ymm2, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm0 +; AVX512DQ-SLOW-NEXT: vpandn %ymm0, %ymm11, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rax), %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm14 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm5 +; AVX512DQ-SLOW-NEXT: vpandn %ymm5, %ymm11, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm14 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0,1],ymm5[2],ymm14[3,4],ymm5[5],ymm14[6,7,8,9],ymm5[10],ymm14[11,12],ymm5[13],ymm14[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4],ymm14[5],ymm5[6,7,8,9],ymm14[10],ymm5[11,12],ymm14[13],ymm5[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpandn %ymm14, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm7 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermd %zmm5, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm19 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm24, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,2,3,3,10,9,11,10] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm24 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermd 64(%rax), %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm20, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm13 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm13, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm20 -; AVX512DQ-SLOW-NEXT: vpor %ymm15, %ymm14, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm0, %ymm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,1,3,2,10,10,10,11] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm14, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6,7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6,7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm25 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8,9,10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3,4],ymm12[5],ymm6[6,7,8,9],ymm12[10],ymm6[11,12],ymm12[13],ymm6[14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm12[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[14,15],zero,zero,ymm12[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,17],zero,zero,ymm12[u,u],zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm9, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm14 -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm15, %ymm6 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm14[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm18 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX512DQ-SLOW-NEXT: vpermd %zmm6, %zmm18, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpandn %ymm14, %ymm9, %ymm14 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm6 -; AVX512DQ-SLOW-NEXT: vpandnq %ymm6, %ymm17, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rax), %ymm6 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm12 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm12 -; AVX512DQ-SLOW-NEXT: vpandnq %ymm12, %ymm17, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm7 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm7, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm1[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rax), %ymm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpandn %ymm14, %ymm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm12, %ymm14 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7,8,9,10],ymm9[11],ymm0[12,13],ymm9[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpermd %zmm12, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm9 -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,0,2,1,4,4,6,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,3,3,10,9,11,10] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm15 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermd 64(%rax), %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm25[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm6, %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm23[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm11, %zmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2,3],xmm6[4],xmm9[5,6],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm7, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm30, %zmm2 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3> -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm8, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22 +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm6, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm2 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm14 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm9, %xmm9 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm4 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm9 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2,3],xmm2[4],xmm10[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm10, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2],xmm5[3,4],xmm10[5],xmm5[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vpbroadcastd (%rax), %ymm3 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm10 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-SLOW-NEXT: vpermd (%rax), %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6,7,8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13,14,15] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vpbroadcastd (%rax), %ymm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm29 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm6, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm17, %zmm7 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5,6,7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-SLOW-NEXT: vpermd (%rax), %zmm24, %zmm24 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm20, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm21 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5,6,7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm13[3],ymm5[4,5],ymm13[6],ymm5[7,8,9,10],ymm13[11],ymm5[12,13],ymm13[14],ymm5[15] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[2,3,3,3,6,7,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm12 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm22 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm2, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm12[2],xmm15[3,4],xmm12[5],xmm15[6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,3,6,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7,8,9],ymm5[10],ymm13[11,12],ymm5[13],ymm13[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm27, %zmm13 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[2,3,3,3,6,7,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm17 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm27 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm28, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm13, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm0[2],xmm11[3,4],xmm0[5],xmm11[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm24 = mem[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm17 = mem[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm28 = mem[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm31, %zmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm2 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm30 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm25, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm5[1],xmm15[2,3],xmm5[4],xmm15[5,6],xmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0],xmm2[1],xmm13[2,3],xmm2[4],xmm13[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm23 = mem[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm13[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm1[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm8[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,0,1,3] +; AVX512DQ-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = mem[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm29 = mem[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm26 = mem[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm21 = mem[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm4 = mem[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = mem[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm18[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm24 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm24, %zmm27, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm23[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm28 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm23 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm13 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm24, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm8, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm24, %zmm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm23 = mem[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm22 = mem[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm16 = mem[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm18[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm1[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm28, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm11, %zmm5, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm18 ; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm23 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm2, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm28[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm28 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm25 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm17 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm20 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm22, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm24 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm31, %zmm2 ; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -13515,932 +13429,913 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm3 = mem[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm6 = mem[2,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm26 = mem[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm12 = mem[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm5 = mem[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3] +; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm21 = mem[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm13 = mem[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm14 = mem[2,1,3,3] ; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm2 = mem[2,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm17 = mem[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm4 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm27, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm27, %zmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm1 = mem[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm31, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm13[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm12 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm31, %zmm12 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm26, %zmm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm21, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm19, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm28, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm30 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm30 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm17 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 320(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512DQ-SLOW-NEXT: addq $2168, %rsp # imm = 0x878 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512DQ-SLOW-NEXT: addq $2136, %rsp # imm = 0x858 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1432, %rsp # imm = 0x598 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm1 +; AVX512DQ-FAST-NEXT: subq $1336, %rsp # imm = 0x538 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm1 ; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm1 +; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] ; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm19 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm8 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm15, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vporq %ymm9, %ymm13, %ymm17 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm15 +; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm15, %ymm20 ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm15, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vpor %ymm9, %ymm13, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm15 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm15, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vpor %ymm9, %ymm13, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm11 -; AVX512DQ-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX512DQ-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm9 -; AVX512DQ-FAST-NEXT: vporq %ymm9, %ymm8, %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm31 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[14,15],zero,zero,ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[16,17],zero,zero,ymm8[u,u],zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm13, %ymm9, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm10 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm10 -; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm12 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm13, %ymm9 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm9 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm4 +; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm0, %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7,8,9],ymm9[10],ymm4[11,12],ymm9[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm17, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[14,15],zero,zero,ymm14[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[16,17],zero,zero,ymm14[u,u],zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm1, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm9 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm4, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512DQ-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vprold $16, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rax), %ymm9 -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpandn %ymm10, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm12 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,2,2,3,10,9,11,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm25, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm26 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vpandnq %ymm4, %ymm16, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7,8,9,10],ymm4[11],ymm9[12,13],ymm4[14],ymm9[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm27 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,2,2,3,10,9,11,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13,14,15] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm29 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7,8,9],ymm7[10],ymm5[11,12],ymm7[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm24, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <5,u,u,u,6,u,u,6> -; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm4 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] ; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm16, %zmm0, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm11 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm12 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm13, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,1,1,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm4, %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm7 -; AVX512DQ-FAST-NEXT: vpandnq %ymm7, %ymm16, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm9 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm16 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm4, %zmm14, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7,8,9,10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,u,3,10,10,11,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm18, %zmm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm14 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,0,1,1,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm0, %zmm5, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7,8,9,10],ymm4[11],ymm7[12,13],ymm4[14],ymm7[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7,8],ymm7[9],ymm13[10,11],ymm7[12],ymm13[13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vprold $16, %ymm15, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm20, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm31 +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 +; AVX512DQ-FAST-NEXT: vpandn %ymm5, %ymm11, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm13 +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm0, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8,9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7,8,9,10],ymm0[11],ymm5[12,13],ymm0[14],ymm5[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm8[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7,8],ymm5[9],ymm9[10,11],ymm5[12],ymm9[13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm21[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3],ymm0[4],ymm9[5,6,7,8],ymm0[9],ymm9[10,11],ymm0[12],ymm9[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm21[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vprold $16, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7,8,9],ymm7[10],ymm4[11,12],ymm7[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7,8,9],ymm7[10],ymm11[11,12],ymm7[13],ymm11[14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm23, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vprold $16, %ymm15, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm19[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8,9,10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vprold $16, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm19 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm4 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm29 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm4, %zmm29, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm26 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm26, %zmm29 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm29 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,2,3,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,2,3,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm15 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,1,1,8,8,10,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vprold $16, %xmm8, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm9, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm1 -; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, %xmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm8, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,1,1,8,8,10,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vprold $16, %xmm7, %xmm15 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm11, %zmm15 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm8, %zmm15 +; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm8, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm11 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm17 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm11, %zmm8, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm14, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7,8,9],ymm7[10],ymm1[11,12],ymm7[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm18 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm15, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7,8,9],ymm7[10],ymm1[11,12],ymm7[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vprold $16, %ymm2, %ymm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm23, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vprold $16, %ymm8, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm17[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm6 -; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm29, %zmm10 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm28, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm28, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm12 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vprold $16, %xmm9, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm21[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm1[1],ymm9[2,3],ymm1[4],ymm9[5,6,7,8],ymm1[9],ymm9[10,11],ymm1[12],ymm9[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,2,3,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,0,2,1,8,8,9,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm24 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8,9,10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm21, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm28, %ymm9 -; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm20 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm20 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm31, %zmm20 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm11, %zmm30, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3],xmm1[4],xmm8[5,6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,1,1,8,8,10,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm2 -; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm20[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7,8,9],ymm13[10],ymm14[11,12],ymm13[13],ymm14[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7,8,9,10],ymm10[11],ymm14[12,13],ymm10[14],ymm14[15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm21, %zmm13 +; AVX512DQ-FAST-NEXT: vpermd %zmm13, %zmm26, %zmm25 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm22, %ymm4, %ymm13 +; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm14 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm24 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm31, %zmm24 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vprold $16, %xmm11, %xmm11 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm11[2],xmm8[3,4],xmm11[5],xmm8[6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm28[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6,7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,2,3,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,2,1,8,8,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm28, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm26 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,1,1,8,8,10,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm11, %zmm30, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm20[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7,8,9],ymm11[10],ymm7[11,12],ymm11[13],ymm7[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm16, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm23 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm11, %zmm11 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512DQ-FAST-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2,3],xmm5[4],xmm9[5,6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm30, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm6 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm6 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm5, %zmm25, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm8, %ymm9, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm11, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm15[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7,8,9],ymm7[10],ymm5[11,12],ymm7[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm7 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm8 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm7, %zmm21, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm9, %ymm10, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm12, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm12, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm13[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm12 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm14 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm30, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 768(%rax) -; AVX512DQ-FAST-NEXT: addq $1432, %rsp # imm = 0x598 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512DQ-FAST-NEXT: addq $1336, %rsp # imm = 0x538 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $136, %rsp +; AVX512BW-NEXT: subq $72, %rsp ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm23 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm30, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm3, %zmm10 ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u> -; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u> +; AVX512BW-NEXT: vpermi2w %zmm23, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm28, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm6, %zmm15 ; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 -; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: kmovd %ecx, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-NEXT: vpermi2w %zmm23, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm22, %zmm10 ; AVX512BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C -; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm23, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm10 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm31, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm24, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm19, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm26 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 -; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} -; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 -; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3} -; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm0 {%k2} +; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm10 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm29, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm31, %zmm23 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm31 +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm21, %zmm23 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm0 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm30 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm3 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm5, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512BW-NEXT: vpermi2w %zmm9, %zmm21, %zmm23 ; AVX512BW-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm3 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm21, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm23, %zmm15 ; AVX512BW-NEXT: movl $202911840, %eax # imm = 0xC183060 -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3} -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm24 -; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm11 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 -; AVX512BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm15 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm13, %zmm4, %zmm28 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm6 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm9, %zmm2, %zmm28 +; AVX512BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm6 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm28, %zmm20 ; AVX512BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} -; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm21 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3} -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2} +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm15 {%k3} +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermi2w %zmm13, %zmm4, %zmm23 +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm23 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm23 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm2, %zmm27 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm2, %zmm27 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm20, %zmm17 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 -; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm13, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u> -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm2, %zmm12 ; AVX512BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm17 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm11, %zmm29 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm29 +; AVX512BW-NEXT: vmovdqu16 %zmm29, %zmm20 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm2, %zmm18 ; AVX512BW-NEXT: movl $405823681, %eax # imm = 0x183060C1 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm2, %zmm1 -; AVX512BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm18 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm12, %zmm26 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm7, %zmm24 +; AVX512BW-NEXT: vpermi2w %zmm13, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm21, %zmm26 ; AVX512BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm1, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u> -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm18 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm12 +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm2 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm12, %zmm21 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm12, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm19 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm7, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u> +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm7, %zmm22 +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm1 {%k2} ; AVX512BW-NEXT: movl $1893843847, %eax # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u> +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512BW-NEXT: vpermi2w %zmm31, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] +; AVX512BW-NEXT: vpermi2w %zmm31, %zmm5, %zmm4 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) -; AVX512BW-NEXT: addq $136, %rsp +; AVX512BW-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 832(%rax) +; AVX512BW-NEXT: addq $72, %rsp ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 1cb76e97601528..6ac65adbd7c685 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -533,78 +533,78 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm11 -; SSE-NEXT: movdqa (%r8), %xmm4 -; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa (%r10), %xmm3 -; SSE-NEXT: movdqa (%rax), %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; SSE-NEXT: movdqa (%rcx), %xmm10 +; SSE-NEXT: movdqa (%r8), %xmm3 +; SSE-NEXT: movdqa (%r9), %xmm7 +; SSE-NEXT: movdqa (%r10), %xmm2 +; SSE-NEXT: movdqa (%rax), %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] +; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] ; SSE-NEXT: movdqa %xmm15, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm12[0],xmm7[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm13[0],xmm9[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm8[0],xmm7[1] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm9, 96(%rax) +; SSE-NEXT: movapd %xmm8, 96(%rax) ; SSE-NEXT: movaps %xmm10, 112(%rax) ; SSE-NEXT: movaps %xmm11, 80(%rax) -; SSE-NEXT: movapd %xmm8, 64(%rax) -; SSE-NEXT: movapd %xmm7, 32(%rax) +; SSE-NEXT: movapd %xmm7, 64(%rax) +; SSE-NEXT: movapd %xmm9, 32(%rax) ; SSE-NEXT: movaps %xmm6, 48(%rax) ; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movapd %xmm2, (%rax) +; SSE-NEXT: movapd %xmm4, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf8: @@ -613,20 +613,20 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%r11), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa (%r11), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,0,0] @@ -644,29 +644,29 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) @@ -849,180 +849,177 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride8_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $88, %rsp +; SSE-NEXT: subq $72, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm15 +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: movdqa (%rdx), %xmm9 ; SSE-NEXT: movdqa (%rcx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm8 ; SSE-NEXT: movdqa (%r9), %xmm2 -; SSE-NEXT: movdqa (%r10), %xmm11 +; SSE-NEXT: movdqa (%r10), %xmm10 ; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; SSE-NEXT: movdqa %xmm8, %xmm12 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSE-NEXT: movdqa 16(%rdx), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: movdqa 16(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%r10), %xmm15 -; SSE-NEXT: movdqa 16(%rax), %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; SSE-NEXT: movdqa 16(%r8), %xmm4 +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; SSE-NEXT: movdqa 16(%rsi), %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa 16(%r10), %xmm6 +; SSE-NEXT: movdqa 16(%rax), %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE-NEXT: movdqa 16(%r8), %xmm2 ; SSE-NEXT: movdqa 16(%r9), %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm3[0],xmm10[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm7[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm1, 224(%rax) ; SSE-NEXT: movaps %xmm3, 240(%rax) -; SSE-NEXT: movapd %xmm10, 160(%rax) +; SSE-NEXT: movapd %xmm7, 160(%rax) ; SSE-NEXT: movaps %xmm8, 176(%rax) ; SSE-NEXT: movapd %xmm13, 96(%rax) ; SSE-NEXT: movaps %xmm12, 112(%rax) -; SSE-NEXT: movapd %xmm5, 32(%rax) -; SSE-NEXT: movaps %xmm6, 48(%rax) -; SSE-NEXT: movapd %xmm9, 192(%rax) +; SSE-NEXT: movapd %xmm0, 32(%rax) +; SSE-NEXT: movaps %xmm9, 48(%rax) +; SSE-NEXT: movapd %xmm10, 192(%rax) ; SSE-NEXT: movaps %xmm11, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) @@ -1030,323 +1027,325 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $88, %rsp +; SSE-NEXT: addq $72, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $136, %rsp +; AVX1-ONLY-NEXT: subq $104, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 16(%r10), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm12[0],zero,xmm12[1],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm10[0],zero,xmm10[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm14, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm15 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2],ymm3[3],ymm10[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4],ymm10[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm11[3],ymm8[4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3],ymm8[4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm10[0],zero,xmm10[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $136, %rsp +; AVX1-ONLY-NEXT: addq $104, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride8_vf16: ; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: pushq %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm12 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm9[0],zero,xmm9[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0],ymm4[1],ymm15[2,3,4],ymm4[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3],ymm7[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3],ymm11[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm15 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3],ymm10[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[8],ymm15[8],ymm9[9],ymm15[9],ymm9[10],ymm15[10],ymm9[11],ymm15[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm15[4],ymm9[5],ymm15[5],ymm9[6],ymm15[6],ymm9[7],ymm15[7],ymm9[12],ymm15[12],ymm9[13],ymm15[13],ymm9[14],ymm15[14],ymm9[15],ymm15[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 224(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 224(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: popq %rax ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1361,117 +1360,117 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm11 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm14 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm7, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2,3,4],ymm2[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm13 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm15 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[8],ymm12[8],ymm8[9],ymm12[9],ymm8[10],ymm12[10],ymm8[11],ymm12[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,1,6,5,6,5,7,7] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm14[0],ymm6[0],ymm14[1],ymm6[1],ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[8],ymm6[8],ymm14[9],ymm6[9],ymm14[10],ymm6[10],ymm14[11],ymm6[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[12],ymm15[12],ymm13[13],ymm15[13],ymm13[14],ymm15[14],ymm13[15],ymm15[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[12],ymm12[12],ymm8[13],ymm12[13],ymm8[14],ymm12[14],ymm8[15],ymm12[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm6[4],ymm14[5],ymm6[5],ymm14[6],ymm6[6],ymm14[7],ymm6[7],ymm14[12],ymm6[12],ymm14[13],ymm6[13],ymm14[14],ymm6[14],ymm14[15],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,1,3,5,7,5,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1486,143 +1485,147 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride8_vf16: ; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: pushq %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm9[0],zero,xmm9[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0],ymm4[1],ymm15[2,3,4],ymm4[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3],ymm7[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3],ymm11[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3],ymm10[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[8],ymm15[8],ymm9[9],ymm15[9],ymm9[10],ymm15[10],ymm9[11],ymm15[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm15[4],ymm9[5],ymm15[5],ymm9[6],ymm15[6],ymm9[7],ymm15[7],ymm9[12],ymm15[12],ymm9[13],ymm15[13],ymm9[14],ymm15[14],ymm9[15],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: popq %rax ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1630,83 +1633,82 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm9 -; AVX512F-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512F-NEXT: vmovdqa (%r8), %ymm15 -; AVX512F-NEXT: vmovdqa (%r9), %ymm3 -; AVX512F-NEXT: vmovdqa (%r10), %ymm4 -; AVX512F-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-NEXT: vmovdqa (%rax), %xmm5 -; AVX512F-NEXT: vmovdqa (%r10), %xmm6 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm22 -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-NEXT: vmovdqa (%r8), %xmm8 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqa (%rsi), %xmm13 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm18 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm19 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15] +; AVX512F-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-NEXT: vmovdqa64 (%rcx), %ymm19 +; AVX512F-NEXT: vmovdqa (%r8), %ymm4 +; AVX512F-NEXT: vmovdqa (%r9), %ymm5 +; AVX512F-NEXT: vmovdqa (%r10), %ymm6 +; AVX512F-NEXT: vmovdqa (%rax), %ymm7 +; AVX512F-NEXT: vmovdqa (%rax), %xmm0 +; AVX512F-NEXT: vmovdqa (%r10), %xmm8 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512F-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm16 +; AVX512F-NEXT: vmovdqa (%r9), %xmm10 +; AVX512F-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm20 +; AVX512F-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512F-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm17 +; AVX512F-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm22 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm18 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqa %ymm2, %ymm1 +; AVX512F-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm19 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] ; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512F-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27> -; AVX512F-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27> +; AVX512F-NEXT: vpermt2d %zmm16, %zmm11, %zmm20 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u> -; AVX512F-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm13, %zmm22 ; AVX512F-NEXT: movb $-86, %cl ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] +; AVX512F-NEXT: vpermt2d %zmm18, %zmm12, %zmm21 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] -; AVX512F-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512F-NEXT: vpermt2d %zmm9, %zmm12, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm13, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512F-NEXT: vpermt2d %zmm19, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 {%k1} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm12, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512F-NEXT: vpermt2d %zmm7, %zmm11, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm13, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1777,148 +1779,148 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $264, %rsp # imm = 0x108 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa (%rsi), %xmm9 ; SSE-NEXT: movdqa (%rdx), %xmm4 ; SSE-NEXT: movdqa (%rcx), %xmm10 ; SSE-NEXT: movdqa (%r8), %xmm6 -; SSE-NEXT: movdqa (%r9), %xmm9 +; SSE-NEXT: movdqa (%r9), %xmm8 ; SSE-NEXT: movdqa (%r10), %xmm7 ; SSE-NEXT: movdqa (%rax), %xmm11 ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%r8), %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] ; SSE-NEXT: movdqa 16(%r9), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movdqa 16(%r10), %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm5 +; SSE-NEXT: movdqa 16(%rax), %xmm3 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,3] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: movdqa 16(%rdx), %xmm4 ; SSE-NEXT: movdqa 16(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm6[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill @@ -1936,17 +1938,17 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 32(%rdx), %xmm2 ; SSE-NEXT: movdqa 32(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm9 +; SSE-NEXT: movdqa 32(%rsi), %xmm10 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -1957,16 +1959,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -1993,17 +1995,17 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] -; SSE-NEXT: movdqa 48(%r10), %xmm9 +; SSE-NEXT: movdqa 48(%r10), %xmm8 ; SSE-NEXT: movdqa 48(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa 48(%r8), %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa 48(%r8), %xmm5 ; SSE-NEXT: movdqa 48(%r9), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa 48(%rdx), %xmm6 @@ -2021,26 +2023,26 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] @@ -2048,26 +2050,26 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 496(%rax) -; SSE-NEXT: movapd %xmm5, 480(%rax) +; SSE-NEXT: movapd %xmm4, 480(%rax) ; SSE-NEXT: movaps %xmm3, 464(%rax) ; SSE-NEXT: movapd %xmm1, 448(%rax) ; SSE-NEXT: movaps %xmm7, 432(%rax) -; SSE-NEXT: movapd %xmm8, 416(%rax) +; SSE-NEXT: movapd %xmm9, 416(%rax) ; SSE-NEXT: movaps %xmm10, 400(%rax) ; SSE-NEXT: movapd %xmm11, 384(%rax) ; SSE-NEXT: movaps %xmm14, 368(%rax) @@ -2138,14 +2140,14 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2155,146 +2157,146 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3],ymm14[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] @@ -2314,75 +2316,75 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm13[0],zero,xmm13[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3],ymm10[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -2393,32 +2395,32 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -2426,9 +2428,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 288(%rax) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2456,14 +2458,14 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm2 @@ -2471,94 +2473,94 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm0[1],ymm14[2,3,4],ymm0[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm14 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm15[0],zero,xmm15[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2,3,4],ymm8[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm14 @@ -2573,22 +2575,22 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] @@ -2597,26 +2599,26 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] @@ -2624,85 +2626,85 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm13[4],ymm9[5],ymm13[5],ymm9[6],ymm13[6],ymm9[7],ymm13[7],ymm9[12],ymm13[12],ymm9[13],ymm13[13],ymm9[14],ymm13[14],ymm9[15],ymm13[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm4 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm4[4],ymm7[5],ymm4[5],ymm7[6],ymm4[6],ymm7[7],ymm4[7],ymm7[12],ymm4[12],ymm7[13],ymm4[13],ymm7[14],ymm4[14],ymm7[15],ymm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm6[4],ymm14[4],ymm6[5],ymm14[5],ymm6[6],ymm14[6],ymm6[7],ymm14[7],ymm6[12],ymm14[12],ymm6[13],ymm14[13],ymm6[14],ymm14[14],ymm6[15],ymm14[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm13[0],ymm9[1],ymm13[1],ymm9[2],ymm13[2],ymm9[3],ymm13[3],ymm9[8],ymm13[8],ymm9[9],ymm13[9],ymm9[10],ymm13[10],ymm9[11],ymm13[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[8],ymm14[8],ymm6[9],ymm14[9],ymm6[10],ymm14[10],ymm6[11],ymm14[11] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] @@ -2710,22 +2712,22 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 224(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 192(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 416(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) @@ -2758,103 +2760,101 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,0,u,u,u,1,u> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2],ymm0[3],ymm9[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3,4],ymm4[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm15, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2862,150 +2862,149 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm15, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm5[3],ymm8[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3],ymm0[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3],ymm0[4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3],ymm10[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm6 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm11[4],ymm8[4],ymm11[5],ymm8[5],ymm11[6],ymm8[6],ymm11[7],ymm8[7],ymm11[12],ymm8[12],ymm11[13],ymm8[13],ymm11[14],ymm8[14],ymm11[15],ymm8[15] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3],ymm13[4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[12],ymm5[12],ymm13[13],ymm5[13],ymm13[14],ymm5[14],ymm13[15],ymm5[15] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm15, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm5[0],ymm13[1],ymm5[1],ymm13[2],ymm5[2],ymm13[3],ymm5[3],ymm13[8],ymm5[8],ymm13[9],ymm5[9],ymm13[10],ymm5[10],ymm13[11],ymm5[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 128(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3041,14 +3040,14 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 @@ -3056,94 +3055,94 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm0[1],ymm14[2,3,4],ymm0[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm15[0],zero,xmm15[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2,3,4],ymm8[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm14 @@ -3158,22 +3157,22 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] @@ -3182,26 +3181,26 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] @@ -3209,85 +3208,85 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm13[4],ymm9[5],ymm13[5],ymm9[6],ymm13[6],ymm9[7],ymm13[7],ymm9[12],ymm13[12],ymm9[13],ymm13[13],ymm9[14],ymm13[14],ymm9[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm4[4],ymm7[5],ymm4[5],ymm7[6],ymm4[6],ymm7[7],ymm4[7],ymm7[12],ymm4[12],ymm7[13],ymm4[13],ymm7[14],ymm4[14],ymm7[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm6[4],ymm14[4],ymm6[5],ymm14[5],ymm6[6],ymm14[6],ymm6[7],ymm14[7],ymm6[12],ymm14[12],ymm6[13],ymm14[13],ymm6[14],ymm14[14],ymm6[15],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm13[0],ymm9[1],ymm13[1],ymm9[2],ymm13[2],ymm9[3],ymm13[3],ymm9[8],ymm13[8],ymm9[9],ymm13[9],ymm9[10],ymm13[10],ymm9[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[8],ymm14[8],ymm6[9],ymm14[9],ymm6[10],ymm14[10],ymm6[11],ymm14[11] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] @@ -3295,22 +3294,22 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) @@ -3347,140 +3346,142 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm26, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm3 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm17, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> ; AVX512F-SLOW-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm18, %zmm23 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm28, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm19, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> ; AVX512F-SLOW-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm19, %zmm31 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm20, %zmm25 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm21, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm22, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm15 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm20, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm7 -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm19, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm18, %zmm17 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm20, %zmm16 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm15[0],ymm0[1],ymm15[1],ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[8],ymm15[8],ymm0[9],ymm15[9],ymm0[10],ymm15[10],ymm0[11],ymm15[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm26, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm14[0],ymm1[0],ymm14[1],ymm1[1],ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[8],ymm1[8],ymm14[9],ymm1[9],ymm14[10],ymm1[10],ymm14[11],ymm1[11] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm28, %zmm27 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm12 +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm21, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm13 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm22, %zmm29 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm1[4],ymm14[5],ymm1[5],ymm14[6],ymm1[6],ymm14[7],ymm1[7],ymm14[12],ymm1[12],ymm14[13],ymm1[13],ymm14[14],ymm1[14],ymm14[15],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm15[4],ymm0[5],ymm15[5],ymm0[6],ymm15[6],ymm0[7],ymm15[7],ymm0[12],ymm15[12],ymm0[13],ymm15[13],ymm0[14],ymm15[14],ymm0[15],ymm15[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm26, %zmm16 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm16 {%k2} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm23 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm27, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm28, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm22 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm26, %zmm25 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm27, %zmm25 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm28, %zmm24 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm24 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm19, %zmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm18, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm19, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm18, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm20, %zmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11] -; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm21, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm20, %zmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm21, %zmm4 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm28, %zmm16 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm17, %zmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm18, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm19, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm20, %zmm3 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm17, %zmm2 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm18, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm19, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm20, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] +; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm21, %zmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm22, %zmm13 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm21, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm22, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[8],ymm14[8],ymm6[9],ymm14[9],ymm6[10],ymm14[10],ymm6[11],ymm14[11] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm28, %zmm4 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm14[4],ymm6[5],ymm14[5],ymm6[6],ymm14[6],ymm6[7],ymm14[7],ymm6[12],ymm14[12],ymm6[13],ymm14[13],ymm6[14],ymm14[14],ymm6[15],ymm14[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm26, %zmm6 +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm28, %zmm6 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm26, %zmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm28, %zmm7 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm7 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm17, %zmm12 +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm18, %zmm12 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm19, %zmm8 +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm20, %zmm8 {%k2} ; AVX512F-SLOW-NEXT: movb $-86, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, %zmm14 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm27 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride8_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $472, %rsp # imm = 0x1D8 +; AVX512F-FAST-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm1 @@ -3490,190 +3491,186 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm27 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm29 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm19 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm31 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm18 ; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm24 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm22 ; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm23 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm21 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm30 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm28 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm11 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm15 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm16 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm26 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm15 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm15 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX512F-FAST-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] -; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm31, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm25 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm31, %zmm17 -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm0, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm16 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm24 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm13 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm28, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm29, %zmm23 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm28, %zmm22 +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm29, %zmm22 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm24 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm25 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm30 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm31 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm31, %zmm6 -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm31, %zmm7 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm13 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm28, %zmm14 +; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm29, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm28, %zmm11 ; AVX512F-FAST-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm0, %zmm7 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] -; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm12, %zmm10 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm0, %zmm22 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm12, %zmm22 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm0, %zmm3 -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm12, %zmm3 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm12, %zmm0 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm2, %zmm12 -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm5, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm2, %zmm15 -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm5, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm5, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm29, %zmm11 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm12, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm28, %zmm21 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm12, %zmm7 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm28, %zmm7 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm12, %zmm2 +; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm28, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm12, %zmm0 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm28, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm4, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm4, %zmm6 +; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm10, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm4, %zmm8 +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm10, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm10, %zmm4 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm16 -; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm5, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm9, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm5, %zmm14 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm5, %zmm9 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm10, %zmm12 +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm5, %zmm12 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm10, %zmm15 +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm5, %zmm15 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm10 {%k2} ; AVX512F-FAST-NEXT: movb $-86, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 384(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-FAST-NEXT: addq $472, %rsp # imm = 0x1D8 +; AVX512F-FAST-NEXT: addq $328, %rsp # imm = 0x148 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -3723,61 +3720,61 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm12 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u> @@ -3788,8 +3785,8 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -3820,151 +3817,151 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $776, %rsp # imm = 0x308 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: movdqa (%rcx), %xmm9 -; SSE-NEXT: movdqa (%r8), %xmm4 -; SSE-NEXT: movdqa (%r9), %xmm10 -; SSE-NEXT: movdqa (%r10), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa (%rcx), %xmm10 +; SSE-NEXT: movdqa (%r8), %xmm6 +; SSE-NEXT: movdqa (%r9), %xmm8 +; SSE-NEXT: movdqa (%r10), %xmm7 ; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%r8), %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE-NEXT: movdqa 16(%r9), %xmm5 +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; SSE-NEXT: movdqa 16(%r9), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movdqa 16(%r10), %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa 16(%rax), %xmm3 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,3] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: movdqa 16(%rdx), %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: movdqa 16(%rdx), %xmm4 ; SSE-NEXT: movdqa 16(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r10), %xmm0 ; SSE-NEXT: movdqa 32(%rax), %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm6 @@ -3979,17 +3976,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 32(%rdx), %xmm2 ; SSE-NEXT: movdqa 32(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm9 +; SSE-NEXT: movdqa 32(%rsi), %xmm10 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4000,16 +3997,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4052,17 +4049,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 48(%rdx), %xmm2 ; SSE-NEXT: movdqa 48(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rsi), %xmm9 +; SSE-NEXT: movdqa 48(%rsi), %xmm10 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4073,16 +4070,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4125,17 +4122,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 64(%rdx), %xmm2 ; SSE-NEXT: movdqa 64(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rsi), %xmm9 +; SSE-NEXT: movdqa 64(%rsi), %xmm10 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4146,16 +4143,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4198,17 +4195,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 80(%rdx), %xmm2 ; SSE-NEXT: movdqa 80(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa 80(%rsi), %xmm9 +; SSE-NEXT: movdqa 80(%rsi), %xmm10 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4219,16 +4216,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4271,17 +4268,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 96(%rdx), %xmm2 ; SSE-NEXT: movdqa 96(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa 96(%rsi), %xmm9 +; SSE-NEXT: movdqa 96(%rsi), %xmm10 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4292,16 +4289,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4328,17 +4325,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] -; SSE-NEXT: movdqa 112(%r10), %xmm9 +; SSE-NEXT: movdqa 112(%r10), %xmm8 ; SSE-NEXT: movdqa 112(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa 112(%r8), %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa 112(%r8), %xmm5 ; SSE-NEXT: movdqa 112(%r9), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa 112(%rdx), %xmm6 @@ -4356,26 +4353,26 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] @@ -4383,26 +4380,26 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 1008(%rax) -; SSE-NEXT: movapd %xmm5, 992(%rax) +; SSE-NEXT: movapd %xmm4, 992(%rax) ; SSE-NEXT: movaps %xmm3, 976(%rax) ; SSE-NEXT: movapd %xmm1, 960(%rax) ; SSE-NEXT: movaps %xmm7, 944(%rax) -; SSE-NEXT: movapd %xmm8, 928(%rax) +; SSE-NEXT: movapd %xmm9, 928(%rax) ; SSE-NEXT: movaps %xmm10, 912(%rax) ; SSE-NEXT: movapd %xmm11, 896(%rax) ; SSE-NEXT: movaps %xmm14, 880(%rax) @@ -4531,17 +4528,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm12 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] @@ -4549,57 +4546,57 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6],ymm13[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1],ymm13[2,3,4],ymm3[5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm8[0],zero,xmm8[1],zero +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm9[0],zero,xmm9[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm6[3],ymm13[4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm10 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm10 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3],ymm9[4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -4671,53 +4668,53 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] @@ -4741,55 +4738,55 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] @@ -4813,55 +4810,55 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%r10), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 64(%rax), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 64(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] @@ -4885,55 +4882,55 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 80(%r10), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 80(%rax), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 80(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] @@ -4956,150 +4953,150 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%r10), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 96(%rax), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rax), %xmm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%r10), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 112(%rax), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm13[0],zero,xmm13[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa 112(%r10), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 112(%rax), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm11[0],zero,xmm11[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3],ymm11[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 960(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 928(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 896(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 864(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 960(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 928(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 896(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 864(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5237,32 +5234,32 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%r10), %xmm8 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -5271,17 +5268,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] @@ -5295,111 +5292,111 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm7 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%r10), %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero @@ -5484,8 +5481,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm11 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 @@ -5493,19 +5490,19 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] @@ -5519,7 +5516,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] @@ -5527,7 +5524,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] @@ -5547,56 +5544,56 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%r10), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX2-SLOW-NEXT: vmovdqa 64(%r10), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %ymm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm11[0],ymm4[1],ymm11[1],ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[8],ymm11[8],ymm4[9],ymm11[9],ymm4[10],ymm11[10],ymm4[11],ymm11[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3,4],ymm6[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm5[4],ymm8[5],ymm5[5],ymm8[6],ymm5[6],ymm8[7],ymm5[7],ymm8[12],ymm5[12],ymm8[13],ymm5[13],ymm8[14],ymm5[14],ymm8[15],ymm5[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm11[4],ymm4[5],ymm11[5],ymm4[6],ymm11[6],ymm4[7],ymm11[7],ymm4[12],ymm11[12],ymm4[13],ymm11[13],ymm4[14],ymm11[14],ymm4[15],ymm11[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] @@ -5604,78 +5601,78 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r10), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 96(%rax), %ymm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r10), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 96(%rax), %ymm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 ; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %ymm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm2[3],ymm12[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 992(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 960(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 928(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 992(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 960(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 928(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 896(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 736(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 736(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5739,240 +5736,240 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: subq $776, %rsp # imm = 0x308 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm4 -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm6 +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm2 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm0[3],ymm10[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%rax), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 64(%rax), %xmm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%r10), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm13 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm13, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm15, %ymm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm14[1],ymm4[2,3,4],ymm14[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,u,0,u,u,u,1,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rax), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 96(%r10), %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm3 ; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3],ymm8[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm12 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,0,u,u,u,1,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 ; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm10 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] @@ -6002,18 +5999,18 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,4,4,4,4,6,5] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,4,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6023,7 +6020,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6031,10 +6028,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm6 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 @@ -6043,27 +6040,27 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11] ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm10, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,5,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,4,5,4,5,5,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] @@ -6076,7 +6073,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm1 @@ -6085,13 +6082,13 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm7 ; AVX2-FAST-NEXT: vmovdqa 64(%r10), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 64(%rax), %ymm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3],ymm10[4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm15 @@ -6108,17 +6105,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm7[4],ymm9[5],ymm7[5],ymm9[6],ymm7[6],ymm9[7],ymm7[7],ymm9[12],ymm7[12],ymm9[13],ymm7[13],ymm9[14],ymm7[14],ymm9[15],ymm7[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,4,4,4,4,6,5] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] @@ -6145,8 +6142,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 96(%r10), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 96(%rax), %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm11 @@ -6159,21 +6156,21 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm8 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,1,6,5,6,5,7,7] ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm13, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,6,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [6,5,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,4,4,4,4,6,5] @@ -6183,11 +6180,11 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm3 @@ -6341,32 +6338,32 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r10), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -6375,17 +6372,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] @@ -6399,111 +6396,111 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r10), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero @@ -6588,8 +6585,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm4 @@ -6597,19 +6594,19 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] @@ -6623,7 +6620,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] @@ -6631,7 +6628,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] @@ -6651,56 +6648,56 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r10), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r10), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm11[0],ymm4[1],ymm11[1],ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[8],ymm11[8],ymm4[9],ymm11[9],ymm4[10],ymm11[10],ymm4[11],ymm11[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3,4],ymm6[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm5[4],ymm8[5],ymm5[5],ymm8[6],ymm5[6],ymm8[7],ymm5[7],ymm8[12],ymm5[12],ymm8[13],ymm5[13],ymm8[14],ymm5[14],ymm8[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm11[4],ymm4[5],ymm11[5],ymm4[6],ymm11[6],ymm4[7],ymm11[7],ymm4[12],ymm11[12],ymm4[13],ymm11[13],ymm4[14],ymm11[14],ymm4[15],ymm11[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] @@ -6708,78 +6705,78 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r10), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r10), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm2[3],ymm12[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 992(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 960(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 928(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 992(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 960(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 928(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 896(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 736(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 736(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6840,295 +6837,300 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-SLOW-LABEL: store_i16_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm30, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %xmm8 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> ; AVX512F-SLOW-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm1 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 ; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm19, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm16, %zmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm19, %zmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm16, %zmm31 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm17, %zmm31 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm30, %zmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm29, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm30, %zmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm19, %zmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm18, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm25, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm26, %zmm1 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm0[0],ymm15[0],ymm0[1],ymm15[1],ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[8],ymm15[8],ymm0[9],ymm15[9],ymm0[10],ymm15[10],ymm0[11],ymm15[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm22, %zmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm23, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm25, %zmm2 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm26, %zmm2 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm7[4],ymm9[5],ymm7[5],ymm9[6],ymm7[6],ymm9[7],ymm7[7],ymm9[12],ymm7[12],ymm9[13],ymm7[13],ymm9[14],ymm7[14],ymm9[15],ymm7[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm15[4],ymm0[5],ymm15[5],ymm0[6],ymm15[6],ymm0[7],ymm15[7],ymm0[12],ymm15[12],ymm0[13],ymm15[13],ymm0[14],ymm15[14],ymm0[15],ymm15[15] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm22, %zmm30 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm23, %zmm30 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm13, %zmm2 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm14, %zmm2 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm1 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm1 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm25, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm0 {%k2} ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm16, %zmm26 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm17, %zmm26 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm8 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm9 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm19, %zmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm5 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm22, %zmm19 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm23, %zmm19 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm11 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm25, %zmm6 +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm26, %zmm6 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm16, %zmm24 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm17, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm30, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm1 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm22, %zmm18 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm23, %zmm18 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm13, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm14, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm6 ; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm19, %zmm28 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm18, %zmm28 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm16, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm17, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm19, %zmm25 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm18, %zmm25 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm16, %zmm21 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm17, %zmm21 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm19, %zmm27 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11] -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm18, %zmm27 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm19, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm18, %zmm20 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm16, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm17, %zmm18 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm16, %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm17, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm30, %zmm19 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm19 {%k2} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm30, %zmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm29, %zmm10 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm30, %zmm17 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm29, %zmm17 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm30, %zmm8 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm25, %zmm31 +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm26, %zmm31 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11] +; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm22, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[8],ymm11[8],ymm0[9],ymm11[9],ymm0[10],ymm11[10],ymm0[11],ymm11[11] +; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm23, %zmm17 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm25, %zmm21 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm26, %zmm21 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm11[4],ymm0[5],ymm11[5],ymm0[6],ymm11[6],ymm0[7],ymm11[7],ymm0[12],ymm11[12],ymm0[13],ymm11[13],ymm0[14],ymm11[14],ymm0[15],ymm11[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm22, %zmm20 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm23, %zmm20 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm25, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm26, %zmm24 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm25, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm26, %zmm27 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm22, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm23, %zmm25 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm22, %zmm22 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm23, %zmm22 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm13, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm14, %zmm16 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm14, %zmm23 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm26 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm26 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm6 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm14, %zmm6 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm5, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm13, %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm15 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm30, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm5, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm5, %zmm3 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm29, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm13, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm15 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm30, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm3 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm5, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm2 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm5, %zmm2 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm29, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm13, %zmm12 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm30, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm5, %zmm12 {%k1} ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm2 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm1 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm5, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm13, %zmm7 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm5, %zmm7 {%k1} ; AVX512F-SLOW-NEXT: movb $-86, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm26 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm21 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm12 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 576(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-SLOW-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 768(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 960(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 896(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512F-FAST-NEXT: subq $2184, %rsp # imm = 0x888 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm3 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %ymm1 @@ -7147,20 +7149,19 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm19 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm7[4],ymm11[5],ymm7[5],ymm11[6],ymm7[6],ymm11[7],ymm7[7],ymm11[12],ymm7[12],ymm11[13],ymm7[13],ymm11[14],ymm7[14],ymm11[15],ymm7[15] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %xmm1 @@ -7178,9 +7179,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7189,302 +7190,303 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm8 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[2],ymm8[2],ymm5[3],ymm8[3],ymm5[8],ymm8[8],ymm5[9],ymm8[9],ymm5[10],ymm8[10],ymm5[11],ymm8[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm8[4],ymm5[5],ymm8[5],ymm5[6],ymm8[6],ymm5[7],ymm8[7],ymm5[12],ymm8[12],ymm5[13],ymm8[13],ymm5[14],ymm8[14],ymm5[15],ymm8[15] -; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm30 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm17 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm16 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm28 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm26 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[8],ymm5[8],ymm11[9],ymm5[9],ymm11[10],ymm5[10],ymm11[11],ymm5[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm30 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm26 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm5[4],ymm11[5],ymm5[5],ymm11[6],ymm5[6],ymm11[7],ymm5[7],ymm11[12],ymm5[12],ymm11[13],ymm5[13],ymm11[14],ymm5[14],ymm11[15],ymm5[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm22 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm29 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm22 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm27 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm23 ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[8],ymm6[8],ymm3[9],ymm6[9],ymm3[10],ymm6[10],ymm3[11],ymm6[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm11 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[8],ymm11[8],ymm2[9],ymm11[9],ymm2[10],ymm11[10],ymm2[11],ymm11[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[8],ymm8[8],ymm1[9],ymm8[9],ymm1[10],ymm8[10],ymm1[11],ymm8[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm9 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[12],ymm6[12],ymm3[13],ymm6[13],ymm3[14],ymm6[14],ymm3[15],ymm6[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm6 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm10 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm9[0],ymm0[1],ymm9[1],ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[8],ymm9[8],ymm0[9],ymm9[9],ymm0[10],ymm9[10],ymm0[11],ymm9[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm13 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm11[4],ymm2[5],ymm11[5],ymm2[6],ymm11[6],ymm2[7],ymm11[7],ymm2[12],ymm11[12],ymm2[13],ymm11[13],ymm2[14],ymm11[14],ymm2[15],ymm11[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm8[4],ymm1[5],ymm8[5],ymm1[6],ymm8[6],ymm1[7],ymm8[7],ymm1[12],ymm8[12],ymm1[13],ymm8[13],ymm1[14],ymm8[14],ymm1[15],ymm8[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm9[4],ymm0[5],ymm9[5],ymm0[6],ymm9[6],ymm0[7],ymm9[7],ymm0[12],ymm9[12],ymm0[13],ymm9[13],ymm0[14],ymm9[14],ymm0[15],ymm9[15] +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm11 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm14 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm12 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm20 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm17 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512F-FAST-NEXT: kmovw %eax, %k2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm0, %zmm19 +; AVX512F-FAST-NEXT: vpermd (%rsp), %zmm1, %zmm19 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm4 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm24 ; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm24 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm1, %zmm25 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm5 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm1, %zmm27 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm1, %zmm5 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm19 -; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm1, %zmm19 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm0, %zmm4 +; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm1, %zmm4 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm1, %zmm25 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm1, %zmm18 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm31 +; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm1, %zmm31 {%k2} ; AVX512F-FAST-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm15 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm1, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm1, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm0, %zmm4 -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm1, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm1, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm5, %zmm6 +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm5, %zmm1 +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm0, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm5, %zmm15 +; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm0, %zmm15 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm16 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 {%k2} # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm28 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm29 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm21 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm22 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm26 +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm5, %zmm26 {%k2} ; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm23 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm10, %zmm23 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm0, %zmm20 -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm10, %zmm20 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm10, %zmm0 {%k2} -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm12 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm22, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm26 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm22, %zmm26 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm26 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm30 -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm22, %zmm30 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm30 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm31 -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm22, %zmm31 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm31 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm22, %zmm5 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm5, %zmm23 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm5, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm14, %zmm5 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm17, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm14 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm27, %zmm20 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm17, %zmm20 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm14 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm28 +; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm27, %zmm28 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm17, %zmm28 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm14 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm29 +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm27, %zmm29 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm17, %zmm29 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm14 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm30 +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm27, %zmm30 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm17, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm14 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm27, %zmm9 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm17, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm14 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm18 -; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm22, %zmm18 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm10 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm22, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm22 -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm11, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm27, %zmm10 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm17, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm14 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm27, %zmm14 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm27 +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm17, %zmm14 {%k1} ; AVX512F-FAST-NEXT: movb $-86, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm26 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm30 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 832(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 768(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 960(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 896(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) -; AVX512F-FAST-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 832(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 768(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 960(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 896(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-FAST-NEXT: addq $2184, %rsp # imm = 0x888 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -7493,159 +7495,172 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: subq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm21 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm29 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm3, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm3, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm3, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm27, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm3, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm15, %zmm30 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm19, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm19, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm19, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm19, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm19, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm19, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm19, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm31 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm20, %zmm31 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm22, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm22, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm22, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm22, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm22, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm28 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm22, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm22, %zmm20 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm28 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm4, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm4, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm29 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm30 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm16 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 @@ -7655,14 +7670,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm23, %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm12 @@ -7670,101 +7685,101 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm6 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm23, %zmm1 ; AVX512BW-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm0 {%k1} ; AVX512BW-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm18 {%k2} ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm9 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm10 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) @@ -7776,7 +7791,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll index 0a59406d865e1e..5d6836ce5869c4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -257,58 +257,58 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps 96(%rdi), %xmm6 ; SSE-NEXT: movaps 80(%rdi), %xmm4 ; SSE-NEXT: movaps 64(%rdi), %xmm3 -; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm5 -; SSE-NEXT: movaps 96(%rsi), %xmm11 -; SSE-NEXT: movaps 80(%rsi), %xmm12 -; SSE-NEXT: movaps 64(%rsi), %xmm13 +; SSE-NEXT: movaps 96(%rsi), %xmm10 +; SSE-NEXT: movaps 80(%rsi), %xmm11 +; SSE-NEXT: movaps 64(%rsi), %xmm12 ; SSE-NEXT: movaps (%rsi), %xmm9 -; SSE-NEXT: movaps 16(%rsi), %xmm10 +; SSE-NEXT: movaps 16(%rsi), %xmm13 ; SSE-NEXT: movaps 32(%rsi), %xmm14 ; SSE-NEXT: movaps 48(%rsi), %xmm15 -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] ; SSE-NEXT: movaps %xmm5, %xmm14 ; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] ; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] +; SSE-NEXT: movaps %xmm4, %xmm12 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; SSE-NEXT: movaps 112(%rsi), %xmm11 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; SSE-NEXT: movaps %xmm6, %xmm11 +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1] +; SSE-NEXT: movaps 112(%rsi), %xmm10 +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movaps %xmm0, 224(%rdx) -; SSE-NEXT: movaps %xmm7, 240(%rdx) +; SSE-NEXT: movaps %xmm8, 240(%rdx) ; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm12, 208(%rdx) +; SSE-NEXT: movaps %xmm11, 208(%rdx) ; SSE-NEXT: movaps %xmm4, 160(%rdx) -; SSE-NEXT: movaps %xmm13, 176(%rdx) +; SSE-NEXT: movaps %xmm12, 176(%rdx) ; SSE-NEXT: movaps %xmm3, 128(%rdx) ; SSE-NEXT: movaps %xmm15, 144(%rdx) ; SSE-NEXT: movaps %xmm5, 96(%rdx) ; SSE-NEXT: movaps %xmm14, 112(%rdx) ; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps %xmm10, 80(%rdx) +; SSE-NEXT: movaps %xmm13, 80(%rdx) ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps %xmm9, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: movaps %xmm7, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: retq @@ -434,7 +434,7 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-LABEL: store_i32_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 112(%rdi), %xmm14 +; SSE-NEXT: movaps 112(%rdi), %xmm15 ; SSE-NEXT: movaps 96(%rdi), %xmm13 ; SSE-NEXT: movaps 80(%rdi), %xmm11 ; SSE-NEXT: movaps 64(%rdi), %xmm10 @@ -449,9 +449,9 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps 16(%rsi), %xmm4 ; SSE-NEXT: movaps 32(%rsi), %xmm5 ; SSE-NEXT: movaps 48(%rsi), %xmm6 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm7 @@ -469,27 +469,27 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm15 ; SSE-NEXT: movaps 128(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm15, %xmm1 @@ -612,29 +612,29 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] @@ -649,14 +649,14 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vmovaps 176(%rsi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] @@ -669,12 +669,12 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 320(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) @@ -690,12 +690,12 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; ; AVX2-ONLY-LABEL: store_i32_stride2_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm10 @@ -705,14 +705,14 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm15[0,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm15[0,1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm13[2],ymm7[3],ymm13[3],ymm7[6],ymm13[6],ymm7[7],ymm13[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm7[0],ymm13[0],ymm7[1],ymm13[1],ymm7[4],ymm13[4],ymm7[5],ymm13[5] @@ -730,19 +730,18 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[0,1],ymm15[0,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[6],ymm10[6],ymm4[7],ymm10[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm4[0],ymm10[0],ymm4[1],ymm10[1],ymm4[4],ymm10[4],ymm4[5],ymm10[5] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[0,1],ymm15[0,1] ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 384(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 320(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rdx) @@ -753,7 +752,7 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: vmovaps %ymm13, 128(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll index 5ff48a44c347a6..71716798c3c6ce 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -193,38 +193,38 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm5 -; SSE-NEXT: movaps 16(%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps 16(%rdx), %xmm3 +; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps 16(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm4 +; SSE-NEXT: movaps 16(%rdx), %xmm5 +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] ; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm5[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,0] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] -; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm5[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: movaps %xmm9, (%rcx) -; SSE-NEXT: movaps %xmm5, 16(%rcx) -; SSE-NEXT: movaps %xmm4, 48(%rcx) -; SSE-NEXT: movaps %xmm6, 64(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm8, 48(%rcx) +; SSE-NEXT: movaps %xmm3, 64(%rcx) ; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: retq @@ -378,71 +378,70 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps 32(%rdi), %xmm4 ; SSE-NEXT: movaps 48(%rdi), %xmm5 -; SSE-NEXT: movaps (%rsi), %xmm7 -; SSE-NEXT: movaps 16(%rsi), %xmm9 -; SSE-NEXT: movaps 32(%rsi), %xmm10 -; SSE-NEXT: movaps 48(%rsi), %xmm11 +; SSE-NEXT: movaps (%rsi), %xmm8 +; SSE-NEXT: movaps 16(%rsi), %xmm12 +; SSE-NEXT: movaps 32(%rsi), %xmm11 +; SSE-NEXT: movaps 48(%rsi), %xmm6 ; SSE-NEXT: movaps 16(%rdx), %xmm0 ; SSE-NEXT: movaps 32(%rdx), %xmm3 -; SSE-NEXT: movaps 48(%rdx), %xmm8 -; SSE-NEXT: movaps %xmm5, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdx), %xmm9 +; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] ; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm11[3,3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm8[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,0] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm10[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm9[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm13[2,0] ; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm11[1] ; SSE-NEXT: movaps %xmm4, %xmm14 -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm10[3,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm13[0,2] +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm13[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] ; SSE-NEXT: movaps %xmm2, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1] ; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3] -; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm12[3,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm8[1] ; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] ; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm8[3,3] ; SSE-NEXT: movaps (%rdx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm13[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[2,0] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm9[2,3] ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm8[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm15, (%rcx) -; SSE-NEXT: movaps %xmm7, 16(%rcx) -; SSE-NEXT: movaps %xmm13, 48(%rcx) -; SSE-NEXT: movaps %xmm9, 64(%rcx) -; SSE-NEXT: movaps %xmm12, 96(%rcx) -; SSE-NEXT: movaps %xmm10, 112(%rcx) -; SSE-NEXT: movaps %xmm6, 144(%rcx) -; SSE-NEXT: movaps %xmm11, 160(%rcx) +; SSE-NEXT: movaps %xmm14, (%rcx) +; SSE-NEXT: movaps %xmm8, 16(%rcx) +; SSE-NEXT: movaps %xmm15, 48(%rcx) +; SSE-NEXT: movaps %xmm12, 64(%rcx) +; SSE-NEXT: movaps %xmm10, 96(%rcx) +; SSE-NEXT: movaps %xmm11, 112(%rcx) +; SSE-NEXT: movaps %xmm7, 144(%rcx) +; SSE-NEXT: movaps %xmm6, 160(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] @@ -514,23 +513,23 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i32_stride3_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm5 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = mem[1,0,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,0,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = mem[1,0,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,1] @@ -539,27 +538,27 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm6[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm7, 160(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -615,23 +614,23 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride3_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = mem[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = mem[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,1] @@ -640,27 +639,27 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -701,162 +700,157 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps (%rsi), %xmm10 -; SSE-NEXT: movaps 16(%rsi), %xmm13 -; SSE-NEXT: movaps 32(%rsi), %xmm12 -; SSE-NEXT: movaps 48(%rsi), %xmm9 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps 16(%rdx), %xmm6 -; SSE-NEXT: movaps 32(%rdx), %xmm7 -; SSE-NEXT: movaps 48(%rdx), %xmm8 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3] -; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm10[3,3] -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm11[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm13[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm3 +; SSE-NEXT: movaps 16(%rdi), %xmm4 +; SSE-NEXT: movaps 32(%rdi), %xmm6 +; SSE-NEXT: movaps 48(%rdi), %xmm15 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rsi), %xmm1 +; SSE-NEXT: movaps 32(%rsi), %xmm13 +; SSE-NEXT: movaps 48(%rsi), %xmm5 +; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdx), %xmm9 +; SSE-NEXT: movaps 32(%rdx), %xmm10 +; SSE-NEXT: movaps 48(%rdx), %xmm11 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm8[0,3] +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3] ; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[0,3] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm12[3,3] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm9[3,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm1[3,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps 64(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[0,3] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm13[3,3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm11[0,3] +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm14 +; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] -; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps 64(%rsi), %xmm11 +; SSE-NEXT: movaps %xmm14, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] +; SSE-NEXT: movaps 80(%rdi), %xmm6 ; SSE-NEXT: movaps 80(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] ; SSE-NEXT: movaps 80(%rsi), %xmm8 -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm8[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2] -; SSE-NEXT: movaps 96(%rdi), %xmm4 -; SSE-NEXT: movaps 96(%rdx), %xmm13 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[0,3] -; SSE-NEXT: movaps 96(%rsi), %xmm5 -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2] +; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps 96(%rdx), %xmm12 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[0,3] +; SSE-NEXT: movaps 96(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] ; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 112(%rdx), %xmm7 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm7[0,3] -; SSE-NEXT: movaps 112(%rsi), %xmm1 +; SSE-NEXT: movaps 112(%rdx), %xmm9 ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm9[0,3] +; SSE-NEXT: movaps 112(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[2,3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[1,2],mem[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm7[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[1,2],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm9[2,3] ; SSE-NEXT: movaps %xmm1, 352(%rcx) -; SSE-NEXT: movaps %xmm3, 336(%rcx) -; SSE-NEXT: movaps %xmm5, 304(%rcx) -; SSE-NEXT: movaps %xmm6, 288(%rcx) +; SSE-NEXT: movaps %xmm2, 336(%rcx) +; SSE-NEXT: movaps %xmm4, 304(%rcx) +; SSE-NEXT: movaps %xmm7, 288(%rcx) ; SSE-NEXT: movaps %xmm8, 256(%rcx) -; SSE-NEXT: movaps %xmm11, 240(%rcx) -; SSE-NEXT: movaps %xmm12, 208(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rcx) +; SSE-NEXT: movaps %xmm10, 240(%rcx) +; SSE-NEXT: movaps %xmm11, 208(%rcx) +; SSE-NEXT: movaps %xmm13, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 160(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 144(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rcx) @@ -872,30 +866,29 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 368(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] -; SSE-NEXT: movaps %xmm4, 320(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] -; SSE-NEXT: movaps %xmm10, 272(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] -; SSE-NEXT: movaps %xmm9, 224(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: movaps %xmm0, 176(%rcx) -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] +; SSE-NEXT: movaps %xmm5, 320(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: movaps %xmm6, 272(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0,1,3] +; SSE-NEXT: movaps %xmm14, 224(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] +; SSE-NEXT: movaps %xmm15, 176(%rcx) +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,3] ; SSE-NEXT: movaps %xmm0, 128(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 80(%rcx) -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm4 +; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovapd 96(%rdx), %ymm0 @@ -903,25 +896,25 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm1[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1],xmm6[0,2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd (%rdx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vbroadcastsd (%rdx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[3,3],xmm5[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[3,3],xmm4[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6],ymm6[7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm6[1] @@ -968,14 +961,14 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm5[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] @@ -994,14 +987,14 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 288(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 256(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1009,41 +1002,41 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i32_stride3_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $40, %rsp -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm10 ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm10 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm11 ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[0,0,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm11[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = mem[1,0,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = mem[1,0,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm12[2],ymm8[3,4],ymm12[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = mem[1,0,2,2] @@ -1067,27 +1060,27 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 96(%rdx), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2],ymm8[3,4],ymm12[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5,6],ymm13[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm12[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] @@ -1096,9 +1089,9 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm15, 288(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm14, 352(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1117,91 +1110,91 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride3_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm13 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm12 ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm12 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm13 ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm14 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] ; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm9, %ymm5 ; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm14[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,0,2,1] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm10[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5,6],ymm6[7] ; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4],ymm15[5],ymm6[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm7[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm12[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm9, %ymm14 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm14[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0],ymm10[1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm11[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm8[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm9, %ymm14 ; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm15 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6],ymm10[7] -; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm7, %ymm11 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] +; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm4[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm14[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6],ymm8[7] -; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm9, %ymm11 +; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm3[2],ymm11[3,4],ymm3[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6],ymm11[7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7] +; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] +; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 288(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, 352(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 288(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 352(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm12, 160(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm13, 128(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm8, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm13, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm12, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm10, 224(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm5, 256(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1214,41 +1207,41 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-LABEL: store_i32_stride3_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm11[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = mem[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = mem[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm12[2],ymm8[3,4],ymm12[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = mem[1,0,2,2] @@ -1272,27 +1265,27 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 96(%rdx), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2],ymm8[3,4],ymm12[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] @@ -1301,9 +1294,9 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 288(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 352(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1372,81 +1365,79 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride3_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rdi), %xmm4 -; SSE-NEXT: movaps 32(%rdi), %xmm5 -; SSE-NEXT: movaps 48(%rdi), %xmm6 -; SSE-NEXT: movaps (%rsi), %xmm0 -; SSE-NEXT: movaps 16(%rsi), %xmm11 -; SSE-NEXT: movaps 32(%rsi), %xmm14 -; SSE-NEXT: movaps 48(%rsi), %xmm3 -; SSE-NEXT: movaps (%rdx), %xmm7 -; SSE-NEXT: movaps 16(%rdx), %xmm8 -; SSE-NEXT: movaps 32(%rdx), %xmm9 -; SSE-NEXT: movaps 48(%rdx), %xmm10 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm6 +; SSE-NEXT: movaps 32(%rdi), %xmm7 +; SSE-NEXT: movaps 48(%rdi), %xmm8 +; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps 16(%rsi), %xmm3 +; SSE-NEXT: movaps 32(%rsi), %xmm1 +; SSE-NEXT: movaps 48(%rsi), %xmm0 +; SSE-NEXT: movaps (%rdx), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdx), %xmm11 +; SSE-NEXT: movaps 32(%rdx), %xmm12 +; SSE-NEXT: movaps 48(%rdx), %xmm13 +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[0,3] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] +; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm11[3,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm6, %xmm2 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[0,3] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm14[3,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[0,3] -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm11[0,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] -; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm10[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm12[0,3] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm1[3,3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm13[0,3] +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm0[3,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: movaps 64(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] ; SSE-NEXT: movaps 64(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1460,11 +1451,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdx), %xmm1 +; SSE-NEXT: movaps 80(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] ; SSE-NEXT: movaps 80(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1478,11 +1468,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 96(%rdx), %xmm1 +; SSE-NEXT: movaps 96(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] ; SSE-NEXT: movaps 96(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1496,11 +1485,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm2 -; SSE-NEXT: movaps 112(%rdx), %xmm1 +; SSE-NEXT: movaps 112(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] ; SSE-NEXT: movaps 112(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1514,11 +1502,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdx), %xmm1 +; SSE-NEXT: movaps 128(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] ; SSE-NEXT: movaps 128(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1532,11 +1519,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 144(%rdx), %xmm1 +; SSE-NEXT: movaps 144(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] ; SSE-NEXT: movaps 144(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1549,30 +1535,12 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm14 -; SSE-NEXT: movaps 160(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm12 -; SSE-NEXT: movaps 176(%rdx), %xmm1 +; SSE-NEXT: movaps 160(%rdi), %xmm12 +; SSE-NEXT: movaps 160(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rsi), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,3] +; SSE-NEXT: movaps 160(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm12, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] @@ -1583,22 +1551,36 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm13 -; SSE-NEXT: movaps 192(%rdx), %xmm1 +; SSE-NEXT: movaps 176(%rdi), %xmm13 +; SSE-NEXT: movaps 176(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rsi), %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] +; SSE-NEXT: movaps 176(%rsi), %xmm14 ; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm11[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm14[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rsi), %xmm14 +; SSE-NEXT: movaps %xmm11, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm14[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] ; SSE-NEXT: movaps 208(%rdi), %xmm6 ; SSE-NEXT: movaps 208(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm6, %xmm0 @@ -1614,9 +1596,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2] ; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 224(%rdx), %xmm15 +; SSE-NEXT: movaps 224(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm15[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] ; SSE-NEXT: movaps 224(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] @@ -1624,7 +1607,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2] ; SSE-NEXT: movaps 240(%rdi), %xmm2 ; SSE-NEXT: movaps 240(%rdx), %xmm9 @@ -1639,10 +1622,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] @@ -1651,10 +1634,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] @@ -1679,15 +1662,16 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,2],mem[2,3] ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[1,2],mem[2,3] ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[1,2],mem[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[1,2],mem[2,3] ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm15[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm9[2,3] ; SSE-NEXT: movaps %xmm0, 736(%rcx) ; SSE-NEXT: movaps %xmm3, 720(%rcx) @@ -1695,9 +1679,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm7, 672(%rcx) ; SSE-NEXT: movaps %xmm8, 640(%rcx) ; SSE-NEXT: movaps %xmm10, 624(%rcx) -; SSE-NEXT: movaps %xmm11, 592(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 576(%rcx) +; SSE-NEXT: movaps %xmm14, 592(%rcx) +; SSE-NEXT: movaps %xmm15, 576(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1752,12 +1735,12 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm5, 704(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] ; SSE-NEXT: movaps %xmm6, 656(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] +; SSE-NEXT: movaps %xmm11, 608(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3] -; SSE-NEXT: movaps %xmm13, 608(%rcx) +; SSE-NEXT: movaps %xmm13, 560(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] -; SSE-NEXT: movaps %xmm12, 560(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0,1,3] -; SSE-NEXT: movaps %xmm14, 512(%rcx) +; SSE-NEXT: movaps %xmm12, 512(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 464(%rcx) @@ -1776,7 +1759,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 224(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 176(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1785,7 +1768,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 @@ -1914,48 +1897,48 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 160(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1],xmm4[0,2] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vbroadcastsd 192(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps 208(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3],xmm0[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[3,3],xmm0[3,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 192(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm4[1],xmm0[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[1,1],xmm5[0,2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 224(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 224(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3],xmm0[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,2] +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm0[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] @@ -1984,8 +1967,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] @@ -1998,13 +1981,13 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1,2],ymm0[3],ymm12[4,5],ymm0[6],ymm12[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 608(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 512(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 512(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 320(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 736(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 672(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 640(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 576(%rcx) @@ -2038,21 +2021,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm15 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm6 +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,0,2,1] @@ -2061,7 +2043,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] @@ -2069,16 +2051,16 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] @@ -2092,7 +2074,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] @@ -2104,8 +2086,9 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 96(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2123,8 +2106,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 128(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm11 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2180,44 +2163,44 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 248(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6],ymm12[7] ; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm15[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm15[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] ; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1,2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7] ; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd $165, (%rsp), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] @@ -2237,10 +2220,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm3, 608(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm6, 512(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm10, 416(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm14, 320(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm13, 224(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 320(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 224(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm12, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2283,22 +2266,22 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm10 -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm13 ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm9 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm13 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm10 +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] @@ -2313,59 +2296,59 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] ; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[1,1,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm5[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm3 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm10[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[0,0,2,1] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[2,1,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm2 @@ -2481,21 +2464,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,0,2,1] @@ -2504,7 +2486,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] @@ -2512,16 +2494,16 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] @@ -2535,7 +2517,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] @@ -2547,8 +2529,9 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 96(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2566,8 +2549,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 128(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2623,44 +2606,44 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1,2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, (%rsp), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] @@ -2680,10 +2663,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 608(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 512(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 416(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 320(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 320(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 224(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2722,10 +2705,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i32_stride3_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 @@ -2735,51 +2718,51 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5> -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm16 ; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10> ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 +; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 -; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 -; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512-NEXT: vpermt2d %zmm6, %zmm15, %zmm7 ; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm11 +; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm11 ; AVX512-NEXT: vpermt2d %zmm10, %zmm20, %zmm11 -; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm1 -; AVX512-NEXT: vpermt2d %zmm10, %zmm14, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm2 +; AVX512-NEXT: vpermt2d %zmm10, %zmm14, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512-NEXT: vpermt2d %zmm5, %zmm15, %zmm6 ; AVX512-NEXT: vpermt2d %zmm9, %zmm17, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm10 +; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm10 ; AVX512-NEXT: vpermt2d %zmm9, %zmm20, %zmm10 -; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm2 -; AVX512-NEXT: vpermt2d %zmm9, %zmm14, %zmm2 -; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm15 +; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm1 +; AVX512-NEXT: vpermt2d %zmm9, %zmm14, %zmm1 +; AVX512-NEXT: vpermi2d %zmm4, %zmm0, %zmm15 ; AVX512-NEXT: vpermt2d %zmm8, %zmm17, %zmm15 -; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm4 +; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm4 ; AVX512-NEXT: vpermt2d %zmm8, %zmm20, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm6, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 384(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm11, 448(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm7, 512(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, 576(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm3, 576(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm19, 640(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm16, 704(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm13, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll index 31e462a139561b..4029f9fe0b5eab 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -236,55 +236,55 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm5 -; SSE-NEXT: movaps 16(%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm7 -; SSE-NEXT: movaps 16(%rdx), %xmm4 -; SSE-NEXT: movaps (%rcx), %xmm8 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps 16(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm4 +; SSE-NEXT: movaps 16(%rdx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm6 +; SSE-NEXT: movaps 16(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm4, %xmm8 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm8[1] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] ; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, 96(%r8) -; SSE-NEXT: movaps %xmm6, 112(%r8) -; SSE-NEXT: movaps %xmm8, 64(%r8) -; SSE-NEXT: movaps %xmm10, 80(%r8) +; SSE-NEXT: movaps %xmm3, 112(%r8) +; SSE-NEXT: movaps %xmm6, 64(%r8) +; SSE-NEXT: movaps %xmm8, 80(%r8) ; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps %xmm5, 48(%r8) -; SSE-NEXT: movaps %xmm2, (%r8) -; SSE-NEXT: movaps %xmm3, 16(%r8) +; SSE-NEXT: movaps %xmm2, 48(%r8) +; SSE-NEXT: movaps %xmm9, (%r8) +; SSE-NEXT: movaps %xmm10, 16(%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride4_vf8: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm2[1],xmm4[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[1],xmm3[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7 @@ -293,7 +293,7 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm5[0],xmm7[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm0[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 @@ -305,23 +305,23 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm7[2],xmm5[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,0],xmm2[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,0],xmm2[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm8[2],xmm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -391,13 +391,13 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride4_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm11 -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps (%rsi), %xmm0 -; SSE-NEXT: movaps 16(%rsi), %xmm3 -; SSE-NEXT: movaps 32(%rsi), %xmm9 +; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps (%rsi), %xmm3 +; SSE-NEXT: movaps 16(%rsi), %xmm0 +; SSE-NEXT: movaps 32(%rsi), %xmm2 ; SSE-NEXT: movaps (%rdx), %xmm7 ; SSE-NEXT: movaps 16(%rdx), %xmm13 ; SSE-NEXT: movaps 32(%rdx), %xmm10 @@ -406,72 +406,72 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 32(%rcx), %xmm12 ; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; SSE-NEXT: movaps %xmm13, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm15[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm15[0] ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movaps %xmm11, %xmm8 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] -; SSE-NEXT: movaps 48(%rdx), %xmm15 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] +; SSE-NEXT: movaps 48(%rdx), %xmm13 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; SSE-NEXT: movaps 48(%rcx), %xmm12 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm10[0] -; SSE-NEXT: movaps %xmm15, %xmm10 +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: movaps %xmm13, %xmm10 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; SSE-NEXT: movaps 48(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps 48(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0] -; SSE-NEXT: movaps %xmm2, 224(%r8) -; SSE-NEXT: movaps %xmm1, 240(%r8) -; SSE-NEXT: movaps %xmm3, 192(%r8) +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm10[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] +; SSE-NEXT: movaps %xmm1, 224(%r8) +; SSE-NEXT: movaps %xmm2, 240(%r8) +; SSE-NEXT: movaps %xmm5, 192(%r8) ; SSE-NEXT: movaps %xmm0, 208(%r8) -; SSE-NEXT: movaps %xmm4, 160(%r8) -; SSE-NEXT: movaps %xmm9, 176(%r8) -; SSE-NEXT: movaps %xmm13, 128(%r8) -; SSE-NEXT: movaps %xmm14, 144(%r8) +; SSE-NEXT: movaps %xmm9, 160(%r8) +; SSE-NEXT: movaps %xmm3, 176(%r8) +; SSE-NEXT: movaps %xmm14, 128(%r8) +; SSE-NEXT: movaps %xmm15, 144(%r8) ; SSE-NEXT: movaps %xmm11, 96(%r8) ; SSE-NEXT: movaps %xmm8, 112(%r8) ; SSE-NEXT: movaps %xmm7, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movaps %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm4, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps %xmm6, (%r8) @@ -483,76 +483,76 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $24, %rsp ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[1],xmm9[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[1],xmm10[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm10[0],xmm7[1],xmm10[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm13, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[1],xmm5[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm15[1],xmm5[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm12[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm13, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[1],xmm4[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm8[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm2[1],xmm4[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm6[0],xmm8[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm13[1],xmm15[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm13[1],xmm15[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[3,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm1[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[3,0],xmm7[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm12[2],xmm11[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm7[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm8[2],xmm6[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 @@ -563,19 +563,19 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[2],xmm1[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[2],xmm9[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm15[3,0],xmm13[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[3,0],xmm13[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -589,27 +589,27 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i32_stride4_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] @@ -624,34 +624,34 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm11 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm9 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[4],ymm9[4],ymm11[5],ymm9[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[4],ymm9[4],ymm11[5],ymm9[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[6],ymm9[6],ymm11[7],ymm9[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -740,141 +740,141 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride4_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $184, %rsp -; SSE-NEXT: movaps (%rdi), %xmm10 -; SSE-NEXT: movaps 16(%rdi), %xmm11 -; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: movaps 48(%rdi), %xmm13 -; SSE-NEXT: movaps (%rsi), %xmm5 -; SSE-NEXT: movaps 16(%rsi), %xmm2 +; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm6 +; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps 48(%rdi), %xmm15 +; SSE-NEXT: movaps (%rsi), %xmm3 +; SSE-NEXT: movaps 16(%rsi), %xmm4 ; SSE-NEXT: movaps 32(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm6 -; SSE-NEXT: movaps 16(%rdx), %xmm4 +; SSE-NEXT: movaps (%rdx), %xmm7 +; SSE-NEXT: movaps 16(%rdx), %xmm8 ; SSE-NEXT: movaps 32(%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm7 -; SSE-NEXT: movaps 16(%rcx), %xmm8 -; SSE-NEXT: movaps 32(%rcx), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1] +; SSE-NEXT: movaps (%rcx), %xmm10 +; SSE-NEXT: movaps 16(%rcx), %xmm11 +; SSE-NEXT: movaps 32(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; SSE-NEXT: movaps %xmm5, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm13[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm8[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movaps %xmm9, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdx), %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movaps 48(%rcx), %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movaps 48(%rcx), %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 48(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movaps %xmm15, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm0 ; SSE-NEXT: movaps 64(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 64(%rdi), %xmm13 +; SSE-NEXT: movaps 64(%rdi), %xmm8 ; SSE-NEXT: movaps 64(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; SSE-NEXT: movaps %xmm13, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdx), %xmm0 -; SSE-NEXT: movaps 80(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 80(%rdi), %xmm11 -; SSE-NEXT: movaps 80(%rsi), %xmm7 -; SSE-NEXT: movaps %xmm11, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdx), %xmm1 +; SSE-NEXT: movaps 80(%rcx), %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps 80(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSE-NEXT: movaps %xmm11, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdx), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: movaps 96(%rdx), %xmm0 ; SSE-NEXT: movaps 96(%rcx), %xmm6 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps 96(%rdi), %xmm5 -; SSE-NEXT: movaps 96(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE-NEXT: movaps 96(%rdi), %xmm9 +; SSE-NEXT: movaps 96(%rsi), %xmm5 ; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movaps %xmm5, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 112(%rdx), %xmm2 -; SSE-NEXT: movaps 112(%rcx), %xmm7 +; SSE-NEXT: movaps 112(%rcx), %xmm5 ; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 112(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm0, %xmm1 @@ -882,7 +882,7 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] @@ -891,27 +891,27 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm4, 480(%r8) ; SSE-NEXT: movaps %xmm1, 464(%r8) ; SSE-NEXT: movaps %xmm3, 448(%r8) -; SSE-NEXT: movaps %xmm5, 432(%r8) -; SSE-NEXT: movaps %xmm10, 416(%r8) -; SSE-NEXT: movaps %xmm9, 400(%r8) -; SSE-NEXT: movaps %xmm12, 384(%r8) -; SSE-NEXT: movaps %xmm11, 368(%r8) -; SSE-NEXT: movaps %xmm15, 352(%r8) -; SSE-NEXT: movaps %xmm8, 336(%r8) +; SSE-NEXT: movaps %xmm9, 432(%r8) +; SSE-NEXT: movaps %xmm11, 416(%r8) +; SSE-NEXT: movaps %xmm12, 400(%r8) +; SSE-NEXT: movaps %xmm14, 384(%r8) +; SSE-NEXT: movaps %xmm10, 368(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%r8) +; SSE-NEXT: movaps %xmm7, 336(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%r8) -; SSE-NEXT: movaps %xmm13, 304(%r8) +; SSE-NEXT: movaps %xmm8, 304(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%r8) -; SSE-NEXT: movaps %xmm14, 272(%r8) +; SSE-NEXT: movaps %xmm13, 272(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%r8) +; SSE-NEXT: movaps %xmm15, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1004,10 +1004,10 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm14[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1018,11 +1018,11 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0] +; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm10[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1034,81 +1034,81 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm12[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm11[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[1],xmm8[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[1],xmm8[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm2[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm1[2],xmm13[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm1[2],xmm13[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm1[2],xmm13[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm13[2],xmm1[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm14[2],xmm1[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -1117,47 +1117,47 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm9[2],xmm10[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,zero,xmm10[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[3,0],xmm13[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm7[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm6[2],xmm7[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[3,0],xmm10[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm1[3,0],xmm12[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm5[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[3,0],xmm1[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[3,0],xmm1[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[3,0],xmm11[3,0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm4[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[3,0],xmm9[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1187,19 +1187,19 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i32_stride4_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: pushq %rax -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 @@ -1207,40 +1207,40 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm10 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] @@ -1255,66 +1255,66 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3],ymm0[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm14 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3],ymm13[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm6 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm14[0],ymm4[0],ymm14[1],ymm4[1],ymm14[4],ymm4[4],ymm14[5],ymm4[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm6[0],ymm14[1],ymm6[1],ymm14[4],ymm6[4],ymm14[5],ymm6[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[4],ymm15[4],ymm13[5],ymm15[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[6],ymm6[6],ymm14[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm13[0],ymm6[0],ymm13[1],ymm6[1],ymm13[4],ymm6[4],ymm13[5],ymm6[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 352(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 320(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm10, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 384(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1467,83 +1467,83 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps (%rdi), %xmm10 ; SSE-NEXT: movaps 16(%rdi), %xmm11 ; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: movaps 48(%rdi), %xmm13 -; SSE-NEXT: movaps (%rsi), %xmm4 -; SSE-NEXT: movaps 16(%rsi), %xmm2 +; SSE-NEXT: movaps 48(%rdi), %xmm15 +; SSE-NEXT: movaps (%rsi), %xmm3 +; SSE-NEXT: movaps 16(%rsi), %xmm4 ; SSE-NEXT: movaps 32(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm6 -; SSE-NEXT: movaps 16(%rdx), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: movaps 16(%rdx), %xmm6 ; SSE-NEXT: movaps 32(%rdx), %xmm1 ; SSE-NEXT: movaps (%rcx), %xmm7 ; SSE-NEXT: movaps 16(%rcx), %xmm8 -; SSE-NEXT: movaps 32(%rcx), %xmm5 -; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: movaps 32(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm9 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1] +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm9[0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm3[0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdx), %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: movaps 48(%rcx), %xmm3 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movaps 48(%rcx), %xmm2 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] ; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 48(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: movaps %xmm15, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm3[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm0 ; SSE-NEXT: movaps 64(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 @@ -1707,58 +1707,58 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 192(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 192(%rdi), %xmm12 +; SSE-NEXT: movaps 192(%rdi), %xmm10 ; SSE-NEXT: movaps 192(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm12, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; SSE-NEXT: movaps %xmm13, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; SSE-NEXT: movaps %xmm10, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 208(%rdx), %xmm0 -; SSE-NEXT: movaps 208(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm13 -; SSE-NEXT: movaps 208(%rsi), %xmm7 -; SSE-NEXT: movaps %xmm13, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdx), %xmm1 +; SSE-NEXT: movaps 208(%rcx), %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: movaps 208(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm12, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdx), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] +; SSE-NEXT: movaps 224(%rdx), %xmm0 ; SSE-NEXT: movaps 224(%rcx), %xmm6 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps 224(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: movaps 224(%rdi), %xmm9 ; SSE-NEXT: movaps 224(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] ; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: movaps %xmm11, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: movaps %xmm9, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movaps %xmm5, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 240(%rdx), %xmm2 -; SSE-NEXT: movaps 240(%rcx), %xmm7 +; SSE-NEXT: movaps 240(%rcx), %xmm5 ; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 ; SSE-NEXT: movaps 240(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm0, %xmm1 @@ -1766,7 +1766,7 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] @@ -1775,20 +1775,20 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm4, 992(%r8) ; SSE-NEXT: movaps %xmm1, 976(%r8) ; SSE-NEXT: movaps %xmm3, 960(%r8) -; SSE-NEXT: movaps %xmm5, 944(%r8) -; SSE-NEXT: movaps %xmm10, 928(%r8) -; SSE-NEXT: movaps %xmm9, 912(%r8) -; SSE-NEXT: movaps %xmm11, 896(%r8) -; SSE-NEXT: movaps %xmm13, 880(%r8) +; SSE-NEXT: movaps %xmm9, 944(%r8) +; SSE-NEXT: movaps %xmm8, 928(%r8) +; SSE-NEXT: movaps %xmm11, 912(%r8) +; SSE-NEXT: movaps %xmm14, 896(%r8) +; SSE-NEXT: movaps %xmm12, 880(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%r8) -; SSE-NEXT: movaps %xmm8, 848(%r8) +; SSE-NEXT: movaps %xmm7, 848(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%r8) -; SSE-NEXT: movaps %xmm12, 816(%r8) +; SSE-NEXT: movaps %xmm10, 816(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%r8) -; SSE-NEXT: movaps %xmm14, 784(%r8) +; SSE-NEXT: movaps %xmm13, 784(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%r8) ; SSE-NEXT: movaps %xmm15, 752(%r8) @@ -1891,7 +1891,7 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1384, %rsp # imm = 0x568 +; AVX1-ONLY-NEXT: subq $1400, %rsp # imm = 0x578 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 @@ -2086,12 +2086,13 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2102,11 +2103,11 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm10[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2117,25 +2118,24 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm6[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm11[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm14[1],xmm11[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2143,211 +2143,213 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm7 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[1],xmm7[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm5[0],xmm1[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm5[0],xmm2[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm13[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm1[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm8[2],xmm10[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[3,0],xmm14[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm4[2],xmm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,zero,xmm10[2],xmm13[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[3,0],xmm13[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3],ymm10[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm6[2],xmm8[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[3,0],xmm10[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[3,0],xmm10[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm3[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm1[2],xmm5[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 928(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 864(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 800(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 736(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 672(%r8) +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm11[3,0],xmm14[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm5[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[3,0],xmm9[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, 992(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 928(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 864(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 800(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 736(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2400,58 +2402,58 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: addq $1384, %rsp # imm = 0x568 +; AVX1-ONLY-NEXT: addq $1400, %rsp # imm = 0x578 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i32_stride4_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[0,0,2,1] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3],ymm13[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2713,109 +2715,109 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm23 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm12 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm10 ; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm25 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm12 ; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm26 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm19 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm19 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm21 ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm13, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm8, %zmm4 ; AVX512F-NEXT: movb $-86, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512F-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> -; AVX512F-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm22, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm14, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm23, %zmm24 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm25, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512F-NEXT: vpermt2d %zmm19, %zmm24, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> +; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm13, %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm7, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm16, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm11, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm18, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm15, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1} -; AVX512F-NEXT: vpermt2d %zmm26, %zmm20, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm21, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm14, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm7, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm8, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm22, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm14, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm23, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm25, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm27 {%k1} +; AVX512F-NEXT: vpermt2d %zmm20, %zmm24, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm13, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm8, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm22, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm23, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm25, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm20 {%k1} +; AVX512F-NEXT: vpermt2d %zmm21, %zmm24, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512F-NEXT: vpermi2d %zmm9, %zmm6, %zmm13 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} +; AVX512F-NEXT: vpermi2d %zmm9, %zmm6, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm14 {%k1} +; AVX512F-NEXT: vpermi2d %zmm9, %zmm6, %zmm23 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm25 ; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm16, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm11, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm18, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512F-NEXT: vpermt2d %zmm19, %zmm20, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm21, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1} -; AVX512F-NEXT: vpermi2d %zmm9, %zmm6, %zmm14 -; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512F-NEXT: vpermi2d %zmm9, %zmm6, %zmm16 -; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1} -; AVX512F-NEXT: vpermi2d %zmm9, %zmm6, %zmm18 -; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512F-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm24, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm19, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, 832(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 960(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 768(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 832(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm20, 704(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm18, 512(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, 576(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 256(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm17, 320(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2826,109 +2828,109 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm12 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm19 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm21 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm13, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm8, %zmm4 ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm22, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm14, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm23, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm25, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm24, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm7, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm11, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm18, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm15, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm14, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm7, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm8, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm22, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm14, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm23, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm25, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm27 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm24, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm13, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm8, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm22, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm23, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm24, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm6, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm6, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm6, %zmm23 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm16, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm11, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm18, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm20, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm6, %zmm14 -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm6, %zmm16 -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm6, %zmm18 -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 832(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 960(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 768(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 832(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 704(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 512(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 576(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 256(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index f0c2f3f23a2d03..f0816847956fa9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -322,75 +322,74 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i32_stride5_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm0 ; SSE-NEXT: movdqa (%rsi), %xmm6 -; SSE-NEXT: movdqa 16(%rsi), %xmm3 -; SSE-NEXT: movdqa (%rdx), %xmm7 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movaps (%rcx), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm3 +; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: movaps (%rcx), %xmm5 ; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps (%r8), %xmm0 +; SSE-NEXT: movaps (%r8), %xmm8 ; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm11[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm9[0,2] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm0[3,3] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm11[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,2] +; SSE-NEXT: movaps %xmm5, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm8[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm13[0],xmm9[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[0,2] -; SSE-NEXT: movaps %xmm5, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm13[0],xmm10[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm12[0,2] +; SSE-NEXT: movaps %xmm0, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm11[2,3] ; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,0] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; SSE-NEXT: movaps %xmm9, %xmm14 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm11[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm13[0,1] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] ; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,0],xmm8[2,0] +; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[2,0] ; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm14[0,1] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm14[0,1] ; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm6[0] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,0] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm8[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm14, (%r9) -; SSE-NEXT: movaps %xmm4, 32(%r9) +; SSE-NEXT: movaps %xmm5, 32(%r9) ; SSE-NEXT: movaps %xmm15, 48(%r9) ; SSE-NEXT: movaps %xmm13, 80(%r9) ; SSE-NEXT: movaps %xmm2, 112(%r9) ; SSE-NEXT: movaps %xmm12, 128(%r9) ; SSE-NEXT: movaps %xmm1, 16(%r9) -; SSE-NEXT: movaps %xmm9, 64(%r9) -; SSE-NEXT: movaps %xmm5, 96(%r9) -; SSE-NEXT: movaps %xmm8, 144(%r9) +; SSE-NEXT: movaps %xmm10, 64(%r9) +; SSE-NEXT: movaps %xmm0, 96(%r9) +; SSE-NEXT: movaps %xmm7, 144(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf8: @@ -684,169 +683,167 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride5_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa 32(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm10 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa 32(%rdx), %xmm4 -; SSE-NEXT: movaps (%rcx), %xmm5 -; SSE-NEXT: movaps 16(%rcx), %xmm14 -; SSE-NEXT: movaps 32(%rcx), %xmm12 -; SSE-NEXT: movaps (%r8), %xmm3 -; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps 32(%r8), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa (%rsi), %xmm7 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm9 +; SSE-NEXT: movdqa 16(%rdx), %xmm8 +; SSE-NEXT: movdqa 32(%rdx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm10 +; SSE-NEXT: movaps 16(%rcx), %xmm15 +; SSE-NEXT: movaps 32(%rcx), %xmm11 +; SSE-NEXT: movaps (%r8), %xmm13 +; SSE-NEXT: movaps 16(%r8), %xmm4 +; SSE-NEXT: movaps 32(%r8), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] +; SSE-NEXT: movdqa 48(%rdx), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm9 +; SSE-NEXT: movaps 48(%rcx), %xmm6 ; SSE-NEXT: movaps 48(%r8), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm13[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm15, %xmm13 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,1] ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[2,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[2,0] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,1] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm9[0] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: movaps %xmm6, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,0] -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,0] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm9, 288(%r9) +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm9[0,1] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm1[2,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm9[2,0] +; SSE-NEXT: movaps (%rsp), %xmm9 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm9[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm12 = xmm9[0],xmm12[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm9[0],xmm7[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm9[0],xmm3[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm6, 288(%r9) ; SSE-NEXT: movaps %xmm4, 272(%r9) -; SSE-NEXT: movdqa %xmm5, 240(%r9) -; SSE-NEXT: movaps %xmm15, 208(%r9) -; SSE-NEXT: movaps %xmm12, 192(%r9) -; SSE-NEXT: movdqa %xmm6, 160(%r9) -; SSE-NEXT: movaps %xmm14, 128(%r9) -; SSE-NEXT: movaps %xmm10, 112(%r9) -; SSE-NEXT: movdqa %xmm8, 80(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps %xmm2, 256(%r9) +; SSE-NEXT: movdqa %xmm2, 240(%r9) +; SSE-NEXT: movaps %xmm5, 208(%r9) +; SSE-NEXT: movaps %xmm11, 192(%r9) +; SSE-NEXT: movdqa %xmm8, 160(%r9) +; SSE-NEXT: movaps %xmm15, 128(%r9) +; SSE-NEXT: movaps %xmm13, 112(%r9) +; SSE-NEXT: movdqa %xmm10, 80(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 304(%r9) +; SSE-NEXT: movaps %xmm0, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) ; SSE-NEXT: movaps %xmm3, 176(%r9) @@ -855,33 +852,33 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm7, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) -; SSE-NEXT: movaps %xmm13, 16(%r9) +; SSE-NEXT: movaps %xmm12, 16(%r9) ; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm10[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm10[1],xmm6[1],zero ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm10[0],xmm6[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3],ymm1[4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3],ymm5[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm8[1],zero ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 @@ -894,8 +891,8 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0],ymm5[1,2,3],ymm15[4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm5[1,2,3],ymm15[4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 @@ -903,15 +900,15 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm10 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm11[2],xmm10[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1],xmm10[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1],ymm2[1,1],ymm7[5,5],ymm2[5,5] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1],ymm3[1,1],ymm7[5,5],ymm3[5,5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm6 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm11 @@ -920,8 +917,8 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[3,3],ymm1[3,3],ymm3[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[3,3],ymm2[3,3],ymm4[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4],ymm11[5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm12[2,3,2,3] @@ -952,29 +949,29 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm13 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[1,1],ymm13[1,1],ymm14[5,5],ymm13[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm4[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm13[1,1],ymm14[5,5],ymm13[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm6[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm12[3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm13[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r9) @@ -992,29 +989,29 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i32_stride5_vf16: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm6 ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm8 ; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm7 ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm14 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm15[1],ymm8[2,3,4,5],ymm15[6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm15[1],ymm4[2,3,4,5],ymm15[6],ymm4[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm15 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] @@ -1035,7 +1032,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] @@ -1046,19 +1043,19 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm5[1],ymm12[2,3,4],ymm5[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm6[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm8[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] @@ -1069,12 +1066,12 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm15 ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4,5,6],ymm14[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] @@ -1084,13 +1081,13 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] @@ -1102,7 +1099,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm3, 224(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 128(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm13, 288(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm12, 256(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm11, 160(%r9) @@ -1116,81 +1113,81 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride5_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm4 -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm3 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm9 -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm12 = <0,1,0,1,u,u,2,2> -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm11 ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm14 -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm13 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm15 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm11[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm13[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm5[1],ymm2[2,3,4,5],ymm5[6],ymm2[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm7 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm12, %ymm8 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,3,2,3,2,3,2] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm11 ; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm2[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm1[1,2],ymm13[3,4],ymm1[5,6],ymm13[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm13 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4],ymm13[5],ymm8[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3,4],ymm13[5,6],ymm5[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3,4],ymm13[5,6],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm13[3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1],ymm13[2],ymm5[3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1],ymm13[2],ymm2[3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm10 ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm9, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0],ymm0[1,2,3],ymm5[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm9, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3,4],ymm0[5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3,4],ymm0[5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm14 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] @@ -1199,35 +1196,35 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm2[1,2],ymm5[3,4],ymm2[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1,2,3,4],ymm15[5],ymm6[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0],ymm5[1,2,3,4],ymm15[5],ymm5[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm9, 160(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm13, 288(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm8, 256(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-NEXT: vmovaps %ymm6, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1238,29 +1235,29 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-LABEL: store_i32_stride5_vf16: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm15[1],ymm8[2,3,4,5],ymm15[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm15[1],ymm4[2,3,4,5],ymm15[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] @@ -1281,7 +1278,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] @@ -1292,19 +1289,19 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm5[1],ymm12[2,3,4],ymm5[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm6[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm8[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] @@ -1315,12 +1312,12 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4,5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] @@ -1330,13 +1327,13 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] @@ -1348,7 +1345,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 224(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 288(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 256(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 160(%r9) @@ -1509,81 +1506,81 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $712, %rsp # imm = 0x2C8 +; SSE-NEXT: subq $728, %rsp # imm = 0x2D8 ; SSE-NEXT: movdqa (%rsi), %xmm9 ; SSE-NEXT: movdqa 16(%rsi), %xmm7 ; SSE-NEXT: movdqa 32(%rsi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdx), %xmm11 ; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa 32(%rdx), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm6 ; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm5 -; SSE-NEXT: movaps 32(%rcx), %xmm6 -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps 16(%r8), %xmm13 +; SSE-NEXT: movaps 16(%rcx), %xmm4 +; SSE-NEXT: movaps 32(%rcx), %xmm5 +; SSE-NEXT: movaps (%r8), %xmm15 +; SSE-NEXT: movaps 16(%r8), %xmm14 ; SSE-NEXT: movaps 32(%r8), %xmm12 ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm8 +; SSE-NEXT: movdqa 48(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps 48(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] +; SSE-NEXT: movaps 48(%rcx), %xmm8 +; SSE-NEXT: movaps 48(%r8), %xmm2 +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 64(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] +; SSE-NEXT: movaps 64(%r8), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 80(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] +; SSE-NEXT: movaps 80(%r8), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm0 @@ -1595,257 +1592,255 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 96(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] +; SSE-NEXT: movaps 96(%r8), %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rsi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm15 -; SSE-NEXT: movaps 112(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] +; SSE-NEXT: movaps 112(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%r8), %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm15[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm15[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm14[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[2,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps 32(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm12[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps 48(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movaps 64(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 80(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movaps 80(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: movaps %xmm1, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 96(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm14, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: movaps %xmm8, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm11 -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movaps 112(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm11[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[1,1],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm14[0],xmm0[1,2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm0[0],xmm9[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm0[0],xmm9[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: movaps %xmm1, 608(%r9) -; SSE-NEXT: movaps %xmm11, 592(%r9) +; SSE-NEXT: movaps %xmm0, 608(%r9) +; SSE-NEXT: movaps %xmm12, 592(%r9) ; SSE-NEXT: movaps %xmm4, 560(%r9) -; SSE-NEXT: movaps %xmm6, 528(%r9) +; SSE-NEXT: movaps %xmm5, 528(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 512(%r9) -; SSE-NEXT: movaps %xmm8, 480(%r9) +; SSE-NEXT: movaps %xmm7, 480(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%r9) -; SSE-NEXT: movaps %xmm12, 432(%r9) +; SSE-NEXT: movaps %xmm10, 432(%r9) ; SSE-NEXT: movaps %xmm13, 400(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%r9) @@ -1882,180 +1877,181 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm3, 576(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%r9) -; SSE-NEXT: movaps %xmm5, 496(%r9) +; SSE-NEXT: movaps %xmm6, 496(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%r9) -; SSE-NEXT: movaps %xmm7, 416(%r9) +; SSE-NEXT: movaps %xmm9, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r9) -; SSE-NEXT: movaps %xmm9, 336(%r9) +; SSE-NEXT: movaps %xmm11, 336(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps %xmm10, 256(%r9) +; SSE-NEXT: movaps %xmm15, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps %xmm2, 176(%r9) +; SSE-NEXT: movaps %xmm14, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) -; SSE-NEXT: movaps %xmm15, 96(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: addq $712, %rsp # imm = 0x2C8 +; SSE-NEXT: addq $728, %rsp # imm = 0x2D8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $600, %rsp # imm = 0x258 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm9[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0],xmm6[0],zero,zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm9[1],xmm8[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0],xmm8[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm15[0],xmm14[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm14[0],xmm6[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm4[1],xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm3[0],zero,zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm4[1],xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm2[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm2[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm3[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm0[1],xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm11 = xmm0[0],xmm1[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2,3],ymm10[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm12[1],xmm13[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm11 = xmm12[0],xmm13[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2,3],ymm10[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm9[2],xmm6[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm5[0],xmm10[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm1[1],xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = xmm1[0],xmm3[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm11[1,2,3],ymm12[4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm13[1],xmm12[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm13[0],xmm12[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1,2,3],ymm0[4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm9[2],xmm8[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5],ymm6[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm6[2],xmm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm15 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm7[2],xmm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,1],ymm15[1,1],ymm6[5,5],ymm15[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[3,3],xmm6[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm15 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1],ymm15[1,1],ymm7[5,5],ymm15[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3],xmm7[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2,3,4],ymm9[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm9[2],ymm7[3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm4[2],xmm3[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm2[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,1],xmm2[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[3,3],ymm8[3,3],ymm9[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm8[2],ymm6[3,4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm4[2],xmm2[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[1,1],ymm4[5,5],ymm3[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm2[1,1],ymm4[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm3[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm2[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,3],ymm4[3,3],ymm14[7,7],ymm4[7,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,3],ymm4[3,3],ymm11[7,7],ymm4[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4],ymm4[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX1-ONLY-NEXT: vbroadcastss 68(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm0[2],xmm1[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX1-ONLY-NEXT: vbroadcastss 68(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm3[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2064,74 +2060,74 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm10[1,1],ymm9[5,5],ymm10[5,5] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm9[1,1],ymm8[5,5],ymm9[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm0[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3],xmm0[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,3],ymm8[3,3],ymm5[7,7],ymm8[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm3[1,2,3,4],ymm11[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[3,3],ymm7[3,3],ymm10[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm2[1,2,3,4],ymm14[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm11[2],ymm4[3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm14[2],ymm3[3,4,5,6],ymm14[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vbroadcastss 100(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm12[2],xmm13[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vbroadcastss 100(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm13[2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm4[2],xmm3[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1],xmm3[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm3[2],xmm2[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,1],xmm2[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm12[1,1],ymm13[5,5],ymm12[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2],ymm4[3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[1,1],ymm12[1,1],ymm13[5,5],ymm12[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2],ymm3[3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3],xmm4[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3],xmm3[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm4[3,3],ymm1[3,3],ymm4[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0],ymm7[1,2,3,4],ymm3[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm3[2],ymm11[3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm3[3,3],ymm1[3,3],ymm3[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0],ymm6[1,2,3,4],ymm2[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2,3],ymm11[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3],ymm14[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm14[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4],ymm11[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -2142,26 +2138,26 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1,2,3],ymm11[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2],mem[3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[1,0,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm12[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm12[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm1, 544(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 224(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 608(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 576(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 608(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2203,20 +2199,20 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%r8), %ymm13 ; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm7 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm6 ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm7 ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm9 ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] @@ -2225,27 +2221,27 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0],ymm5[1,2,3],ymm11[4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm15[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm15[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm7[0,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3],ymm7[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] @@ -2473,155 +2469,155 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $600, %rsp # imm = 0x258 -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm11 -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: subq $584, %rsp # imm = 0x248 +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm12 +; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm15 -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm12 = <0,1,0,1,u,u,2,2> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm14 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm15 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm11 = <0,1,0,1,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm7 ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm8 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2],xmm9[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm12[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4,5],ymm9[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2],xmm9[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4,5],ymm9[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2],xmm11[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovaps 64(%r8), %ymm11 -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm4[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovaps 64(%r8), %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7] +; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm10 ; AVX2-FAST-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm14 -; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm13[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovaps 96(%r8), %ymm11 -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm13 +; AVX2-FAST-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2],xmm10[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovaps 96(%r8), %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm12, %ymm7 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm11 = [0,1,3,2,3,2,3,2] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm8 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm9 -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7] +; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [0,1,3,2,3,2,3,2] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm5, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm14 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm14[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1,2],ymm7[3,4],ymm10[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[6],ymm10[6],ymm14[7],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm6, %ymm7 ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm10 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1,2],ymm6[3,4],ymm13[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm3, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm9 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2],ymm3[3,4],ymm8[5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm8 +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1,2],ymm2[3,4],ymm6[5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm5 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm15 ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm4 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm2 @@ -2633,7 +2629,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2641,86 +2637,86 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm12 -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1,2],ymm15[3,4],ymm6[5,6],ymm15[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 120(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3],ymm14[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0,1,2],mem[3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1,2],ymm14[3,4],ymm1[5,6],ymm14[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2],ymm14[3,4],ymm9[5,6],ymm14[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 120(%r8), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3],ymm10[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm15[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm0, 544(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm2, 384(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm6, 608(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm10, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm9, 608(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%r9) @@ -2746,7 +2742,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: addq $600, %rsp # imm = 0x258 +; AVX2-FAST-NEXT: addq $584, %rsp # imm = 0x248 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -2759,20 +2755,20 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r8), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] @@ -2781,27 +2777,27 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0],ymm5[1,2,3],ymm11[4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm15[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm15[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm7[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3],ymm7[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] @@ -3032,103 +3028,103 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm8 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm14 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm13, %zmm3 ; AVX512F-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512F-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512F-NEXT: vpermt2d %zmm5, %zmm12, %zmm3 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm14, %zmm15 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm16, %zmm17 ; AVX512F-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 +; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512F-NEXT: vpermt2d %zmm10, %zmm15, %zmm17 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm18, %zmm19 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm20, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm10, %zmm19, %zmm21 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm22, %zmm23 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm24, %zmm25 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm10, %zmm23, %zmm25 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm26, %zmm27 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm29 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512F-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512F-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512F-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm24, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm27, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm11, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm0 {%k1} +; AVX512F-NEXT: vpermt2d %zmm10, %zmm12, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm16 +; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm16 {%k2} +; AVX512F-NEXT: vpermt2d %zmm5, %zmm15, %zmm16 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm6, %zmm20 +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm20 {%k1} +; AVX512F-NEXT: vpermt2d %zmm5, %zmm19, %zmm20 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm24 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm24 {%k2} +; AVX512F-NEXT: vpermt2d %zmm5, %zmm23, %zmm24 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm26, %zmm4 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm28, %zmm1 ; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm1 {%k3} ; AVX512F-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 256(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 576(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -3138,103 +3134,103 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm13, %zmm3 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm14, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm17 ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm18, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm22, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm25 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm23, %zmm25 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm26, %zmm27 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm24, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm27, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm11, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm0 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm16 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm20 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm19, %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm24 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm23, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm26, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm28, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k3} ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 576(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3256,68 +3252,66 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1736, %rsp # imm = 0x6C8 -; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: subq $1752, %rsp # imm = 0x6D8 +; SSE-NEXT: movdqa (%rsi), %xmm12 +; SSE-NEXT: movdqa 16(%rsi), %xmm10 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm8 +; SSE-NEXT: movdqa (%rdx), %xmm14 ; SSE-NEXT: movdqa 16(%rdx), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm5 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps 32(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps 32(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdx), %xmm15 +; SSE-NEXT: movaps (%rcx), %xmm4 +; SSE-NEXT: movaps 16(%rcx), %xmm6 +; SSE-NEXT: movaps 32(%rcx), %xmm9 +; SSE-NEXT: movaps (%r8), %xmm5 +; SSE-NEXT: movaps 16(%r8), %xmm8 +; SSE-NEXT: movaps 32(%r8), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm8[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm12 +; SSE-NEXT: movdqa 48(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm6 -; SSE-NEXT: movaps 48(%r8), %xmm7 -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] +; SSE-NEXT: movaps 48(%rcx), %xmm7 +; SSE-NEXT: movaps 48(%r8), %xmm11 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdx), %xmm15 +; SSE-NEXT: movdqa 64(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm13 +; SSE-NEXT: movaps 64(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%r8), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3328,10 +3322,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm14 +; SSE-NEXT: movaps 80(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%r8), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3470,170 +3464,127 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 240(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%r8), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] +; SSE-NEXT: movaps 240(%r8), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movaps 16(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm11[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm11[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm7[1,1] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm8[2,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm13[1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movaps 80(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movaps 48(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm7, %xmm4 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm11[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm11[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps 64(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3652,34 +3603,34 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 128(%rdi), %xmm5 +; SSE-NEXT: movaps 96(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 144(%rdi), %xmm4 +; SSE-NEXT: movaps 112(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -3698,260 +3649,308 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 160(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movaps 128(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movaps 144(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 160(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 176(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 192(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movaps 192(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movaps 208(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: movaps 224(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm15, %xmm6 +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movaps 240(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,1] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm2[2,0] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm0[0],xmm12[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1248(%r9) ; SSE-NEXT: movaps %xmm3, 1232(%r9) -; SSE-NEXT: movaps %xmm6, 1200(%r9) -; SSE-NEXT: movaps %xmm7, 1168(%r9) +; SSE-NEXT: movaps %xmm4, 1200(%r9) +; SSE-NEXT: movaps %xmm6, 1168(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1152(%r9) ; SSE-NEXT: movaps %xmm8, 1120(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1088(%r9) ; SSE-NEXT: movaps %xmm9, 1072(%r9) -; SSE-NEXT: movaps %xmm13, 1040(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1008(%r9) +; SSE-NEXT: movaps %xmm11, 1040(%r9) +; SSE-NEXT: movaps %xmm12, 1008(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 992(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4033,25 +4032,27 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm5, 1216(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1184(%r9) -; SSE-NEXT: movaps %xmm4, 1136(%r9) +; SSE-NEXT: movaps %xmm7, 1136(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1104(%r9) -; SSE-NEXT: movaps %xmm12, 1056(%r9) +; SSE-NEXT: movaps %xmm10, 1056(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1024(%r9) -; SSE-NEXT: movaps %xmm10, 976(%r9) +; SSE-NEXT: movaps %xmm13, 976(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 944(%r9) -; SSE-NEXT: movaps %xmm11, 896(%r9) +; SSE-NEXT: movaps %xmm1, 896(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%r9) ; SSE-NEXT: movaps %xmm14, 816(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%r9) -; SSE-NEXT: movaps %xmm2, 736(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 736(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 704(%r9) -; SSE-NEXT: movaps %xmm15, 656(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 656(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 624(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4070,11 +4071,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 336(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) @@ -4084,115 +4085,115 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: addq $1736, %rsp # imm = 0x6C8 +; SSE-NEXT: addq $1752, %rsp # imm = 0x6D8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1784, %rsp # imm = 0x6F8 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm15 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm6[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm11[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[1],xmm10[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0],xmm10[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm14[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm15[1],xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm15[0],xmm14[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm6[1],xmm15[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = xmm6[0],xmm15[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm10[1],xmm13[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm13[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm13[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = xmm13[0],xmm0[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm7[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm8[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0],xmm0[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm4[1,2,3],ymm7[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 132(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 132(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm7 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm8[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = xmm8[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm4[1,2,3],ymm9[4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm11[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[0],xmm0[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1,2,3],ymm0[4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm4[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm4[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm7[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0],xmm0[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1,2,3],ymm0[4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4200,15 +4201,15 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm9[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm0[1],zero ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4219,23 +4220,22 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 228(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm0[1],xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = xmm0[0],xmm1[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm0[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm1[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm5[2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm5[2],xmm10[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] @@ -4243,90 +4243,92 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3],ymm14[3,3],ymm2[7,7],ymm14[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm15[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm6[2],xmm15[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm14[1,1],ymm15[5,5],ymm14[5,5] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm15[1,1],ymm1[5,5],ymm15[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm10[2],xmm13[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm13[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4334,32 +4336,32 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm7[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm8[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4378,9 +4380,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3],xmm1[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 @@ -4392,7 +4394,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4400,9 +4402,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 132(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm8[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm11[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4438,14 +4440,14 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 164(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm4[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm7[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4477,7 +4479,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4499,9 +4501,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[1,1],ymm11[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm1 @@ -4510,12 +4511,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3],ymm8[3,3],ymm9[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm9[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] @@ -4538,9 +4539,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm5[1,1],ymm4[5,5],ymm5[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm0 @@ -4550,47 +4551,49 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4],ymm7[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0],ymm3[1,2,3,4],ymm10[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm7[2],ymm5[3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm10[2],ymm6[3,4,5,6],ymm10[7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm11[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm14[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm15[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2],ymm15[3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -4626,30 +4629,30 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1184(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 1024(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 1024(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 864(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 704(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1248(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4664,7 +4667,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%r9) @@ -4676,7 +4679,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) @@ -4723,24 +4726,24 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: subq $1736, %rsp # imm = 0x6C8 ; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm15 ; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm14 -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%r8), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm13 ; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm7 +; AVX2-SLOW-NEXT: vmovaps 64(%r8), %ymm14 +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm6 ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm7 ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm9 ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] @@ -4749,34 +4752,34 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0],ymm5[1,2,3],ymm11[4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm15[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm15[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm7[0,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3],ymm7[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,1,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm13[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -4796,7 +4799,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,1,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm0 @@ -4930,154 +4933,152 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm15 -; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm13 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm14 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm14 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm13 -; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm14 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm14 +; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm13 -; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm14 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm14 +; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm13 -; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm5 +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %ymm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %ymm11 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %ymm9 @@ -5088,8 +5089,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5098,10 +5099,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 184(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm8 +; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm4 @@ -5113,7 +5114,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5121,80 +5122,82 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 216(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2,3,4],ymm11[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3,4],ymm7[5,6],ymm11[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 248(%r8), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm1[1,2],ymm13[3,4],ymm1[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4],ymm13[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3,4],ymm8[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 248(%r8), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm13[1,2,3],ymm8[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm13[1,2,3],ymm8[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm13[1,2,3],ymm8[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3],ymm7[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm14[1,2,3],ymm8[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload @@ -5207,17 +5210,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -5234,17 +5236,18 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm4, 1024(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm9, 864(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm13, 704(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 544(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 384(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 544(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1248(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 1216(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1088(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1056(%r9) @@ -5310,65 +5313,65 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride5_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1800, %rsp # imm = 0x708 -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm8 -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm5 +; AVX2-FAST-NEXT: subq $1832, %rsp # imm = 0x728 +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm14 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = <0,1,0,1,u,u,2,2> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm15 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm7 +; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = <0,1,0,1,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm14 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm11 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm8[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2],xmm2[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm12[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm8 -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm6[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm7 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps 64(%r8), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm4[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] @@ -5382,7 +5385,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rcx), %xmm2 @@ -5401,11 +5404,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] @@ -5420,7 +5423,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rcx), %xmm2 @@ -5439,7 +5442,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm2 ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %xmm0 @@ -5453,155 +5456,154 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [0,1,3,2,3,2,3,2] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = [0,1,3,2,3,2,3,2] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm3, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm15 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm10 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm1[1,2],ymm3[3,4],ymm1[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3,4],ymm0[5,6],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm3 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm3, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3,4],ymm5[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3,4],ymm0[5,6],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vinsertf128 $1, 64(%r8), %ymm3, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm6 ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3,4],ymm5[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3,4],ymm0[5,6],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 88(%r8), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 88(%r8), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm14, %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -5614,8 +5616,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 128(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] @@ -5626,13 +5628,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] @@ -5640,35 +5641,33 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 160(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovaps 160(%rcx), %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 160(%rcx), %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm10 @@ -5679,7 +5678,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] @@ -5688,15 +5687,15 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 184(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 192(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] @@ -5706,10 +5705,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 208(%r8), %ymm1 @@ -5718,7 +5717,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -5728,11 +5727,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] @@ -5741,102 +5740,104 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3,4],ymm1[5,6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3,4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3,4],ymm9[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2,3,4],ymm9[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,2,3,3,4,6,7,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2],ymm9[3,4],ymm4[5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3,4],ymm6[5,6],ymm9[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm9[3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 248(%r8), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm15[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1],ymm9[2],ymm15[3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1,2,3],ymm4[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1,2,3],ymm6[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, (%rsp), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] @@ -5846,12 +5847,13 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm0, 1184(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm5, 1024(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm4, 1024(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm10, 864(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm13, 704(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm4, 544(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm6, 544(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm9, 384(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm12, 224(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5866,7 +5868,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 1056(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 960(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 928(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 896(%r9) @@ -5918,7 +5920,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: addq $1800, %rsp # imm = 0x708 +; AVX2-FAST-NEXT: addq $1832, %rsp # imm = 0x728 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -5927,24 +5929,24 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: subq $1736, %rsp # imm = 0x6C8 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r8), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] @@ -5953,34 +5955,34 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0],ymm5[1,2,3],ymm11[4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm15[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm15[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm7[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3],ymm7[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm13[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -6000,7 +6002,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm0 @@ -6134,154 +6136,152 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %ymm9 @@ -6292,8 +6292,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6302,10 +6302,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 184(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm4 @@ -6317,7 +6317,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6325,80 +6325,82 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 216(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2,3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3,4],ymm7[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%r8), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm1[1,2],ymm13[3,4],ymm1[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4],ymm13[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3,4],ymm8[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm13[1,2,3],ymm8[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm13[1,2,3],ymm8[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm13[1,2,3],ymm8[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3],ymm7[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm14[1,2,3],ymm8[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload @@ -6411,17 +6413,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -6438,17 +6439,18 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 1024(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 864(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 704(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 544(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 384(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 544(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1248(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1216(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1088(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1056(%r9) @@ -6514,429 +6516,427 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i32_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm25 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm28 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm15 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 -; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm11, %zmm29 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm21, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm8, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm11, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm21, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm15, %zmm17, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm8, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm17, %zmm28 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm21, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm13 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm4, %zmm1, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm21, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 -; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm1, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm12 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm1, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm19 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm5 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm27, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm20, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm3 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm1, %zmm0 ; AVX512F-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} ; AVX512F-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm29 {%k2} ; AVX512F-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm17 {%k3} +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm13, %zmm27 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm18, %zmm29 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm21, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm27, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm22 {%k3} +; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm13, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm18, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm21, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm27, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm4 {%k3} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%r9) +; AVX512F-NEXT: vpermt2d %zmm1, %zmm27, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm27, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm2, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm13, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm18, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm8 {%k1} +; AVX512F-NEXT: vpermt2d %zmm1, %zmm13, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm11 {%k2} +; AVX512F-NEXT: vpermt2d %zmm1, %zmm18, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vpermt2d %zmm1, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 320(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm30, 576(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm19, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 832(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 1088(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm31, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 704(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 768(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 832(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 960(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1024(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm29, 1088(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1152(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512F-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm28 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm15 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm29 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm8, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm11, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm17, %zmm28 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm21, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm21, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm12 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm27, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm20, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm1, %zmm0 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm29 {%k2} ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm17 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm13, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm29 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm21, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm27, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm22 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm13, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm21, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm27, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm4 {%k3} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm27, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm27, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm2, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm18, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm8 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm13, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm11 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 320(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 576(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 832(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1088(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 704(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 832(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1024(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1088(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1152(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512BW-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll index 6e09eba1de1bc9..91148e3f978e8e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -180,61 +180,61 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm4 -; SSE-NEXT: movaps (%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm5 -; SSE-NEXT: movaps (%r8), %xmm7 -; SSE-NEXT: movaps (%r9), %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3] -; SSE-NEXT: movaps %xmm7, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm3[1,1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,2] -; SSE-NEXT: movaps %xmm3, 16(%rax) -; SSE-NEXT: movaps %xmm2, 32(%rax) +; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps (%rcx), %xmm3 +; SSE-NEXT: movaps (%r8), %xmm4 +; SSE-NEXT: movaps (%r9), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm5[1,1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[2,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm8[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm9[0,2] +; SSE-NEXT: movaps %xmm5, 16(%rax) +; SSE-NEXT: movaps %xmm6, 32(%rax) ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps %xmm6, 64(%rax) -; SSE-NEXT: movaps %xmm7, (%rax) +; SSE-NEXT: movaps %xmm2, 80(%rax) +; SSE-NEXT: movaps %xmm7, 64(%rax) +; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm9 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,0],xmm1[0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,0],xmm2[0,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm11[1,2],ymm7[5,6],ymm11[5,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] @@ -242,9 +242,9 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) @@ -335,76 +335,73 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride6_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rsi), %xmm10 -; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movaps (%rsi), %xmm10 +; SSE-NEXT: movaps 16(%rsi), %xmm11 +; SSE-NEXT: movaps (%rdx), %xmm9 ; SSE-NEXT: movaps 16(%rdx), %xmm2 -; SSE-NEXT: movaps (%rcx), %xmm6 -; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps (%rcx), %xmm7 +; SSE-NEXT: movaps 16(%rcx), %xmm12 ; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps (%r9), %xmm7 -; SSE-NEXT: movaps 16(%r9), %xmm3 -; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: movaps 16(%r8), %xmm13 +; SSE-NEXT: movaps (%r9), %xmm8 +; SSE-NEXT: movaps 16(%r9), %xmm4 +; SSE-NEXT: movaps %xmm12, %xmm14 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE-NEXT: movaps %xmm13, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm4[1,1] ; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movaps %xmm13, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm4[3,3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; SSE-NEXT: movaps %xmm15, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2] -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm7[3,3] -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm14[0,2] -; SSE-NEXT: movaps %xmm4, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm14[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm14[0,2] +; SSE-NEXT: movaps %xmm5, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm8[3,3] +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm13[0,2] +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm11[0,2] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[2,0] +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps %xmm14, 48(%rax) +; SSE-NEXT: movaps %xmm10, 32(%rax) +; SSE-NEXT: movaps %xmm13, 48(%rax) ; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm3, 112(%rax) -; SSE-NEXT: movaps %xmm13, 160(%rax) +; SSE-NEXT: movaps %xmm4, 112(%rax) +; SSE-NEXT: movaps %xmm6, 160(%rax) ; SSE-NEXT: movaps %xmm2, 176(%rax) -; SSE-NEXT: movaps %xmm4, (%rax) -; SSE-NEXT: movaps %xmm6, 16(%rax) -; SSE-NEXT: movaps %xmm8, 64(%rax) -; SSE-NEXT: movaps %xmm11, 80(%rax) -; SSE-NEXT: movaps %xmm10, 128(%rax) -; SSE-NEXT: movaps %xmm12, 144(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm7, 16(%rax) +; SSE-NEXT: movaps %xmm9, 64(%rax) +; SSE-NEXT: movaps %xmm14, 80(%rax) +; SSE-NEXT: movaps %xmm15, 128(%rax) +; SSE-NEXT: movaps %xmm3, 144(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf8: @@ -494,13 +491,13 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,2,2,3] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7] @@ -512,36 +509,36 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm12 ; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm13 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm12, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm9 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm9, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4],ymm13[5],ymm14[6],ymm13[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] @@ -559,7 +556,7 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm13, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper @@ -575,53 +572,53 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm12 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm11[0],zero,xmm11[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm13 ; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm14 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm9[4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,0,7,0,6,0,7,0] ; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm10 @@ -641,7 +638,7 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 128(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -656,13 +653,13 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7] @@ -674,36 +671,36 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm9, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4],ymm13[5],ymm14[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] @@ -721,7 +718,7 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -774,133 +771,133 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride6_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $72, %rsp -; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movaps 16(%rdx), %xmm10 -; SSE-NEXT: movaps (%rcx), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm3 +; SSE-NEXT: movaps 16(%rdi), %xmm5 +; SSE-NEXT: movaps (%rsi), %xmm8 +; SSE-NEXT: movaps 16(%rsi), %xmm2 +; SSE-NEXT: movaps (%rdx), %xmm4 +; SSE-NEXT: movaps 16(%rdx), %xmm11 +; SSE-NEXT: movaps (%rcx), %xmm9 ; SSE-NEXT: movaps 16(%rcx), %xmm0 -; SSE-NEXT: movaps (%r8), %xmm3 +; SSE-NEXT: movaps (%r8), %xmm10 ; SSE-NEXT: movaps 16(%r8), %xmm14 -; SSE-NEXT: movaps (%r9), %xmm4 -; SSE-NEXT: movaps 16(%r9), %xmm13 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm5[2,3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm11[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[2,3] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] +; SSE-NEXT: movaps (%r9), %xmm12 +; SSE-NEXT: movaps 16(%r9), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm10[0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm6[2,3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm13[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm13, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: movaps %xmm10, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm3[2,3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,0] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm12[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm10[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm14[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[2,3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm12[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm1[3,3] ; SSE-NEXT: movaps 32(%rdx), %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: movaps 32(%rcx), %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm14[0,2] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movaps 32(%rsi), %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movaps 32(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movaps 32(%r8), %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] +; SSE-NEXT: movaps 32(%r8), %xmm1 ; SSE-NEXT: movaps 32(%r9), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm15[2,3] ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm0[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] +; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[0,2] ; SSE-NEXT: movaps 48(%rdx), %xmm3 -; SSE-NEXT: movaps 48(%rcx), %xmm10 +; SSE-NEXT: movaps 48(%rcx), %xmm9 ; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rsi), %xmm9 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rsi), %xmm10 +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movaps 48(%r8), %xmm1 ; SSE-NEXT: movaps 48(%r9), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm3, 368(%rax) -; SSE-NEXT: movaps %xmm9, 352(%rax) -; SSE-NEXT: movaps %xmm2, 336(%rax) +; SSE-NEXT: movaps %xmm10, 352(%rax) +; SSE-NEXT: movaps %xmm4, 336(%rax) ; SSE-NEXT: movaps %xmm5, 320(%rax) ; SSE-NEXT: movaps %xmm6, 304(%rax) -; SSE-NEXT: movaps %xmm4, 288(%rax) +; SSE-NEXT: movaps %xmm2, 288(%rax) ; SSE-NEXT: movaps %xmm13, 272(%rax) ; SSE-NEXT: movaps %xmm8, 256(%rax) ; SSE-NEXT: movaps %xmm12, 240(%rax) @@ -921,9 +918,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) @@ -936,60 +933,60 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $104, %rsp -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX1-ONLY-NEXT: subq $136, %rsp +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,2],xmm8[1,2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,2],xmm7[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[4],ymm13[4],ymm5[5],ymm13[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[4],ymm13[4],ymm4[5],ymm13[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,2],xmm3[1,2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,2],xmm3[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 36(%r8), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm11[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm12 @@ -1002,248 +999,250 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[6],ymm13[6],ymm4[7],ymm13[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,2],ymm9[1,2],ymm8[5,6],ymm9[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm4[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm12[1,2],ymm11[5,6],ymm12[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2],ymm10[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,0],ymm7[3,0],ymm9[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,0],ymm8[3,0],ymm9[7,4],ymm8[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rax) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $104, %rsp +; AVX1-ONLY-NEXT: addq $136, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride6_vf16: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $200, %rsp -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm13[0],zero,xmm13[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,2,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm5 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm4 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm8, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm15, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd %xmm15, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm3 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm4 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm13, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[2,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[6],ymm4[6],ymm7[7],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm14[2],ymm11[3],ymm14[3],ymm11[6],ymm14[6],ymm11[7],ymm14[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6],ymm12[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm11[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm9 @@ -1253,20 +1252,20 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[4],ymm4[4],ymm7[5],ymm4[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 256(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 256(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 352(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) @@ -1287,78 +1286,78 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-LABEL: store_i32_stride6_vf16: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $232, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 ; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm5 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm5[0],zero,xmm5[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm4 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm10 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm5, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm14 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm1 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm4[2],ymm14[2],ymm4[3],ymm14[3],ymm4[6],ymm14[6],ymm4[7],ymm14[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm3 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm4 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1367,34 +1366,34 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm3[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm15[3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm12[2],xmm8[2],xmm12[3],xmm8[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm15[1],ymm7[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm15, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm15, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm9[1],ymm7[2,3,4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm9[2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm6, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7] @@ -1407,9 +1406,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm15, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm15, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -1417,7 +1416,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] @@ -1426,7 +1425,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[4],ymm1[4],ymm5[5],ymm1[5] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5] @@ -1464,143 +1463,143 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm13[0],zero,xmm13[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[6],ymm4[6],ymm7[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm14[2],ymm11[3],ymm14[3],ymm11[6],ymm14[6],ymm11[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm9 @@ -1610,20 +1609,20 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[4],ymm4[4],ymm7[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) @@ -1644,15 +1643,15 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-LABEL: store_i32_stride6_vf16: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm5 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512F-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 +; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] ; AVX512F-SLOW-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 @@ -1660,62 +1659,62 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm6, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 +; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm7, %zmm6 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512F-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 +; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm7, %zmm8 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512F-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512F-SLOW-NEXT: movb $-110, %cl ; AVX512F-SLOW-NEXT: kmovw %ecx, %k2 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm9, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 +; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm7, %zmm9 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512F-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm10, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 +; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm7, %zmm10 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm7, %zmm11 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm11, %zmm7 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512F-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 +; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm11, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) @@ -1728,101 +1727,101 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-LABEL: store_i32_stride6_vf16: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm5 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 +; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512F-FAST-NEXT: movb $-110, %cl ; AVX512F-FAST-NEXT: kmovw %ecx, %k2 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512F-FAST-NEXT: movb $36, %cl ; AVX512F-FAST-NEXT: kmovw %ecx, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm8, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512F-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm9, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm6, %zmm9 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm10, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm6, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512F-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm11, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm6, %zmm11 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: store_i32_stride6_vf16: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm5 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] ; AVX512BW-SLOW-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 @@ -1830,62 +1829,62 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm6, %zmm7 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm7, %zmm6 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm7, %zmm8 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-SLOW-NEXT: movb $-110, %cl ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm9, %zmm7 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm7, %zmm9 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512BW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm10, %zmm7 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm7, %zmm10 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] ; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm7, %zmm11 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm11, %zmm7 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm11, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm0, %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) @@ -1898,86 +1897,86 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FAST-LABEL: store_i32_stride6_vf16: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm5 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 +; AVX512BW-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-FAST-NEXT: movb $-110, %cl ; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-FAST-NEXT: movb $36, %cl ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm8, %zmm6 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512BW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm9, %zmm6 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm6, %zmm9 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm10, %zmm6 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm6, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm11, %zmm6 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm6, %zmm11 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <16 x i32>, ptr %in.vecptr0, align 64 @@ -2002,33 +2001,33 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: movaps (%rdi), %xmm9 -; SSE-NEXT: movaps 16(%rdi), %xmm10 +; SSE-NEXT: movaps 16(%rdi), %xmm11 ; SSE-NEXT: movaps (%rsi), %xmm4 -; SSE-NEXT: movaps 16(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm11 -; SSE-NEXT: movaps 16(%rdx), %xmm12 +; SSE-NEXT: movaps 16(%rsi), %xmm2 +; SSE-NEXT: movaps (%rdx), %xmm10 +; SSE-NEXT: movaps 16(%rdx), %xmm13 ; SSE-NEXT: movaps (%rcx), %xmm5 -; SSE-NEXT: movaps 16(%rcx), %xmm1 +; SSE-NEXT: movaps 16(%rcx), %xmm0 ; SSE-NEXT: movaps (%r8), %xmm6 -; SSE-NEXT: movaps 16(%r8), %xmm2 +; SSE-NEXT: movaps 16(%r8), %xmm1 ; SSE-NEXT: movaps (%r9), %xmm7 ; SSE-NEXT: movaps 16(%r9), %xmm3 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; SSE-NEXT: movaps %xmm10, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] ; SSE-NEXT: movaps %xmm9, %xmm8 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; SSE-NEXT: movaps %xmm7, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm6[0] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm8[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm12[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[0,2] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm8[0,2] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; SSE-NEXT: movaps %xmm6, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] @@ -2037,39 +2036,39 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm6[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm11[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] ; SSE-NEXT: movaps 32(%rdx), %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movaps 32(%rsi), %xmm1 @@ -2159,9 +2158,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] @@ -2199,73 +2198,73 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm9 +; SSE-NEXT: movaps 96(%rdx), %xmm6 ; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdi), %xmm11 -; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movaps 96(%r8), %xmm2 -; SSE-NEXT: movaps 96(%r9), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm9 +; SSE-NEXT: movaps 96(%rsi), %xmm8 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; SSE-NEXT: movaps 96(%r8), %xmm3 +; SSE-NEXT: movaps 96(%r9), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm11[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[0,2] ; SSE-NEXT: movaps 112(%rdx), %xmm3 ; SSE-NEXT: movaps 112(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: movaps 112(%rdi), %xmm2 -; SSE-NEXT: movaps 112(%rsi), %xmm10 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; SSE-NEXT: movaps 112(%rdi), %xmm8 +; SSE-NEXT: movaps 112(%rsi), %xmm14 +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] ; SSE-NEXT: movaps 112(%r8), %xmm1 -; SSE-NEXT: movaps 112(%r9), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps 112(%r9), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] ; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm3, 752(%rax) -; SSE-NEXT: movaps %xmm10, 736(%rax) -; SSE-NEXT: movaps %xmm2, 720(%rax) -; SSE-NEXT: movaps %xmm5, 704(%rax) -; SSE-NEXT: movaps %xmm6, 688(%rax) -; SSE-NEXT: movaps %xmm4, 672(%rax) -; SSE-NEXT: movaps %xmm9, 656(%rax) -; SSE-NEXT: movaps %xmm8, 640(%rax) -; SSE-NEXT: movaps %xmm11, 624(%rax) -; SSE-NEXT: movaps %xmm14, 608(%rax) +; SSE-NEXT: movaps %xmm14, 736(%rax) +; SSE-NEXT: movaps %xmm8, 720(%rax) +; SSE-NEXT: movaps %xmm7, 704(%rax) +; SSE-NEXT: movaps %xmm4, 688(%rax) +; SSE-NEXT: movaps %xmm2, 672(%rax) +; SSE-NEXT: movaps %xmm6, 656(%rax) +; SSE-NEXT: movaps %xmm10, 640(%rax) +; SSE-NEXT: movaps %xmm9, 624(%rax) +; SSE-NEXT: movaps %xmm13, 608(%rax) ; SSE-NEXT: movaps %xmm15, 592(%rax) -; SSE-NEXT: movaps %xmm13, 576(%rax) +; SSE-NEXT: movaps %xmm11, 576(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 560(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2280,9 +2279,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 480(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 448(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) @@ -2344,15 +2343,15 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i32_stride6_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm10 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 @@ -2377,13 +2376,14 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[4],ymm8[4],ymm12[5],ymm8[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2393,7 +2393,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -2403,16 +2403,16 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2442,15 +2442,16 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 68(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[4],ymm14[4],ymm6[5],ymm14[5] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2460,16 +2461,15 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 80(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,2],xmm9[1,2] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,2],xmm13[1,2] +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm11[2],xmm4[3],xmm11[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] @@ -2478,14 +2478,14 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 100(%r9), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[4],ymm3[4],ymm7[5],ymm3[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[4],ymm3[4],ymm12[5],ymm3[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm15[2,3],ymm1[4,5,6,7] @@ -2495,78 +2495,79 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 112(%r9), %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm12[1,2],ymm0[5,6],ymm12[5,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2],ymm15[1,2],ymm10[5,6],ymm15[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,2],ymm13[1,2],ymm8[5,6],ymm13[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[6],ymm14[6],ymm6[7],ymm14[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[1,2],mem[1,2],ymm14[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,2],ymm9[1,2],ymm10[5,6],ymm9[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm6[1,2],mem[1,2],ymm6[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,0,0,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vmovaps %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[6],ymm3[6],ymm12[7],ymm3[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,2],ymm2[1,2],ymm8[5,6],ymm2[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2],ymm2[1,2],ymm10[5,6],ymm2[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm0 -; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] @@ -2580,8 +2581,8 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm15[3,0],mem[3,0],ymm15[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[2,3],ymm3[2,3] @@ -2591,115 +2592,115 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm5[3,0],ymm14[3,0],ymm5[7,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm10[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6],ymm10[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm10[3,0],ymm3[7,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm9[1],ymm0[2,3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm13[2],mem[2],xmm13[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6],ymm10[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3,4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 640(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) @@ -2707,7 +2708,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) @@ -2794,8 +2795,8 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm8 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm8[0],zero,xmm8[1],zero -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 68(%r9), %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] @@ -2808,19 +2809,19 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 100(%r9), %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm8 ; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm9 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] @@ -2836,21 +2837,22 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm8 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm10 ; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm11 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] @@ -2859,11 +2861,11 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6] @@ -2872,72 +2874,71 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd 64(%rcx), %xmm12 ; AVX2-SLOW-NEXT: vpbroadcastd 64(%rdx), %xmm13 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm15, %ymm5 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm14, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd %xmm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm12 ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 84(%r9), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd %xmm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 116(%r9), %ymm15 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2957,52 +2958,53 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] @@ -3012,30 +3014,30 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 80(%r9), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 80(%r9), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[2,3],ymm4[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7] @@ -3045,50 +3047,49 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm11 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3,4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm11 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 112(%r9), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 112(%r9), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[2,3],ymm6[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 736(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 672(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 736(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 672(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm5, 640(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 544(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 480(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 480(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm3, 448(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 352(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 288(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 96(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 704(%rax) @@ -3120,337 +3121,339 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $872, %rsp # imm = 0x368 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: subq $856, %rsp # imm = 0x358 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 ; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 ; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm12 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm12[0],zero,xmm12[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm9[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm6[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm14 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm14[0],zero,xmm14[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm9 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm10 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[6],ymm10[6],ymm1[7],ymm10[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm13[3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm8 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm13 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm5, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm13[0],zero,xmm13[1],zero +; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[6],ymm13[6],ymm1[7],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm3 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm14 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm11 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm10 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm12 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm12[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm8 -; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm12 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm13, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm1 -; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm4 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm14, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4],ymm1[5],ymm9[6],ymm1[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm12 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd %xmm15, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] +; AVX2-FAST-NEXT: vpbroadcastd %xmm15, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm14, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm11[2],ymm1[3],ymm11[3],ymm1[6],ymm11[6],ymm1[7],ymm11[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm14 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[2,3],ymm5[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[2,3],ymm2[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm14[1],ymm10[2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[4],ymm13[4],ymm14[5],ymm13[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm11[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[2,3],ymm5[2,3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[4],mem[4],ymm12[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[2,3],ymm4[2,3] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm10 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[2,3],ymm6[2,3] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm11, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[4],mem[4],ymm7[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[4],ymm3[4],ymm9[5],ymm3[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[2,3],ymm3[2,3] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm2, 736(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 672(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 672(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 640(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 544(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 544(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 480(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm8, 448(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm5, 352(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm13, 288(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 256(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 256(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) @@ -3480,7 +3483,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $872, %rsp # imm = 0x368 +; AVX2-FAST-NEXT: addq $856, %rsp # imm = 0x358 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3547,8 +3550,8 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm8[0],zero,xmm8[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 68(%r9), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] @@ -3561,19 +3564,19 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%r9), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] @@ -3589,21 +3592,22 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] @@ -3612,11 +3616,11 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm12, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6] @@ -3625,72 +3629,71 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rcx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rdx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm14, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 84(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 116(%r9), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3710,52 +3713,53 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] @@ -3765,30 +3769,30 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%r9), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%r9), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[2,3],ymm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7] @@ -3798,50 +3802,49 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3,4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 112(%r9), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 112(%r9), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[2,3],ymm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 736(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 672(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 736(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 672(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 544(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 480(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 480(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 448(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 704(%rax) @@ -3874,135 +3877,135 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-LABEL: store_i32_stride6_vf32: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm14 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512F-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm7, %zmm3 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512F-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 -; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 -; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 -; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm12, %zmm9 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm15 +; AVX512F-SLOW-NEXT: vpermi2d %zmm19, %zmm11, %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm19, %zmm11, %zmm12 +; AVX512F-SLOW-NEXT: vpermi2d %zmm19, %zmm11, %zmm17 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm18 = zmm11[2],zmm19[2],zmm11[3],zmm19[3],zmm11[6],zmm19[6],zmm11[7],zmm19[7],zmm11[10],zmm19[10],zmm11[11],zmm19[11],zmm11[14],zmm19[14],zmm11[15],zmm19[15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19 ; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm20 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] -; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 +; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm19 ; AVX512F-SLOW-NEXT: movb $36, %dl ; AVX512F-SLOW-NEXT: kmovw %edx, %k1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm19[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm19, %zmm11 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 -; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm11 +; AVX512F-SLOW-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm19, %zmm2 +; AVX512F-SLOW-NEXT: vpermt2d %zmm14, %zmm22, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm19, %zmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm20, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d %zmm14, %zmm21, %zmm3 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512F-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm22, %zmm23 ; AVX512F-SLOW-NEXT: movb $-110, %cl ; AVX512F-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm23, %zmm9 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm14, %zmm24, %zmm9 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512F-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm25, %zmm26 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 +; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm26, %zmm15 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 -; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 -; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 -; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm14, %zmm27, %zmm15 +; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm20, %zmm7 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm12 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm23, %zmm12 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm24, %zmm12 +; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm26, %zmm17 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm27, %zmm17 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm19, %zmm20 ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 ; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm22, %zmm20 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 -; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm14, %zmm23, %zmm20 +; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm19 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm21[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm22, %zmm19 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm23, %zmm19 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm21, %zmm0 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm4 = zmm4[2],zmm10[2],zmm4[3],zmm10[3],zmm4[6],zmm10[6],zmm4[7],zmm10[7],zmm4[10],zmm10[10],zmm4[11],zmm10[11],zmm4[14],zmm10[14],zmm4[15],zmm10[15] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm4[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-SLOW-NEXT: vpermt2d %zmm14, %zmm10, %zmm0 +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm21, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm18[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm10, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 576(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 448(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -4010,137 +4013,137 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512F-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512F-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm14, %zmm21 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm22, %zmm23 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm24, %zmm25 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512F-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 -; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 -; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm10, %zmm8 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm11, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm18, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm18, %zmm26 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512F-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm12, %zmm7, %zmm4 +; AVX512F-FAST-NEXT: vpermi2d %zmm12, %zmm7, %zmm22 +; AVX512F-FAST-NEXT: vpermi2d %zmm12, %zmm7, %zmm24 +; AVX512F-FAST-NEXT: vpermi2d %zmm12, %zmm7, %zmm11 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm18, %zmm7 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 -; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 -; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 -; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 -; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm17, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm20, %zmm0 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm20, %zmm12 +; AVX512F-FAST-NEXT: vpermi2d %zmm15, %zmm1, %zmm14 +; AVX512F-FAST-NEXT: vpermi2d %zmm15, %zmm1, %zmm6 +; AVX512F-FAST-NEXT: vpermi2d %zmm15, %zmm1, %zmm10 +; AVX512F-FAST-NEXT: vpermi2d %zmm15, %zmm1, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm20, %zmm1 ; AVX512F-FAST-NEXT: movb $-110, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm15 ; AVX512F-FAST-NEXT: movb $36, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm19, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm8 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm20, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm18, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm21, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm13, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm19, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm19, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm20, %zmm4 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm18, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm21, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm17, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm13, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm13, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm19, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm15, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm18, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm17, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm17, %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm13, %zmm4 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm15, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm18, %zmm11 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm17, %zmm1 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 640(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-FAST-NEXT: vzeroupper @@ -4149,135 +4152,135 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-SLOW-LABEL: store_i32_stride6_vf32: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm14 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm7, %zmm3 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm12, %zmm9 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm15 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm19, %zmm11, %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm19, %zmm11, %zmm12 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm19, %zmm11, %zmm17 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm18 = zmm11[2],zmm19[2],zmm11[3],zmm19[3],zmm11[6],zmm19[6],zmm11[7],zmm19[7],zmm11[10],zmm19[10],zmm11[11],zmm19[11],zmm11[14],zmm19[14],zmm11[15],zmm19[15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19 ; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] -; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 +; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm19 ; AVX512BW-SLOW-NEXT: movb $36, %dl ; AVX512BW-SLOW-NEXT: kmovd %edx, %k1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm19[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm19, %zmm11 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm11 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 ; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm19, %zmm2 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm14, %zmm22, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm19, %zmm20 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm20, %zmm3 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm14, %zmm21, %zmm3 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm22, %zmm23 ; AVX512BW-SLOW-NEXT: movb $-110, %cl ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm23, %zmm9 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm14, %zmm24, %zmm9 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm25, %zmm26 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm26, %zmm15 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm14, %zmm27, %zmm15 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 {%k1} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm20, %zmm7 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm12 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm23, %zmm12 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm24, %zmm12 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm26, %zmm17 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm27, %zmm17 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm19, %zmm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 ; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm22, %zmm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm14, %zmm23, %zmm20 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm19 ; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm21[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm22, %zmm19 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm23, %zmm19 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm21, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm4 = zmm4[2],zmm10[2],zmm4[3],zmm10[3],zmm4[6],zmm10[6],zmm4[7],zmm10[7],zmm4[10],zmm10[10],zmm4[11],zmm10[11],zmm4[14],zmm10[14],zmm4[15],zmm10[15] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm4[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm4, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm14, %zmm10, %zmm0 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm21, %zmm1 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm18[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm4, %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm10, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 576(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 448(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -4285,137 +4288,137 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512BW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm14, %zmm21 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm22, %zmm23 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm6, %zmm5 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm24, %zmm25 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512BW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 -; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 -; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 -; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm10, %zmm8 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm11, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm18, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm18, %zmm26 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512BW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm12, %zmm7, %zmm4 +; AVX512BW-FAST-NEXT: vpermi2d %zmm12, %zmm7, %zmm22 +; AVX512BW-FAST-NEXT: vpermi2d %zmm12, %zmm7, %zmm24 +; AVX512BW-FAST-NEXT: vpermi2d %zmm12, %zmm7, %zmm11 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm18, %zmm7 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 -; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 -; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 -; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 -; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm17, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm20, %zmm0 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm20, %zmm12 +; AVX512BW-FAST-NEXT: vpermi2d %zmm15, %zmm1, %zmm14 +; AVX512BW-FAST-NEXT: vpermi2d %zmm15, %zmm1, %zmm6 +; AVX512BW-FAST-NEXT: vpermi2d %zmm15, %zmm1, %zmm10 +; AVX512BW-FAST-NEXT: vpermi2d %zmm15, %zmm1, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm20, %zmm1 ; AVX512BW-FAST-NEXT: movb $-110, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm15 ; AVX512BW-FAST-NEXT: movb $36, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm19, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm8 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm20, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm18, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm12 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm21, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm13, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm19, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm19, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm20, %zmm4 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm18, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm21, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm17, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm13, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm13, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm7, %zmm5 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm19, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm15, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm18, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm17, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm17, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm13, %zmm4 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm7, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm15, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm18, %zmm11 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm17, %zmm1 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 640(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper @@ -4442,78 +4445,78 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $1224, %rsp # imm = 0x4C8 ; SSE-NEXT: movaps (%rdi), %xmm9 -; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm11 -; SSE-NEXT: movaps 16(%rdx), %xmm12 -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps 16(%rcx), %xmm1 +; SSE-NEXT: movaps 16(%rdi), %xmm11 +; SSE-NEXT: movaps (%rsi), %xmm4 +; SSE-NEXT: movaps 16(%rsi), %xmm2 +; SSE-NEXT: movaps (%rdx), %xmm10 +; SSE-NEXT: movaps 16(%rdx), %xmm13 +; SSE-NEXT: movaps (%rcx), %xmm5 +; SSE-NEXT: movaps 16(%rcx), %xmm0 ; SSE-NEXT: movaps (%r8), %xmm6 -; SSE-NEXT: movaps 16(%r8), %xmm3 +; SSE-NEXT: movaps 16(%r8), %xmm1 ; SSE-NEXT: movaps (%r9), %xmm7 -; SSE-NEXT: movaps 16(%r9), %xmm5 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] +; SSE-NEXT: movaps 16(%r9), %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] ; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; SSE-NEXT: movaps %xmm7, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm6[0] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm8[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm12[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[0,2] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm9[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm8[0,2] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm9[2,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm6[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[0,2] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm5[3,3] -; SSE-NEXT: movaps 32(%rdx), %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm11[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] +; SSE-NEXT: movaps 32(%rdx), %xmm6 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm3[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movaps 32(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 32(%r8), %xmm2 ; SSE-NEXT: movaps 32(%r9), %xmm3 @@ -4527,19 +4530,19 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdx), %xmm6 ; SSE-NEXT: movaps 48(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm6, %xmm5 @@ -4863,9 +4866,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] @@ -4903,73 +4906,73 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm9 +; SSE-NEXT: movaps 224(%rdx), %xmm6 ; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movaps 224(%r8), %xmm2 -; SSE-NEXT: movaps 224(%r9), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm9 +; SSE-NEXT: movaps 224(%rsi), %xmm8 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; SSE-NEXT: movaps 224(%r8), %xmm3 +; SSE-NEXT: movaps 224(%r9), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm11[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[0,2] ; SSE-NEXT: movaps 240(%rdx), %xmm3 ; SSE-NEXT: movaps 240(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: movaps 240(%rdi), %xmm2 -; SSE-NEXT: movaps 240(%rsi), %xmm10 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; SSE-NEXT: movaps 240(%rdi), %xmm8 +; SSE-NEXT: movaps 240(%rsi), %xmm14 +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] ; SSE-NEXT: movaps 240(%r8), %xmm1 -; SSE-NEXT: movaps 240(%r9), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps 240(%r9), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] ; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm3, 1520(%rax) -; SSE-NEXT: movaps %xmm10, 1504(%rax) -; SSE-NEXT: movaps %xmm2, 1488(%rax) -; SSE-NEXT: movaps %xmm5, 1472(%rax) -; SSE-NEXT: movaps %xmm6, 1456(%rax) -; SSE-NEXT: movaps %xmm4, 1440(%rax) -; SSE-NEXT: movaps %xmm9, 1424(%rax) -; SSE-NEXT: movaps %xmm8, 1408(%rax) -; SSE-NEXT: movaps %xmm11, 1392(%rax) -; SSE-NEXT: movaps %xmm14, 1376(%rax) +; SSE-NEXT: movaps %xmm14, 1504(%rax) +; SSE-NEXT: movaps %xmm8, 1488(%rax) +; SSE-NEXT: movaps %xmm7, 1472(%rax) +; SSE-NEXT: movaps %xmm4, 1456(%rax) +; SSE-NEXT: movaps %xmm2, 1440(%rax) +; SSE-NEXT: movaps %xmm6, 1424(%rax) +; SSE-NEXT: movaps %xmm10, 1408(%rax) +; SSE-NEXT: movaps %xmm9, 1392(%rax) +; SSE-NEXT: movaps %xmm13, 1376(%rax) ; SSE-NEXT: movaps %xmm15, 1360(%rax) -; SSE-NEXT: movaps %xmm13, 1344(%rax) +; SSE-NEXT: movaps %xmm11, 1344(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1328(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4984,9 +4987,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 1248(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1232(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1216(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1216(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1200(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1184(%rax) @@ -5143,16 +5146,15 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2504, %rsp # imm = 0x9C8 +; AVX1-ONLY-NEXT: subq $2408, %rsp # imm = 0x968 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 @@ -5179,13 +5181,15 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm15 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5211,11 +5215,12 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5325,16 +5330,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 132(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5351,9 +5357,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5362,17 +5368,16 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 164(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5389,9 +5394,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5400,17 +5405,16 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 196(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5429,9 +5433,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5440,168 +5444,165 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 240(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[1,2],ymm15[1,2],ymm10[5,6],ymm15[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm14[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,2],ymm12[1,2],ymm15[5,6],ymm12[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm3[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,2],ymm13[1,2],ymm12[5,6],ymm13[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0,0,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm13[1,2],mem[1,2],ymm13[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,2],ymm15[1,2],ymm11[5,6],ymm15[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 148(%r9), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 160(%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm8[1,2],ymm6[5,6],ymm8[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 180(%r9), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 148(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0,0,0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm15[0,0,0,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 160(%r9), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 180(%r9), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,0,0] +; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0,0,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 192(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,2],ymm2[1,2],ymm4[5,6],ymm2[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 212(%r8), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 212(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[6],ymm0[6],ymm9[7],ymm0[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,2],ymm1[1,2],ymm5[5,6],ymm1[5,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 192(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 212(%r8), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 212(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm6[1,2],ymm1[5,6],ymm6[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 244(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 244(%r9), %ymm1 @@ -5613,8 +5614,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm1 @@ -5630,14 +5631,15 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[3,0],ymm0[7,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] @@ -5677,8 +5679,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] @@ -5693,178 +5694,179 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 128(%r9), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm15[3,0],mem[3,0],ymm15[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm13[2],mem[2],xmm13[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 128(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm10 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm12 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $19, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[2,3],ymm12[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm15[2],mem[2],xmm15[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm12 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4,5,6],ymm13[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 224(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm14[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm6, %ymm14 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm6[3,0],mem[3,0],ymm6[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload @@ -5876,19 +5878,18 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4,5,6],ymm14[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm5, 1504(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 1344(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 1408(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 1344(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 1312(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 1216(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 1120(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 1024(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 928(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 832(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 1024(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 928(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 832(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 640(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 544(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) @@ -5958,7 +5959,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX1-ONLY-NEXT: addq $2408, %rsp # imm = 0x968 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -6782,266 +6783,263 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2376, %rsp # imm = 0x948 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 ; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm4 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm6 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm8 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm9[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm7 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm7[0],zero,xmm7[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 128(%r8), %xmm5 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 132(%r9), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm15[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%r8), %xmm9 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 132(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%r8), %xmm6 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 164(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%r8), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 164(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovdqa 192(%r8), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 196(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 196(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm14 +; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovdqa 224(%r8), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 228(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 228(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm14 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm15 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2],ymm0[3],ymm14[4],ymm0[5],ymm14[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm5[2],ymm14[2],ymm5[3],ymm14[3],ymm5[6],ymm14[6],ymm5[7],ymm14[7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm14 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm13, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm5 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero @@ -7052,26 +7050,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq %xmm7, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero @@ -7082,27 +7080,27 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd 128(%rcx), %xmm0 ; AVX2-FAST-NEXT: vpbroadcastd 128(%rdx), %xmm4 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm10[0],mem[0],xmm10[1],mem[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm5, %ymm4 +; AVX2-FAST-NEXT: vpbroadcastq %xmm9, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 128(%r9), %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %ymm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %ymm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero @@ -7110,30 +7108,31 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd 148(%r9), %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 160(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcastss 160(%r9), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %ymm9 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero @@ -7141,33 +7140,31 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd 180(%r9), %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpbroadcastd %xmm11, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 192(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcastss 192(%r9), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero @@ -7190,16 +7187,16 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %ymm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %ymm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm14[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero @@ -7216,137 +7213,134 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[2,3],ymm2[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm3[0],ymm15[0],ymm3[1],ymm15[1],ymm3[4],ymm15[4],ymm3[5],ymm15[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm15[2],ymm3[3],ymm15[3],ymm3[6],ymm15[6],ymm3[7],ymm15[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm3[0],ymm11[0],ymm3[1],ymm11[1],ymm3[4],ymm11[4],ymm3[5],ymm11[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7356,134 +7350,136 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 128(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa 128(%r9), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%r9), %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 144(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 144(%r9), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%r9), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 176(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%r9), %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[4],ymm9[4],ymm3[5],ymm9[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 176(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[2,3],ymm9[2,3] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 192(%r9), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 208(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3],ymm15[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 224(%r8), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 192(%r9), %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm15[0],ymm8[0],ymm15[1],ymm8[1],ymm15[4],ymm8[4],ymm15[5],ymm8[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 208(%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm12[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm15[2],ymm8[2],ymm15[3],ymm8[3],ymm15[6],ymm8[6],ymm15[7],ymm8[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[2,3],ymm8[2,3] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm7, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm0[1],ymm8[2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm12, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 224(%r8), %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3,4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovdqa 224(%r9), %ymm13 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 240(%r9), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[2,3],ymm8[2,3] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 240(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm15[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[2,3],ymm14[2,3] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm5, 1504(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 1440(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 1504(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 1440(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 1408(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 1312(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 1248(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 1216(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 1120(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 1056(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 1024(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 928(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 1248(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 1216(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 1120(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 1056(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 1024(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 928(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 864(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7560,7 +7556,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $2376, %rsp # imm = 0x948 +; AVX2-FAST-NEXT: addq $2312, %rsp # imm = 0x908 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -8384,1138 +8380,1148 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-SLOW-LABEL: store_i32_stride6_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm28 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm13 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm27 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm10 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm6, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm27, %zmm13 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm20, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm27, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm20, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm27, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm20, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 -; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 -; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm12, %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm7, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm12, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vpermi2d %zmm10, %zmm5, %zmm27 +; AVX512F-SLOW-NEXT: vpermi2d %zmm10, %zmm5, %zmm22 +; AVX512F-SLOW-NEXT: vpermi2d %zmm10, %zmm5, %zmm20 +; AVX512F-SLOW-NEXT: vpermi2d %zmm10, %zmm5, %zmm7 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm12, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] -; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 +; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm2, %ymm11 ; AVX512F-SLOW-NEXT: movb $36, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 -; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 -; AVX512F-SLOW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm11[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512F-SLOW-NEXT: vpermt2d %zmm26, %zmm6, %zmm10 +; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d %zmm19, %zmm6, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm1[0,1,0,1,2,3,6,7] ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdx), %ymm0 ; AVX512F-SLOW-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm29 +; AVX512F-SLOW-NEXT: vpermi2d %zmm29, %zmm28, %zmm6 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%r8), %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm12, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm12, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm12, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512F-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm12 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm12 ; AVX512F-SLOW-NEXT: movb $-110, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm21[2],zmm17[2],zmm21[3],zmm17[3],zmm21[6],zmm17[6],zmm21[7],zmm17[7],zmm21[10],zmm17[10],zmm21[11],zmm17[11],zmm21[14],zmm17[14],zmm21[15],zmm17[15] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm23, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512F-SLOW-NEXT: vpermt2d %zmm26, %zmm15, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512F-SLOW-NEXT: vpermt2d %zmm26, %zmm19, %zmm21 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm10 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm24[2],zmm26[2],zmm24[3],zmm26[3],zmm24[6],zmm26[6],zmm24[7],zmm26[7],zmm24[10],zmm26[10],zmm24[11],zmm26[11],zmm24[14],zmm26[14],zmm24[15],zmm26[15] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermt2d %zmm26, %zmm23, %zmm25 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 -; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 -; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm4 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm15, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm26 +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm19, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm24 = zmm24[2],zmm0[2],zmm24[3],zmm0[3],zmm24[6],zmm0[6],zmm24[7],zmm0[7],zmm24[10],zmm0[10],zmm24[11],zmm0[11],zmm24[14],zmm0[14],zmm24[15],zmm0[15] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm23, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-SLOW-NEXT: vpermi2d %zmm29, %zmm28, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm27, %zmm12 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm27, %zmm17 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm27, %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm31, %zmm27, %zmm15 +; AVX512F-SLOW-NEXT: vpermi2d %zmm29, %zmm28, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm19 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm22, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm21 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm26 +; AVX512F-SLOW-NEXT: vpermt2d %zmm31, %zmm22, %zmm19 +; AVX512F-SLOW-NEXT: vpermi2d %zmm29, %zmm28, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm23 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm20, %zmm14 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm20, %zmm25 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm20, %zmm30 +; AVX512F-SLOW-NEXT: vpermt2d %zmm31, %zmm20, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm22 # 64-byte Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm20[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm20[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm20 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm20[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm20[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm20, %zmm22 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm22, (%rsp) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm20, %zmm24 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm20, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm31, %zmm20, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm20, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm20, %zmm8 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm20, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm1 = zmm28[2],zmm29[2],zmm28[3],zmm29[3],zmm28[6],zmm29[6],zmm28[7],zmm29[7],zmm28[10],zmm29[10],zmm28[11],zmm29[11],zmm28[14],zmm29[14],zmm28[15],zmm29[15] +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm2 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k1} = zmm1[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512F-SLOW-NEXT: vpermt2d %zmm31, %zmm20, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm27 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm10 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm12 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm17 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm21 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm26 +; AVX512F-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm25 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm30 +; AVX512F-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm4 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm24 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm16 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm8 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm9 +; AVX512F-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm5 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 1408(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 1152(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 1024(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 768(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 1280(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, 1216(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 1472(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 1408(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 1344(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 1152(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 1088(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 1024(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 960(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 1280(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 1216(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 896(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 832(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 512(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 64(%rax) +; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i32_stride6_vf64: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm29 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm19 ; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rsi), %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 ; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512F-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512F-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512F-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm24, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm5, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm6, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm9, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm5, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm6, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm7, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm9, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm11, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm17, %zmm24, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm12, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 -; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm21, %zmm2, %zmm5 +; AVX512F-FAST-NEXT: vpermi2d %zmm21, %zmm2, %zmm6 +; AVX512F-FAST-NEXT: vpermi2d %zmm21, %zmm2, %zmm7 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 +; AVX512F-FAST-NEXT: vpermi2d %zmm21, %zmm2, %zmm9 +; AVX512F-FAST-NEXT: vpermi2d %zmm21, %zmm2, %zmm11 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm24, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdx), %zmm1 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm21, %zmm17 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm7, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm31, %zmm8 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm27, %zmm12, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512F-FAST-NEXT: vpermt2d %zmm27, %zmm21, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512F-FAST-NEXT: vpermt2d %zmm27, %zmm7, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm27, %zmm31, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdx), %zmm27 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rcx), %zmm30 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm30, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm30, %zmm12, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm30, %zmm21, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm30, %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm30, %zmm31, %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdx), %zmm30 ; AVX512F-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm30, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm30, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm30, %zmm21 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm30, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm30, %zmm31 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 ; AVX512F-FAST-NEXT: movb $-110, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512F-FAST-NEXT: movb $36, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 {%k2} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm22 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm27 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm23 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm11 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm7, %zmm17 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm8, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm10, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm29 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm7, %zmm24 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm8, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm10, %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm23 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm7, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm8, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm10, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm7, %zmm21 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm10, %zmm30 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 1472(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 1408(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 1344(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 1152(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 1024(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 960(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 896(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 768(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 1472(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 1344(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 1280(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 1216(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 1152(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 1024(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 960(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 768(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, (%rax) ; AVX512F-FAST-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: store_i32_stride6_vf64: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm28 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm22 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm13 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm27 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 +; AVX512BW-SLOW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm21 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm10 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm6, %zmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm27, %zmm13 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm20, %zmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm27, %zmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm20, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm27, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm20, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 -; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdx), %ymm0 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm12, %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm7, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm12, %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm0 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm10, %zmm5, %zmm27 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm10, %zmm5, %zmm22 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm10, %zmm5, %zmm20 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm10, %zmm5, %zmm7 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm12, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] -; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 +; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm2, %ymm11 ; AVX512BW-SLOW-NEXT: movb $36, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 -; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 -; AVX512BW-SLOW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm11[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm26, %zmm6, %zmm10 +; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm0 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm19, %zmm6, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm1 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm1[0,1,0,1,2,3,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm0 ; AVX512BW-SLOW-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm29 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm29, %zmm28, %zmm6 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm12, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm12, %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm12, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm12 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm12 ; AVX512BW-SLOW-NEXT: movb $-110, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm21[2],zmm17[2],zmm21[3],zmm17[3],zmm21[6],zmm17[6],zmm21[7],zmm17[7],zmm21[10],zmm17[10],zmm21[11],zmm17[11],zmm21[14],zmm17[14],zmm21[15],zmm17[15] ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm23, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm26, %zmm15, %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm26, %zmm19, %zmm21 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm24[2],zmm26[2],zmm24[3],zmm26[3],zmm24[6],zmm26[6],zmm24[7],zmm26[7],zmm24[10],zmm26[10],zmm24[11],zmm26[11],zmm24[14],zmm26[14],zmm24[15],zmm26[15] +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpermt2d %zmm26, %zmm23, %zmm25 ; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm10 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm4 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm15, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm26 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm19, %zmm26 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm24 = zmm24[2],zmm0[2],zmm24[3],zmm0[3],zmm24[6],zmm0[6],zmm24[7],zmm0[7],zmm24[10],zmm0[10],zmm24[11],zmm0[11],zmm24[14],zmm0[14],zmm24[15],zmm0[15] +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm23, %zmm30 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-SLOW-NEXT: vpermi2d %zmm29, %zmm28, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm27, %zmm12 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm27, %zmm17 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm27, %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm31, %zmm27, %zmm15 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm29, %zmm28, %zmm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm19 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm22, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm21 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm26 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm31, %zmm22, %zmm19 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm29, %zmm28, %zmm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm23 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm20, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm20, %zmm25 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm20, %zmm30 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm31, %zmm20, %zmm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512BW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm22 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm20[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm20[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm20 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm20[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm20[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm20, %zmm22 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm22, (%rsp) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm20, %zmm24 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm20, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm31, %zmm20, %zmm7 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm20, %zmm16 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm20, %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm20, %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm1 = zmm28[2],zmm29[2],zmm28[3],zmm29[3],zmm28[6],zmm29[6],zmm28[7],zmm29[7],zmm28[10],zmm29[10],zmm28[11],zmm29[11],zmm28[14],zmm29[14],zmm28[15],zmm29[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm2 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k1} = zmm1[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm31, %zmm20, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm27 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm10 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm12 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm17 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm21 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm26 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm25 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm30 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm4 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm24 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm0, %zmm22, %zmm16 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm22, %zmm8 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm22, %zmm9 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm20, %zmm22, %zmm5 ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 1408(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm31, 1152(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 1024(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 768(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm21, 1280(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, 1216(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 1472(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, 1408(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 1344(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 1152(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 1088(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, 1024(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, 960(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, 640(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm21, 576(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 1280(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 1216(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 896(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, 512(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 448(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512BW-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 832(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, 512(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 64(%rax) +; AVX512BW-SLOW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i32_stride6_vf64: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm29 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm19 ; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm23 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm9, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512BW-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm11, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512BW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm24, %zmm29 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm6, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm9, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm5, %zmm23 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm6, %zmm20 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm7, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm9, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm11, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm17, %zmm24, %zmm19 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm12, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 -; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm21, %zmm2, %zmm5 +; AVX512BW-FAST-NEXT: vpermi2d %zmm21, %zmm2, %zmm6 +; AVX512BW-FAST-NEXT: vpermi2d %zmm21, %zmm2, %zmm7 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 +; AVX512BW-FAST-NEXT: vpermi2d %zmm21, %zmm2, %zmm9 +; AVX512BW-FAST-NEXT: vpermi2d %zmm21, %zmm2, %zmm11 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm24, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm1 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm21, %zmm17 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm7, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm31, %zmm8 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm27, %zmm12, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512BW-FAST-NEXT: vpermt2d %zmm27, %zmm21, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512BW-FAST-NEXT: vpermt2d %zmm27, %zmm7, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm27, %zmm31, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm27 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm30 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm30, %zmm3, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, %zmm25 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm30, %zmm12, %zmm25 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm30, %zmm21, %zmm28 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm30, %zmm7, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm30, %zmm31, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm27 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm30 ; AVX512BW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm30, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm30, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm30, %zmm21 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm30, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm30, %zmm31 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 ; AVX512BW-FAST-NEXT: movb $-110, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-FAST-NEXT: movb $36, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 {%k2} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm22 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm27 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm23 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm28 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm27 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm7, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm8, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm25 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm10, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm29 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm7, %zmm24 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm8, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm13 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm10, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm23 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm7, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm8, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm10, %zmm27 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm7, %zmm21 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm10, %zmm30 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 1472(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 1408(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 1344(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 1152(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, 1024(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, 960(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 896(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 768(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 704(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, 1472(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 1344(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 1280(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 1216(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 1152(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 1024(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 960(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, 768(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 704(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 640(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, 576(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 512(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, (%rax) ; AVX512BW-FAST-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index 33c9d9e182c34c..efc143aea6c5ef 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -262,48 +262,48 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: movaps (%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm6 ; SSE-NEXT: movaps (%rcx), %xmm1 -; SSE-NEXT: movaps (%r8), %xmm4 +; SSE-NEXT: movaps (%r8), %xmm5 ; SSE-NEXT: movaps (%r9), %xmm2 -; SSE-NEXT: movaps (%r10), %xmm8 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm2[3,3] -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm1[1,1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm9[2,0] -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,1] +; SSE-NEXT: movaps (%r10), %xmm7 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] ; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm10[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm5[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm2[3,3] +; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm1[1,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm8[2,0] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm8[0,1] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm10[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm4, 16(%rax) -; SSE-NEXT: movaps %xmm9, 32(%rax) +; SSE-NEXT: movaps %xmm5, 16(%rax) +; SSE-NEXT: movaps %xmm8, 32(%rax) ; SSE-NEXT: movaps %xmm2, 48(%rax) ; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm7, 64(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm9, 64(%rax) +; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: retq ; @@ -311,48 +311,48 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm9[1,0],ymm8[1,0],ymm9[5,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm8[2,1],ymm10[6,4],ymm8[6,5] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,1],ymm6[2,0],ymm5[5,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm3[1,1],xmm4[1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm11[1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3],ymm7[3,3],ymm5[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm11 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,1],ymm1[2,0],ymm0[5,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[1,1],xmm3[1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm10[3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[3,3],ymm7[3,3],ymm0[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[6],ymm11[6],ymm2[7],ymm11[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6],ymm11[7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm11[2,0],ymm9[6,5],ymm11[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] ; AVX1-ONLY-NEXT: vbroadcastss (%r10), %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,0],xmm3[0,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] -; AVX1-ONLY-NEXT: vbroadcastss 12(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,0],xmm2[0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] +; AVX1-ONLY-NEXT: vbroadcastss 12(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -360,52 +360,52 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm5 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm4 ; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm2 -; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm3 +; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm5 ; AVX2-SLOW-NEXT: vmovaps (%r10), %xmm1 -; AVX2-SLOW-NEXT: vinsertf128 $1, (%rsi), %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vinsertf128 $1, (%rsi), %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm7 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3] ; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[6],ymm5[6],ymm9[7],ymm5[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1],xmm0[1],zero +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1],xmm0[1],zero ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [5,0,2,6,5,0,2,6] ; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0],ymm4[1,2],ymm10[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4] ; AVX2-SLOW-NEXT: # xmm9 = mem[0,0] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm9, %ymm6 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,4,0,1,0,4,0,1] ; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] ; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vbroadcastss (%r10), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm5[3,3] ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm2[1,2],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vmovaps %xmm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -471,52 +471,52 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r10), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%rsi), %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%rsi), %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3] ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[6],ymm5[6],ymm9[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1],xmm0[1],zero +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1],xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [5,0,2,6,5,0,2,6] ; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0],ymm4[1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4] ; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm9, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,4,0,1,0,4,0,1] ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss (%r10), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm5[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm2[1,2],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -565,144 +565,144 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: pushq %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa (%rsi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rsi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm8 ; SSE-NEXT: movdqa 16(%rdx), %xmm6 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 +; SSE-NEXT: movdqa 16(%rcx), %xmm13 ; SSE-NEXT: movdqa 16(%r8), %xmm11 -; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movaps 16(%r9), %xmm0 -; SSE-NEXT: movdqa (%rax), %xmm10 -; SSE-NEXT: movaps 16(%rax), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] -; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: movdqa (%r9), %xmm1 +; SSE-NEXT: movaps 16(%r9), %xmm2 +; SSE-NEXT: movdqa (%rax), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: movaps %xmm2, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm12[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, %xmm14 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm5[0],xmm13[1,2,3] -; SSE-NEXT: movaps (%rcx), %xmm0 -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm6[0] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm4[0],xmm9[1,2,3] +; SSE-NEXT: movaps (%rcx), %xmm10 +; SSE-NEXT: movaps (%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,0] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm6[0] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm14[3,3] -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm7[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm9[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[2,0] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,1],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[2,0] ; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: movaps %xmm2, %xmm14 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[1,3] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,2] -; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[1,3] +; SSE-NEXT: movaps %xmm0, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm2[0,2] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movdqa %xmm12, %xmm8 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm15[0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm11[0] -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1] -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0,1],mem[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm2[0] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm11[0] +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm13[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm5[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm12 = xmm2[0],xmm12[1,2,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm10[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, 112(%rax) -; SSE-NEXT: movdqa %xmm5, 176(%rax) +; SSE-NEXT: movaps %xmm7, 112(%rax) +; SSE-NEXT: movdqa %xmm4, 176(%rax) ; SSE-NEXT: movdqa %xmm8, (%rax) -; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm15, 16(%rax) ; SSE-NEXT: movaps %xmm14, 64(%rax) ; SSE-NEXT: movaps %xmm6, 128(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) -; SSE-NEXT: movaps %xmm13, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm4, 96(%rax) +; SSE-NEXT: movaps %xmm3, 192(%rax) +; SSE-NEXT: movaps %xmm9, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rax) +; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm1, 80(%rax) +; SSE-NEXT: movaps %xmm12, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf8: ; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: subq $56, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm3 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1],ymm3[1,1],ymm2[5,5],ymm3[5,5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] +; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6],ymm5[7] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] @@ -710,84 +710,89 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm5[2,3] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm5 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[1,1],xmm5[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[1,1],xmm5[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[1,1],xmm9[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm14[1],xmm15[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[2],ymm13[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1],xmm10[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm15 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm15[1],xmm14[1],zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0],ymm13[1,2],ymm10[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm0[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2],ymm9[3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm12[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0],xmm12[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm9[0,2],ymm8[5,5],ymm9[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm4[0,2],ymm8[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1],ymm13[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[3,3],ymm0[3,3],ymm3[7,7],ymm0[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[3,3],ymm1[3,3],ymm2[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%rax), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[3,3],xmm11[3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1],ymm1[0,2],ymm3[7,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,3],xmm6[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%rax), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[3,3],xmm11[3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm2[0,2],ymm3[7,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm6[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: addq $56, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -799,15 +804,15 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm9 ; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm3 ; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm6 -; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm7 +; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm8 ; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm1 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm4 ; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm5 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm10, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm12 ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero @@ -817,63 +822,63 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm8[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm14[3,3],xmm15[3,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm14[3,3],xmm15[3,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5,6],ymm11[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm10[2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1],ymm10[2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm7 ; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm10 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm12 ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm3[1,1],ymm9[5,5],ymm3[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm10[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1],ymm3[1,1],ymm9[5,5],ymm3[5,5] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm8[3,3],ymm6[7,7],ymm8[7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2,3,4,5,6],ymm8[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm2 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] @@ -887,7 +892,7 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm12, (%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1011,15 +1016,15 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero @@ -1029,63 +1034,63 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm8[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm14[3,3],xmm15[3,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm14[3,3],xmm15[3,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5,6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm10[2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1],ymm10[2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm3[1,1],ymm9[5,5],ymm3[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm10[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1],ymm3[1,1],ymm9[5,5],ymm3[5,5] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm8[3,3],ymm6[7,7],ymm8[7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] @@ -1099,7 +1104,7 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1115,40 +1120,40 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512F-NEXT: vmovdqa (%r8), %ymm2 ; AVX512F-NEXT: vmovdqa (%r10), %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 -; AVX512F-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [31,7,15,23,31,7,15,23] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,23,31,7,6,23,31,7] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10> -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12> -; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512F-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm0 ; AVX512F-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] ; AVX512F-NEXT: vmovdqa %ymm0, 192(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1161,40 +1166,40 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-NEXT: vmovdqa (%r10), %ymm3 -; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 +; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [31,7,15,23,31,7,15,23] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,23,31,7,6,23,31,7] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10> -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12> -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm0, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1221,66 +1226,68 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $520, %rsp # imm = 0x208 +; SSE-NEXT: subq $472, %rsp # imm = 0x1D8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm7 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm10 +; SSE-NEXT: movdqa 16(%rdx), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rcx), %xmm8 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm15 -; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movaps 16(%rcx), %xmm14 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm13 -; SSE-NEXT: movdqa 16(%r9), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps (%r8), %xmm13 +; SSE-NEXT: movaps 16(%r8), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa (%r9), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm12 +; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm9 +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa 16(%rax), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 32(%rcx), %xmm2 @@ -1297,263 +1304,257 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm3 +; SSE-NEXT: movdqa 48(%rdx), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movaps 48(%rcx), %xmm6 ; SSE-NEXT: movaps 48(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rax), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r9), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 48(%rdi), %xmm7 +; SSE-NEXT: movaps 48(%rax), %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%r9), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm12[3,3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[1,3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm5, %xmm10 -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[1,3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm5[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0,1],mem[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm11[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm7[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm7[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm7[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm7[0],xmm12[1,2,3] -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm12[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm11[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm7[0],xmm9[1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3] +; SSE-NEXT: movaps %xmm6, %xmm11 ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm7[0],xmm8[1,2,3] -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm1, 416(%rax) -; SSE-NEXT: movaps %xmm3, 400(%rax) -; SSE-NEXT: movaps %xmm4, 384(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm12 = xmm2[0],xmm12[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rax) +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3] +; SSE-NEXT: shufps $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm3[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm2[0],xmm15[1,2,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps %xmm7, 416(%rax) +; SSE-NEXT: movaps %xmm4, 400(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 384(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 352(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 336(%rax) ; SSE-NEXT: movdqa %xmm5, 288(%rax) -; SSE-NEXT: movaps %xmm6, 240(%rax) -; SSE-NEXT: movdqa %xmm15, 224(%rax) -; SSE-NEXT: movaps %xmm10, 176(%rax) -; SSE-NEXT: movaps %xmm11, 128(%rax) +; SSE-NEXT: movaps %xmm9, 240(%rax) +; SSE-NEXT: movdqa %xmm14, 224(%rax) +; SSE-NEXT: movaps %xmm8, 176(%rax) +; SSE-NEXT: movaps %xmm10, 128(%rax) ; SSE-NEXT: movaps %xmm13, 112(%rax) -; SSE-NEXT: movaps %xmm14, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 432(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 368(%rax) -; SSE-NEXT: movaps %xmm8, 320(%rax) -; SSE-NEXT: movaps %xmm9, 304(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 272(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 256(%rax) -; SSE-NEXT: movaps %xmm12, 208(%rax) -; SSE-NEXT: movaps %xmm2, 192(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 432(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 368(%rax) +; SSE-NEXT: movaps %xmm1, 320(%rax) +; SSE-NEXT: movaps %xmm15, 304(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 272(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 256(%rax) +; SSE-NEXT: movaps %xmm0, 208(%rax) +; SSE-NEXT: movaps %xmm6, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps %xmm12, 96(%rax) +; SSE-NEXT: movaps %xmm11, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $520, %rsp # imm = 0x208 +; SSE-NEXT: addq $472, %rsp # imm = 0x1D8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm1 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] ; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm8 ; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm11 -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm11, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm12[0],xmm4[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm14 +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm15 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] @@ -1572,34 +1573,34 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm12[1,1],xmm4[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm15[1],xmm14[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm14[1],xmm15[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,1],ymm9[1,1],ymm6[5,5],ymm9[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1],ymm8[1,1],ymm7[5,5],ymm8[5,5] -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm13 -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm11 +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm11 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm10 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,1],ymm1[6,4],ymm0[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,1],ymm2[6,4],ymm0[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm14[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1],xmm0[0,2] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[1,1],xmm0[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm12[1],xmm2[1],zero -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm2[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 @@ -1617,112 +1618,111 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm5[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm13[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm10[1,1],ymm9[5,5],ymm10[5,5] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm8[1,1],ymm7[5,5],ymm8[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1],ymm7[1,1],ymm8[5,5],ymm7[5,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm15[2,1],ymm2[6,4],ymm15[6,5] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm3[1,1],ymm6[5,5],ymm3[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,1],ymm15[6,4],ymm1[6,5] ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1],ymm2[0,2],ymm1[5,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm15[0],ymm2[0],ymm15[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm4[0,2],ymm0[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3],ymm10[3,3],ymm11[7,7],ymm10[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,3],ymm11[3,3],ymm13[7,7],ymm11[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3],ymm6[3,3],ymm11[7,7],ymm6[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 60(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 60(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rax), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,3],ymm10[3,3],ymm11[7,7],ymm10[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 60(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 60(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rax), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm6[3,3],ymm3[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,3],ymm7[3,3],ymm8[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,3],ymm0[3,3],ymm1[7,7],ymm0[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,3],ymm0[1,2],ymm5[6,7],ymm0[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3],ymm1[3,3],ymm15[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3],ymm1[1,2],ymm5[6,7],ymm1[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm14[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm12[2],mem[2],xmm12[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[3,3],xmm12[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm13[0],ymm3[2],ymm13[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,1],ymm3[0,2],ymm13[7,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3],xmm6[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm5[0,2],ymm7[7,5],ymm5[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm12[3,3],xmm14[3,3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,1],ymm4[0,2],ymm5[7,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm14[3,3],xmm15[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1],ymm5[0,2],ymm3[7,5],ymm5[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[3,3],xmm12[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] @@ -1733,7 +1733,8 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1750,7 +1751,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1758,52 +1759,52 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm6 -; AVX2-SLOW-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm2 +; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm8 -; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm2 -; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm9 +; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm9 ; AVX2-SLOW-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm3 +; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm10 +; AVX2-SLOW-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm12[1],zero -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm7 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm7[1],xmm8[1],zero +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2],xmm6[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0],ymm3[1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm10[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm12[2],xmm3[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm13[1],xmm3[1],zero -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm13[1],xmm11[1],zero +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1814,40 +1815,40 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4,5],ymm1[6],ymm3[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm6 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm10 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm9 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm9 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 60(%r8), %ymm1 @@ -1857,9 +1858,9 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -1871,104 +1872,104 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm1 +; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vbroadcastss %xmm7, %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastsd %xmm15, %ymm3 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vbroadcastsd %xmm15, %ymm4 +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm12[3,3] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm3[3,3],xmm8[3,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss %xmm2, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastsd %xmm12, %ymm5 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastsd %xmm4, %ymm5 +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm9[1,1],ymm6[5,5],ymm9[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,1],ymm14[1,1],ymm1[5,5],ymm14[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] -; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] +; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm12 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3,4],ymm7[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm0[3,3],ymm8[3,3],ymm0[7,7],ymm8[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3,4],ymm7[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6],ymm4[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = xmm1[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = xmm0[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm12[0],ymm1[1],ymm12[1],ymm1[4],ymm12[4],ymm1[5],ymm12[5] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] @@ -1976,14 +1977,14 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm7 = xmm0[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 352(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2015,38 +2016,38 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%r8), %xmm5 -; AVX2-FAST-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r8), %xmm6 +; AVX2-FAST-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%r9), %xmm7 -; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r9), %xmm8 +; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-FAST-NEXT: vmovaps %xmm4, %xmm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm10[1],xmm11[1],zero +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm10 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm9 +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm12 ; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm4 ; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,2,2] @@ -2054,7 +2055,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm8[1],zero +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm11[1],zero ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2086,11 +2087,11 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm11 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm4 ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm15 = [5,6,5,6,5,6,5,6] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm15, %ymm15 @@ -2098,7 +2099,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[6],ymm3[6],ymm11[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2110,12 +2111,12 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vbroadcastsd 56(%rax), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm6[3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm15 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm15 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,1,2,2,0,1,2,2] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm6, %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7] ; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2126,26 +2127,26 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss %xmm11, %xmm2 -; AVX2-FAST-NEXT: vbroadcastss %xmm10, %xmm10 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vbroadcastss %xmm10, %xmm2 +; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm9 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm14, %ymm9 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vbroadcastsd %xmm14, %ymm7 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm5[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm11[1,1],ymm3[5,5],ymm11[5,5] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm3 @@ -2153,90 +2154,90 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm6[3,3] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm5[3,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss %xmm0, %xmm3 +; AVX2-FAST-NEXT: vbroadcastss %xmm0, %xmm2 ; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm4 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm6, %ymm4 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastsd %xmm3, %ymm4 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1],ymm1[1,1],ymm7[5,5],ymm1[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm0[1,1],ymm8[5,5],ymm0[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm4[5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 +; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm9[3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm9 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[6],ymm0[6],ymm5[7],ymm0[7] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm2[3,3],ymm10[3,3],ymm2[7,7],ymm10[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm2[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6],ymm8[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm15[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovaps %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,3],ymm2[3,3],ymm1[7,7],ymm2[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2],ymm7[3,4],ymm5[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm1[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm14[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm15[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm11, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm10, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2265,52 +2266,52 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm12[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm7[1],xmm8[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0],ymm3[1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm10[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm12[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm13[1],xmm3[1],zero -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm13[1],xmm11[1],zero +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2321,40 +2322,40 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4,5],ymm1[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 60(%r8), %ymm1 @@ -2364,9 +2365,9 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -2378,104 +2379,104 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm7, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm15, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm12[3,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm3[3,3],xmm8[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm12, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm9[1,1],ymm6[5,5],ymm9[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,1],ymm14[1,1],ymm1[5,5],ymm14[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3,4],ymm7[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm0[3,3],ymm8[3,3],ymm0[7,7],ymm8[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3,4],ymm7[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm1[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm0[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm12[0],ymm1[1],ymm12[1],ymm1[4],ymm12[4],ymm1[5],ymm12[5] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] @@ -2483,14 +2484,14 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm0[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2517,111 +2518,111 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%r10), %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm6, %zmm3, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm2, %zmm8 ; AVX512F-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm4, %zmm7 ; AVX512F-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm6, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm2, %zmm8 ; AVX512F-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm10 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm2, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm6, %zmm3, %zmm9 ; AVX512F-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm10, %zmm11 ; AVX512F-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512F-NEXT: kmovw %ecx, %k3 ; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm6, %zmm3, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm2, %zmm11 ; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512F-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vpermi2d %zmm4, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512F-NEXT: vpermi2d %zmm1, %zmm10, %zmm12 ; AVX512F-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm6, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vpermi2d %zmm5, %zmm2, %zmm11 ; AVX512F-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm4, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512F-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm10, %zmm13 ; AVX512F-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm6, %zmm3, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512F-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm5, %zmm13 ; AVX512F-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm4, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm10, %zmm14 ; AVX512F-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm5, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512F-NEXT: vpermi2d %zmm3, %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermi2d %zmm1, %zmm3, %zmm0 ; AVX512F-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512F-NEXT: vmovdqa64 %zmm7, 384(%rax) @@ -2632,111 +2633,111 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm8 ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm4, %zmm7 ; AVX512BW-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm8 ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm10 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm9 ; AVX512BW-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm11 ; AVX512BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm12 ; AVX512BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm11 ; AVX512BW-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm13 ; AVX512BW-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm13 ; AVX512BW-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm14 ; AVX512BW-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm0 ; AVX512BW-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rax) @@ -2765,72 +2766,71 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $1256, %rsp # imm = 0x4E8 +; SSE-NEXT: subq $1224, %rsp # imm = 0x4C8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm13 ; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movaps (%rdx), %xmm14 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm13 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm12 +; SSE-NEXT: movaps 16(%rcx), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r9), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm2 +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] -; SSE-NEXT: movaps %xmm14, %xmm3 -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm4 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r9), %xmm1 @@ -2840,22 +2840,23 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 48(%rsi), %xmm9 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%r9), %xmm1 @@ -2867,24 +2868,23 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 48(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm1 -; SSE-NEXT: movaps 64(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%r9), %xmm1 @@ -2894,17 +2894,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movdqa 64(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm3 +; SSE-NEXT: movdqa 80(%rsi), %xmm5 ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 80(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2922,194 +2922,192 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm6 -; SSE-NEXT: movaps 96(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movdqa 96(%rsi), %xmm3 +; SSE-NEXT: movaps 96(%rdx), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm3 -; SSE-NEXT: movaps 96(%r8), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rcx), %xmm5 +; SSE-NEXT: movaps 96(%r8), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%r9), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rax), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm2 +; SSE-NEXT: movdqa 112(%rdx), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movaps 112(%rcx), %xmm5 ; SSE-NEXT: movaps 112(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 112(%r9), %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 112(%rax), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 112(%r9), %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 112(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm12[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3122,225 +3120,223 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[1,3] +; SSE-NEXT: movaps %xmm8, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movaps 112(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm10[0] +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm3, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,1] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm10[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm5[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm10[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm12[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm0[0],xmm12[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm5[0],xmm10[1,2,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm5[0],xmm7[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[2,0] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm5[0],xmm14[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm5, 864(%rax) -; SSE-NEXT: movaps %xmm7, 848(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 832(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: movaps %xmm6, 784(%rax) -; SSE-NEXT: movaps %xmm8, 736(%rax) +; SSE-NEXT: movaps %xmm3, 864(%rax) +; SSE-NEXT: movaps %xmm8, 848(%rax) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 832(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 800(%rax) +; SSE-NEXT: movaps %xmm4, 784(%rax) +; SSE-NEXT: movaps %xmm6, 736(%rax) ; SSE-NEXT: movaps %xmm9, 688(%rax) -; SSE-NEXT: movaps %xmm10, 672(%rax) +; SSE-NEXT: movaps %xmm12, 672(%rax) ; SSE-NEXT: movaps %xmm11, 624(%rax) ; SSE-NEXT: movaps %xmm13, 576(%rax) ; SSE-NEXT: movaps %xmm15, 560(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 512(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 464(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 448(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 288(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 880(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 816(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 768(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 512(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 464(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 448(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 400(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 352(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 336(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 288(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 240(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 224(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 176(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 880(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 816(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 768(%rax) ; SSE-NEXT: movaps %xmm14, 752(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 720(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 704(%rax) -; SSE-NEXT: movaps %xmm1, 656(%rax) -; SSE-NEXT: movaps %xmm2, 640(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 720(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 704(%rax) +; SSE-NEXT: movaps %xmm2, 656(%rax) +; SSE-NEXT: movaps %xmm0, 640(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 608(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 592(%rax) -; SSE-NEXT: movaps %xmm3, 544(%rax) -; SSE-NEXT: movaps %xmm12, 528(%rax) +; SSE-NEXT: movaps %xmm7, 544(%rax) +; SSE-NEXT: movaps %xmm10, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3363,7 +3359,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: movaps %xmm4, 192(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3376,32 +3373,32 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $1256, %rsp # imm = 0x4E8 +; SSE-NEXT: addq $1224, %rsp # imm = 0x4C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1624, %rsp # imm = 0x658 +; AVX1-ONLY-NEXT: subq $1688, %rsp # imm = 0x698 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 @@ -3418,11 +3415,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -3437,7 +3434,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3455,8 +3452,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3466,18 +3463,18 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -3496,11 +3493,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3510,17 +3507,16 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3530,26 +3526,26 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3557,32 +3553,32 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm9[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm9[1,1],ymm2[5,5],ymm9[5,5] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm1[1,1],ymm9[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[2,1],ymm1[6,4],ymm7[6,5] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm2 @@ -3597,91 +3593,90 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm6[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm11 ; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm11[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm11[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm3[1],xmm4[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm5[1],xmm3[1],zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[6],ymm4[6],ymm12[7],ymm4[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm1[0,2],ymm8[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm1[0,2],ymm12[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm1[0,2],ymm13[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[6],ymm9[6],ymm13[7],ymm9[7] -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm14 +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[0,2],ymm10[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm1[0,2],ymm8[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[3,3],ymm14[3,3],ymm5[7,7],ymm14[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm15[3,3],ymm8[7,7],ymm15[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm12[3,3],ymm2[7,7],ymm12[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 124(%r8), %ymm1 @@ -3691,12 +3686,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 120(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[4],ymm15[4],ymm8[5],ymm15[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm5[0],ymm14[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[3,1],ymm0[0,2],ymm5[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm12[0],ymm2[1],ymm12[1],ymm2[4],ymm12[4],ymm2[5],ymm12[5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 108(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] @@ -3705,8 +3697,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm2[1,1],ymm3[5,5],ymm2[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm8[1,1],ymm15[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm5[1,1],ymm14[5,5],ymm5[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%r8), %ymm1 @@ -3729,18 +3721,18 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3],ymm12[3,3],ymm4[7,7],ymm12[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3],ymm6[3,3],ymm7[7,7],ymm6[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm5[3,3],ymm11[7,7],ymm5[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3],ymm3[3,3],ymm15[7,7],ymm3[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm11[3,3],mem[3,3],ymm11[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] @@ -3748,126 +3740,127 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6],ymm13[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm13[2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[3,3],ymm15[3,3],ymm4[7,7],ymm15[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3],ymm12[3,3],ymm11[7,7],ymm12[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3],ymm6[3,3],ymm10[7,7],ymm6[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,3],ymm12[3,3],ymm0[7,7],ymm12[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm1[2,3],ymm11[1,2],ymm1[6,7],ymm11[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm13[1,2,3,4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm1[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[3,3],ymm11[3,3],ymm8[7,7],ymm11[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,3],ymm14[3,3],ymm6[7,7],ymm14[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm14[3,3],ymm4[7,7],ymm14[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,3],ymm6[3,3],ymm1[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rax), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm2[0,2],ymm5[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,3],ymm3[1,2],ymm2[6,7],ymm3[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[3,1],ymm3[0,2],ymm11[7,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm13[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[4],ymm14[4],ymm4[5],ymm14[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1],ymm5[0,2],ymm6[7,5],ymm5[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm9[3,3] +; AVX1-ONLY-NEXT: vshufps $255, (%rsp), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rax), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm5[0,2],ymm7[7,5],ymm5[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm12[0],ymm0[1],ymm12[1],ymm0[4],ymm12[4],ymm0[5],ymm12[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[3,1],ymm7[0,2],ymm0[7,5],ymm7[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm9[0,2],ymm8[7,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[3,3],xmm14[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 640(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3914,40 +3907,41 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%rax) -; AVX1-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 +; AVX1-ONLY-NEXT: addq $1688, %rsp # imm = 0x698 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1320, %rsp # imm = 0x528 +; AVX2-SLOW-NEXT: subq $1304, %rsp # imm = 0x518 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm3 ; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm14 -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm7 -; AVX2-SLOW-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm10 -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm9 +; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm15 +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm9 ; AVX2-SLOW-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm1 +; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm10 +; AVX2-SLOW-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm8 ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm4 -; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm8 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm12[1],zero +; AVX2-SLOW-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm7 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm7[1],xmm8[1],zero ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm5 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX2-SLOW-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] @@ -3955,15 +3949,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm13[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm4[1],zero +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm4[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3986,7 +3980,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm13 ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm13[1],xmm2[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] @@ -3994,8 +3988,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm9 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 96(%rax), %xmm1 @@ -4046,11 +4040,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4086,152 +4080,152 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vbroadcastss 112(%r9), %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 112(%rax), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm15 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps %xmm14, %xmm11 -; AVX2-SLOW-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm11[1,1],ymm3[1,1],ymm11[5,5],ymm3[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vbroadcastss 112(%r9), %xmm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 112(%rax), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm0 +; AVX2-SLOW-NEXT: vbroadcastss %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm0[2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps %xmm15, %xmm1 +; AVX2-SLOW-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm6[3,3],xmm5[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm5[2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm5 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm11[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3],xmm7[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] ; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm0 ; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm5 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3],xmm10[3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3],xmm1[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] ; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm0 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm5 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm9[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm2[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] ; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm10[0],ymm4[0],ymm10[1],ymm4[1],ymm10[4],ymm4[4],ymm10[5],ymm4[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 108(%r8), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm4[2],ymm10[3],ymm4[3],ymm10[6],ymm4[6],ymm10[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[6],ymm3[6],ymm11[7],ymm3[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = mem[1,2,2,3,5,6,6,7] @@ -4243,8 +4237,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm7[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[6],ymm10[6],ymm4[7],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -4255,38 +4249,38 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 120(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1],ymm5[1,1],ymm10[5,5],ymm5[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm12[1,1],ymm6[5,5],ymm12[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm9[1,1],ymm0[5,5],ymm9[5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm14[1,1],ymm1[1,1],ymm14[5,5],ymm1[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm4 @@ -4299,102 +4293,100 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm2[1,1],ymm0[5,5],ymm2[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 80(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcastsd 80(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm5[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm3[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[4],ymm5[4],ymm9[5],ymm5[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = xmm5[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[6],ymm6[6],ymm12[7],ymm6[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm7[3,3],ymm10[3,3],ymm7[7,7],ymm10[7,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3,4],ymm7[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm4[1,2,3,4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm3[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] +; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6],ymm10[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm10 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[4],ymm13[4],ymm11[5],ymm13[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = xmm1[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = xmm10[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0],ymm10[1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm12 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm7[1,2],ymm12[3,4],ymm7[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6],ymm12[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm5[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm15[3,3],mem[3,3],ymm15[7,7],mem[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3,4],ymm12[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0],ymm4[1,2,3,4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm12 +; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6],ymm13[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm13 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = xmm2[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm14 = ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[6],ymm0[6],ymm11[7],ymm0[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm12, 640(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 544(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 640(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm12, 544(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 416(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm4, 608(%rax) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 576(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, 384(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4435,7 +4427,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-SLOW-NEXT: addq $1320, %rsp # imm = 0x528 +; AVX2-SLOW-NEXT: addq $1304, %rsp # imm = 0x518 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4445,43 +4437,43 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm4 -; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r8), %xmm5 +; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm3 +; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm6 ; AVX2-FAST-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm6[1],zero +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm11 ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm10 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm10[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2],xmm6[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,2,2] @@ -4489,8 +4481,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm6[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm2[1],xmm5[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%r8), %xmm1 @@ -4511,20 +4503,20 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm3[1],xmm2[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r9), %xmm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX2-FAST-NEXT: vmovaps %xmm2, %xmm7 -; AVX2-FAST-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps %xmm2, %xmm14 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 96(%rax), %xmm1 @@ -4538,44 +4530,44 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm3[1],xmm2[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm10 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[6],ymm10[6],ymm6[7],ymm10[7] -; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm9 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm1[2],ymm9[3],ymm1[3],ymm9[6],ymm1[6],ymm9[7],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm14 -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm12 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm2 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm7 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4590,8 +4582,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm0 @@ -4613,51 +4605,51 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm5[1,1],ymm1[1,1],ymm5[5,5],ymm1[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm15[5,6],ymm3[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,1],ymm0[1,1],ymm4[5,5],ymm0[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6],ymm15[7] ; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0],ymm3[1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vbroadcastss 112(%r9), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 112(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm15[2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6],ymm15[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vbroadcastss 108(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm7[2,2,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermps 96(%r9), %ymm12, %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm14 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermps 96(%r9), %ymm14, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-NEXT: vmovaps 96(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -4668,45 +4660,45 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vbroadcastsd 120(%rax), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm0 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm13, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps %xmm11, %xmm15 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,1,2,2,0,1,2,2] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm9[3,3],xmm15[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm11[3,3],xmm10[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm13[2],xmm8[3],xmm13[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps %xmm5, %xmm9 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm14[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm10[1,1],ymm6[5,5],ymm10[5,5] +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm9[1,1],mem[1,1],ymm9[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm3 @@ -4715,41 +4707,40 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vbroadcastss %xmm0, %xmm1 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm2 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm2 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vbroadcastsd (%rsp), %ymm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm5[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm0[1,1],mem[1,1],ymm0[5,5],mem[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] @@ -4761,165 +4752,165 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm1 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm2 +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm2 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm0[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm6[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm10[1,1],ymm13[5,5],ymm10[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1],ymm8[1,1],ymm14[5,5],ymm8[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 80(%rax), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 80(%rax), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm3 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm5 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3],xmm1[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm3[2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm4 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3],xmm9[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm1[2],ymm7[3],ymm1[3],ymm7[6],ymm1[6],ymm7[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3,4],ymm4[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3],xmm15[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm9 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm1[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6],ymm9[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm7 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm5[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[4],ymm1[4],ymm6[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = xmm2[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm9[1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[6],ymm6[6],ymm1[7],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3,4],ymm6[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm6 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6],ymm9[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm0[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6],ymm9[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = xmm3[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, (%rsp), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm9 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6],ymm10[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm10 = xmm0[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm10 = xmm10[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[6],ymm14[6],ymm8[7],ymm14[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[6],ymm13[6],ymm11[7],ymm13[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm15[3,3],mem[3,3],ymm15[7,7],mem[7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4],ymm10[5,6],ymm11[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm9, 640(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm8, 544(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm8, 640(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 544(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm7, 416(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 736(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm15, 736(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4968,34 +4959,35 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1320, %rsp # imm = 0x528 +; AVX2-FAST-PERLANE-NEXT: subq $1304, %rsp # imm = 0x518 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm12[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm7[1],xmm8[1],zero ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] @@ -5003,15 +4995,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm13[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm4[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm4[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5034,7 +5026,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm13[1],xmm2[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] @@ -5042,8 +5034,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %xmm1 @@ -5094,11 +5086,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5134,152 +5126,152 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%r9), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm11[1,1],ymm3[1,1],ymm11[5,5],ymm3[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%r9), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rax), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm0[2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm6[3,3],xmm5[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm5[2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm11[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3],xmm7[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3],xmm10[3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3],xmm1[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm9[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm2[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm10[0],ymm4[0],ymm10[1],ymm4[1],ymm10[4],ymm4[4],ymm10[5],ymm4[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 108(%r8), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm4[2],ymm10[3],ymm4[3],ymm10[6],ymm4[6],ymm10[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[6],ymm3[6],ymm11[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = mem[1,2,2,3,5,6,6,7] @@ -5291,8 +5283,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm7[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[6],ymm10[6],ymm4[7],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -5303,38 +5295,38 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1],ymm5[1,1],ymm10[5,5],ymm5[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm12[1,1],ymm6[5,5],ymm12[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm9[1,1],ymm0[5,5],ymm9[5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm14[1,1],ymm1[1,1],ymm14[5,5],ymm1[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm4 @@ -5347,102 +5339,100 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm2[1,1],ymm0[5,5],ymm2[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm5[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm3[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[4],ymm5[4],ymm9[5],ymm5[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm5[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[6],ymm6[6],ymm12[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm7[3,3],ymm10[3,3],ymm7[7,7],ymm10[7,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3,4],ymm7[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm4[1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm3[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm10 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[4],ymm13[4],ymm11[5],ymm13[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = xmm1[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = xmm10[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0],ymm10[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm12 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm15[3,3],mem[3,3],ymm15[7,7],mem[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3,4],ymm12[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0],ymm4[1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm7[1,2],ymm12[3,4],ymm7[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm5[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm13 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm2[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm14 = ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[6],ymm0[6],ymm11[7],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 640(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 544(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 640(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 544(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 416(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 608(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 576(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 384(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5483,7 +5473,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1320, %rsp # imm = 0x528 +; AVX2-FAST-PERLANE-NEXT: addq $1304, %rsp # imm = 0x518 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -5491,201 +5481,200 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F: # %bb.0: ; AVX512F-NEXT: pushq %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm12 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm11 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm23 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm20 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm22 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm26 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm14 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 ; AVX512F-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512F-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vpermi2d %zmm26, %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm28, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm6, %zmm16 +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> +; AVX512F-NEXT: vpermi2d %zmm26, %zmm16, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm21, %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm23, %zmm7 ; AVX512F-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm29, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vpermt2d %zmm10, %zmm15, %zmm30 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm0 {%k2} ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm25 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm18 {%k2} -; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 -; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512F-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512F-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm29, %zmm26 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm15, %zmm26 +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm0 {%k2} +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm27 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512F-NEXT: vpermi2d %zmm5, %zmm12, %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512F-NEXT: vpermi2d %zmm10, %zmm15, %zmm26 ; AVX512F-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vpermt2d %zmm13, %zmm15, %zmm31 ; AVX512F-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm16 {%k2} -; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512F-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 +; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm16 {%k2} +; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm28 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm6 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512F-NEXT: vpermi2d %zmm12, %zmm5, %zmm26 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512F-NEXT: vpermi2d %zmm10, %zmm26, %zmm27 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vpermt2d %zmm14, %zmm28, %zmm30 ; AVX512F-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512F-NEXT: vpermt2d %zmm29, %zmm27, %zmm30 ; AVX512F-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 -; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512F-NEXT: vpermt2d %zmm13, %zmm30, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm8, %zmm18 +; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512F-NEXT: vpermt2d %zmm11, %zmm26, %zmm17 ; AVX512F-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} -; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512F-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm17 {%k2} +; AVX512F-NEXT: vpermi2d %zmm12, %zmm5, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm27, %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vpermt2d %zmm14, %zmm4, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512F-NEXT: vpermt2d %zmm29, %zmm15, %zmm21 ; AVX512F-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 -; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 -; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm17 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vpermt2d %zmm13, %zmm21, %zmm25 +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm30 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm26 +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512F-NEXT: vpermt2d %zmm11, %zmm27, %zmm24 ; AVX512F-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} -; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} +; AVX512F-NEXT: vpermi2d %zmm12, %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vpermt2d %zmm14, %zmm15, %zmm20 +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm26 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512F-NEXT: vpermt2d %zmm29, %zmm4, %zmm20 ; AVX512F-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} -; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm0 -; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm20 {%k2} +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm27 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm27 {%k1} +; AVX512F-NEXT: vpermi2d %zmm12, %zmm5, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm15 +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vpermt2d %zmm11, %zmm4, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512F-NEXT: vpermt2d %zmm13, %zmm4, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm4, %zmm23 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm4, %zmm23 +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} ; AVX512F-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512F-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512F-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm14, %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512F-NEXT: vpermi2d %zmm29, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512F-NEXT: vpermi2d %zmm29, %zmm4, %zmm3 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 448(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm16, 704(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 832(%rax) ; AVX512F-NEXT: popq %rax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5694,201 +5683,200 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: pushq %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm20 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm22 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm26 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm14 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermi2d %zmm26, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm28, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> +; AVX512BW-NEXT: vpermi2d %zmm26, %zmm16, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm23, %zmm7 ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm29, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm30 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm0 {%k2} ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm18 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512BW-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm29, %zmm26 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm15, %zmm26 +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm0 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm27 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm12, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm15, %zmm26 ; AVX512BW-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm15, %zmm31 ; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm16 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm16 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm28 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm5, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm26, %zmm27 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm28, %zmm30 ; AVX512BW-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm27, %zmm30 ; AVX512BW-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm30, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm8, %zmm18 +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm26, %zmm17 ; AVX512BW-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm17 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm5, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm27, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm15, %zmm21 ; AVX512BW-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm25 +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm30 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm27, %zmm24 ; AVX512BW-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm15, %zmm20 +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm26 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm4, %zmm20 ; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm20 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm27 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm27 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm5, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm15 +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm4, %zmm23 +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} ; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm14, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm4, %zmm3 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -5915,48 +5903,49 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $2760, %rsp # imm = 0xAC8 +; SSE-NEXT: subq $2776, %rsp # imm = 0xAD8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm3 -; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm13 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rcx), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r9), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%r9), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa (%rax), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm0 @@ -5966,22 +5955,20 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm8 +; SSE-NEXT: movaps 32(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r9), %xmm1 @@ -5991,17 +5978,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm2 +; SSE-NEXT: movdqa 48(%rsi), %xmm10 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 48(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6017,17 +6004,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rsi), %xmm1 +; SSE-NEXT: movdqa 64(%rsi), %xmm12 ; SSE-NEXT: movaps 64(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 64(%rcx), %xmm4 @@ -6044,17 +6030,19 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm2 +; SSE-NEXT: movdqa 80(%rsi), %xmm4 ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 80(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6070,23 +6058,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 80(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm1 ; SSE-NEXT: movaps 96(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm10 +; SSE-NEXT: movaps 96(%rcx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%r9), %xmm1 @@ -6096,22 +6085,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm2 +; SSE-NEXT: movdqa 112(%rsi), %xmm4 ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 112(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6122,16 +6112,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 112(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rsi), %xmm1 ; SSE-NEXT: movaps 128(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] @@ -6151,15 +6141,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rsi), %xmm2 +; SSE-NEXT: movdqa 144(%rsi), %xmm4 ; SSE-NEXT: movdqa 144(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 144(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6177,14 +6168,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rsi), %xmm1 ; SSE-NEXT: movaps 160(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] @@ -6204,21 +6195,21 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 160(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rsi), %xmm2 +; SSE-NEXT: movdqa 176(%rsi), %xmm4 ; SSE-NEXT: movdqa 176(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 176(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%r9), %xmm1 @@ -6230,22 +6221,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rsi), %xmm1 ; SSE-NEXT: movaps 192(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 192(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%r9), %xmm1 @@ -6257,294 +6248,290 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rsi), %xmm3 +; SSE-NEXT: movdqa 208(%rsi), %xmm6 ; SSE-NEXT: movdqa 208(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 208(%rcx), %xmm6 -; SSE-NEXT: movaps 208(%r8), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rcx), %xmm15 +; SSE-NEXT: movaps 208(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm15[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%r9), %xmm6 -; SSE-NEXT: movdqa 208(%rax), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%r9), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rax), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps 224(%rcx), %xmm4 +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] +; SSE-NEXT: movaps 224(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 224(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 224(%r9), %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE-NEXT: movaps 224(%r9), %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movaps 224(%rax), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 240(%rcx), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movaps 240(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm6[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 240(%r9), %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: movaps 240(%r9), %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movaps 240(%rax), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[1,3] -; SSE-NEXT: movaps (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -6552,101 +6539,101 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6658,349 +6645,347 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm14[3,3] -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm12[3,3] +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 224(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movaps 240(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movaps 224(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm11[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[0,1],mem[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: movaps %xmm4, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movaps 240(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0,1],mem[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[2,0] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm4[0],xmm11[1,2,3] +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 1760(%rax) -; SSE-NEXT: movaps %xmm7, 1744(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1728(%rax) +; SSE-NEXT: movaps %xmm5, 1760(%rax) +; SSE-NEXT: movaps %xmm8, 1744(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1728(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 1696(%rax) -; SSE-NEXT: movaps %xmm5, 1680(%rax) -; SSE-NEXT: movaps %xmm8, 1648(%rax) +; SSE-NEXT: movaps %xmm7, 1680(%rax) +; SSE-NEXT: movaps %xmm6, 1648(%rax) ; SSE-NEXT: movaps %xmm9, 1632(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1616(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1616(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 1584(%rax) ; SSE-NEXT: movaps %xmm10, 1568(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1536(%rax) -; SSE-NEXT: movaps %xmm13, 1520(%rax) -; SSE-NEXT: movaps %xmm12, 1472(%rax) -; SSE-NEXT: movaps %xmm14, 1456(%rax) -; SSE-NEXT: movaps %xmm15, 1408(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1360(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1344(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1296(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1248(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1232(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1184(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1136(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1120(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1536(%rax) +; SSE-NEXT: movaps %xmm11, 1520(%rax) +; SSE-NEXT: movaps %xmm15, 1472(%rax) +; SSE-NEXT: movaps %xmm12, 1456(%rax) +; SSE-NEXT: movaps %xmm13, 1408(%rax) +; SSE-NEXT: movaps %xmm14, 1360(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1344(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1296(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1248(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1232(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1184(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1136(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1120(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 1072(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7075,27 +7060,27 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, 1504(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 1488(%rax) -; SSE-NEXT: movaps %xmm4, 1440(%rax) -; SSE-NEXT: movaps %xmm6, 1424(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1392(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1376(%rax) -; SSE-NEXT: movaps %xmm11, 1328(%rax) +; SSE-NEXT: movaps %xmm2, 1440(%rax) +; SSE-NEXT: movaps %xmm0, 1424(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1392(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1376(%rax) +; SSE-NEXT: movaps %xmm4, 1328(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1312(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1280(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1264(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1280(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1264(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1216(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1200(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1168(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1152(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1168(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1152(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1104(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7112,7 +7097,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 944(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 928(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 880(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%rax) @@ -7120,7 +7105,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 816(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 752(%rax) @@ -7154,7 +7139,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) -; SSE-NEXT: movaps %xmm2, 304(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7175,35 +7161,34 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $2760, %rsp # imm = 0xAC8 +; SSE-NEXT: addq $2776, %rsp # imm = 0xAD8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3432, %rsp # imm = 0xD68 +; AVX1-ONLY-NEXT: subq $3416, %rsp # imm = 0xD58 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm2 @@ -7216,14 +7201,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -7236,10 +7221,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm8[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7257,8 +7242,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7268,26 +7253,26 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7295,14 +7280,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm10[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7312,17 +7297,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm10[1,1],ymm1[5,5],ymm10[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7332,18 +7316,19 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 @@ -7361,11 +7346,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm9[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7375,17 +7360,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7395,26 +7379,26 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7422,14 +7406,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm9[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7439,16 +7423,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm2[0],ymm14[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7461,23 +7446,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7485,14 +7470,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm5[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm4[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm10[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7502,11 +7487,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm1[1,1],ymm9[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7522,17 +7507,19 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm12 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm4 @@ -7550,31 +7537,31 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm3[1],xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm11[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm12[1,1],ymm1[5,5],ymm12[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm11[2,1],ymm1[6,4],ymm11[6,5] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,1],ymm1[6,4],ymm10[6,5] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -7582,25 +7569,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm6[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7608,36 +7594,36 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm9[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm14[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm4[1],xmm3[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[1,1],ymm7[5,5],ymm0[5,5] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm0[1,1],ymm4[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,1],ymm12[6,4],ymm1[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[2,1],ymm14[6,4],ymm1[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -7645,106 +7631,108 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm4[0,2],ymm8[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm15[1],ymm9[3],ymm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm15[1,1],ymm9[0,2],ymm15[5,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[1,1],ymm9[0,2],ymm7[5,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm9[0,2],ymm8[5,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm14[1],ymm4[3],ymm14[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[1,1],ymm4[0,2],ymm14[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[1,1],ymm9[0,2],ymm3[5,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1],ymm4[0,2],ymm6[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 144(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[1,1],ymm9[0,2],ymm3[5,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 144(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm4[0,2],ymm13[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[6],ymm12[6],ymm3[7],ymm12[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[1,1],ymm9[0,2],ymm11[5,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] @@ -7757,19 +7745,19 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm12[1],xmm4[1],zero +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm14[1],xmm9[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 228(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 228(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm15[3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm12[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm14[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -7781,9 +7769,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm6[3,3],ymm8[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3],ymm6[3,3],ymm4[7,7],ymm6[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm7[3,3],ymm11[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 220(%r8), %ymm1 @@ -7793,7 +7781,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 216(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] @@ -7807,12 +7795,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm3[3,3],ymm8[7,7],ymm3[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3],ymm3[3,3],ymm7[7,7],ymm3[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm2[3,3],ymm4[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm2[3,3],ymm8[7,7],ymm2[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 252(%r8), %ymm1 @@ -7822,9 +7810,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,1],ymm0[0,2],ymm8[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,1],ymm0[0,2],ymm7[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[4],ymm2[4],ymm8[5],ymm2[5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 236(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] @@ -7833,8 +7821,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm4[1,1],ymm2[5,5],ymm4[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm7[1,1],ymm3[5,5],ymm7[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm8[1,1],ymm2[5,5],ymm8[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastsd 240(%r8), %ymm1 @@ -7847,10 +7835,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7864,9 +7852,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm5[3,3],mem[3,3],ymm5[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm6[3,3],ymm1[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm5[3,3],mem[3,3],ymm5[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7887,9 +7875,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpermilps $170, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -7899,9 +7887,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[3,3],mem[3,3],ymm4[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm6[3,3],mem[3,3],ymm6[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7934,9 +7922,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm9[3,3],mem[3,3],ymm9[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7966,11 +7954,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm10[3,3],ymm0[7,7],ymm10[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm7[3,3],mem[3,3],ymm7[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8000,12 +7989,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,3],ymm15[3,3],ymm12[7,7],ymm15[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm15[3,3],mem[3,3],ymm15[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm11[3,3],ymm2[7,7],ymm11[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm14[3,3],ymm2[7,7],ymm14[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8030,23 +8019,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm12[3,3],ymm11[7,7],ymm12[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,3],ymm9[3,3],ymm0[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,3],ymm10[3,3],ymm0[7,7],ymm10[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,3],mem[3,3],ymm3[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,3],ymm3[1,2],ymm5[6,7],ymm3[5,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3],ymm3[1,2],ymm4[6,7],ymm3[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4],ymm3[5,6,7] @@ -8054,67 +8044,64 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,1],ymm3[0,2],ymm6[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,1],ymm3[0,2],ymm5[7,5],ymm3[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm6[0],mem[0],ymm6[1],mem[1],ymm6[4],mem[4],ymm6[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,1],ymm3[0,2],ymm6[7,5],ymm3[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[4],mem[4],ymm9[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,1],ymm3[0,2],ymm9[7,5],ymm3[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[4],mem[4],ymm7[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,1],ymm3[0,2],ymm7[7,5],ymm3[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload @@ -8122,51 +8109,50 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[4],ymm11[4],ymm2[5],ymm11[5] -; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1],ymm12[0,2],ymm2[7,5],ymm12[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm2[0],ymm14[0],ymm2[1],ymm14[1],ymm2[4],ymm14[4],ymm2[5],ymm14[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[3,1],ymm14[0,2],ymm15[7,5],ymm14[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm9[0],ymm0[1],ymm9[1],ymm0[4],ymm9[4],ymm0[5],ymm9[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm10[3,1],ymm15[0,2],ymm10[7,5],ymm15[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm13[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm2[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[3,1],ymm15[0,2],ymm11[7,5],ymm15[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm13[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[3,1],ymm13[0,2],ymm0[7,5],ymm13[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm7[3,3],xmm8[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,1],ymm13[0,2],ymm0[7,5],ymm13[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm8[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm10, 1440(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 1440(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 1216(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 992(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 544(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 1408(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1312(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8261,13 +8247,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1632(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX1-ONLY-NEXT: addq $3432, %rsp # imm = 0xD68 +; AVX1-ONLY-NEXT: addq $3416, %rsp # imm = 0xD58 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2968, %rsp # imm = 0xB98 +; AVX2-SLOW-NEXT: subq $3016, %rsp # imm = 0xBC8 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8283,14 +8269,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm10 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm9 ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm9[1],zero ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8308,10 +8294,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm3[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8333,11 +8319,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm2 ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm2[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8607,58 +8592,60 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vbroadcastss %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %xmm2 +; AVX2-SLOW-NEXT: vbroadcastss %xmm2, %xmm3 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 224(%r8), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%r9), %xmm3 -; AVX2-SLOW-NEXT: vbroadcastss %xmm3, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 228(%r8), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vbroadcastss %xmm3, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 224(%rax), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm2[1],xmm0[1],zero +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 228(%r8), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm15[3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm3[1,1,1,1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vinsertf128 $1, 224(%rax), %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,3],xmm4[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovaps 224(%r8), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 224(%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm11 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1],ymm2[1,1],ymm0[5,5],ymm2[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] ; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] @@ -8667,8 +8654,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastss 240(%rax), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm15 +; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm15 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] @@ -8681,212 +8668,211 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm6 -; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm7 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm14[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm6 +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm7 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm1[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm10[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm7 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm1[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm7 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm6 ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm1[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vbroadcastsd 136(%rax), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm6 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm4, %xmm7 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3],xmm0[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vbroadcastsd 168(%rax), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm6 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm7 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm4, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3],xmm0[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vbroadcastsd 200(%rax), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 220(%r8), %ymm7 @@ -8899,17 +8885,20 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastss 240(%rdx), %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm11[0],ymm5[1],ymm11[1],ymm5[4],ymm11[4],ymm5[5],ymm11[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 236(%r8), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] +; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] @@ -8919,8 +8908,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[6],ymm1[6],ymm11[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -8931,19 +8920,19 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm6[1,1],ymm11[5,5],ymm6[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm2 @@ -8952,12 +8941,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm10[1,1],ymm1[5,5],ymm10[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -9046,9 +9035,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm8[1,1],ymm0[5,5],ymm8[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -9059,43 +9048,45 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 208(%rax), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] +; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[6],ymm7[6],ymm2[7],ymm7[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3],ymm14[3,3],ymm13[7,7],ymm14[7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3],ymm13[3,3],ymm12[7,7],ymm13[7,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovaps %ymm11, %ymm6 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] +; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm15[0],ymm5[1],ymm15[1],ymm5[4],ymm15[4],ymm5[5],ymm15[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -9108,8 +9099,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -9122,8 +9113,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -9134,7 +9125,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] @@ -9157,97 +9148,98 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0],ymm1[1,2],ymm12[3,4],ymm1[5,6],ymm12[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 144(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6],ymm12[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, (%rsp), %xmm4, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm4[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm12[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 176(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2],ymm11[3,4],ymm12[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm0[1,2,3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 176(%rdx), %ymm11 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = xmm7[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm7[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3,4],ymm12[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-SLOW-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX2-SLOW-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vbroadcastss 208(%rdx), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6],ymm12[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = xmm1[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm1[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm13, 1440(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 1440(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 1216(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 1216(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm3, 1088(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 992(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm5, 864(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 640(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 640(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm12, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm15, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 1472(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 1472(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9330,13 +9322,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-SLOW-NEXT: addq $2968, %rsp # imm = 0xB98 +; AVX2-SLOW-NEXT: addq $3016, %rsp # imm = 0xBC8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $3080, %rsp # imm = 0xC08 +; AVX2-FAST-NEXT: subq $3112, %rsp # imm = 0xC28 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9532,22 +9524,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm15 -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm12 +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm11 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm12[2],ymm1[3,4,5],ymm12[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9589,11 +9581,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9612,11 +9604,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9635,11 +9627,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9658,12 +9650,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm12 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm10 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm2 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm13 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm7 ; AVX2-FAST-NEXT: vmovaps 192(%rcx), %ymm8 @@ -9694,31 +9686,31 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vbroadcastss 228(%r8), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 224(%r9), %xmm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm4[1,1,1,1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vinsertf128 $1, 224(%rax), %ymm5, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm2 ; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm5 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm9, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 224(%r8), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FAST-NEXT: vbroadcastss 224(%rax), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6],ymm2[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,1,2,2,0,1,2,2] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 224(%r8), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vbroadcastss %xmm4, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-NEXT: vbroadcastss 224(%rax), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] @@ -9731,51 +9723,51 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm1 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1],ymm0[1,1],ymm3[5,5],ymm0[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6],ymm11[7] -; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vbroadcastss 240(%r9), %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 240(%rax), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6],ymm15[7] +; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vbroadcastss 240(%r9), %xmm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 240(%rax), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 220(%r8), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 220(%r9), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastsd 216(%rax), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[6],ymm9[6],ymm13[7],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 220(%r8), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-NEXT: vbroadcastss 220(%r9), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vbroadcastsd 216(%rax), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 240(%rdx), %ymm11 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6],ymm14[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 236(%r8), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 240(%rdx), %ymm14 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vbroadcastss 236(%r8), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermps 224(%r9), %ymm11, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovaps 224(%rax), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm14 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermps 224(%r9), %ymm14, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovaps 224(%rax), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm11[2,3],ymm14[2,3] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm14[2,3],ymm15[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] @@ -9790,30 +9782,30 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm6[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm8[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9828,8 +9820,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm2 @@ -9841,26 +9833,26 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm6[3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm5[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] @@ -9884,30 +9876,30 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9915,8 +9907,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -9932,43 +9924,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -9980,43 +9972,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 136(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm13[1,1],ymm1[5,5],ymm13[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10028,43 +10020,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 168(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm12[1,1],ymm1[5,5],ymm12[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10084,7 +10076,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -10096,7 +10088,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] @@ -10105,209 +10097,212 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm8[1,1],ymm0[5,5],ymm8[5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 208(%rax), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 208(%rax), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm3[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm4 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm3[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3,4],ymm6[5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm1[6],ymm9[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm8[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3,4],ymm5[5,6],ymm11[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm13[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3,4],ymm10[5,6],ymm13[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6],ymm13[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm13 = xmm8[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm13 = xmm13[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm5[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm5[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm14 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm15 = xmm8[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3,4],ymm13[5,6],ymm14[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0],ymm5[1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm14 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = xmm6[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm4[6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm1[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm15[1,2],ymm6[3,4],ymm15[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm14 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm2[6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = xmm0[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm4, 1440(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm14, 1216(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm13, 1088(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, 992(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, 864(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm2, 768(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 640(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm3, 544(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm7, 416(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm2, 1440(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 1312(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, 1216(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 1088(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 992(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm11, 864(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 768(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 640(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 544(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10398,13 +10393,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 1568(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX2-FAST-NEXT: addq $3080, %rsp # imm = 0xC08 +; AVX2-FAST-NEXT: addq $3112, %rsp # imm = 0xC28 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2968, %rsp # imm = 0xB98 +; AVX2-FAST-PERLANE-NEXT: subq $3016, %rsp # imm = 0xBC8 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10420,14 +10415,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm9[1],zero ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10445,10 +10440,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm3[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10470,11 +10465,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm2[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10744,58 +10738,60 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 224(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 228(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 224(%rax), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm2[1],xmm0[1],zero +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 228(%r8), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm15[3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm3[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 224(%rax), %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,3],xmm4[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1],ymm2[1,1],ymm0[5,5],ymm2[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] @@ -10804,8 +10800,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rax), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] @@ -10818,212 +10814,211 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm14[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm1[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm1[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm1[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 136(%rax), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm4, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3],xmm0[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 168(%rax), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm4, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3],xmm0[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 200(%rax), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 220(%r8), %ymm7 @@ -11036,17 +11031,20 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm11[0],ymm5[1],ymm11[1],ymm5[4],ymm11[4],ymm5[5],ymm11[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 236(%r8), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] @@ -11056,8 +11054,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[6],ymm1[6],ymm11[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11068,19 +11066,19 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm6[1,1],ymm11[5,5],ymm6[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm2 @@ -11089,12 +11087,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm10[1,1],ymm1[5,5],ymm10[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -11183,9 +11181,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm8[1,1],ymm0[5,5],ymm8[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -11196,43 +11194,45 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 208(%rax), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[6],ymm7[6],ymm2[7],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3],ymm14[3,3],ymm13[7,7],ymm14[7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3],ymm13[3,3],ymm12[7,7],ymm13[7,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm15[0],ymm5[1],ymm15[1],ymm5[4],ymm15[4],ymm5[5],ymm15[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11245,8 +11245,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -11259,8 +11259,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -11271,7 +11271,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] @@ -11294,97 +11294,98 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0],ymm1[1,2],ymm12[3,4],ymm1[5,6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 144(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, (%rsp), %xmm4, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm4[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm12[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 176(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2],ymm11[3,4],ymm12[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm0[1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 176(%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm7[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm7[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3,4],ymm12[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm1[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm1[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 1440(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 1440(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 1216(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 1216(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 1088(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 992(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 864(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 640(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 544(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 1472(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1472(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11467,959 +11468,965 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2968, %rsp # imm = 0xB98 +; AVX2-FAST-PERLANE-NEXT: addq $3016, %rsp # imm = 0xBC8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: store_i32_stride7_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3080, %rsp # imm = 0xC08 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm22 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm5 -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: subq $3016, %rsp # imm = 0xBC8 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm25 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm17 +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512F-NEXT: vpermt2d %zmm18, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm2, %zmm4, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm4, %zmm22, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm4, %zmm22, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm4, %zmm22, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm16, %zmm3, %zmm20 +; AVX512F-NEXT: vpermi2d %zmm16, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm16, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm3, %zmm4, %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm28, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm25 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm24 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm29 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm21 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512F-NEXT: vpermi2d %zmm3, %zmm21, %zmm11 -; AVX512F-NEXT: vpermi2d %zmm3, %zmm21, %zmm30 -; AVX512F-NEXT: vpermi2d %zmm21, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm3, %zmm9, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm28, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm28, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm26 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm4 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm26, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm26, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm26, %zmm4, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm3, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm4, %zmm8, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm2, %zmm29, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm2, %zmm29, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm10, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm10, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm8, %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm5 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm7, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm8, %zmm15 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm9, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm8, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm23 -; AVX512F-NEXT: vpermi2d %zmm13, %zmm12, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm13, %zmm12, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm11, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm23 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm29 +; AVX512F-NEXT: vpermi2d %zmm14, %zmm19, %zmm31 +; AVX512F-NEXT: vpermi2d %zmm14, %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm14, %zmm3, %zmm19 ; AVX512F-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm27 {%k1} ; AVX512F-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm9 {%k1} ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm28 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm6 ; AVX512F-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm30 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512F-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 ; AVX512F-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm12 ; AVX512F-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm28 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm30 {%k2} +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm28 {%k2} ; AVX512F-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512F-NEXT: kmovw %ecx, %k2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm18 ; AVX512F-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm22 {%k1} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512F-NEXT: kmovw %ecx, %k3 -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm31 {%k2} ; AVX512F-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm31 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm10 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm25 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm23 {%k3} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512F-NEXT: vpermi2d %zmm5, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm7 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm31 {%k3} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm12 ; AVX512F-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> -; AVX512F-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm2, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermi2d %zmm2, %zmm27, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm24, %zmm12 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm14, %zmm13 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm21, %zmm14 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm19 {%k1} ; AVX512F-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm8, %zmm15, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm6, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm30, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vpermt2d %zmm5, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vpermi2d %zmm5, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15> -; AVX512F-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512F-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512F-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] -; AVX512F-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] -; AVX512F-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512F-NEXT: vpermi2d %zmm5, %zmm13, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> +; AVX512F-NEXT: vpermi2d %zmm5, %zmm14, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512F-NEXT: vpermi2d %zmm7, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512F-NEXT: vpermi2d %zmm7, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512F-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512F-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512F-NEXT: vpermi2d %zmm7, %zmm10, %zmm6 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm1, 1472(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1472(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1280(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm30, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 1088(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 1024(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm31, 1344(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 1280(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm28, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 1152(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 1088(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 1024(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm23, 896(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 768(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 704(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, 448(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 320(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 256(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 1728(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 1536(%rax) -; AVX512F-NEXT: addq $3080, %rsp # imm = 0xC08 +; AVX512F-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 1536(%rax) +; AVX512F-NEXT: addq $3016, %rsp # imm = 0xBC8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3080, %rsp # imm = 0xC08 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm22 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm5 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm5 +; AVX512BW-NEXT: subq $3016, %rsp # imm = 0xBC8 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm17 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm22, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm22, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm22, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm20, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm3, %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm29 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm21, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm21, %zmm30 -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm26 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm26, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm26, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm26, %zmm4, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm3, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm29, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm5 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm8, %zmm15 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm9, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm8, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm23 -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm12, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm12, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm29 +; AVX512BW-NEXT: vpermi2d %zmm14, %zmm19, %zmm31 +; AVX512BW-NEXT: vpermi2d %zmm14, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm3, %zmm19 ; AVX512BW-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm27 {%k1} ; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm9 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm28 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm12 ; AVX512BW-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm28 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm30 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm28 {%k2} ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm18 ; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm22 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm31 {%k2} ; AVX512BW-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm25 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm23 {%k3} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm31 {%k3} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm12 ; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm24, %zmm12 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm14, %zmm13 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm21, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm19 {%k1} ; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm15, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15> -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm14, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm10, %zmm6 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1472(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 1344(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 768(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 704(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 256(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1728(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 1536(%rax) -; AVX512BW-NEXT: addq $3080, %rsp # imm = 0xC08 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 1536(%rax) +; AVX512BW-NEXT: addq $3016, %rsp # imm = 0xBC8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll index 35d12ffc3a8d30..9e893532820256 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -151,45 +151,45 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm5 -; SSE-NEXT: movaps (%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm6 -; SSE-NEXT: movaps (%r8), %xmm2 -; SSE-NEXT: movaps (%r9), %xmm7 -; SSE-NEXT: movaps (%r11), %xmm8 -; SSE-NEXT: movaps (%r10), %xmm9 -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps (%rcx), %xmm3 +; SSE-NEXT: movaps (%r8), %xmm4 +; SSE-NEXT: movaps (%r9), %xmm5 +; SSE-NEXT: movaps (%r11), %xmm6 +; SSE-NEXT: movaps (%r10), %xmm7 +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] +; SSE-NEXT: movaps %xmm6, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] ; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps %xmm2, 112(%rax) -; SSE-NEXT: movaps %xmm6, 80(%rax) -; SSE-NEXT: movaps %xmm5, 64(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) +; SSE-NEXT: movaps %xmm4, 112(%rax) +; SSE-NEXT: movaps %xmm3, 80(%rax) +; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps %xmm9, 32(%rax) ; SSE-NEXT: movaps %xmm12, 48(%rax) ; SSE-NEXT: movaps %xmm13, 16(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm10, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride8_vf4: @@ -256,21 +256,21 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm5 ; AVX2-ONLY-NEXT: vmovaps (%r11), %xmm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm7 -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm6, %ymm9 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm8 = [0,4,0,4,0,4,0,4] +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm8, %ymm9 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4] ; AVX2-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm10 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm9, %ymm10 ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [1,5,1,5] @@ -280,19 +280,19 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm11 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm10, %ymm11 ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [2,6,2,6] ; AVX2-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm11, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm11, %ymm7 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm4, %ymm6 ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [3,7,3,7] ; AVX2-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm5, %ymm2 @@ -302,7 +302,7 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -355,116 +355,114 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: subq $72, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps (%rsi), %xmm0 ; SSE-NEXT: movaps (%rdx), %xmm9 ; SSE-NEXT: movaps (%rcx), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm15 -; SSE-NEXT: movaps 16(%r8), %xmm10 +; SSE-NEXT: movaps (%r8), %xmm11 +; SSE-NEXT: movaps 16(%r8), %xmm6 ; SSE-NEXT: movaps (%r9), %xmm1 -; SSE-NEXT: movaps (%r10), %xmm14 +; SSE-NEXT: movaps (%r10), %xmm13 ; SSE-NEXT: movaps 16(%r10), %xmm12 -; SSE-NEXT: movaps (%rax), %xmm4 +; SSE-NEXT: movaps (%rax), %xmm2 ; SSE-NEXT: movaps 16(%rax), %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] +; SSE-NEXT: movaps %xmm11, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm8 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] -; SSE-NEXT: movaps 16(%r9), %xmm7 -; SSE-NEXT: movaps %xmm10, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 16(%r9), %xmm4 +; SSE-NEXT: movaps %xmm6, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdx), %xmm6 -; SSE-NEXT: movaps 16(%rcx), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm6[0] -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: movaps %xmm12, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] -; SSE-NEXT: movaps %xmm10, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movaps 16(%rdx), %xmm2 +; SSE-NEXT: movaps 16(%rcx), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rsi), %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm14[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm13[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm13[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm3[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm9[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm9[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm1[2,0] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm9[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm12[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 224(%rax) -; SSE-NEXT: movaps %xmm10, 240(%rax) +; SSE-NEXT: movaps %xmm7, 224(%rax) +; SSE-NEXT: movaps %xmm6, 240(%rax) ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps %xmm14, 96(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps %xmm5, 48(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) +; SSE-NEXT: movaps %xmm15, 176(%rax) +; SSE-NEXT: movaps %xmm3, 96(%rax) +; SSE-NEXT: movaps %xmm11, 112(%rax) +; SSE-NEXT: movaps %xmm14, 32(%rax) +; SSE-NEXT: movaps %xmm8, 48(%rax) +; SSE-NEXT: movaps %xmm10, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -539,23 +537,23 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm6 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm13 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm14 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm10[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm9[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] @@ -563,17 +561,17 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm0[1],xmm15[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm9[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] @@ -585,9 +583,9 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) @@ -608,14 +606,14 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm10 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] +; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] ; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm11 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 @@ -623,28 +621,28 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm11 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[6],ymm8[6],ymm11[7],ymm8[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm8 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[2,2,2,2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm4 @@ -821,39 +819,39 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm9 -; SSE-NEXT: movaps 16(%rdi), %xmm10 +; SSE-NEXT: movaps 16(%rdi), %xmm13 ; SSE-NEXT: movaps (%rsi), %xmm4 -; SSE-NEXT: movaps 16(%rsi), %xmm1 -; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps 16(%rsi), %xmm2 +; SSE-NEXT: movaps (%rdx), %xmm1 ; SSE-NEXT: movaps 16(%rdx), %xmm0 ; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps (%r8), %xmm11 +; SSE-NEXT: movaps (%r8), %xmm10 ; SSE-NEXT: movaps (%r9), %xmm7 ; SSE-NEXT: movaps (%r10), %xmm5 ; SSE-NEXT: movaps (%rax), %xmm6 ; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; SSE-NEXT: movaps %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[2,0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] -; SSE-NEXT: movaps %xmm11, %xmm14 +; SSE-NEXT: movaps %xmm10, %xmm14 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[2,0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm8[2,0] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; SSE-NEXT: movaps %xmm9, %xmm4 @@ -861,32 +859,32 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: movaps %xmm10, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rcx), %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] ; SSE-NEXT: movaps 16(%r10), %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm1[0,2] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm10, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps 16(%rax), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movaps 16(%r8), %xmm11 +; SSE-NEXT: movaps 16(%r8), %xmm10 ; SSE-NEXT: movaps 16(%r9), %xmm6 -; SSE-NEXT: movaps %xmm11, %xmm9 +; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] ; SSE-NEXT: movaps %xmm9, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] @@ -895,132 +893,131 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[0,2] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdx), %xmm3 +; SSE-NEXT: movaps 32(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps 32(%rdi), %xmm14 -; SSE-NEXT: movaps 32(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm1 -; SSE-NEXT: movaps 32(%rax), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps 32(%r8), %xmm11 -; SSE-NEXT: movaps 32(%r9), %xmm6 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; SSE-NEXT: movaps %xmm13, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps 32(%rdi), %xmm10 +; SSE-NEXT: movaps 32(%rsi), %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r10), %xmm2 +; SSE-NEXT: movaps 32(%rax), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] +; SSE-NEXT: movaps 32(%r8), %xmm14 +; SSE-NEXT: movaps 32(%r9), %xmm4 +; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] -; SSE-NEXT: movaps 48(%rdx), %xmm1 -; SSE-NEXT: movaps 48(%rcx), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps 48(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm8[2,0] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm8[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm2[0,2] +; SSE-NEXT: movaps 48(%rdx), %xmm4 +; SSE-NEXT: movaps 48(%rcx), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps 48(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps 48(%r10), %xmm0 -; SSE-NEXT: movaps 48(%rax), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps 48(%r8), %xmm5 +; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%r10), %xmm2 +; SSE-NEXT: movaps 48(%rax), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps 48(%r8), %xmm0 ; SSE-NEXT: movaps 48(%r9), %xmm9 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm8, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[2,0] -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm3[2,0] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm3[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm5, 496(%rax) -; SSE-NEXT: movaps %xmm3, 480(%rax) +; SSE-NEXT: movaps %xmm0, 496(%rax) +; SSE-NEXT: movaps %xmm8, 480(%rax) ; SSE-NEXT: movaps %xmm9, 464(%rax) ; SSE-NEXT: movaps %xmm12, 448(%rax) -; SSE-NEXT: movaps %xmm6, 432(%rax) -; SSE-NEXT: movaps %xmm4, 416(%rax) -; SSE-NEXT: movaps %xmm10, 400(%rax) -; SSE-NEXT: movaps %xmm15, 384(%rax) -; SSE-NEXT: movaps %xmm11, 368(%rax) -; SSE-NEXT: movaps %xmm14, 352(%rax) +; SSE-NEXT: movaps %xmm1, 432(%rax) +; SSE-NEXT: movaps %xmm13, 416(%rax) +; SSE-NEXT: movaps %xmm7, 400(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rax) +; SSE-NEXT: movaps %xmm14, 368(%rax) +; SSE-NEXT: movaps %xmm10, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) -; SSE-NEXT: movaps %xmm13, 304(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 288(%rax) +; SSE-NEXT: movaps %xmm11, 304(%rax) +; SSE-NEXT: movaps %xmm15, 288(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1068,7 +1065,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 @@ -1076,38 +1073,38 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,0],ymm10[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,0],ymm10[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[2,0],ymm2[2,3],ymm10[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,0],ymm1[2,3],ymm10[6,4],ymm1[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm2[3,0],ymm0[3,0],ymm2[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1],ymm2[2,0],ymm10[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1],ymm1[2,0],ymm10[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm2[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm10 @@ -1115,53 +1112,53 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[2,0],ymm13[2,3],ymm5[6,4],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm13[2,3],ymm5[6,4],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm13 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,0],ymm0[1,0],ymm2[5,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm10[3,0],ymm11[7,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,0],ymm10[3,0],ymm11[7,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm1[2],ymm9[3],ymm1[3],ymm9[6],ymm1[6],ymm9[7],ymm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[4],ymm5[4],ymm12[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] @@ -1169,8 +1166,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm0 @@ -1184,99 +1180,101 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm11[2],mem[2],xmm11[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm9[2],xmm14[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) @@ -1304,7 +1302,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i32_stride8_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $328, %rsp # imm = 0x148 +; AVX2-ONLY-NEXT: subq $296, %rsp # imm = 0x128 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 @@ -1312,9 +1310,9 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm13 @@ -1322,10 +1320,10 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1344,18 +1342,18 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm10 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[4],ymm6[4],ymm9[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm9 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 @@ -1363,27 +1361,27 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm3 ; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm6 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm10[1],ymm6[1],ymm10[3],ymm6[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[4],ymm3[4],ymm13[5],ymm3[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] @@ -1403,134 +1401,131 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm15 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm11 +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm13 ; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm6 -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm13 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm5 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm6 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm15, %xmm0 ; AVX2-ONLY-NEXT: vbroadcastss %xmm14, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm1 +; AVX2-ONLY-NEXT: vbroadcastss %xmm13, %xmm1 ; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm6 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm9 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm8 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm2[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm1[0,1,2],xmm10[3] ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm15[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm5 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm11 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm11 +; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm13[2],mem[2],xmm13[3],mem[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 256(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) @@ -1548,7 +1543,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rax) -; AVX2-ONLY-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1598,61 +1593,61 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = ; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512F-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> @@ -1663,8 +1658,8 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 %zmm16, 384(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm15, 320(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512F-NEXT: vzeroupper @@ -1716,61 +1711,61 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> @@ -1781,8 +1776,8 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -1814,117 +1809,117 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm9 -; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps (%rsi), %xmm3 +; SSE-NEXT: movaps 16(%rdi), %xmm13 +; SSE-NEXT: movaps (%rsi), %xmm4 ; SSE-NEXT: movaps 16(%rsi), %xmm1 ; SSE-NEXT: movaps (%rdx), %xmm2 ; SSE-NEXT: movaps 16(%rdx), %xmm0 -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps (%r8), %xmm11 -; SSE-NEXT: movaps (%r9), %xmm6 +; SSE-NEXT: movaps (%rcx), %xmm3 +; SSE-NEXT: movaps (%r8), %xmm10 +; SSE-NEXT: movaps (%r9), %xmm7 ; SSE-NEXT: movaps (%r10), %xmm5 -; SSE-NEXT: movaps (%rax), %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: movaps (%rax), %xmm6 +; SSE-NEXT: movaps %xmm3, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] -; SSE-NEXT: movaps %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[2,0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] +; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[2,0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm8[2,0] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rcx), %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rcx), %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] ; SSE-NEXT: movaps 16(%r10), %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rax), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps 16(%r9), %xmm5 -; SSE-NEXT: movaps %xmm11, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps 16(%r8), %xmm10 +; SSE-NEXT: movaps 16(%r9), %xmm6 +; SSE-NEXT: movaps %xmm10, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] ; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdx), %xmm0 ; SSE-NEXT: movaps 32(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 32(%rdi), %xmm7 -; SSE-NEXT: movaps 32(%rsi), %xmm4 +; SSE-NEXT: movaps 32(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r10), %xmm2 -; SSE-NEXT: movaps 32(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 32(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 @@ -1937,26 +1932,26 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdx), %xmm0 @@ -1964,16 +1959,16 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 48(%rdi), %xmm7 -; SSE-NEXT: movaps 48(%rsi), %xmm4 +; SSE-NEXT: movaps 48(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%r10), %xmm2 -; SSE-NEXT: movaps 48(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 48(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 48(%r8), %xmm11 ; SSE-NEXT: movaps 48(%r9), %xmm6 @@ -1986,26 +1981,26 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm0 @@ -2013,16 +2008,16 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 64(%rdi), %xmm7 -; SSE-NEXT: movaps 64(%rsi), %xmm4 +; SSE-NEXT: movaps 64(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%r10), %xmm2 -; SSE-NEXT: movaps 64(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 64(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 64(%r8), %xmm11 ; SSE-NEXT: movaps 64(%r9), %xmm6 @@ -2035,181 +2030,179 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm0 ; SSE-NEXT: movaps 80(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps 80(%rdi), %xmm7 -; SSE-NEXT: movaps 80(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdi), %xmm8 +; SSE-NEXT: movaps 80(%rsi), %xmm3 +; SSE-NEXT: movaps %xmm8, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%r10), %xmm2 -; SSE-NEXT: movaps 80(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 80(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; SSE-NEXT: movaps 80(%r8), %xmm11 -; SSE-NEXT: movaps 80(%r9), %xmm6 +; SSE-NEXT: movaps 80(%r9), %xmm7 ; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; SSE-NEXT: movaps %xmm10, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm6[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm6[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm6[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm2 -; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps 96(%rdi), %xmm15 -; SSE-NEXT: movaps 96(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] +; SSE-NEXT: movaps 96(%rdx), %xmm0 +; SSE-NEXT: movaps 96(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 96(%rdi), %xmm13 +; SSE-NEXT: movaps 96(%rsi), %xmm12 +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r10), %xmm1 -; SSE-NEXT: movaps 96(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps 96(%r8), %xmm13 -; SSE-NEXT: movaps 96(%r9), %xmm6 -; SSE-NEXT: movaps %xmm13, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; SSE-NEXT: movaps %xmm12, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps 96(%r10), %xmm2 +; SSE-NEXT: movaps 96(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] +; SSE-NEXT: movaps 96(%r8), %xmm8 +; SSE-NEXT: movaps 96(%r9), %xmm3 +; SSE-NEXT: movaps %xmm8, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] ; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm9[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm9[2,0] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm9[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] ; SSE-NEXT: movaps %xmm13, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm9[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[0,2] -; SSE-NEXT: movaps 112(%rdx), %xmm1 -; SSE-NEXT: movaps 112(%rcx), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps 112(%rdi), %xmm2 -; SSE-NEXT: movaps 112(%rsi), %xmm11 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] -; SSE-NEXT: movaps 112(%r10), %xmm0 -; SSE-NEXT: movaps 112(%rax), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps 112(%r8), %xmm4 -; SSE-NEXT: movaps 112(%r9), %xmm9 -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm5[2,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] -; SSE-NEXT: movaps %xmm2, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm5[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm2[0,2] +; SSE-NEXT: movaps 112(%rdx), %xmm5 +; SSE-NEXT: movaps 112(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 112(%rsi), %xmm12 +; SSE-NEXT: movaps %xmm2, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%r10), %xmm3 +; SSE-NEXT: movaps 112(%rax), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movaps 112(%r8), %xmm0 +; SSE-NEXT: movaps 112(%r9), %xmm11 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm7, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm4[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 1008(%rax) +; SSE-NEXT: movaps %xmm0, 1008(%rax) ; SSE-NEXT: movaps %xmm2, 992(%rax) -; SSE-NEXT: movaps %xmm9, 976(%rax) -; SSE-NEXT: movaps %xmm11, 960(%rax) -; SSE-NEXT: movaps %xmm6, 944(%rax) -; SSE-NEXT: movaps %xmm3, 928(%rax) +; SSE-NEXT: movaps %xmm11, 976(%rax) +; SSE-NEXT: movaps %xmm12, 960(%rax) +; SSE-NEXT: movaps %xmm1, 944(%rax) +; SSE-NEXT: movaps %xmm9, 928(%rax) ; SSE-NEXT: movaps %xmm10, 912(%rax) -; SSE-NEXT: movaps %xmm14, 896(%rax) -; SSE-NEXT: movaps %xmm13, 880(%rax) -; SSE-NEXT: movaps %xmm15, 864(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 896(%rax) +; SSE-NEXT: movaps %xmm8, 880(%rax) +; SSE-NEXT: movaps %xmm13, 864(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 848(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%rax) -; SSE-NEXT: movaps %xmm12, 816(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 800(%rax) +; SSE-NEXT: movaps %xmm15, 816(%rax) +; SSE-NEXT: movaps %xmm14, 800(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2320,206 +2313,206 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm10 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm11 -; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm10 +; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,0],ymm9[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,0],ymm6[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[1,0],ymm8[1,0],ymm11[5,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[2,0],ymm6[2,3],ymm9[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[4],ymm12[4],ymm7[5],ymm12[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[2,3],ymm6[6,4],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1],ymm6[2,0],ymm9[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,0],ymm6[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm9 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm8[3,0],ymm11[7,4],ymm8[7,4] -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm4[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[6],ymm12[6],ymm7[7],ymm12[7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm10[3,0],ymm11[7,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[2,0],ymm12[2,3],ymm11[6,4],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm11 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm9[1,0],ymm10[5,4],ymm9[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,0],ymm6[1,0],ymm7[5,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[2,3],ymm5[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,0],ymm9[3,0],ymm10[7,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm6[3,0],ymm7[7,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[2,0],ymm8[2,3],ymm7[6,4],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm5[1,0],ymm7[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm6[2,3],ymm8[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm6[1,0],ymm7[5,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[2,3],ymm4[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,0],ymm8[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,0],ymm5[3,0],ymm7[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,0],ymm6[3,0],ymm7[7,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,3],ymm0[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -2532,43 +2525,43 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] @@ -2576,153 +2569,153 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm2[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm15[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 832(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 832(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 800(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 768(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 608(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 576(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 544(%rax) @@ -2787,156 +2780,156 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm10 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm12 ; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm11 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[4],ymm7[4],ymm4[5],ymm7[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm6[0],ymm12[0],ymm6[1],ymm12[1],ymm6[4],ymm12[4],ymm6[5],ymm12[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm5 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 24(%r10), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7] ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm8 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm10 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm7 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm7[0,1],xmm11[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 28(%rax), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 28(%rax), %ymm11 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[6],ymm9[6],ymm11[7],ymm9[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 56(%r10), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm5[7] -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 56(%r10), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm8[2],ymm5[3],ymm8[3],ymm5[6],ymm8[6],ymm5[7],ymm8[7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%rax), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 60(%rax), %ymm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[6],ymm7[6],ymm3[7],ymm7[7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm7 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm7 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[4],ymm0[4],ymm5[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 64(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[4],ymm3[4],ymm7[5],ymm3[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 88(%r10), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 88(%r10), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[6],ymm0[6],ymm5[7],ymm0[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 92(%rax), %ymm0 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm2 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm8 @@ -2944,28 +2937,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 120(%r10), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 120(%r10), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 124(%rax), %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] @@ -3086,117 +3079,117 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm2 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm3[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%r10), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm6[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm7 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 +; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%r10), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm15 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%r10), %xmm2 +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm15 ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm1 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm5[0,1,2],xmm12[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm2[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 864(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 832(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 800(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 768(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 608(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 576(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 544(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 512(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 832(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 800(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 768(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 608(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 576(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 544(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 512(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3254,159 +3247,172 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: subq $2056, %rsp # imm = 0x808 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm27 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm30 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm28 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm21 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%r10), %zmm29 +; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm19 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm2 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm13 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm16 ; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512F-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm14, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm30 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm13 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm27 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm31 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm22 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm0 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm22, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm22, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm22, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm22, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm22, %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm22, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm22, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm17 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm28 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm4 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm3 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm4, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm4, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm29 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm4, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm16 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 @@ -3416,14 +3422,14 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm23, %zmm3 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 @@ -3431,101 +3437,101 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm23, %zmm1 ; AVX512F-NEXT: movb $-120, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} ; AVX512F-NEXT: movb $34, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 {%k2} ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm15 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm15 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm13 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm11 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm13 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm11 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm9 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm12 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm6 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 {%k3} ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm1, 896(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 768(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm7, 640(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm8, 704(%rax) @@ -3537,7 +3543,7 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512F-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512F-NEXT: vzeroupper @@ -3548,159 +3554,172 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: subq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm21 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm29 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm30 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm31 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm22, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm22, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm22, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm22, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm22, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm22, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm22, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm28 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm4, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm4, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm29 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm16 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 @@ -3710,14 +3729,14 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm23, %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 @@ -3725,101 +3744,101 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm23, %zmm1 ; AVX512BW-NEXT: movb $-120, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} ; AVX512BW-NEXT: movb $34, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 {%k2} ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm9 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) @@ -3831,7 +3850,7 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: vzeroupper @@ -3863,117 +3882,117 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm9 -; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm3 -; SSE-NEXT: movaps 16(%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps (%r8), %xmm11 -; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps (%r10), %xmm6 -; SSE-NEXT: movaps (%rax), %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm3[0] -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; SSE-NEXT: movaps %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] +; SSE-NEXT: movaps 16(%rdi), %xmm13 +; SSE-NEXT: movaps (%rsi), %xmm4 +; SSE-NEXT: movaps 16(%rsi), %xmm1 +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps 16(%rdx), %xmm0 +; SSE-NEXT: movaps (%rcx), %xmm3 +; SSE-NEXT: movaps (%r8), %xmm10 +; SSE-NEXT: movaps (%r9), %xmm7 +; SSE-NEXT: movaps (%r10), %xmm5 +; SSE-NEXT: movaps (%rax), %xmm6 +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] +; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[2,0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] +; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[2,0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm6[1,1] +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rcx), %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3] -; SSE-NEXT: movaps 16(%r10), %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[0,2] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rcx), %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: movaps 16(%r10), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps %xmm10, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: movaps 16(%r8), %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps 16(%r8), %xmm10 ; SSE-NEXT: movaps 16(%r9), %xmm6 -; SSE-NEXT: movaps %xmm11, %xmm9 +; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] ; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm1[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[0,2] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdx), %xmm0 ; SSE-NEXT: movaps 32(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 32(%rdi), %xmm7 -; SSE-NEXT: movaps 32(%rsi), %xmm4 +; SSE-NEXT: movaps 32(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r10), %xmm2 -; SSE-NEXT: movaps 32(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 32(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 @@ -3986,26 +4005,26 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdx), %xmm0 @@ -4013,16 +4032,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 48(%rdi), %xmm7 -; SSE-NEXT: movaps 48(%rsi), %xmm4 +; SSE-NEXT: movaps 48(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%r10), %xmm2 -; SSE-NEXT: movaps 48(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 48(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 48(%r8), %xmm11 ; SSE-NEXT: movaps 48(%r9), %xmm6 @@ -4035,26 +4054,26 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm0 @@ -4062,16 +4081,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 64(%rdi), %xmm7 -; SSE-NEXT: movaps 64(%rsi), %xmm4 +; SSE-NEXT: movaps 64(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%r10), %xmm2 -; SSE-NEXT: movaps 64(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 64(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 64(%r8), %xmm11 ; SSE-NEXT: movaps 64(%r9), %xmm6 @@ -4084,26 +4103,26 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm0 @@ -4111,16 +4130,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 80(%rdi), %xmm7 -; SSE-NEXT: movaps 80(%rsi), %xmm4 +; SSE-NEXT: movaps 80(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%r10), %xmm2 -; SSE-NEXT: movaps 80(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 80(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 80(%r8), %xmm11 ; SSE-NEXT: movaps 80(%r9), %xmm6 @@ -4133,26 +4152,26 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdx), %xmm0 @@ -4160,16 +4179,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps 96(%rsi), %xmm4 +; SSE-NEXT: movaps 96(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%r10), %xmm2 -; SSE-NEXT: movaps 96(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 96(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 96(%r8), %xmm11 ; SSE-NEXT: movaps 96(%r9), %xmm6 @@ -4182,26 +4201,26 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdx), %xmm0 @@ -4209,16 +4228,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 112(%rdi), %xmm7 -; SSE-NEXT: movaps 112(%rsi), %xmm4 +; SSE-NEXT: movaps 112(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%r10), %xmm2 -; SSE-NEXT: movaps 112(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 112(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 112(%r8), %xmm11 ; SSE-NEXT: movaps 112(%r9), %xmm6 @@ -4231,26 +4250,26 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdx), %xmm0 @@ -4258,16 +4277,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 128(%rdi), %xmm7 -; SSE-NEXT: movaps 128(%rsi), %xmm4 +; SSE-NEXT: movaps 128(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%r10), %xmm2 -; SSE-NEXT: movaps 128(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 128(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 128(%r8), %xmm11 ; SSE-NEXT: movaps 128(%r9), %xmm6 @@ -4280,26 +4299,26 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdx), %xmm0 @@ -4307,16 +4326,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 144(%rdi), %xmm7 -; SSE-NEXT: movaps 144(%rsi), %xmm4 +; SSE-NEXT: movaps 144(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%r10), %xmm2 -; SSE-NEXT: movaps 144(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 144(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 144(%r8), %xmm11 ; SSE-NEXT: movaps 144(%r9), %xmm6 @@ -4329,26 +4348,26 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdx), %xmm0 @@ -4356,16 +4375,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps 160(%rsi), %xmm4 +; SSE-NEXT: movaps 160(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%r10), %xmm2 -; SSE-NEXT: movaps 160(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 160(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 160(%r8), %xmm11 ; SSE-NEXT: movaps 160(%r9), %xmm6 @@ -4378,26 +4397,26 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdx), %xmm0 @@ -4405,16 +4424,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 176(%rdi), %xmm7 -; SSE-NEXT: movaps 176(%rsi), %xmm4 +; SSE-NEXT: movaps 176(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%r10), %xmm2 -; SSE-NEXT: movaps 176(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 176(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 176(%r8), %xmm11 ; SSE-NEXT: movaps 176(%r9), %xmm6 @@ -4427,26 +4446,26 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdx), %xmm0 @@ -4454,16 +4473,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 192(%rsi), %xmm4 +; SSE-NEXT: movaps 192(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%r10), %xmm2 -; SSE-NEXT: movaps 192(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 192(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 192(%r8), %xmm11 ; SSE-NEXT: movaps 192(%r9), %xmm6 @@ -4476,26 +4495,26 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdx), %xmm0 @@ -4503,16 +4522,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps 208(%rsi), %xmm4 +; SSE-NEXT: movaps 208(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%r10), %xmm2 -; SSE-NEXT: movaps 208(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps 208(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 208(%r8), %xmm11 ; SSE-NEXT: movaps 208(%r9), %xmm6 @@ -4525,132 +4544,131 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm1 +; SSE-NEXT: movaps 224(%rdx), %xmm4 ; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps 224(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%r10), %xmm2 -; SSE-NEXT: movaps 224(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps 224(%r8), %xmm15 -; SSE-NEXT: movaps 224(%r9), %xmm6 -; SSE-NEXT: movaps %xmm15, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; SSE-NEXT: movaps %xmm13, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[0,2] -; SSE-NEXT: movaps 240(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movaps 224(%rdi), %xmm14 +; SSE-NEXT: movaps 224(%rsi), %xmm13 +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%r10), %xmm1 +; SSE-NEXT: movaps 224(%rax), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] +; SSE-NEXT: movaps 224(%r8), %xmm10 +; SSE-NEXT: movaps 224(%r9), %xmm5 +; SSE-NEXT: movaps %xmm10, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm9[2,0] +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm9[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm1[0,2] +; SSE-NEXT: movaps 240(%rdx), %xmm3 ; SSE-NEXT: movaps 240(%rcx), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps 240(%rsi), %xmm11 -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] -; SSE-NEXT: movaps %xmm4, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] -; SSE-NEXT: movaps 240(%r10), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movaps 240(%rdi), %xmm4 +; SSE-NEXT: movaps 240(%rsi), %xmm13 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%r10), %xmm1 ; SSE-NEXT: movaps 240(%rax), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps 240(%r8), %xmm5 -; SSE-NEXT: movaps 240(%r9), %xmm9 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm3[2,0] -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm3[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movaps 240(%r8), %xmm0 +; SSE-NEXT: movaps 240(%r9), %xmm11 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm2, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm6[2,0] +; SSE-NEXT: movaps %xmm8, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm6[2,0] +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm5, 2032(%rax) -; SSE-NEXT: movaps %xmm1, 2016(%rax) -; SSE-NEXT: movaps %xmm9, 2000(%rax) -; SSE-NEXT: movaps %xmm11, 1984(%rax) -; SSE-NEXT: movaps %xmm6, 1968(%rax) -; SSE-NEXT: movaps %xmm4, 1952(%rax) -; SSE-NEXT: movaps %xmm10, 1936(%rax) -; SSE-NEXT: movaps %xmm14, 1920(%rax) -; SSE-NEXT: movaps %xmm15, 1904(%rax) -; SSE-NEXT: movaps %xmm12, 1888(%rax) +; SSE-NEXT: movaps %xmm0, 2032(%rax) +; SSE-NEXT: movaps %xmm4, 2016(%rax) +; SSE-NEXT: movaps %xmm11, 2000(%rax) +; SSE-NEXT: movaps %xmm13, 1984(%rax) +; SSE-NEXT: movaps %xmm2, 1968(%rax) +; SSE-NEXT: movaps %xmm5, 1952(%rax) +; SSE-NEXT: movaps %xmm9, 1936(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1920(%rax) +; SSE-NEXT: movaps %xmm10, 1904(%rax) +; SSE-NEXT: movaps %xmm14, 1888(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1872(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1856(%rax) -; SSE-NEXT: movaps %xmm13, 1840(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1824(%rax) +; SSE-NEXT: movaps %xmm12, 1840(%rax) +; SSE-NEXT: movaps %xmm15, 1824(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1808(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4889,206 +4907,206 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm9 -; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm11 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm10 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm11 +; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,0],ymm10[4,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,0],ymm6[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm9[1,0],ymm7[1,0],ymm9[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm8[2,3],ymm10[6,4],ymm8[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[4],ymm12[4],ymm7[5],ymm12[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[2,3],ymm6[6,4],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1],ymm8[2,0],ymm10[4,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,0],ymm6[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm8[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm10 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm7[3,0],ymm9[7,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm9[2,0],ymm12[2,3],ymm9[6,4],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm9 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm4[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[6],ymm12[6],ymm7[7],ymm12[7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm10[3,0],ymm11[7,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[2,0],ymm12[2,3],ymm11[6,4],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm8[0],ymm2[1],ymm8[1],ymm2[4],ymm8[4],ymm2[5],ymm8[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,0],ymm6[1,0],ymm7[5,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[2,3],ymm5[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,0],ymm10[3,0],ymm11[7,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm8[2],ymm2[3],ymm8[3],ymm2[6],ymm8[6],ymm2[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm6[3,0],ymm7[7,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[2,0],ymm8[2,3],ymm7[6,4],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm6[1,0],ymm7[5,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[2,3],ymm4[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,0],ymm6[3,0],ymm7[7,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,3],ymm0[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -5097,50 +5115,50 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%r10), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 128(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 128(%r10), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rax), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,3],ymm0[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -5149,50 +5167,50 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%r10), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 160(%r10), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rax), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,3],ymm0[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -5201,50 +5219,50 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 192(%r10), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 192(%r10), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rax), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,3],ymm0[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -5253,50 +5271,50 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%r10), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 224(%r10), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,3],ymm0[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -5309,43 +5327,43 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] @@ -5353,353 +5371,353 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 128(%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 160(%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm2[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 192(%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 192(%r10), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 224(%r10), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 224(%r10), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm15[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1888(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 1856(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 1856(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 1824(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 1792(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 1792(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 1632(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 1600(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 1568(%rax) @@ -5828,156 +5846,156 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm11 -; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm10 -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm12 +; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[4],ymm7[4],ymm4[5],ymm7[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm8[0],ymm15[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm6[0],ymm12[0],ymm6[1],ymm12[1],ymm6[4],ymm12[4],ymm6[5],ymm12[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm5 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5,6],ymm13[7] -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm10 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm9 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm8 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm7 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm7[0,1],xmm11[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm12[2],ymm5[3],ymm12[3],ymm5[6],ymm12[6],ymm5[7],ymm12[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm11 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[6],ymm9[6],ymm11[7],ymm9[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[4],ymm10[4],ymm8[5],ymm10[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm5[7] -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm8[2],ymm5[3],ymm8[3],ymm5[6],ymm8[6],ymm5[7],ymm8[7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[6],ymm7[6],ymm3[7],ymm7[7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm7 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm7 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[4],ymm0[4],ymm5[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 64(%r10), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[4],ymm3[4],ymm7[5],ymm3[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 88(%rax), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 88(%rax), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[6],ymm0[6],ymm5[7],ymm0[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 92(%r10), %ymm0 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm2 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 96(%r10), %ymm8 @@ -5985,28 +6003,28 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 120(%rax), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 120(%rax), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 124(%r10), %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] @@ -6539,8 +6557,8 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm6 +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm2 ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -6552,104 +6570,104 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 ; AVX2-ONLY-NEXT: vmovaps 192(%r10), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm6[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm7 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm5 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm3 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm4 +; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 224(%r10), %xmm1 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm15 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm15 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm5[0,1,2],xmm12[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm2[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1888(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1856(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1824(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 1792(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1632(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1600(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 1568(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1536(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1856(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1824(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 1792(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1632(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1600(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1568(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1536(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1376(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6771,256 +6789,257 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: subq $6216, %rsp # imm = 0x1848 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm5 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm4 -; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm0 -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm30 +; AVX512F-NEXT: vmovdqa64 (%r10), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm12 +; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm1 +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm2 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm13 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm30, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm1 ; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm26 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512F-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm29 ; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm29 ; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm22 ; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm27 +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm25 ; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm25, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm25, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm25, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm25, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm25, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 @@ -7036,7 +7055,7 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 @@ -7060,7 +7079,7 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 @@ -7069,23 +7088,23 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm17 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm16 ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm2 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm3 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> @@ -7104,9 +7123,9 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 @@ -7132,12 +7151,12 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm29 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm12 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 @@ -7150,11 +7169,11 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm20 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm20 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm19 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm18 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm17 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm15 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 @@ -7162,7 +7181,7 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm11 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm10 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm8 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 @@ -7229,60 +7248,60 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm5 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm5 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm26 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm30 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm31 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm31 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm30 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm28 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm27 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm28 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm12 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7323,8 +7342,8 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm17 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 {%k2} @@ -7357,8 +7376,8 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7372,20 +7391,20 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 {%k3} ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm2, 1984(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm3, 1920(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 1792(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 1792(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm10, 1664(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm11, 1600(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm14, 1536(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 1408(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1408(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm18, 1344(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm19, 1280(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm20, 1216(%rax) @@ -7393,12 +7412,12 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 %zmm23, 1088(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm24, 1024(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm28, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm29, 832(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm30, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 896(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm28, 768(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm31, 704(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm30, 576(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%rax) @@ -7425,256 +7444,257 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: subq $6216, %rsp # imm = 0x1848 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm30 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm26 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm29 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm29 ; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm22 ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm27 +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm25 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm25, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm25, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 @@ -7690,7 +7710,7 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 @@ -7714,7 +7734,7 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 @@ -7723,23 +7743,23 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> @@ -7758,9 +7778,9 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 @@ -7786,12 +7806,12 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm12 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 @@ -7804,11 +7824,11 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm15 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 @@ -7816,7 +7836,7 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm11 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm10 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm8 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 @@ -7883,60 +7903,60 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm26 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm31 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm30 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm28 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm28 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7977,8 +7997,8 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k2} @@ -8011,8 +8031,8 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -8026,20 +8046,20 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm2, 1984(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 1920(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 1792(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 1664(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 1600(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 1536(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1408(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 1344(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1280(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1216(%rax) @@ -8047,12 +8067,12 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1088(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm24, 1024(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 704(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 576(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll index 57e030ff2a1cee..27fe5d0d47ba23 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll @@ -228,58 +228,58 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps 96(%rdi), %xmm6 ; SSE-NEXT: movaps 80(%rdi), %xmm4 ; SSE-NEXT: movaps 64(%rdi), %xmm3 -; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm5 -; SSE-NEXT: movaps 96(%rsi), %xmm11 -; SSE-NEXT: movaps 80(%rsi), %xmm12 -; SSE-NEXT: movaps 64(%rsi), %xmm13 +; SSE-NEXT: movaps 96(%rsi), %xmm10 +; SSE-NEXT: movaps 80(%rsi), %xmm11 +; SSE-NEXT: movaps 64(%rsi), %xmm12 ; SSE-NEXT: movaps (%rsi), %xmm9 -; SSE-NEXT: movaps 16(%rsi), %xmm10 +; SSE-NEXT: movaps 16(%rsi), %xmm13 ; SSE-NEXT: movaps 32(%rsi), %xmm14 ; SSE-NEXT: movaps 48(%rsi), %xmm15 -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] ; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm13[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] ; SSE-NEXT: movaps %xmm5, %xmm14 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0] ; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] -; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm12[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] +; SSE-NEXT: movaps %xmm4, %xmm12 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0] -; SSE-NEXT: movaps 112(%rsi), %xmm11 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm11[0] +; SSE-NEXT: movaps %xmm6, %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm10[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm10[0] +; SSE-NEXT: movaps 112(%rsi), %xmm10 +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] ; SSE-NEXT: movaps %xmm0, 224(%rdx) -; SSE-NEXT: movaps %xmm7, 240(%rdx) +; SSE-NEXT: movaps %xmm8, 240(%rdx) ; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm12, 208(%rdx) +; SSE-NEXT: movaps %xmm11, 208(%rdx) ; SSE-NEXT: movaps %xmm4, 160(%rdx) -; SSE-NEXT: movaps %xmm13, 176(%rdx) +; SSE-NEXT: movaps %xmm12, 176(%rdx) ; SSE-NEXT: movaps %xmm3, 128(%rdx) ; SSE-NEXT: movaps %xmm15, 144(%rdx) ; SSE-NEXT: movaps %xmm5, 96(%rdx) ; SSE-NEXT: movaps %xmm14, 112(%rdx) ; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps %xmm10, 80(%rdx) +; SSE-NEXT: movaps %xmm13, 80(%rdx) ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps %xmm9, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: movaps %xmm7, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: retq @@ -405,7 +405,7 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-LABEL: store_i64_stride2_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 112(%rdi), %xmm14 +; SSE-NEXT: movaps 112(%rdi), %xmm15 ; SSE-NEXT: movaps 96(%rdi), %xmm13 ; SSE-NEXT: movaps 80(%rdi), %xmm11 ; SSE-NEXT: movaps 64(%rdi), %xmm10 @@ -420,9 +420,9 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps 16(%rsi), %xmm4 ; SSE-NEXT: movaps 32(%rsi), %xmm5 ; SSE-NEXT: movaps 48(%rsi), %xmm6 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm7 @@ -440,27 +440,27 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm15 ; SSE-NEXT: movaps 128(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm15, %xmm1 @@ -775,7 +775,7 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-LABEL: store_i64_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movaps 112(%rdi), %xmm14 +; SSE-NEXT: movaps 112(%rdi), %xmm15 ; SSE-NEXT: movaps 96(%rdi), %xmm13 ; SSE-NEXT: movaps 80(%rdi), %xmm11 ; SSE-NEXT: movaps 64(%rdi), %xmm10 @@ -790,47 +790,47 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps 16(%rsi), %xmm4 ; SSE-NEXT: movaps 32(%rsi), %xmm5 ; SSE-NEXT: movaps 48(%rsi), %xmm6 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm5[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: movaps 128(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -936,29 +936,29 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm15 +; SSE-NEXT: movaps 368(%rdi), %xmm14 ; SSE-NEXT: movaps 368(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 384(%rdi), %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 384(%rdi), %xmm15 ; SSE-NEXT: movaps 384(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 400(%rdi), %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 400(%rdi), %xmm13 ; SSE-NEXT: movaps 400(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 416(%rdi), %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 416(%rdi), %xmm11 ; SSE-NEXT: movaps 416(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 432(%rdi), %xmm8 ; SSE-NEXT: movaps 432(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm8, %xmm10 @@ -994,15 +994,15 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps %xmm9, 896(%rdx) ; SSE-NEXT: movaps %xmm8, 880(%rdx) ; SSE-NEXT: movaps %xmm10, 864(%rdx) -; SSE-NEXT: movaps %xmm12, 848(%rdx) -; SSE-NEXT: movaps %xmm14, 832(%rdx) -; SSE-NEXT: movaps %xmm11, 816(%rdx) +; SSE-NEXT: movaps %xmm11, 848(%rdx) +; SSE-NEXT: movaps %xmm12, 832(%rdx) +; SSE-NEXT: movaps %xmm13, 816(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rdx) -; SSE-NEXT: movaps %xmm13, 784(%rdx) +; SSE-NEXT: movaps %xmm15, 784(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rdx) -; SSE-NEXT: movaps %xmm15, 752(%rdx) +; SSE-NEXT: movaps %xmm14, 752(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 736(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1306,72 +1306,72 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm13 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm10[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm7[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm11[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm11[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,2,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm12[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm12[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm13[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm8[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm13[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm8[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm3[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm4[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm4[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll index 55c8cfb9b49566..3098f93974f98a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -179,46 +179,46 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm3 +; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm7 -; SSE-NEXT: movaps 16(%rsi), %xmm8 -; SSE-NEXT: movaps 32(%rsi), %xmm9 -; SSE-NEXT: movaps 48(%rsi), %xmm10 -; SSE-NEXT: movaps (%rdx), %xmm11 -; SSE-NEXT: movaps 16(%rdx), %xmm12 -; SSE-NEXT: movaps 32(%rdx), %xmm6 -; SSE-NEXT: movaps 48(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm8[0] -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm3[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 32(%rcx) +; SSE-NEXT: movaps (%rsi), %xmm4 +; SSE-NEXT: movaps 16(%rsi), %xmm5 +; SSE-NEXT: movaps 32(%rsi), %xmm6 +; SSE-NEXT: movaps 48(%rsi), %xmm7 +; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movaps 16(%rdx), %xmm9 +; SSE-NEXT: movaps 32(%rdx), %xmm10 +; SSE-NEXT: movaps 48(%rdx), %xmm11 +; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm8, 16(%rcx) +; SSE-NEXT: movaps %xmm5, 32(%rcx) ; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps %xmm12, 64(%rcx) -; SSE-NEXT: movaps %xmm9, 80(%rcx) -; SSE-NEXT: movaps %xmm1, 96(%rcx) -; SSE-NEXT: movaps %xmm6, 112(%rcx) -; SSE-NEXT: movaps %xmm10, 128(%rcx) +; SSE-NEXT: movaps %xmm9, 64(%rcx) +; SSE-NEXT: movaps %xmm6, 80(%rcx) +; SSE-NEXT: movaps %xmm3, 96(%rcx) +; SSE-NEXT: movaps %xmm10, 112(%rcx) +; SSE-NEXT: movaps %xmm7, 128(%rcx) ; SSE-NEXT: movaps %xmm0, 144(%rcx) -; SSE-NEXT: movaps %xmm5, 160(%rcx) -; SSE-NEXT: movaps %xmm4, 176(%rcx) +; SSE-NEXT: movaps %xmm11, 160(%rcx) +; SSE-NEXT: movaps %xmm12, 176(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride3_vf8: @@ -341,39 +341,40 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp ; SSE-NEXT: movapd 64(%rdi), %xmm5 -; SSE-NEXT: movapd (%rdi), %xmm1 -; SSE-NEXT: movapd 16(%rdi), %xmm2 +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: movapd 16(%rdi), %xmm1 ; SSE-NEXT: movapd 32(%rdi), %xmm3 ; SSE-NEXT: movapd 48(%rdi), %xmm6 ; SSE-NEXT: movapd 64(%rsi), %xmm9 -; SSE-NEXT: movapd (%rsi), %xmm4 -; SSE-NEXT: movapd 16(%rsi), %xmm7 -; SSE-NEXT: movapd 32(%rsi), %xmm11 +; SSE-NEXT: movapd (%rsi), %xmm2 +; SSE-NEXT: movapd 16(%rsi), %xmm4 +; SSE-NEXT: movapd 32(%rsi), %xmm8 ; SSE-NEXT: movapd 48(%rsi), %xmm10 ; SSE-NEXT: movapd 64(%rdx), %xmm15 -; SSE-NEXT: movapd (%rdx), %xmm0 +; SSE-NEXT: movapd (%rdx), %xmm11 ; SSE-NEXT: movapd 16(%rdx), %xmm12 ; SSE-NEXT: movapd 32(%rdx), %xmm13 ; SSE-NEXT: movapd 48(%rdx), %xmm14 -; SSE-NEXT: movapd %xmm1, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] -; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: movapd %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm4[0] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm3, %xmm12 -; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm8[0] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm6, %xmm13 ; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm10[0] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm14[0],xmm6[1] @@ -385,16 +386,16 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] ; SSE-NEXT: movapd 80(%rdi), %xmm15 -; SSE-NEXT: movapd 80(%rsi), %xmm7 +; SSE-NEXT: movapd 80(%rsi), %xmm6 ; SSE-NEXT: movapd %xmm15, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm7[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm6[0] ; SSE-NEXT: movapd 80(%rdx), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movapd 96(%rdi), %xmm4 ; SSE-NEXT: movapd 96(%rsi), %xmm3 -; SSE-NEXT: movapd %xmm4, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movapd %xmm4, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm3[0] ; SSE-NEXT: movapd 96(%rdx), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] @@ -410,8 +411,8 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm1, 336(%rcx) ; SSE-NEXT: movapd %xmm3, 320(%rcx) ; SSE-NEXT: movapd %xmm4, 304(%rcx) -; SSE-NEXT: movapd %xmm6, 288(%rcx) -; SSE-NEXT: movapd %xmm7, 272(%rcx) +; SSE-NEXT: movapd %xmm7, 288(%rcx) +; SSE-NEXT: movapd %xmm6, 272(%rcx) ; SSE-NEXT: movapd %xmm15, 256(%rcx) ; SSE-NEXT: movapd %xmm8, 240(%rcx) ; SSE-NEXT: movapd %xmm9, 224(%rcx) @@ -422,7 +423,8 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rcx) ; SSE-NEXT: movapd %xmm13, 144(%rcx) -; SSE-NEXT: movapd %xmm11, 128(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movapd %xmm12, 96(%rcx) @@ -430,8 +432,7 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movapd %xmm11, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -443,9 +444,9 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride3_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm7 ; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm6 ; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm9 ; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm4 @@ -454,16 +455,16 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm10[0],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm11, %ymm11 @@ -475,7 +476,7 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm5[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm6[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm4[2,3] @@ -489,15 +490,15 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3] ; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2],ymm5[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2],ymm2[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] @@ -509,12 +510,12 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd %ymm1, 320(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm13, 160(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 128(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 224(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 224(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm12, 256(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper @@ -522,77 +523,77 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i64_stride3_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm11[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm12[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm5[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm13[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 288(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%rcx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm3[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 288(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 256(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -649,56 +650,56 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i64_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movapd 64(%rdi), %xmm8 +; SSE-NEXT: movapd 64(%rdi), %xmm10 ; SSE-NEXT: movapd (%rdi), %xmm5 ; SSE-NEXT: movapd 16(%rdi), %xmm6 -; SSE-NEXT: movapd 32(%rdi), %xmm7 -; SSE-NEXT: movapd 48(%rdi), %xmm9 -; SSE-NEXT: movapd 64(%rsi), %xmm13 -; SSE-NEXT: movapd (%rsi), %xmm10 -; SSE-NEXT: movapd 16(%rsi), %xmm11 -; SSE-NEXT: movapd 32(%rsi), %xmm12 -; SSE-NEXT: movapd 48(%rsi), %xmm14 +; SSE-NEXT: movapd 32(%rdi), %xmm8 +; SSE-NEXT: movapd 48(%rdi), %xmm11 +; SSE-NEXT: movapd 64(%rsi), %xmm14 +; SSE-NEXT: movapd (%rsi), %xmm7 +; SSE-NEXT: movapd 16(%rsi), %xmm9 +; SSE-NEXT: movapd 32(%rsi), %xmm13 +; SSE-NEXT: movapd 48(%rsi), %xmm15 ; SSE-NEXT: movapd 64(%rdx), %xmm0 ; SSE-NEXT: movapd (%rdx), %xmm1 ; SSE-NEXT: movapd 16(%rdx), %xmm2 ; SSE-NEXT: movapd 32(%rdx), %xmm3 ; SSE-NEXT: movapd 48(%rdx), %xmm4 -; SSE-NEXT: movapd %xmm5, %xmm15 -; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm10[0] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm5, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm7[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm6, %xmm5 -; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm11[0] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm7, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm12[0] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm9, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; SSE-NEXT: movapd %xmm6, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm4[0],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm8, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm13[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, %xmm5 +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm13[0] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm3[0],xmm8[1] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm11, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm15[0] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm4[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm4[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm10, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 80(%rdi), %xmm1 ; SSE-NEXT: movapd 80(%rsi), %xmm2 ; SSE-NEXT: movapd %xmm1, %xmm0 @@ -766,24 +767,24 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd 176(%rdx), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movapd 192(%rdi), %xmm9 -; SSE-NEXT: movapd 192(%rsi), %xmm8 -; SSE-NEXT: movapd %xmm9, %xmm14 -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm8[0] +; SSE-NEXT: movapd 192(%rdi), %xmm10 +; SSE-NEXT: movapd 192(%rsi), %xmm9 +; SSE-NEXT: movapd %xmm10, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm9[0] ; SSE-NEXT: movapd 192(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movapd 208(%rdi), %xmm10 -; SSE-NEXT: movapd 208(%rsi), %xmm7 -; SSE-NEXT: movapd %xmm10, %xmm11 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] -; SSE-NEXT: movapd 208(%rdx), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movapd 208(%rdi), %xmm8 +; SSE-NEXT: movapd 208(%rsi), %xmm6 +; SSE-NEXT: movapd %xmm8, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm6[0] +; SSE-NEXT: movapd 208(%rdx), %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movapd 224(%rdi), %xmm4 ; SSE-NEXT: movapd 224(%rsi), %xmm3 -; SSE-NEXT: movapd %xmm4, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movapd %xmm4, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm3[0] ; SSE-NEXT: movapd 224(%rdx), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] @@ -799,12 +800,12 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm1, 720(%rcx) ; SSE-NEXT: movapd %xmm3, 704(%rcx) ; SSE-NEXT: movapd %xmm4, 688(%rcx) -; SSE-NEXT: movapd %xmm6, 672(%rcx) -; SSE-NEXT: movapd %xmm7, 656(%rcx) -; SSE-NEXT: movapd %xmm10, 640(%rcx) +; SSE-NEXT: movapd %xmm7, 672(%rcx) +; SSE-NEXT: movapd %xmm6, 656(%rcx) +; SSE-NEXT: movapd %xmm8, 640(%rcx) ; SSE-NEXT: movapd %xmm11, 624(%rcx) -; SSE-NEXT: movapd %xmm8, 608(%rcx) -; SSE-NEXT: movapd %xmm9, 592(%rcx) +; SSE-NEXT: movapd %xmm9, 608(%rcx) +; SSE-NEXT: movapd %xmm10, 592(%rcx) ; SSE-NEXT: movapd %xmm14, 576(%rcx) ; SSE-NEXT: movapd %xmm12, 560(%rcx) ; SSE-NEXT: movapd %xmm13, 544(%rcx) @@ -881,13 +882,13 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i64_stride3_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm6 ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm11 -; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm13 -; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 @@ -939,29 +940,29 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm7[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm10 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 96(%rdx), %ymm4 ; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm9 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 128(%rdx), %ymm3 @@ -983,17 +984,17 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 192(%rdx), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 208(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm4[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm5[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2],ymm8[3] ; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm15 ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] ; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm2[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],mem[2],ymm14[3] @@ -1001,27 +1002,27 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],mem[2],ymm14[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2],ymm10[3] ; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2],ymm9[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2],ymm7[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2],ymm6[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2],ymm2[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm12, 736(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 736(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 704(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm8, 640(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 608(%rcx) @@ -1033,13 +1034,13 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd %ymm3, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 320(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 320(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%rcx) @@ -1066,96 +1067,96 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i64_stride3_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $168, %rsp -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm14 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm2[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm4[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm5[2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm8[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,1,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,1,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 128(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] @@ -1165,21 +1166,21 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 192(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] @@ -1204,10 +1205,10 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm3, 544(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 512(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 480(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 448(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 384(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 352(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 448(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 416(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 384(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1234,10 +1235,10 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride3_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 @@ -1247,51 +1248,51 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,8,u,1,9,u,2,10> -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = <5,u,14,6,u,15,7,u> ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 +; AVX512-NEXT: vpermt2q %zmm3, %zmm15, %zmm16 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm19 ; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 -; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 -; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm7 +; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm7 ; AVX512-NEXT: vpermt2q %zmm10, %zmm17, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm11 ; AVX512-NEXT: vpermt2q %zmm10, %zmm20, %zmm11 -; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 -; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm2 +; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm6 ; AVX512-NEXT: vpermt2q %zmm9, %zmm17, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm10 ; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm10 -; AVX512-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm2 -; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm15 +; AVX512-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm15 ; AVX512-NEXT: vpermt2q %zmm8, %zmm17, %zmm15 -; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 -; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm0 +; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm6, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 384(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm11, 448(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm7, 512(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, 576(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm3, 576(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm19, 640(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm16, 704(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm13, (%rcx) @@ -1312,56 +1313,56 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i64_stride3_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1176, %rsp # imm = 0x498 -; SSE-NEXT: movapd 64(%rdi), %xmm8 +; SSE-NEXT: movapd 64(%rdi), %xmm10 ; SSE-NEXT: movapd (%rdi), %xmm5 ; SSE-NEXT: movapd 16(%rdi), %xmm6 -; SSE-NEXT: movapd 32(%rdi), %xmm7 -; SSE-NEXT: movapd 48(%rdi), %xmm9 -; SSE-NEXT: movapd 64(%rsi), %xmm13 -; SSE-NEXT: movapd (%rsi), %xmm10 -; SSE-NEXT: movapd 16(%rsi), %xmm11 -; SSE-NEXT: movapd 32(%rsi), %xmm12 -; SSE-NEXT: movapd 48(%rsi), %xmm14 +; SSE-NEXT: movapd 32(%rdi), %xmm8 +; SSE-NEXT: movapd 48(%rdi), %xmm11 +; SSE-NEXT: movapd 64(%rsi), %xmm14 +; SSE-NEXT: movapd (%rsi), %xmm7 +; SSE-NEXT: movapd 16(%rsi), %xmm9 +; SSE-NEXT: movapd 32(%rsi), %xmm13 +; SSE-NEXT: movapd 48(%rsi), %xmm15 ; SSE-NEXT: movapd 64(%rdx), %xmm0 ; SSE-NEXT: movapd (%rdx), %xmm1 ; SSE-NEXT: movapd 16(%rdx), %xmm2 ; SSE-NEXT: movapd 32(%rdx), %xmm3 ; SSE-NEXT: movapd 48(%rdx), %xmm4 -; SSE-NEXT: movapd %xmm5, %xmm15 -; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm10[0] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm5, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm7[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm6, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm11[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm9[0] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm7, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm9, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm14[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm4[0],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm8, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm13[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, %xmm5 +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm13[0] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm3[0],xmm8[1] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm11, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm15[0] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm4[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm4[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm10, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 80(%rdi), %xmm1 ; SSE-NEXT: movapd 80(%rsi), %xmm2 ; SSE-NEXT: movapd %xmm1, %xmm0 @@ -1589,24 +1590,24 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd 432(%rdx), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movapd 448(%rdi), %xmm9 -; SSE-NEXT: movapd 448(%rsi), %xmm8 -; SSE-NEXT: movapd %xmm9, %xmm13 -; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm8[0] +; SSE-NEXT: movapd 448(%rdi), %xmm11 +; SSE-NEXT: movapd 448(%rsi), %xmm9 +; SSE-NEXT: movapd %xmm11, %xmm13 +; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm9[0] ; SSE-NEXT: movapd 448(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movapd 464(%rdi), %xmm10 -; SSE-NEXT: movapd 464(%rsi), %xmm7 -; SSE-NEXT: movapd %xmm10, %xmm11 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movapd 464(%rdi), %xmm8 +; SSE-NEXT: movapd 464(%rsi), %xmm6 +; SSE-NEXT: movapd %xmm8, %xmm10 +; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm6[0] ; SSE-NEXT: movapd 464(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movapd 480(%rdi), %xmm4 ; SSE-NEXT: movapd 480(%rsi), %xmm3 -; SSE-NEXT: movapd %xmm4, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movapd %xmm4, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm3[0] ; SSE-NEXT: movapd 480(%rdx), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] @@ -1622,12 +1623,12 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm1, 1488(%rcx) ; SSE-NEXT: movapd %xmm3, 1472(%rcx) ; SSE-NEXT: movapd %xmm4, 1456(%rcx) -; SSE-NEXT: movapd %xmm6, 1440(%rcx) -; SSE-NEXT: movapd %xmm7, 1424(%rcx) -; SSE-NEXT: movapd %xmm10, 1408(%rcx) -; SSE-NEXT: movapd %xmm11, 1392(%rcx) -; SSE-NEXT: movapd %xmm8, 1376(%rcx) -; SSE-NEXT: movapd %xmm9, 1360(%rcx) +; SSE-NEXT: movapd %xmm7, 1440(%rcx) +; SSE-NEXT: movapd %xmm6, 1424(%rcx) +; SSE-NEXT: movapd %xmm8, 1408(%rcx) +; SSE-NEXT: movapd %xmm10, 1392(%rcx) +; SSE-NEXT: movapd %xmm9, 1376(%rcx) +; SSE-NEXT: movapd %xmm11, 1360(%rcx) ; SSE-NEXT: movapd %xmm13, 1344(%rcx) ; SSE-NEXT: movapd %xmm12, 1328(%rcx) ; SSE-NEXT: movapd %xmm14, 1312(%rcx) @@ -1965,31 +1966,31 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm12 ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3] ; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 256(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovapd 256(%rdx), %ymm10 ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3] ; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 288(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 288(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovapd 288(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 320(%rdx), %ymm7 @@ -2010,35 +2011,35 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 384(%rdx), %ymm4 ; AVX1-ONLY-NEXT: vmovapd 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm12[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm13[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovapd 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm8[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm9[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 480(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm4[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm5[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2084,15 +2085,15 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] @@ -2101,20 +2102,20 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],mem[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2],ymm10[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2],ymm8[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 1504(%rcx) +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm13[0],ymm4[1],ymm13[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],mem[2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 1504(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 1472(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1408(%rcx) @@ -2124,7 +2125,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd %ymm3, 1280(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1216(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 1184(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 1184(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1120(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm6, 1088(%rcx) @@ -2133,13 +2134,13 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd %ymm7, 992(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 928(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 896(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 896(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 832(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 800(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 800(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 736(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 704(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 704(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 640(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm14, 608(%rcx) @@ -2207,76 +2208,76 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: subq $936, %rsp # imm = 0x3A8 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm4 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm12[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm12[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm5[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm9[1],ymm5[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm13[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm13[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm10[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[2,1,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm4[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] @@ -2408,91 +2409,91 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 384(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 416(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 448(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 448(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm14 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm1[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm15[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm13 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm13, 1504(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 1472(%rcx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm12, 1504(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1472(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1408(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 1376(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 1376(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 1344(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 1312(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1280(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1248(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1216(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1184(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1152(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1120(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 1088(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1056(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 1280(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1248(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1216(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 1184(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1152(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1120(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1088(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1056(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1024(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2565,122 +2566,122 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride3_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm24 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm9 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm10 ; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm8 ; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm7 -; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm4 +; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm6 ; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm12 -; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm18 -; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm26 -; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm28 +; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm17 +; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm22 +; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm23 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm15 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm25 -; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,1,9,u,2,10> -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] -; AVX512-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm10 -; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] -; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = <5,u,14,6,u,15,7,u> -; AVX512-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] -; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm13 -; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 -; AVX512-NEXT: vpermt2q %zmm25, %zmm19, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm15 -; AVX512-NEXT: vpermt2q %zmm1, %zmm17, %zmm15 -; AVX512-NEXT: vpermt2q %zmm25, %zmm21, %zmm15 -; AVX512-NEXT: vpermt2q %zmm20, %zmm22, %zmm1 -; AVX512-NEXT: vpermt2q %zmm25, %zmm23, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm20 -; AVX512-NEXT: vpermt2q %zmm29, %zmm19, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm25 -; AVX512-NEXT: vpermt2q %zmm2, %zmm17, %zmm25 -; AVX512-NEXT: vpermt2q %zmm29, %zmm21, %zmm25 -; AVX512-NEXT: vpermt2q %zmm27, %zmm22, %zmm2 -; AVX512-NEXT: vpermt2q %zmm29, %zmm23, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm27 -; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm27 -; AVX512-NEXT: vpermt2q %zmm31, %zmm19, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512-NEXT: vpermt2q %zmm6, %zmm17, %zmm29 -; AVX512-NEXT: vpermt2q %zmm31, %zmm21, %zmm29 -; AVX512-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 -; AVX512-NEXT: vpermt2q %zmm31, %zmm23, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm30 -; AVX512-NEXT: vpermt2q %zmm28, %zmm19, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm31 -; AVX512-NEXT: vpermt2q %zmm4, %zmm17, %zmm31 -; AVX512-NEXT: vpermt2q %zmm28, %zmm21, %zmm31 -; AVX512-NEXT: vpermt2q %zmm24, %zmm22, %zmm4 -; AVX512-NEXT: vpermt2q %zmm28, %zmm23, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512-NEXT: vpermt2q %zmm7, %zmm14, %zmm24 -; AVX512-NEXT: vpermt2q %zmm26, %zmm19, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm28 -; AVX512-NEXT: vpermt2q %zmm7, %zmm17, %zmm28 -; AVX512-NEXT: vpermt2q %zmm26, %zmm21, %zmm28 -; AVX512-NEXT: vpermt2q %zmm16, %zmm22, %zmm7 -; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm16 -; AVX512-NEXT: vpermt2q %zmm18, %zmm19, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512-NEXT: vpermt2q %zmm8, %zmm17, %zmm26 -; AVX512-NEXT: vpermt2q %zmm18, %zmm21, %zmm26 -; AVX512-NEXT: vpermt2q %zmm11, %zmm22, %zmm8 -; AVX512-NEXT: vpermt2q %zmm18, %zmm23, %zmm8 -; AVX512-NEXT: vpermi2q %zmm9, %zmm5, %zmm14 -; AVX512-NEXT: vpermt2q %zmm12, %zmm19, %zmm14 -; AVX512-NEXT: vpermi2q %zmm9, %zmm5, %zmm17 -; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm17 -; AVX512-NEXT: vpermt2q %zmm5, %zmm22, %zmm9 -; AVX512-NEXT: vpermt2q %zmm12, %zmm23, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm9, 1472(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm17, 1408(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm14, 1344(%rcx) +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm25 +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm26 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,8,u,1,9,u,2,10> +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm4 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,8,3,4,9,6,7] +; AVX512-NEXT: vpermt2q %zmm15, %zmm27, %zmm4 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512-NEXT: vpermt2q %zmm0, %zmm28, %zmm9 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm9 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = <5,u,14,6,u,15,7,u> +; AVX512-NEXT: vpermt2q %zmm14, %zmm30, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm14 +; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm14 +; AVX512-NEXT: vpermt2q %zmm24, %zmm27, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm15 +; AVX512-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 +; AVX512-NEXT: vpermt2q %zmm24, %zmm29, %zmm15 +; AVX512-NEXT: vpermt2q %zmm19, %zmm30, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm31, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512-NEXT: vpermt2q %zmm25, %zmm27, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm24 +; AVX512-NEXT: vpermt2q %zmm25, %zmm29, %zmm24 +; AVX512-NEXT: vpermt2q %zmm20, %zmm30, %zmm2 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 +; AVX512-NEXT: vpermt2q %zmm26, %zmm27, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512-NEXT: vpermt2q %zmm6, %zmm28, %zmm25 +; AVX512-NEXT: vpermt2q %zmm26, %zmm29, %zmm25 +; AVX512-NEXT: vpermt2q %zmm21, %zmm30, %zmm6 +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512-NEXT: vpermt2q %zmm5, %zmm16, %zmm21 +; AVX512-NEXT: vpermt2q %zmm23, %zmm27, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512-NEXT: vpermt2q %zmm5, %zmm28, %zmm26 +; AVX512-NEXT: vpermt2q %zmm23, %zmm29, %zmm26 +; AVX512-NEXT: vpermt2q %zmm18, %zmm30, %zmm5 +; AVX512-NEXT: vpermt2q %zmm23, %zmm31, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512-NEXT: vpermt2q %zmm7, %zmm16, %zmm18 +; AVX512-NEXT: vpermt2q %zmm22, %zmm27, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512-NEXT: vpermt2q %zmm7, %zmm28, %zmm23 +; AVX512-NEXT: vpermt2q %zmm22, %zmm29, %zmm23 +; AVX512-NEXT: vpermt2q %zmm13, %zmm30, %zmm7 +; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512-NEXT: vpermt2q %zmm8, %zmm16, %zmm13 +; AVX512-NEXT: vpermt2q %zmm17, %zmm27, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512-NEXT: vpermt2q %zmm8, %zmm28, %zmm22 +; AVX512-NEXT: vpermt2q %zmm17, %zmm29, %zmm22 +; AVX512-NEXT: vpermt2q %zmm11, %zmm30, %zmm8 +; AVX512-NEXT: vpermt2q %zmm17, %zmm31, %zmm8 +; AVX512-NEXT: vpermi2q %zmm10, %zmm3, %zmm16 +; AVX512-NEXT: vpermt2q %zmm12, %zmm27, %zmm16 +; AVX512-NEXT: vpermi2q %zmm10, %zmm3, %zmm28 +; AVX512-NEXT: vpermt2q %zmm12, %zmm29, %zmm28 +; AVX512-NEXT: vpermt2q %zmm3, %zmm30, %zmm10 +; AVX512-NEXT: vpermt2q %zmm12, %zmm31, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm10, 1472(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm28, 1408(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, 1344(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm8, 1280(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm26, 1216(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm16, 1152(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm22, 1216(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm13, 1152(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm7, 1088(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm28, 1024(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm24, 960(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm4, 896(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm31, 832(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm30, 768(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm23, 1024(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm18, 960(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm5, 896(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm26, 832(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm21, 768(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm6, 704(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm29, 640(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm27, 576(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm25, 640(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm20, 576(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm2, 512(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm20, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm24, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, 384(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm15, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm13, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm14, 192(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm4, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll index c0200ee6f6ebcf..645fede6baef27 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll @@ -216,58 +216,58 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm10 -; SSE-NEXT: movaps 16(%rsi), %xmm14 +; SSE-NEXT: movaps (%rsi), %xmm9 +; SSE-NEXT: movaps 16(%rsi), %xmm10 ; SSE-NEXT: movaps 32(%rsi), %xmm11 ; SSE-NEXT: movaps (%rdx), %xmm2 ; SSE-NEXT: movaps 16(%rdx), %xmm4 -; SSE-NEXT: movaps 32(%rdx), %xmm7 -; SSE-NEXT: movaps 48(%rdx), %xmm9 -; SSE-NEXT: movaps (%rcx), %xmm8 +; SSE-NEXT: movaps 32(%rdx), %xmm8 +; SSE-NEXT: movaps 48(%rdx), %xmm12 +; SSE-NEXT: movaps (%rcx), %xmm7 ; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps 32(%rcx), %xmm15 -; SSE-NEXT: movaps 48(%rcx), %xmm12 +; SSE-NEXT: movaps 32(%rcx), %xmm14 +; SSE-NEXT: movaps 48(%rcx), %xmm15 ; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] -; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] -; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm11[0] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm14[1] +; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; SSE-NEXT: movaps 48(%rsi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm15[1] +; SSE-NEXT: movaps 48(%rsi), %xmm15 ; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; SSE-NEXT: movaps %xmm0, 224(%r8) -; SSE-NEXT: movaps %xmm9, 240(%r8) +; SSE-NEXT: movaps %xmm12, 240(%r8) ; SSE-NEXT: movaps %xmm6, 192(%r8) ; SSE-NEXT: movaps %xmm11, 208(%r8) ; SSE-NEXT: movaps %xmm3, 160(%r8) -; SSE-NEXT: movaps %xmm7, 176(%r8) -; SSE-NEXT: movaps %xmm15, 128(%r8) -; SSE-NEXT: movaps %xmm14, 144(%r8) +; SSE-NEXT: movaps %xmm8, 176(%r8) +; SSE-NEXT: movaps %xmm14, 128(%r8) +; SSE-NEXT: movaps %xmm10, 144(%r8) ; SSE-NEXT: movaps %xmm1, 96(%r8) ; SSE-NEXT: movaps %xmm4, 112(%r8) ; SSE-NEXT: movaps %xmm13, 64(%r8) -; SSE-NEXT: movaps %xmm10, 80(%r8) +; SSE-NEXT: movaps %xmm9, 80(%r8) ; SSE-NEXT: movaps %xmm5, 32(%r8) ; SSE-NEXT: movaps %xmm2, 48(%r8) -; SSE-NEXT: movaps %xmm8, (%r8) +; SSE-NEXT: movaps %xmm7, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: retq @@ -457,66 +457,66 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm9 -; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps (%rsi), %xmm3 -; SSE-NEXT: movaps 16(%rsi), %xmm2 -; SSE-NEXT: movaps 32(%rsi), %xmm1 -; SSE-NEXT: movaps 48(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm11 +; SSE-NEXT: movaps 32(%rdi), %xmm10 +; SSE-NEXT: movaps 48(%rdi), %xmm13 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rsi), %xmm1 +; SSE-NEXT: movaps 32(%rsi), %xmm2 +; SSE-NEXT: movaps 48(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm9 ; SSE-NEXT: movaps 16(%rdx), %xmm12 ; SSE-NEXT: movaps 32(%rdx), %xmm14 ; SSE-NEXT: movaps 48(%rdx), %xmm15 ; SSE-NEXT: movaps (%rcx), %xmm4 ; SSE-NEXT: movaps 16(%rcx), %xmm5 ; SSE-NEXT: movaps 32(%rcx), %xmm6 -; SSE-NEXT: movaps %xmm7, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm3[0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: movaps %xmm9, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm14 +; SSE-NEXT: movaps 64(%rdi), %xmm13 ; SSE-NEXT: movaps 64(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 64(%rdx), %xmm12 ; SSE-NEXT: movaps 64(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm12, %xmm15 @@ -524,48 +524,48 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 80(%rdi), %xmm11 ; SSE-NEXT: movaps 80(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: movaps %xmm11, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdx), %xmm7 -; SSE-NEXT: movaps 80(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 96(%rdi), %xmm8 +; SSE-NEXT: movaps 80(%rdx), %xmm8 +; SSE-NEXT: movaps 80(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm6 ; SSE-NEXT: movaps 96(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movaps %xmm6, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 96(%rdx), %xmm5 -; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 96(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps 112(%rdi), %xmm1 ; SSE-NEXT: movaps 112(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: movaps 112(%rdx), %xmm3 -; SSE-NEXT: movaps 112(%rcx), %xmm1 +; SSE-NEXT: movaps 112(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm3, 496(%r8) -; SSE-NEXT: movaps %xmm2, 480(%r8) +; SSE-NEXT: movaps %xmm1, 480(%r8) ; SSE-NEXT: movaps %xmm0, 464(%r8) -; SSE-NEXT: movaps %xmm4, 448(%r8) +; SSE-NEXT: movaps %xmm2, 448(%r8) ; SSE-NEXT: movaps %xmm5, 432(%r8) -; SSE-NEXT: movaps %xmm8, 416(%r8) -; SSE-NEXT: movaps %xmm6, 400(%r8) +; SSE-NEXT: movaps %xmm6, 416(%r8) +; SSE-NEXT: movaps %xmm7, 400(%r8) ; SSE-NEXT: movaps %xmm9, 384(%r8) -; SSE-NEXT: movaps %xmm7, 368(%r8) +; SSE-NEXT: movaps %xmm8, 368(%r8) ; SSE-NEXT: movaps %xmm11, 352(%r8) ; SSE-NEXT: movaps %xmm10, 336(%r8) -; SSE-NEXT: movaps %xmm13, 320(%r8) +; SSE-NEXT: movaps %xmm14, 320(%r8) ; SSE-NEXT: movaps %xmm12, 304(%r8) -; SSE-NEXT: movaps %xmm14, 288(%r8) +; SSE-NEXT: movaps %xmm13, 288(%r8) ; SSE-NEXT: movaps %xmm15, 272(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r8) @@ -659,35 +659,35 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm13[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm15[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm15[0],xmm14[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm14[0],xmm12[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm15[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm14[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm15[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm14[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm15[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm14[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm14[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r8) @@ -696,10 +696,10 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm7, 384(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 176(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 176(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm11, 304(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm10, 288(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm9, 272(%r8) @@ -728,86 +728,86 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i64_stride4_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: pushq %rax -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm11 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm14 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm10, %ymm12 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm10, %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm4, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm6, %ymm8 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm6, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm15[2,3],ymm10[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm14[0],ymm9[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm13[1],ymm5[3],ymm13[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm13[1],ymm9[1],ymm13[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovaps %ymm3, 480(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 416(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 256(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -959,60 +959,60 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $664, %rsp # imm = 0x298 ; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm9 -; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps (%rsi), %xmm3 -; SSE-NEXT: movaps 16(%rsi), %xmm2 -; SSE-NEXT: movaps 32(%rsi), %xmm1 -; SSE-NEXT: movaps 48(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm11 +; SSE-NEXT: movaps 32(%rdi), %xmm10 +; SSE-NEXT: movaps 48(%rdi), %xmm13 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rsi), %xmm1 +; SSE-NEXT: movaps 32(%rsi), %xmm2 +; SSE-NEXT: movaps 48(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm9 ; SSE-NEXT: movaps 16(%rdx), %xmm12 -; SSE-NEXT: movaps 32(%rdx), %xmm13 -; SSE-NEXT: movaps 48(%rdx), %xmm14 +; SSE-NEXT: movaps 32(%rdx), %xmm14 +; SSE-NEXT: movaps 48(%rdx), %xmm15 ; SSE-NEXT: movaps (%rcx), %xmm4 ; SSE-NEXT: movaps 16(%rcx), %xmm5 ; SSE-NEXT: movaps 32(%rcx), %xmm6 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: movaps %xmm9, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm2 ; SSE-NEXT: movaps 64(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm1 @@ -1124,62 +1124,62 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 192(%rdi), %xmm12 +; SSE-NEXT: movaps 192(%rdi), %xmm13 ; SSE-NEXT: movaps 192(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 192(%rdx), %xmm15 ; SSE-NEXT: movaps 192(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 208(%rdi), %xmm10 -; SSE-NEXT: movaps 208(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 208(%rdx), %xmm7 +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 208(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps 208(%rdx), %xmm9 ; SSE-NEXT: movaps 208(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm6 ; SSE-NEXT: movaps 224(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdx), %xmm4 -; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdx), %xmm5 +; SSE-NEXT: movaps 224(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps 240(%rdi), %xmm1 ; SSE-NEXT: movaps 240(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: movaps 240(%rdx), %xmm3 -; SSE-NEXT: movaps 240(%rcx), %xmm1 +; SSE-NEXT: movaps 240(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm3, 1008(%r8) -; SSE-NEXT: movaps %xmm2, 992(%r8) +; SSE-NEXT: movaps %xmm1, 992(%r8) ; SSE-NEXT: movaps %xmm0, 976(%r8) -; SSE-NEXT: movaps %xmm5, 960(%r8) -; SSE-NEXT: movaps %xmm4, 944(%r8) -; SSE-NEXT: movaps %xmm8, 928(%r8) -; SSE-NEXT: movaps %xmm6, 912(%r8) -; SSE-NEXT: movaps %xmm9, 896(%r8) -; SSE-NEXT: movaps %xmm7, 880(%r8) -; SSE-NEXT: movaps %xmm10, 864(%r8) -; SSE-NEXT: movaps %xmm11, 848(%r8) -; SSE-NEXT: movaps %xmm13, 832(%r8) +; SSE-NEXT: movaps %xmm2, 960(%r8) +; SSE-NEXT: movaps %xmm5, 944(%r8) +; SSE-NEXT: movaps %xmm6, 928(%r8) +; SSE-NEXT: movaps %xmm7, 912(%r8) +; SSE-NEXT: movaps %xmm8, 896(%r8) +; SSE-NEXT: movaps %xmm9, 880(%r8) +; SSE-NEXT: movaps %xmm11, 864(%r8) +; SSE-NEXT: movaps %xmm10, 848(%r8) +; SSE-NEXT: movaps %xmm12, 832(%r8) ; SSE-NEXT: movaps %xmm15, 816(%r8) -; SSE-NEXT: movaps %xmm12, 800(%r8) +; SSE-NEXT: movaps %xmm13, 800(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1760,109 +1760,109 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm23 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm12 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm10 ; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm25 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm12 ; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm26 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm19 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm19 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm21 ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,3,11,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm13, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,10,u,u,3,11,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm8, %zmm4 ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,8,u,u,1,9,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,7,15,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512F-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <4,12,u,u,5,13,u,u> -; AVX512F-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm14, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm22, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,1,9,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm14, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm23, %zmm24 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = <6,14,u,u,7,15,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm25, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512F-NEXT: vpermt2q %zmm19, %zmm24, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <4,12,u,u,5,13,u,u> +; AVX512F-NEXT: vpermt2q %zmm15, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm13, %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm7, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm16, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm11, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm18, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm15, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1} -; AVX512F-NEXT: vpermt2q %zmm26, %zmm20, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm21, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm14, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm7, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm22, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm25, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm27 {%k1} +; AVX512F-NEXT: vpermt2q %zmm20, %zmm24, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm13, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm22, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm14, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm23, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm25, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm20 {%k1} +; AVX512F-NEXT: vpermt2q %zmm21, %zmm24, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} +; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm22 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm14 {%k1} +; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm23 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm25 ; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm16, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm11, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm18, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512F-NEXT: vpermt2q %zmm19, %zmm20, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm21, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1} -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1} -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm19, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, 832(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 960(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 768(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 832(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm20, 704(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm18, 512(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, 576(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 256(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm17, 320(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1873,109 +1873,109 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm12 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm19 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm21 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,3,11,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,10,u,u,3,11,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm8, %zmm4 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,8,u,u,1,9,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,7,15,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <4,12,u,u,5,13,u,u> -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm14, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,1,9,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm14, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <6,14,u,u,7,15,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm24, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <4,12,u,u,5,13,u,u> +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm16, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm18, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm14, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm22, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm27 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm24, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm13, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm22, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm23, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm24, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm20, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 832(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 960(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 768(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 832(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 704(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 512(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 576(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 256(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1997,60 +1997,60 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $1688, %rsp # imm = 0x698 ; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm9 -; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps (%rsi), %xmm3 -; SSE-NEXT: movaps 16(%rsi), %xmm2 -; SSE-NEXT: movaps 32(%rsi), %xmm1 -; SSE-NEXT: movaps 48(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm11 +; SSE-NEXT: movaps 32(%rdi), %xmm10 +; SSE-NEXT: movaps 48(%rdi), %xmm13 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rsi), %xmm1 +; SSE-NEXT: movaps 32(%rsi), %xmm2 +; SSE-NEXT: movaps 48(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm9 ; SSE-NEXT: movaps 16(%rdx), %xmm12 -; SSE-NEXT: movaps 32(%rdx), %xmm13 -; SSE-NEXT: movaps 48(%rdx), %xmm14 +; SSE-NEXT: movaps 32(%rdx), %xmm14 +; SSE-NEXT: movaps 48(%rdx), %xmm15 ; SSE-NEXT: movaps (%rcx), %xmm4 ; SSE-NEXT: movaps 16(%rcx), %xmm5 ; SSE-NEXT: movaps 32(%rcx), %xmm6 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] +; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm2 ; SSE-NEXT: movaps 64(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm1 @@ -2386,62 +2386,62 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 448(%rdi), %xmm12 +; SSE-NEXT: movaps 448(%rdi), %xmm11 ; SSE-NEXT: movaps 448(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 448(%rdx), %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 448(%rdx), %xmm14 ; SSE-NEXT: movaps 448(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 464(%rdi), %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdi), %xmm12 ; SSE-NEXT: movaps 464(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 464(%rdx), %xmm7 -; SSE-NEXT: movaps 464(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 480(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdx), %xmm8 +; SSE-NEXT: movaps 464(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 480(%rdi), %xmm6 ; SSE-NEXT: movaps 480(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movaps %xmm6, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 480(%rdx), %xmm5 -; SSE-NEXT: movaps 480(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm2 +; SSE-NEXT: movaps 480(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps 496(%rdi), %xmm1 ; SSE-NEXT: movaps 496(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: movaps 496(%rdx), %xmm3 -; SSE-NEXT: movaps 496(%rcx), %xmm1 +; SSE-NEXT: movaps 496(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm3, 2032(%r8) -; SSE-NEXT: movaps %xmm2, 2016(%r8) +; SSE-NEXT: movaps %xmm1, 2016(%r8) ; SSE-NEXT: movaps %xmm0, 2000(%r8) -; SSE-NEXT: movaps %xmm4, 1984(%r8) +; SSE-NEXT: movaps %xmm2, 1984(%r8) ; SSE-NEXT: movaps %xmm5, 1968(%r8) -; SSE-NEXT: movaps %xmm8, 1952(%r8) -; SSE-NEXT: movaps %xmm6, 1936(%r8) +; SSE-NEXT: movaps %xmm6, 1952(%r8) +; SSE-NEXT: movaps %xmm7, 1936(%r8) ; SSE-NEXT: movaps %xmm9, 1920(%r8) -; SSE-NEXT: movaps %xmm7, 1904(%r8) -; SSE-NEXT: movaps %xmm13, 1888(%r8) +; SSE-NEXT: movaps %xmm8, 1904(%r8) +; SSE-NEXT: movaps %xmm12, 1888(%r8) ; SSE-NEXT: movaps %xmm10, 1872(%r8) -; SSE-NEXT: movaps %xmm14, 1856(%r8) -; SSE-NEXT: movaps %xmm11, 1840(%r8) -; SSE-NEXT: movaps %xmm12, 1824(%r8) +; SSE-NEXT: movaps %xmm13, 1856(%r8) +; SSE-NEXT: movaps %xmm14, 1840(%r8) +; SSE-NEXT: movaps %xmm11, 1824(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1808(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2676,14 +2676,14 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1688, %rsp # imm = 0x698 ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm10 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm11 @@ -2707,55 +2707,55 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm9[1],ymm5[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm10[1],xmm8[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovaps 144(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 176(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 208(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -3659,237 +3659,245 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-LABEL: store_i64_stride4_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm8 +; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm9 +; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm10 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm23 -; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm3 -; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm12 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm11 +; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm1 +; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm5 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm18, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm16, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm18, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm16, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm18, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm16, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm18, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm26, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm26, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm28 +; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm26 ; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm28, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm28, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm28, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm26 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm26, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm26, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm26, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm26 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm28 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,10,u,u,3,11,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,8,u,u,1,9,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <2,10,u,u,3,11,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <4,12,u,u,5,13,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <6,14,u,u,7,15,u,u> -; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = <6,14,u,u,7,15,u,u> +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm28 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm27 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm20 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm27 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm25 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm18 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm22, %zmm19 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm25 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm21 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm7 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm9 ; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm10 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm9 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 ; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -3897,56 +3905,56 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm1, 1984(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm2, 1920(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 1856(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, 1792(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 1856(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 1792(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm3, 1664(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm6, 1664(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm8, 1600(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm10, 1536(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm6, 1472(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 1472(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm11, 1408(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm12, 1344(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm13, 1280(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 1216(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1216(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm14, 1152(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1088(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm16, 1024(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm19, 960(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 896(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm21, 832(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 768(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm18, 704(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm24, 640(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 512(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm20, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm30, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1088(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm19, 1024(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm21, 960(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm20, 896(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, 832(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm24, 768(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 704(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 640(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 576(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm30, 512(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 448(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm31, 320(%r8) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm31, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, 128(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%r8) ; AVX512F-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -3954,237 +3962,245 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-LABEL: store_i64_stride4_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm10 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm16, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm18, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm26, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm28 +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm26 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm26, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm26, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm26, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm26 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,10,u,u,3,11,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,8,u,u,1,9,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <2,10,u,u,3,11,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <4,12,u,u,5,13,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <6,14,u,u,7,15,u,u> -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <6,14,u,u,7,15,u,u> +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm28 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm20 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm27 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm18 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm19 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm25 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm21 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm7 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm9 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -4192,56 +4208,56 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, 1984(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 1920(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 1856(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1856(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 1792(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1664(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1664(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 1600(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 1536(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 1472(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1472(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 1408(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 1344(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 1280(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1216(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1216(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 1152(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1088(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 960(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 832(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 768(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 704(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 640(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 512(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 1024(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 960(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 896(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 832(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 768(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 704(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 640(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 576(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 512(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 320(%r8) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 128(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%r8) ; AVX512BW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4263,8 +4279,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} -; AVX512-FAST: {{.*}} -; AVX512-SLOW: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index fcc1958a46ed49..1db697a1cc6d67 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -109,38 +109,38 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm4 -; SSE-NEXT: movaps 16(%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps 16(%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm7 -; SSE-NEXT: movaps 16(%rcx), %xmm8 -; SSE-NEXT: movaps (%r8), %xmm9 -; SSE-NEXT: movaps 16(%r8), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps 16(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm4 +; SSE-NEXT: movaps 16(%rdx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm6 +; SSE-NEXT: movaps 16(%rcx), %xmm7 +; SSE-NEXT: movaps (%r8), %xmm8 +; SSE-NEXT: movaps 16(%r8), %xmm9 +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: movaps %xmm5, 16(%r9) -; SSE-NEXT: movaps %xmm9, 32(%r9) -; SSE-NEXT: movaps %xmm6, 48(%r9) -; SSE-NEXT: movaps %xmm7, 64(%r9) -; SSE-NEXT: movaps %xmm0, 80(%r9) -; SSE-NEXT: movaps %xmm1, 96(%r9) -; SSE-NEXT: movaps %xmm3, 112(%r9) +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps %xmm0, (%r9) +; SSE-NEXT: movaps %xmm4, 16(%r9) +; SSE-NEXT: movaps %xmm8, 32(%r9) +; SSE-NEXT: movaps %xmm3, 48(%r9) +; SSE-NEXT: movaps %xmm6, 64(%r9) +; SSE-NEXT: movaps %xmm1, 80(%r9) +; SSE-NEXT: movaps %xmm5, 96(%r9) +; SSE-NEXT: movaps %xmm9, 112(%r9) ; SSE-NEXT: movaps %xmm10, 128(%r9) -; SSE-NEXT: movaps %xmm8, 144(%r9) +; SSE-NEXT: movaps %xmm7, 144(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf4: @@ -255,16 +255,16 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i64_stride5_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movapd (%rdi), %xmm0 -; SSE-NEXT: movapd 16(%rdi), %xmm2 -; SSE-NEXT: movapd 32(%rdi), %xmm5 +; SSE-NEXT: movapd 16(%rdi), %xmm3 +; SSE-NEXT: movapd 32(%rdi), %xmm7 ; SSE-NEXT: movapd (%rsi), %xmm1 -; SSE-NEXT: movapd 16(%rsi), %xmm9 -; SSE-NEXT: movapd 32(%rsi), %xmm8 -; SSE-NEXT: movapd (%rdx), %xmm3 +; SSE-NEXT: movapd 16(%rsi), %xmm8 +; SSE-NEXT: movapd 32(%rsi), %xmm9 +; SSE-NEXT: movapd (%rdx), %xmm2 ; SSE-NEXT: movapd 16(%rdx), %xmm6 -; SSE-NEXT: movapd 32(%rdx), %xmm10 -; SSE-NEXT: movapd (%rcx), %xmm7 -; SSE-NEXT: movapd 16(%rcx), %xmm11 +; SSE-NEXT: movapd 32(%rdx), %xmm11 +; SSE-NEXT: movapd (%rcx), %xmm5 +; SSE-NEXT: movapd 16(%rcx), %xmm10 ; SSE-NEXT: movapd 32(%rcx), %xmm12 ; SSE-NEXT: movapd (%r8), %xmm13 ; SSE-NEXT: movapd 16(%r8), %xmm14 @@ -274,23 +274,23 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] -; SSE-NEXT: movapd %xmm2, %xmm13 -; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm14[0],xmm2[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm6[1] -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm14[1] -; SSE-NEXT: movapd %xmm5, %xmm14 -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm8[0] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm15[0],xmm5[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] +; SSE-NEXT: movapd %xmm3, %xmm13 +; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1] +; SSE-NEXT: movapd %xmm7, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm9[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm15[0],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm15[1] ; SSE-NEXT: movapd 48(%rdi), %xmm15 ; SSE-NEXT: movapd 48(%rsi), %xmm2 @@ -309,17 +309,17 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movapd %xmm1, 256(%r9) ; SSE-NEXT: movapd %xmm3, 240(%r9) ; SSE-NEXT: movapd %xmm12, 224(%r9) -; SSE-NEXT: movapd %xmm8, 208(%r9) -; SSE-NEXT: movapd %xmm5, 192(%r9) -; SSE-NEXT: movapd %xmm10, 176(%r9) +; SSE-NEXT: movapd %xmm9, 208(%r9) +; SSE-NEXT: movapd %xmm7, 192(%r9) +; SSE-NEXT: movapd %xmm11, 176(%r9) ; SSE-NEXT: movapd %xmm14, 160(%r9) -; SSE-NEXT: movapd %xmm11, 144(%r9) -; SSE-NEXT: movapd %xmm9, 128(%r9) +; SSE-NEXT: movapd %xmm10, 144(%r9) +; SSE-NEXT: movapd %xmm8, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movapd %xmm6, 96(%r9) ; SSE-NEXT: movapd %xmm13, 80(%r9) -; SSE-NEXT: movapd %xmm7, 64(%r9) +; SSE-NEXT: movapd %xmm5, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -332,48 +332,48 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm6 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm5[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm5[2],ymm8[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm6[2],ymm9[3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm5[0],ymm9[1,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm6[0],ymm8[1,2,3] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm3[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm13[1],xmm10[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 @@ -385,13 +385,13 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps %xmm10, 160(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 192(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 256(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 192(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 256(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 288(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 288(%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -404,35 +404,35 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm9 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1],ymm9[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm11, %ymm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1],ymm11[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm2[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 @@ -450,10 +450,10 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 192(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 256(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm7, (%r9) @@ -465,119 +465,119 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512F-LABEL: store_i64_stride5_vf8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <3,u,u,u,12,4,u,u> +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512F-NEXT: movb $8, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512F-NEXT: movb $-116, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm9, %zmm8 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride5_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <3,u,u,u,12,4,u,u> +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-NEXT: movb $8, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: movb $-116, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm9, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 @@ -600,51 +600,51 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $280, %rsp # imm = 0x118 ; SSE-NEXT: movapd (%rdi), %xmm3 -; SSE-NEXT: movapd 16(%rdi), %xmm4 -; SSE-NEXT: movapd 32(%rdi), %xmm6 -; SSE-NEXT: movapd (%rsi), %xmm5 -; SSE-NEXT: movapd 16(%rsi), %xmm8 -; SSE-NEXT: movapd 32(%rsi), %xmm10 -; SSE-NEXT: movapd (%rdx), %xmm7 +; SSE-NEXT: movapd 16(%rdi), %xmm6 +; SSE-NEXT: movapd 32(%rdi), %xmm10 +; SSE-NEXT: movapd (%rsi), %xmm4 +; SSE-NEXT: movapd 16(%rsi), %xmm7 +; SSE-NEXT: movapd 32(%rsi), %xmm12 +; SSE-NEXT: movapd (%rdx), %xmm5 ; SSE-NEXT: movapd 16(%rdx), %xmm9 -; SSE-NEXT: movapd 32(%rdx), %xmm12 -; SSE-NEXT: movapd (%rcx), %xmm11 +; SSE-NEXT: movapd 32(%rdx), %xmm14 +; SSE-NEXT: movapd (%rcx), %xmm8 ; SSE-NEXT: movapd 16(%rcx), %xmm13 ; SSE-NEXT: movapd 32(%rcx), %xmm15 ; SSE-NEXT: movapd (%r8), %xmm0 ; SSE-NEXT: movapd 16(%r8), %xmm1 ; SSE-NEXT: movapd 32(%r8), %xmm2 -; SSE-NEXT: movapd %xmm3, %xmm14 -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm5[0] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm3, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm4[0] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm11[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm4, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm6, %xmm4 +; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm13[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm6, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm12[0] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm2[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 48(%rdi), %xmm1 @@ -685,17 +685,17 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 80(%r8), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd 80(%rdx), %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] +; SSE-NEXT: movapd 80(%rdx), %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm11[1] ; SSE-NEXT: movapd 80(%rcx), %xmm9 -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movapd 96(%rdi), %xmm11 +; SSE-NEXT: movapd 96(%rdi), %xmm10 ; SSE-NEXT: movapd 96(%rsi), %xmm8 -; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: movapd %xmm10, %xmm12 ; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm8[0] ; SSE-NEXT: movapd 96(%r8), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] ; SSE-NEXT: movapd 96(%rdx), %xmm6 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] ; SSE-NEXT: movapd 96(%rcx), %xmm3 @@ -719,13 +719,13 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm5, 560(%r9) ; SSE-NEXT: movapd %xmm3, 544(%r9) ; SSE-NEXT: movapd %xmm8, 528(%r9) -; SSE-NEXT: movapd %xmm11, 512(%r9) +; SSE-NEXT: movapd %xmm10, 512(%r9) ; SSE-NEXT: movapd %xmm6, 496(%r9) ; SSE-NEXT: movapd %xmm12, 480(%r9) ; SSE-NEXT: movapd %xmm9, 464(%r9) ; SSE-NEXT: movapd %xmm13, 448(%r9) ; SSE-NEXT: movapd %xmm14, 432(%r9) -; SSE-NEXT: movapd %xmm10, 416(%r9) +; SSE-NEXT: movapd %xmm11, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%r9) ; SSE-NEXT: movapd %xmm15, 384(%r9) @@ -783,136 +783,136 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i64_stride5_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $216, %rsp -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm1[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm2[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm8[0],ymm13[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0],ymm8[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm5[0,1],ymm8[2],ymm5[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm7[1],xmm4[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm11[0],ymm14[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm3[0,1],ymm11[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm10[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm5[0],ymm1[0],ymm5[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm0 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0],ymm9[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, (%r9) +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm14, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, (%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 480(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 320(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 576(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 512(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 352(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 320(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 576(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 512(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 352(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 256(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -921,11 +921,11 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 608(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 544(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 608(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 544(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -938,160 +938,162 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-ONLY-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm5, %ymm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm15[0],mem[0],ymm15[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],mem[0],ymm15[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm9[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm6[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm3 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm15 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 576(%r9) +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm6 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm6[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm14 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm14[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm6, 576(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 512(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 384(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 384(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 352(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 256(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 256(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 608(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 608(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%r9) -; AVX2-ONLY-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1100,97 +1102,97 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm10 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <3,u,u,u,12,4,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm13 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm7, %zmm4 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} ; AVX512F-NEXT: movb $8, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 {%k2} -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 ; AVX512F-NEXT: movb $-116, %al -; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 -; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm15 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [15,7,15,7,15,7,15,7] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm16, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm18, %zmm19 +; AVX512F-NEXT: movb $24, %al +; AVX512F-NEXT: kmovw %eax, %k3 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,14,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm17, %zmm19 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm21 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} +; AVX512F-NEXT: vpermt2q %zmm8, %zmm22, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 {%k3} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512F-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm24, %zmm25 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm26, %zmm27 ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512F-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k3} -; AVX512F-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm12, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512F-NEXT: vpermt2q %zmm11, %zmm13, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 {%k3} +; AVX512F-NEXT: vpermt2q %zmm6, %zmm17, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 ; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 {%k2} -; AVX512F-NEXT: vpermt2q %zmm4, %zmm21, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 {%k3} +; AVX512F-NEXT: vpermt2q %zmm6, %zmm21, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm24, %zmm5 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512F-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm18, 256(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm27, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, 448(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm23, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 576(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, (%r9) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1199,97 +1201,97 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm10 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <3,u,u,u,12,4,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm7, %zmm4 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} ; AVX512BW-NEXT: movb $8, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k2} -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 ; AVX512BW-NEXT: movb $-116, %al -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 -; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm15 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm19 +; AVX512BW-NEXT: movb $24, %al +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm19 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm21 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k3} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm25 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm17, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm21, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm27, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 448(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -1312,53 +1314,53 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $920, %rsp # imm = 0x398 ; SSE-NEXT: movapd (%rdi), %xmm3 -; SSE-NEXT: movapd 16(%rdi), %xmm4 -; SSE-NEXT: movapd 32(%rdi), %xmm5 -; SSE-NEXT: movapd (%rsi), %xmm6 +; SSE-NEXT: movapd 16(%rdi), %xmm6 +; SSE-NEXT: movapd 32(%rdi), %xmm10 +; SSE-NEXT: movapd (%rsi), %xmm4 ; SSE-NEXT: movapd 16(%rsi), %xmm7 -; SSE-NEXT: movapd 32(%rsi), %xmm8 -; SSE-NEXT: movapd (%rdx), %xmm9 -; SSE-NEXT: movapd 16(%rdx), %xmm10 -; SSE-NEXT: movapd 32(%rdx), %xmm11 -; SSE-NEXT: movapd (%rcx), %xmm12 +; SSE-NEXT: movapd 32(%rsi), %xmm12 +; SSE-NEXT: movapd (%rdx), %xmm5 +; SSE-NEXT: movapd 16(%rdx), %xmm9 +; SSE-NEXT: movapd 32(%rdx), %xmm14 +; SSE-NEXT: movapd (%rcx), %xmm8 ; SSE-NEXT: movapd 16(%rcx), %xmm13 -; SSE-NEXT: movapd 32(%rcx), %xmm14 +; SSE-NEXT: movapd 32(%rcx), %xmm15 ; SSE-NEXT: movapd (%r8), %xmm0 ; SSE-NEXT: movapd 16(%r8), %xmm1 ; SSE-NEXT: movapd 32(%r8), %xmm2 -; SSE-NEXT: movapd %xmm3, %xmm15 -; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm6[0] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm3, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm4[0] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm12[0] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm4, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm6, %xmm4 +; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm13[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm13[0] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm5, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm14[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm12[0] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm15[0] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm2[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 48(%rdi), %xmm1 ; SSE-NEXT: movapd 48(%rsi), %xmm2 ; SSE-NEXT: movapd %xmm1, %xmm0 @@ -1525,17 +1527,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 208(%r8), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd 208(%rdx), %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] +; SSE-NEXT: movapd 208(%rdx), %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm11[1] ; SSE-NEXT: movapd 208(%rcx), %xmm9 -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movapd 224(%rdi), %xmm11 +; SSE-NEXT: movapd 224(%rdi), %xmm10 ; SSE-NEXT: movapd 224(%rsi), %xmm8 -; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: movapd %xmm10, %xmm12 ; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm8[0] ; SSE-NEXT: movapd 224(%r8), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] ; SSE-NEXT: movapd 224(%rdx), %xmm6 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] ; SSE-NEXT: movapd 224(%rcx), %xmm3 @@ -1559,13 +1561,13 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm5, 1200(%r9) ; SSE-NEXT: movapd %xmm3, 1184(%r9) ; SSE-NEXT: movapd %xmm8, 1168(%r9) -; SSE-NEXT: movapd %xmm11, 1152(%r9) +; SSE-NEXT: movapd %xmm10, 1152(%r9) ; SSE-NEXT: movapd %xmm6, 1136(%r9) ; SSE-NEXT: movapd %xmm12, 1120(%r9) ; SSE-NEXT: movapd %xmm9, 1104(%r9) ; SSE-NEXT: movapd %xmm13, 1088(%r9) ; SSE-NEXT: movapd %xmm14, 1072(%r9) -; SSE-NEXT: movapd %xmm10, 1056(%r9) +; SSE-NEXT: movapd %xmm11, 1056(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1040(%r9) ; SSE-NEXT: movapd %xmm15, 1024(%r9) @@ -1703,16 +1705,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i64_stride5_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1048, %rsp # imm = 0x418 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] @@ -1721,7 +1722,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] @@ -1733,13 +1734,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],mem[0],ymm11[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -1752,258 +1755,256 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm5 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm0[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm15[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm2[0],ymm8[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm15[1],xmm8[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm15[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, (%rsp), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm8[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm8[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm8[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm8[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm2[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm2[2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm5[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm5[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm5[2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm5[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm15[0],ymm5[1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm1[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0],ymm11[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm2[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0],ymm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm5[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm4[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0],ymm12[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0],ymm8[1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm7[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, (%r9) +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm13, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, (%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 976(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 960(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 960(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1136(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 1120(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 816(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 800(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 1120(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 816(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 800(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 480(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm3, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 320(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 656(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 640(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 320(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 656(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 640(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2018,7 +2019,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) @@ -2026,7 +2027,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r9) @@ -2074,61 +2075,61 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1128, %rsp # imm = 0x468 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX2-ONLY-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],mem[0],ymm11[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0 @@ -2139,11 +2140,12 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],mem[0],ymm10[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 @@ -2154,14 +2156,13 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 @@ -2190,14 +2191,13 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],mem[0],ymm10[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 @@ -2208,19 +2208,20 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm1 @@ -2241,93 +2242,95 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm15[1],ymm2[1],ymm15[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm2[0],ymm15[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $252, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm9[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm2[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm3 @@ -2335,49 +2338,49 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm3 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm6 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm6[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm8[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm0 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1184(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 1152(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1120(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1056(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 1056(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 1024(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 992(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 992(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 896(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 864(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 896(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 864(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 832(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 800(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 736(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 736(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 704(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 672(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2399,7 +2402,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r9) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r9) @@ -2414,424 +2417,426 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 1088(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1248(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1248(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-ONLY-NEXT: addq $1128, %rsp # imm = 0x468 +; AVX2-ONLY-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: store_i64_stride5_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm24 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm22 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm27 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm23 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm25 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm12, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm28, %zmm23 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm6 +; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm15 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm16 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm31 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm11 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <3,u,u,u,12,4,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm22, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [15,7,15,7,15,7,15,7] +; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm29 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm24, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <1,u,u,u,10,2,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm24, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm14, %zmm16 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = +; AVX512F-NEXT: vpermt2q %zmm2, %zmm9, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm24, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm17, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm1, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm1, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm31 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm22, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm30, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm28 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm22, %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm31, %zmm1, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm31, %zmm29 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm31, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm30, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm20, %zmm1, %zmm22 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm20, %zmm30 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm20, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm20 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7 {%k1} ; AVX512F-NEXT: movb $-116, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm20 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm23 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm10 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm28 {%k3} ; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 ; AVX512F-NEXT: movb $8, %al ; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm29 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,1,2,3,4,13,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,9,2,3,4,5,10,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k3} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm14 {%k3} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 {%k2} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm28 {%k2} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, 704(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm10, 768(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm16, 832(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1024(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 1088(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 1152(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1216(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm12 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm27 {%k3} +; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm9 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm20 {%k1} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm27, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm10, 640(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 704(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 768(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, 832(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm8, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 1024(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm11, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm29, 1216(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm22 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm15 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm31 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <3,u,u,u,12,4,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm29 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <1,u,u,u,10,2,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm1, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm31 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm22, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm31, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm31, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm20, %zmm30 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm20, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm20 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7 {%k1} ; AVX512BW-NEXT: movb $-116, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm23 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm28 {%k3} ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 ; AVX512BW-NEXT: movb $8, %al ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm28 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 768(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 832(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1024(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1088(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1152(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1216(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm27 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 704(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 832(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 1024(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1216(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -2854,53 +2859,53 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $2200, %rsp # imm = 0x898 ; SSE-NEXT: movapd (%rdi), %xmm3 -; SSE-NEXT: movapd 16(%rdi), %xmm4 -; SSE-NEXT: movapd 32(%rdi), %xmm5 -; SSE-NEXT: movapd (%rsi), %xmm6 +; SSE-NEXT: movapd 16(%rdi), %xmm6 +; SSE-NEXT: movapd 32(%rdi), %xmm10 +; SSE-NEXT: movapd (%rsi), %xmm4 ; SSE-NEXT: movapd 16(%rsi), %xmm7 -; SSE-NEXT: movapd 32(%rsi), %xmm8 -; SSE-NEXT: movapd (%rdx), %xmm9 -; SSE-NEXT: movapd 16(%rdx), %xmm10 -; SSE-NEXT: movapd 32(%rdx), %xmm11 -; SSE-NEXT: movapd (%rcx), %xmm12 +; SSE-NEXT: movapd 32(%rsi), %xmm12 +; SSE-NEXT: movapd (%rdx), %xmm5 +; SSE-NEXT: movapd 16(%rdx), %xmm9 +; SSE-NEXT: movapd 32(%rdx), %xmm14 +; SSE-NEXT: movapd (%rcx), %xmm8 ; SSE-NEXT: movapd 16(%rcx), %xmm13 -; SSE-NEXT: movapd 32(%rcx), %xmm14 -; SSE-NEXT: movapd 16(%r8), %xmm1 -; SSE-NEXT: movapd 32(%r8), %xmm0 +; SSE-NEXT: movapd 32(%rcx), %xmm15 +; SSE-NEXT: movapd 16(%r8), %xmm0 +; SSE-NEXT: movapd 32(%r8), %xmm1 ; SSE-NEXT: movapd (%r8), %xmm2 -; SSE-NEXT: movapd %xmm3, %xmm15 -; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm6[0] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm3, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm4[0] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm12[0] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm2[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm4, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm13[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm5, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm8[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm14[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movapd %xmm6, %xmm4 +; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm13[0] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm12[0] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm15[0] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 48(%rdi), %xmm1 ; SSE-NEXT: movapd 48(%rsi), %xmm2 ; SSE-NEXT: movapd %xmm1, %xmm0 @@ -3323,17 +3328,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 464(%r8), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd 464(%rdx), %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] +; SSE-NEXT: movapd 464(%rdx), %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm11[1] ; SSE-NEXT: movapd 464(%rcx), %xmm9 -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movapd 480(%rdi), %xmm11 +; SSE-NEXT: movapd 480(%rdi), %xmm10 ; SSE-NEXT: movapd 480(%rsi), %xmm8 -; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: movapd %xmm10, %xmm12 ; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm8[0] ; SSE-NEXT: movapd 480(%r8), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] ; SSE-NEXT: movapd 480(%rdx), %xmm6 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] ; SSE-NEXT: movapd 480(%rcx), %xmm3 @@ -3357,13 +3362,13 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm5, 2480(%r9) ; SSE-NEXT: movapd %xmm3, 2464(%r9) ; SSE-NEXT: movapd %xmm8, 2448(%r9) -; SSE-NEXT: movapd %xmm11, 2432(%r9) +; SSE-NEXT: movapd %xmm10, 2432(%r9) ; SSE-NEXT: movapd %xmm6, 2416(%r9) ; SSE-NEXT: movapd %xmm12, 2400(%r9) ; SSE-NEXT: movapd %xmm9, 2384(%r9) ; SSE-NEXT: movapd %xmm13, 2368(%r9) ; SSE-NEXT: movapd %xmm14, 2352(%r9) -; SSE-NEXT: movapd %xmm10, 2336(%r9) +; SSE-NEXT: movapd %xmm11, 2336(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 2320(%r9) ; SSE-NEXT: movapd %xmm15, 2304(%r9) @@ -3660,48 +3665,46 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2264, %rsp # imm = 0x8D8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 +; AVX1-ONLY-NEXT: subq $2296, %rsp # imm = 0x8F8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 160(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -3721,8 +3724,8 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],mem[0],ymm12[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 288(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -3730,7 +3733,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm10 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3746,7 +3749,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm10 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3767,427 +3770,260 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vmovapd 480(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm15 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 264(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vbroadcastsd 264(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 296(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vbroadcastsd 296(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 328(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 328(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 360(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 360(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 392(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 392(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 488(%rsi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm13, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm12[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm12[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm8[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm12[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm7[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[0],ymm8[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm8[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0],ymm7[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm7[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovapd 304(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 304(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 288(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0],ymm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm5[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 368(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 352(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0],ymm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm14[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm14[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 432(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm3[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm3[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm2[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 496(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm2 ; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4198,41 +4034,211 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovapd 304(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 304(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 288(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 368(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 352(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 432(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 496(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 @@ -4243,7 +4249,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4265,20 +4271,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -4286,17 +4292,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm9, (%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm7, 1936(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 1920(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 1920(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm0, 2256(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 2240(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 2240(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 2416(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 2400(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 2400(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 2096(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 2080(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 2080(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1616(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 1600(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 1600(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 1776(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm15, 1760(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4311,13 +4317,13 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%r9) @@ -4459,60 +4465,60 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $2264, %rsp # imm = 0x8D8 +; AVX1-ONLY-NEXT: addq $2296, %rsp # imm = 0x8F8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX2-ONLY-NEXT: subq $2728, %rsp # imm = 0xAA8 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] @@ -4763,7 +4769,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 184(%rsi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm1 @@ -4794,78 +4800,77 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm10 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm4 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 256(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm6 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 280(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 344(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vbroadcastsd 280(%rsi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 344(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 408(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 408(%rsi), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4873,18 +4878,18 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 @@ -4910,12 +4915,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 @@ -4925,11 +4930,11 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm1 @@ -4946,7 +4951,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 @@ -4977,7 +4982,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm1 @@ -5008,7 +5013,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm1 @@ -5038,36 +5043,37 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm11 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm11[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm11[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm10[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm10[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm10[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm12 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm12[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm12[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm13 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm13[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm10[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm0 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3,4,5,6,7] @@ -5076,19 +5082,19 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm0 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 2464(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 2432(%r9) @@ -5105,8 +5111,8 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2080(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 2016(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 1984(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 1952(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1984(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 1952(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1920(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 1856(%r9) @@ -5163,7 +5169,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 832(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 800(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 704(%r9) @@ -5171,7 +5177,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%r9) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r9) @@ -5219,7 +5225,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 2528(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 2528(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2208(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5234,452 +5240,460 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-ONLY-NEXT: addq $2696, %rsp # imm = 0xA88 +; AVX2-ONLY-NEXT: addq $2728, %rsp # imm = 0xAA8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: store_i64_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512F-NEXT: subq $3080, %rsp # imm = 0xC08 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm24 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm2 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <3,u,u,u,12,4,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm12, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm12, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm12, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm17, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm12, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm23 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm17 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm28 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <1,u,u,u,10,2,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm13, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm28 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm31 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm26 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm31 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm16 ; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm30 ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm30 +; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm6 ; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 -; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm7 +; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm8 ; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm11 -; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm8 +; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm3 ; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm3 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512F-NEXT: movb $-116, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm16 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm22 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm20 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm21 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm23 {%k3} ; AVX512F-NEXT: movb $8, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [12,1,2,3,4,13,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,14,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm28 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 {%k3} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k3} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm30 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm18 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm7 +; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm8 +; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm17 {%k1} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm15 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 {%k2} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm5, 2496(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, 2432(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 2368(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm4, 2304(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 2240(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm11, 2176(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 2112(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 2048(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, 1984(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1920(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm3, 2496(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 2432(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 2368(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 2304(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 2240(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm8, 2176(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, 2112(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 2048(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm1, 1984(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, 1920(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm7, 1856(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 1792(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 1728(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 1664(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 1600(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, 1536(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1472(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm29, 1408(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 1344(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 1280(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm19, 1216(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 1792(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm27, 1728(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 1664(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 1600(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 1536(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1472(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 1408(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1344(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm11, 1280(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm30, 1216(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 1152(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm31, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm18, 1088(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 1024(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 960(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 896(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 832(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm30, 768(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm29, 768(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 704(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 576(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 512(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5688,461 +5702,469 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 256(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX512F-NEXT: addq $3080, %rsp # imm = 0xC08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512BW-NEXT: subq $3080, %rsp # imm = 0xC08 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <3,u,u,u,12,4,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm28 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <1,u,u,u,10,2,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm28 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm31 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm26 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm31 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm30 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm30 +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm7 +; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm8 ; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm8 +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm3 ; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm3 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512BW-NEXT: movb $-116, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm20 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm23 {%k3} ; AVX512BW-NEXT: movb $8, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm30 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm7 +; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm8 +; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm5, 2496(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 2432(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 2368(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 2304(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 2240(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 2176(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 2112(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 2048(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 1984(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1920(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm3, 2496(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 2432(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 2368(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 2304(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 2240(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 2176(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 2112(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 2048(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1984(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1920(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 1856(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1792(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1728(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1664(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1536(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1472(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 1408(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1344(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1280(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 1792(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 1600(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1536(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1472(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 1408(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1344(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1280(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1216(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1152(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 1088(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1024(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 896(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 832(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 768(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 704(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 576(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 512(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6151,16 +6173,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%r9) -; AVX512BW-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX512BW-NEXT: addq $3080, %rsp # imm = 0xC08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index 12c18c325e5dc2..11b8da85b09046 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -126,48 +126,48 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i64_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm5 -; SSE-NEXT: movaps 16(%rsi), %xmm6 +; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm3 +; SSE-NEXT: movaps 16(%rsi), %xmm4 ; SSE-NEXT: movaps (%rdx), %xmm0 -; SSE-NEXT: movaps 16(%rdx), %xmm4 -; SSE-NEXT: movaps (%rcx), %xmm7 -; SSE-NEXT: movaps 16(%rcx), %xmm8 -; SSE-NEXT: movaps (%r8), %xmm9 -; SSE-NEXT: movaps 16(%r8), %xmm10 -; SSE-NEXT: movaps (%r9), %xmm11 -; SSE-NEXT: movaps 16(%r9), %xmm12 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: movaps %xmm10, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: movaps %xmm9, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm11[1] +; SSE-NEXT: movaps 16(%rdx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm6 +; SSE-NEXT: movaps 16(%rcx), %xmm7 +; SSE-NEXT: movaps (%r8), %xmm8 +; SSE-NEXT: movaps 16(%r8), %xmm9 +; SSE-NEXT: movaps (%r9), %xmm10 +; SSE-NEXT: movaps 16(%r9), %xmm11 +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm10[1] ; SSE-NEXT: movaps %xmm0, %xmm15 -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm8[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm11[0] -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps %xmm2, 48(%rax) -; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm4, 112(%rax) -; SSE-NEXT: movaps %xmm12, 160(%rax) -; SSE-NEXT: movaps %xmm10, 176(%rax) -; SSE-NEXT: movaps %xmm7, (%rax) +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm6[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; SSE-NEXT: movaps %xmm5, %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE-NEXT: movaps %xmm8, 32(%rax) +; SSE-NEXT: movaps %xmm1, 48(%rax) +; SSE-NEXT: movaps %xmm2, 96(%rax) +; SSE-NEXT: movaps %xmm5, 112(%rax) +; SSE-NEXT: movaps %xmm11, 160(%rax) +; SSE-NEXT: movaps %xmm9, 176(%rax) +; SSE-NEXT: movaps %xmm6, (%rax) ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps %xmm15, 64(%rax) ; SSE-NEXT: movaps %xmm14, 80(%rax) ; SSE-NEXT: movaps %xmm13, 128(%rax) -; SSE-NEXT: movaps %xmm3, 144(%rax) +; SSE-NEXT: movaps %xmm12, 144(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf4: @@ -310,92 +310,89 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i64_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rdi), %xmm3 -; SSE-NEXT: movaps 32(%rdi), %xmm5 -; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps 32(%rdi), %xmm6 +; SSE-NEXT: movaps (%rsi), %xmm7 ; SSE-NEXT: movaps 16(%rsi), %xmm12 -; SSE-NEXT: movaps 32(%rsi), %xmm14 -; SSE-NEXT: movaps (%rdx), %xmm4 -; SSE-NEXT: movaps 16(%rdx), %xmm6 -; SSE-NEXT: movaps 32(%rdx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm10 +; SSE-NEXT: movaps 32(%rsi), %xmm15 +; SSE-NEXT: movaps (%rdx), %xmm1 +; SSE-NEXT: movaps 16(%rdx), %xmm5 +; SSE-NEXT: movaps 32(%rdx), %xmm10 +; SSE-NEXT: movaps (%rcx), %xmm8 ; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps (%r8), %xmm7 +; SSE-NEXT: movaps (%r8), %xmm4 ; SSE-NEXT: movaps 16(%r8), %xmm9 ; SSE-NEXT: movaps (%r9), %xmm11 -; SSE-NEXT: movaps 16(%r9), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] +; SSE-NEXT: movaps 16(%r9), %xmm14 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm11[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: movaps %xmm2, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm13[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps %xmm5, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1] +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r8), %xmm5 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 32(%r8), %xmm6 ; SSE-NEXT: movaps 32(%r9), %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 48(%rdi), %xmm5 ; SSE-NEXT: movaps 48(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 48(%rdx), %xmm1 -; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps 48(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%r8), %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps 48(%r8), %xmm2 ; SSE-NEXT: movaps 48(%r9), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 368(%rax) +; SSE-NEXT: movaps %xmm2, 368(%rax) ; SSE-NEXT: movaps %xmm1, 352(%rax) -; SSE-NEXT: movaps %xmm6, 336(%rax) -; SSE-NEXT: movaps %xmm2, 320(%rax) +; SSE-NEXT: movaps %xmm5, 336(%rax) +; SSE-NEXT: movaps %xmm0, 320(%rax) ; SSE-NEXT: movaps %xmm4, 304(%rax) ; SSE-NEXT: movaps %xmm7, 288(%rax) -; SSE-NEXT: movaps %xmm5, 272(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 256(%rax) +; SSE-NEXT: movaps %xmm6, 272(%rax) +; SSE-NEXT: movaps %xmm10, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rax) ; SSE-NEXT: movaps %xmm8, 224(%rax) -; SSE-NEXT: movaps %xmm14, 208(%rax) -; SSE-NEXT: movaps %xmm15, 192(%rax) +; SSE-NEXT: movaps %xmm15, 208(%rax) +; SSE-NEXT: movaps %xmm14, 192(%rax) ; SSE-NEXT: movaps %xmm9, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) @@ -410,7 +407,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm10, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -424,30 +422,30 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm11 ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm6 ; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm6[2,3],ymm1[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm7, %ymm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm6, %ymm10 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm10[2,3],ymm6[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm12[1],xmm4[1] @@ -457,54 +455,54 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm15 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm11[0],ymm14[0],ymm11[2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm14[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm15[0],ymm13[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm11 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm12[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm6[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm9[0],xmm5[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 208(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm13, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 160(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -514,84 +512,84 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-ONLY-LABEL: store_i64_stride6_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm5 ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm4 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = xmm3[0,0] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm9[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[0,1],ymm2[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm6[1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm10[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm13[1],xmm11[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm11[1],xmm9[1] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm7[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[0,1],ymm8[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = xmm4[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm7[1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm5[0,1],ymm13[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],mem[1],ymm10[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm12 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm8[1],ymm13[1],ymm8[3],ymm13[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm13[0],ymm8[2],ymm13[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm10 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 288(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -599,27 +597,27 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-LABEL: store_i64_stride6_vf8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,5,13,4,12,5,13] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512F-NEXT: movb $12, %r10b ; AVX512F-NEXT: kmovw %r10d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512F-NEXT: movb $16, %r10b ; AVX512F-NEXT: kmovw %r10d, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 @@ -627,56 +625,56 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: kmovw %r9d, %k2 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqa (%rdx), %xmm10 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm11[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <14,u,2,3,4,5,15,u> +; AVX512F-NEXT: vpermi2q %zmm4, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,14,2,3,4,5,6,15] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <10,u,2,3,4,5,11,u> +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, 256(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) @@ -686,27 +684,27 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-LABEL: store_i64_stride6_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,5,13,4,12,5,13] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: movb $12, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512BW-NEXT: movb $16, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 @@ -714,56 +712,56 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: kmovd %r9d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] ; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm11[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <14,u,2,3,4,5,15,u> +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <10,u,2,3,4,5,11,u> +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) @@ -791,61 +789,61 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $408, %rsp # imm = 0x198 ; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm9 -; SSE-NEXT: movaps (%rsi), %xmm3 +; SSE-NEXT: movaps 16(%rdi), %xmm9 +; SSE-NEXT: movaps 32(%rdi), %xmm13 +; SSE-NEXT: movaps (%rsi), %xmm0 ; SSE-NEXT: movaps 16(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm10 -; SSE-NEXT: movaps 16(%rdx), %xmm11 -; SSE-NEXT: movaps 32(%rdx), %xmm12 -; SSE-NEXT: movaps (%rcx), %xmm5 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps (%r8), %xmm13 -; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps (%r9), %xmm6 -; SSE-NEXT: movaps 16(%r9), %xmm4 -; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps 32(%rsi), %xmm2 +; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movaps 16(%rdx), %xmm12 +; SSE-NEXT: movaps 32(%rdx), %xmm15 +; SSE-NEXT: movaps (%rcx), %xmm3 +; SSE-NEXT: movaps 16(%rcx), %xmm4 +; SSE-NEXT: movaps (%r8), %xmm11 +; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movaps (%r9), %xmm5 +; SSE-NEXT: movaps 16(%r9), %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm4[1] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm2 ; SSE-NEXT: movaps 32(%r9), %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm1 @@ -907,56 +905,56 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 80(%r8), %xmm14 +; SSE-NEXT: movaps 80(%r8), %xmm13 ; SSE-NEXT: movaps 80(%r9), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdi), %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm8 ; SSE-NEXT: movaps 96(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 96(%rdx), %xmm10 ; SSE-NEXT: movaps 96(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm10, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 96(%r8), %xmm5 +; SSE-NEXT: movaps 96(%r8), %xmm7 ; SSE-NEXT: movaps 96(%r9), %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdi), %xmm5 ; SSE-NEXT: movaps 112(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 112(%rdx), %xmm1 -; SSE-NEXT: movaps 112(%rcx), %xmm0 +; SSE-NEXT: movaps 112(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%r8), %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps 112(%r8), %xmm2 ; SSE-NEXT: movaps 112(%r9), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 752(%rax) +; SSE-NEXT: movaps %xmm2, 752(%rax) ; SSE-NEXT: movaps %xmm1, 736(%rax) -; SSE-NEXT: movaps %xmm6, 720(%rax) -; SSE-NEXT: movaps %xmm2, 704(%rax) +; SSE-NEXT: movaps %xmm5, 720(%rax) +; SSE-NEXT: movaps %xmm0, 704(%rax) ; SSE-NEXT: movaps %xmm4, 688(%rax) -; SSE-NEXT: movaps %xmm7, 672(%rax) -; SSE-NEXT: movaps %xmm5, 656(%rax) +; SSE-NEXT: movaps %xmm6, 672(%rax) +; SSE-NEXT: movaps %xmm7, 656(%rax) ; SSE-NEXT: movaps %xmm10, 640(%rax) -; SSE-NEXT: movaps %xmm9, 624(%rax) -; SSE-NEXT: movaps %xmm8, 608(%rax) +; SSE-NEXT: movaps %xmm8, 624(%rax) +; SSE-NEXT: movaps %xmm9, 608(%rax) ; SSE-NEXT: movaps %xmm11, 592(%rax) -; SSE-NEXT: movaps %xmm13, 576(%rax) -; SSE-NEXT: movaps %xmm14, 560(%rax) +; SSE-NEXT: movaps %xmm14, 576(%rax) +; SSE-NEXT: movaps %xmm13, 560(%rax) ; SSE-NEXT: movaps %xmm12, 544(%rax) ; SSE-NEXT: movaps %xmm15, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1031,183 +1029,183 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i64_stride6_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm11 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm9[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm5[0],ymm11[0],ymm5[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm4[0],ymm7[0],ymm4[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm3, 592(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 576(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 576(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm14, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 400(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm12, 384(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 704(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 704(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -1227,7 +1225,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm4, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1250,175 +1248,175 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] +; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm0[0,0] ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm15[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm14[1],xmm3[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[0,1],ymm3[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm13 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm13[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm10[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm7[1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm4[0,1],ymm6[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm12[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm7[1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[0,1],ymm6[0,1] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = xmm8[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[0,1],ymm10[0,1] -; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = xmm0[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm6[0,1] +; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = xmm8[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm13[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm15, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm6 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-ONLY-NEXT: vperm2f128 $19, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm5 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm4[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[2,3],ymm7[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm10 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm11 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm12 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm11, 736(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 704(%rax) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 704(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 672(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 576(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 544(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 512(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 480(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 384(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 288(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 128(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) @@ -1446,95 +1444,95 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] ; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $16, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm7, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] ; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: movb $48, %r9b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r9d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] ; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] ; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm17, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm17 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm19, %zmm19 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm18, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] ; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm21 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] ; AVX512F-ONLY-SLOW-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 @@ -1542,43 +1540,43 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm24, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm20[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm21, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm24, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; @@ -1586,95 +1584,95 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] ; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm11 ; AVX512F-ONLY-FAST-NEXT: movb $12, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $16, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm7, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm16 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm9 ; AVX512F-ONLY-FAST-NEXT: movb $48, %r9b ; AVX512F-ONLY-FAST-NEXT: kmovw %r9d, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] ; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] ; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm17, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm19, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm18, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] ; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] ; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 @@ -1682,43 +1680,43 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm20[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm21, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -1726,95 +1724,95 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] ; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm11 ; AVX512DQ-SLOW-NEXT: movb $12, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: movb $16, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm7, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] ; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm16 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] ; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm9 ; AVX512DQ-SLOW-NEXT: movb $48, %r9b ; AVX512DQ-SLOW-NEXT: kmovw %r9d, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm9 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] ; AVX512DQ-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] ; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm17, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm16 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm12 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm17 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,8,1,9,0,8,1,9] +; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm19, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm18 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm18, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm18 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] ; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm21 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] ; AVX512DQ-SLOW-NEXT: # ymm22 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 @@ -1822,43 +1820,43 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] ; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm24, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm21, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -1866,95 +1864,95 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] ; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm11 ; AVX512DQ-FAST-NEXT: movb $12, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: movb $16, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} ; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm7, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm16 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] ; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm9 ; AVX512DQ-FAST-NEXT: movb $48, %r9b ; AVX512DQ-FAST-NEXT: kmovw %r9d, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> ; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm9 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] ; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm17, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm16 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm16 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm18 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} ; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm21 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} ; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,8,1,9,0,8,1,9] +; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm19, %zmm19 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm18 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm18, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm18 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] ; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm21 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] ; AVX512DQ-FAST-NEXT: # ymm22 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 @@ -1962,43 +1960,43 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> ; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> ; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -2006,95 +2004,95 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] ; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $16, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm7, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r9b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r9d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] ; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] ; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm17, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm19, %zmm19 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm18, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] ; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm21 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] ; AVX512BW-ONLY-SLOW-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 @@ -2102,43 +2100,43 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] ; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm24, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm21, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm24, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; @@ -2146,95 +2144,95 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] ; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $16, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm7, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm16 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] ; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: movb $48, %r9b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r9d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] ; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] ; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm17, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm16 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm19, %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm18, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] ; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm21 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] ; AVX512BW-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 @@ -2242,43 +2240,43 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] ; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm21, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; @@ -2286,95 +2284,95 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] ; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm11 ; AVX512DQBW-SLOW-NEXT: movb $12, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $16, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm7, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm16 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] ; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm9 ; AVX512DQBW-SLOW-NEXT: movb $48, %r9b ; AVX512DQBW-SLOW-NEXT: kmovd %r9d, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm9 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] ; AVX512DQBW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] ; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm17, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm16 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm12 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm17 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm19, %zmm19 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm18 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm18, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm18 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] ; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm21 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] ; AVX512DQBW-SLOW-NEXT: # ymm22 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 @@ -2382,43 +2380,43 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] ; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm24, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm21, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; @@ -2426,95 +2424,95 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] ; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm11 ; AVX512DQBW-FAST-NEXT: movb $12, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} ; AVX512DQBW-FAST-NEXT: movb $16, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm7, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm16 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] ; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm9 ; AVX512DQBW-FAST-NEXT: movb $48, %r9b ; AVX512DQBW-FAST-NEXT: kmovd %r9d, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm9 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] ; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] ; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm17, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm16 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm12 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm19, %zmm19 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm18 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm18, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm18 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] ; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm21 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] ; AVX512DQBW-FAST-NEXT: # ymm22 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 @@ -2522,43 +2520,43 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] ; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm21, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -2583,61 +2581,61 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $1176, %rsp # imm = 0x498 ; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm9 -; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm9 +; SSE-NEXT: movaps 32(%rdi), %xmm13 +; SSE-NEXT: movaps (%rsi), %xmm0 ; SSE-NEXT: movaps 16(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm10 -; SSE-NEXT: movaps 16(%rdx), %xmm11 -; SSE-NEXT: movaps 32(%rdx), %xmm12 -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps 16(%rcx), %xmm3 -; SSE-NEXT: movaps (%r8), %xmm13 +; SSE-NEXT: movaps 32(%rsi), %xmm2 +; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movaps 16(%rdx), %xmm12 +; SSE-NEXT: movaps 32(%rdx), %xmm15 +; SSE-NEXT: movaps (%rcx), %xmm3 +; SSE-NEXT: movaps 16(%rcx), %xmm4 +; SSE-NEXT: movaps (%r8), %xmm11 ; SSE-NEXT: movaps 16(%r8), %xmm14 -; SSE-NEXT: movaps (%r9), %xmm6 -; SSE-NEXT: movaps 16(%r9), %xmm5 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movaps (%r9), %xmm5 +; SSE-NEXT: movaps 16(%r9), %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm2 ; SSE-NEXT: movaps 32(%r9), %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm1 @@ -2855,33 +2853,33 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm15 +; SSE-NEXT: movaps 208(%rdi), %xmm14 ; SSE-NEXT: movaps 208(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 208(%rdx), %xmm12 ; SSE-NEXT: movaps 208(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 208(%r8), %xmm11 -; SSE-NEXT: movaps 208(%r9), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdi), %xmm13 -; SSE-NEXT: movaps 224(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 208(%r8), %xmm15 +; SSE-NEXT: movaps 208(%r9), %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] +; SSE-NEXT: movaps 224(%rdi), %xmm10 +; SSE-NEXT: movaps 224(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] ; SSE-NEXT: movaps 224(%rdx), %xmm9 ; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 224(%r8), %xmm5 ; SSE-NEXT: movaps 224(%r9), %xmm0 @@ -2894,31 +2892,31 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movaps 240(%rdx), %xmm1 -; SSE-NEXT: movaps 240(%rcx), %xmm0 +; SSE-NEXT: movaps 240(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps 240(%r8), %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps 240(%r8), %xmm2 ; SSE-NEXT: movaps 240(%r9), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 1520(%rax) +; SSE-NEXT: movaps %xmm2, 1520(%rax) ; SSE-NEXT: movaps %xmm1, 1504(%rax) ; SSE-NEXT: movaps %xmm6, 1488(%rax) -; SSE-NEXT: movaps %xmm2, 1472(%rax) +; SSE-NEXT: movaps %xmm0, 1472(%rax) ; SSE-NEXT: movaps %xmm4, 1456(%rax) ; SSE-NEXT: movaps %xmm7, 1440(%rax) ; SSE-NEXT: movaps %xmm5, 1424(%rax) ; SSE-NEXT: movaps %xmm9, 1408(%rax) -; SSE-NEXT: movaps %xmm13, 1392(%rax) +; SSE-NEXT: movaps %xmm10, 1392(%rax) ; SSE-NEXT: movaps %xmm8, 1376(%rax) -; SSE-NEXT: movaps %xmm10, 1360(%rax) -; SSE-NEXT: movaps %xmm14, 1344(%rax) -; SSE-NEXT: movaps %xmm11, 1328(%rax) +; SSE-NEXT: movaps %xmm11, 1360(%rax) +; SSE-NEXT: movaps %xmm13, 1344(%rax) +; SSE-NEXT: movaps %xmm15, 1328(%rax) ; SSE-NEXT: movaps %xmm12, 1312(%rax) -; SSE-NEXT: movaps %xmm15, 1296(%rax) +; SSE-NEXT: movaps %xmm14, 1296(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1280(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3086,342 +3084,344 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1608, %rsp # imm = 0x648 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm0 +; AVX1-ONLY-NEXT: subq $1624, %rsp # imm = 0x658 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm13[1],xmm11[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm2[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm4[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm9[2,3],ymm10[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm12[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm10[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm7[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm4[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm14[0] +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm14[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm13[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm0[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm0[0],xmm11[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = xmm0[0],mem[0] @@ -3552,7 +3552,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $1608, %rsp # imm = 0x648 +; AVX1-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3563,9 +3563,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm0[0,0] ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm8 ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 @@ -3578,7 +3578,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[0,1],ymm3[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 @@ -3590,20 +3590,20 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm5[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm5[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3659,10 +3659,10 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3670,17 +3670,17 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm10[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] ; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1] ; AVX2-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3688,37 +3688,37 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] ; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[0,1],ymm14[0,1] ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm0[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = xmm0[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm13[1] +; AVX2-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3747,21 +3747,21 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 @@ -3840,7 +3840,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -3864,39 +3864,39 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 208(%r9), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 208(%r9), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm2 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm12[0],ymm2[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm11[0],ymm2[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm12 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm11 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm11[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] @@ -3922,7 +3922,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 928(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 896(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 864(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 864(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 736(%rax) @@ -3946,7 +3946,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3991,2233 +3991,2201 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,5,13,4,12,5,13] ; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm27, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm27, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm13, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm24, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm24, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm24, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm23, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm24, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm24, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movb $12, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $48, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm29, %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm31, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm21, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm24, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm24 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm23[0,1,2,3],zmm13[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $16, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,10,2,3,4,5,6,11] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm23, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm23, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm23, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,5,13,4,12,5,13] ; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm13, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm24, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm24, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm23, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm24, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm24, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $12, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $48, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm29, %zmm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm31, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm21, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm24, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm24 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm23[0,1,2,3],zmm13[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $16, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,10,2,3,4,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm23, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm23, %zmm3 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm23 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,5,13,4,12,5,13] ; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm27 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm27, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm27, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm13, %zmm27 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,9,2,10,1,9,2,10] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,13,6,14,5,13,6,14] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm24, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm24, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm24, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm23, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] ; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm12 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,8,1,9,0,8,1,9] +; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm24, %zmm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm24, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $12, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 {%k1} ; AVX512DQ-SLOW-NEXT: movb $48, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm29, %zmm29 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm29 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm31, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm21, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm24, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm24 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm23[0,1,2,3],zmm13[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $16, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <14,u,2,3,4,5,15,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,10,2,3,4,5,6,11] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,12,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,14,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm23, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm23, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm23, %zmm3 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQ-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQ-SLOW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride6_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm23 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] -; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,5,13,4,12,5,13] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm9 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm13, %zmm27 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,9,2,10,1,9,2,10] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,13,6,14,5,13,6,14] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm24, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm17 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm24, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm16 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm23, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,8,1,9,0,8,1,9] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm24, %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm24, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $12, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: movb $48, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm29, %zmm29 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm31, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm21, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm24, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm24 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm23[0,1,2,3],zmm13[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $16, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <14,u,2,3,4,5,15,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm23, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm23, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm23, %zmm3 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQ-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQ-FAST-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,5,13,4,12,5,13] ; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm27, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm27, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm13, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm24, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm24, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm24, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm23, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm24, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm24, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm29, %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm31, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm21, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm24, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm24 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm23[0,1,2,3],zmm13[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $16, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,10,2,3,4,5,6,11] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm23, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm23, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm23, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,5,13,4,12,5,13] ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm13, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm24, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm24, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm23, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm24, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm24, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $12, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $48, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm29, %zmm29 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm31, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm21, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm24, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm24 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm23[0,1,2,3],zmm13[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $16, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,10,2,3,4,5,6,11] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm23, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm23, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm23 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,5,13,4,12,5,13] ; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm27, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm27, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm13, %zmm27 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm24, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm24, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm24, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm23, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm24, %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm24, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm26, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $12, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $48, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm29, %zmm29 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm29 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm31, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm31 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm21, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm24, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm24 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm23[0,1,2,3],zmm13[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $16, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,10,2,3,4,5,6,11] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm23, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm23, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm23, %zmm3 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf32: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm23 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,5,13,4,12,5,13] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm27 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm13, %zmm27 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm24, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm18, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm24, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm23, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm24, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm24, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm26, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $12, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: movb $48, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm29, %zmm29 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm29 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm31, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm31 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm21, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm24, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm24 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm23[0,1,2,3],zmm13[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $16, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,10,2,3,4,5,6,11] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm23, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm23, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm23, %zmm3 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQBW-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQBW-FAST-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -6242,61 +6210,61 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $2712, %rsp # imm = 0xA98 ; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm9 -; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm9 +; SSE-NEXT: movaps 32(%rdi), %xmm13 +; SSE-NEXT: movaps (%rsi), %xmm0 ; SSE-NEXT: movaps 16(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm10 -; SSE-NEXT: movaps 16(%rdx), %xmm11 -; SSE-NEXT: movaps 32(%rdx), %xmm12 -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps 16(%rcx), %xmm3 +; SSE-NEXT: movaps 32(%rsi), %xmm2 +; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movaps 16(%rdx), %xmm12 +; SSE-NEXT: movaps 32(%rdx), %xmm15 +; SSE-NEXT: movaps (%rcx), %xmm3 +; SSE-NEXT: movaps 16(%rcx), %xmm4 ; SSE-NEXT: movaps 16(%r8), %xmm14 -; SSE-NEXT: movaps (%r8), %xmm13 +; SSE-NEXT: movaps (%r8), %xmm11 ; SSE-NEXT: movaps 16(%r9), %xmm5 ; SSE-NEXT: movaps (%r9), %xmm6 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm2 ; SSE-NEXT: movaps 32(%r9), %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm1 @@ -6856,32 +6824,32 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 464(%rdx), %xmm14 +; SSE-NEXT: movaps 464(%rdx), %xmm13 ; SSE-NEXT: movaps 464(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 464(%r8), %xmm11 -; SSE-NEXT: movaps 464(%r9), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdi), %xmm12 -; SSE-NEXT: movaps 480(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdx), %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 464(%r8), %xmm12 +; SSE-NEXT: movaps 464(%r9), %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] +; SSE-NEXT: movaps 480(%rdi), %xmm9 +; SSE-NEXT: movaps 480(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] +; SSE-NEXT: movaps 480(%rdx), %xmm10 ; SSE-NEXT: movaps 480(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 480(%r8), %xmm5 ; SSE-NEXT: movaps 480(%r9), %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm6 ; SSE-NEXT: movaps 496(%rsi), %xmm1 @@ -6889,30 +6857,30 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movaps 496(%rdx), %xmm1 -; SSE-NEXT: movaps 496(%rcx), %xmm0 +; SSE-NEXT: movaps 496(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps 496(%r8), %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps 496(%r8), %xmm2 ; SSE-NEXT: movaps 496(%r9), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 3056(%rax) +; SSE-NEXT: movaps %xmm2, 3056(%rax) ; SSE-NEXT: movaps %xmm1, 3040(%rax) ; SSE-NEXT: movaps %xmm6, 3024(%rax) -; SSE-NEXT: movaps %xmm2, 3008(%rax) +; SSE-NEXT: movaps %xmm0, 3008(%rax) ; SSE-NEXT: movaps %xmm4, 2992(%rax) ; SSE-NEXT: movaps %xmm7, 2976(%rax) ; SSE-NEXT: movaps %xmm5, 2960(%rax) -; SSE-NEXT: movaps %xmm8, 2944(%rax) -; SSE-NEXT: movaps %xmm12, 2928(%rax) -; SSE-NEXT: movaps %xmm9, 2912(%rax) -; SSE-NEXT: movaps %xmm10, 2896(%rax) -; SSE-NEXT: movaps %xmm13, 2880(%rax) -; SSE-NEXT: movaps %xmm11, 2864(%rax) -; SSE-NEXT: movaps %xmm14, 2848(%rax) +; SSE-NEXT: movaps %xmm10, 2944(%rax) +; SSE-NEXT: movaps %xmm9, 2928(%rax) +; SSE-NEXT: movaps %xmm8, 2912(%rax) +; SSE-NEXT: movaps %xmm11, 2896(%rax) +; SSE-NEXT: movaps %xmm14, 2880(%rax) +; SSE-NEXT: movaps %xmm12, 2864(%rax) +; SSE-NEXT: movaps %xmm13, 2848(%rax) ; SSE-NEXT: movaps %xmm15, 2832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 2816(%rax) @@ -7274,441 +7242,441 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i64_stride6_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $3464, %rsp # imm = 0xD88 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm15 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 256(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm10[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 328(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 328(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 352(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm10[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 360(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 360(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 384(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm10[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm13 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm10[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm14[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm2[1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm5[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r9), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r9), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm12[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm9[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm8[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 256(%r9), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm11[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 288(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 288(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 288(%r9), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 288(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 320(%r9), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovapd 320(%r9), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm0 @@ -7724,9 +7692,11 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 384(%r9), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%r9), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 @@ -7740,213 +7710,213 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 448(%r9), %ymm14 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 448(%r9), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 480(%r9), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 272(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 280(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 312(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 336(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 344(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 368(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 376(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 400(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 400(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 440(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 464(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 472(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 480(%r9), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 272(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 280(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 312(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 336(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 336(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 344(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 368(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 376(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 400(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 440(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 464(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 472(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -8755,7 +8725,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 @@ -8934,7 +8904,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm2 @@ -8969,17 +8939,17 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm5 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],mem[1],ymm5[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 504(%r8), %ymm2 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -8987,13 +8957,13 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9003,18 +8973,18 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3040(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 3008(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 2976(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 2976(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2880(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 2848(%rax) @@ -9029,7 +8999,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2592(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 2464(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 2464(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2432(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9050,7 +9020,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2016(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1920(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1888(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1888(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1856(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9085,7 +9055,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 928(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 928(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9192,5421 +9162,1365 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf64: -; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: movb $12, %al -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: movb $48, %al -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: movb $16, %al -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 -; AVX512F-ONLY-SLOW-NEXT: vzeroupper -; AVX512F-ONLY-SLOW-NEXT: retq -; -; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf64: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: movb $12, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} -; AVX512F-ONLY-FAST-NEXT: movb $48, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: movb $16, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2752(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2560(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 2368(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 2176(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1984(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf64: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] -; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] -; AVX512DQ-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm23 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: movb $12, %al -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: movb $48, %al -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $16, %al -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) -; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: store_i64_stride6_vf64: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: movb $12, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} -; AVX512DQ-FAST-NEXT: movb $48, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: movb $16, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2752(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2560(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 2368(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 2176(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1984(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq -; -; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf64: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: movb $12, %al -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: movb $48, %al -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: movb $16, %al -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq -; -; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf64: -; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: movb $12, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} -; AVX512BW-ONLY-FAST-NEXT: movb $48, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $16, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2752(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2560(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 2368(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 2176(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1984(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 -; AVX512BW-ONLY-FAST-NEXT: vzeroupper -; AVX512BW-ONLY-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf64: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm23 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: movb $12, %al -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: movb $48, %al -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: movb $16, %al -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq +; AVX512F-LABEL: store_i64_stride6_vf64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: subq $3336, %rsp # imm = 0xD08 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm16 +; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm15 +; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm9 +; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm10 +; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm11 +; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm12 +; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm1 +; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,12,4,12] +; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm13, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,9,2,10,1,9,2,10] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,13,6,14,5,13,6,14] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm13, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm13, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm13, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm4, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm4, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm4, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,12,5,13,4,12,5,13] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm30 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm24 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm20 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,1,9,0,8,1,9] +; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [7,15,7,15] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm5, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm8, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm8, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm5, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm8, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm31, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm4, %zmm12, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm12, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm12, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: movb $12, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} +; AVX512F-NEXT: movb $48, %al +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,9,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm4 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm6 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm7 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm11 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm14 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm5 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,13,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm16 +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm9 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm9 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: movb $16, %al +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm26 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm26 +; AVX512F-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa 448(%rdi), %ymm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm23 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%r9), %zmm30 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,13,4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm27 +; AVX512F-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa 256(%rdx), %xmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa 320(%rdx), %xmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa 384(%rdx), %xmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa 448(%rdx), %xmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm10, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,8,6,7] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm25, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm11, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti32x4 $2, 128(%r8), %zmm13, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti32x4 $2, 192(%r8), %zmm28, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti32x4 $2, 256(%r8), %zmm29, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm25, %zmm21 +; AVX512F-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm25, %zmm19 +; AVX512F-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm25, %zmm18 +; AVX512F-NEXT: vinserti32x4 $2, 448(%r8), %zmm31, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm25, %zmm31 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,10,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm25, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,1,2,3,4,12,6,7] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm28, %zmm24 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm8, %zmm25, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm28, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm29, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm28, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm28, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm28, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm5, %zmm25, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm5, %zmm28, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm29, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm25, %zmm20 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm28, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm29, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm28, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm23 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm23, 3008(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, 2944(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, 2880(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm22, 2816(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 2752(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 2624(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 2560(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 2432(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 2240(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 2048(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 1856(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 1792(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 1472(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 1408(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm31, 2688(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 2304(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 1920(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 1536(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf64: -; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm23 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: movb $12, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: movb $48, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: movb $16, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2752(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2560(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 2368(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 2176(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1984(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 -; AVX512DQBW-FAST-NEXT: vzeroupper -; AVX512DQBW-FAST-NEXT: retq +; AVX512BW-LABEL: store_i64_stride6_vf64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: subq $3336, %rsp # imm = 0xD08 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm15 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm10 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,12,4,12] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm13, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,9,2,10,1,9,2,10] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,13,6,14,5,13,6,14] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm13, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,12,5,13,4,12,5,13] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm30 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm24 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,1,9,0,8,1,9] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [7,15,7,15] +; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm8, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm8, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: movb $12, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} +; AVX512BW-NEXT: movb $48, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,9,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,13,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm16 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm9 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: movb $16, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm26 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm26 +; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm23 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm27 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa 320(%rdx), %xmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa 448(%rdx), %xmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm10, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm11, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm28, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm29, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm25, %zmm21 +; AVX512BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm19 +; AVX512BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm25, %zmm18 +; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm31, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm31 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm29, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm28, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm28, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm29, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm25, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm28, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm29, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm28, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm23 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm23, 3008(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 2944(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 2880(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 2816(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 2752(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 2624(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 2560(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 2432(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 2240(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 2048(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1856(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1472(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 2688(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 2304(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 1920(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 1536(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <64 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <64 x i64>, ptr %in.vecptr2, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index 3d88b7b550dacd..c75bdf65cbf9ac 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -146,50 +146,50 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movapd (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movapd (%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm6 ; SSE-NEXT: movapd (%rsi), %xmm0 -; SSE-NEXT: movaps 16(%rsi), %xmm13 +; SSE-NEXT: movaps 16(%rsi), %xmm7 ; SSE-NEXT: movapd (%rdx), %xmm4 ; SSE-NEXT: movaps 16(%rdx), %xmm2 ; SSE-NEXT: movapd (%rcx), %xmm3 ; SSE-NEXT: movaps 16(%rcx), %xmm1 -; SSE-NEXT: movapd (%r8), %xmm10 -; SSE-NEXT: movaps 16(%r8), %xmm6 -; SSE-NEXT: movapd (%r9), %xmm9 -; SSE-NEXT: movaps 16(%r9), %xmm5 -; SSE-NEXT: movapd (%r10), %xmm14 -; SSE-NEXT: movaps 16(%r10), %xmm12 -; SSE-NEXT: movaps %xmm13, %xmm11 -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] +; SSE-NEXT: movapd (%r8), %xmm8 +; SSE-NEXT: movaps 16(%r8), %xmm9 +; SSE-NEXT: movapd (%r9), %xmm10 +; SSE-NEXT: movaps 16(%r9), %xmm11 +; SSE-NEXT: movapd (%r10), %xmm12 +; SSE-NEXT: movaps 16(%r10), %xmm13 +; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm12[1] -; SSE-NEXT: movapd %xmm8, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] +; SSE-NEXT: movapd %xmm5, %xmm15 ; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm13[0] -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm10, 32(%rax) -; SSE-NEXT: movapd %xmm8, 48(%rax) -; SSE-NEXT: movapd %xmm9, 96(%rax) -; SSE-NEXT: movaps %xmm7, 112(%rax) -; SSE-NEXT: movaps %xmm12, 160(%rax) -; SSE-NEXT: movaps %xmm11, 176(%rax) +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm12[0],xmm5[1] +; SSE-NEXT: movapd %xmm8, 32(%rax) +; SSE-NEXT: movapd %xmm5, 48(%rax) +; SSE-NEXT: movapd %xmm10, 96(%rax) +; SSE-NEXT: movaps %xmm6, 112(%rax) +; SSE-NEXT: movaps %xmm13, 160(%rax) +; SSE-NEXT: movaps %xmm14, 176(%rax) ; SSE-NEXT: movapd %xmm15, (%rax) ; SSE-NEXT: movapd %xmm4, 16(%rax) ; SSE-NEXT: movapd %xmm0, 64(%rax) ; SSE-NEXT: movapd %xmm3, 80(%rax) ; SSE-NEXT: movaps %xmm2, 128(%rax) -; SSE-NEXT: movaps %xmm6, 144(%rax) +; SSE-NEXT: movaps %xmm9, 144(%rax) ; SSE-NEXT: movaps %xmm1, 192(%rax) -; SSE-NEXT: movaps %xmm5, 208(%rax) +; SSE-NEXT: movaps %xmm11, 208(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf4: @@ -248,16 +248,16 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm3 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 16(%r10), %xmm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm7 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm8 @@ -267,31 +267,31 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm3[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r9), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm7[0],xmm2[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r9), %ymm5 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -300,43 +300,43 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-NEXT: vmovdqa (%r10), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 -; AVX512F-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 -; AVX512F-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <15,3,7,u> -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-NEXT: vmovdqa (%r8), %ymm2 +; AVX512F-NEXT: vmovdqa (%r10), %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 +; AVX512F-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <15,3,7,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,4,8,12,u,u,u,1> -; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512F-NEXT: movb $112, %cl ; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <5,9,13,u,u,u,2,6> -; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512F-NEXT: movb $56, %cl ; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <2,6,u,u,u,11,15,3> -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <2,6,u,u,u,11,15,3> +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: movb $28, %cl ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-NEXT: vmovdqa %ymm0, 192(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -345,43 +345,43 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512BW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512BW-NEXT: vmovdqa (%r10), %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <15,3,7,u> -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 +; AVX512BW-NEXT: vmovdqa (%r10), %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <15,3,7,u> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,4,8,12,u,u,u,1> -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: movb $112, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <5,9,13,u,u,u,2,6> -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: movb $56, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <2,6,u,u,u,11,15,3> -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <2,6,u,u,u,11,15,3> +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: movb $28, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512BW-NEXT: vmovdqa %ymm0, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -411,54 +411,55 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: subq $88, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd (%rdi), %xmm2 -; SSE-NEXT: movapd 16(%rdi), %xmm5 -; SSE-NEXT: movapd 32(%rdi), %xmm10 -; SSE-NEXT: movapd (%rsi), %xmm3 -; SSE-NEXT: movapd 16(%rsi), %xmm6 -; SSE-NEXT: movapd (%rdx), %xmm4 +; SSE-NEXT: movapd 16(%rdi), %xmm6 +; SSE-NEXT: movapd 32(%rdi), %xmm15 +; SSE-NEXT: movapd (%rsi), %xmm4 +; SSE-NEXT: movapd 16(%rsi), %xmm10 +; SSE-NEXT: movapd (%rdx), %xmm3 ; SSE-NEXT: movapd 16(%rdx), %xmm8 ; SSE-NEXT: movapd (%rcx), %xmm7 -; SSE-NEXT: movapd 16(%rcx), %xmm11 -; SSE-NEXT: movapd (%r8), %xmm9 -; SSE-NEXT: movapd 16(%r8), %xmm14 -; SSE-NEXT: movapd (%r9), %xmm12 +; SSE-NEXT: movapd 16(%rcx), %xmm14 +; SSE-NEXT: movapd (%r8), %xmm5 +; SSE-NEXT: movapd 16(%r8), %xmm12 +; SSE-NEXT: movapd (%r9), %xmm11 ; SSE-NEXT: movapd 16(%r9), %xmm13 ; SSE-NEXT: movapd (%rax), %xmm0 ; SSE-NEXT: movapd 16(%rax), %xmm1 -; SSE-NEXT: movapd %xmm2, %xmm15 -; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm3[0] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm4[0] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm12[0] -; SSE-NEXT: movapd %xmm9, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm5, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm11[0] +; SSE-NEXT: movapd %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm6, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm8[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm14[0] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm14[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm12[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] ; SSE-NEXT: movapd 32(%rsi), %xmm12 -; SSE-NEXT: movapd %xmm10, %xmm15 +; SSE-NEXT: movapd %xmm15, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm12[0] ; SSE-NEXT: movapd 32(%rax), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm3[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%rdx), %xmm11 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] ; SSE-NEXT: movapd 32(%rcx), %xmm8 @@ -476,19 +477,19 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 48(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 48(%rcx), %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: movapd 48(%r8), %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movapd 48(%rcx), %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movapd 48(%r8), %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movapd 48(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 432(%rax) -; SSE-NEXT: movapd %xmm2, 416(%rax) +; SSE-NEXT: movapd %xmm1, 416(%rax) ; SSE-NEXT: movapd %xmm4, 400(%rax) ; SSE-NEXT: movapd %xmm5, 384(%rax) -; SSE-NEXT: movapd %xmm1, 368(%rax) +; SSE-NEXT: movapd %xmm2, 368(%rax) ; SSE-NEXT: movapd %xmm3, 352(%rax) ; SSE-NEXT: movapd %xmm7, 336(%rax) ; SSE-NEXT: movapd %xmm6, 320(%rax) @@ -500,13 +501,13 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movapd %xmm11, 240(%rax) ; SSE-NEXT: movapd %xmm15, 224(%rax) ; SSE-NEXT: movapd %xmm13, 208(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rax) +; SSE-NEXT: movapd %xmm14, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movapd %xmm14, 144(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -532,20 +533,19 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: pushq %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm3 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovapd 32(%rax), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm6[1] @@ -553,33 +553,34 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm10 ; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm15[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm15[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm8[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm8, %ymm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm8, %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2],ymm10[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm9[1],ymm6[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm14 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 @@ -592,19 +593,18 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm2[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],mem[0],ymm15[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm9[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm10[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] @@ -614,22 +614,23 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm5 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm14[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm8[0] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 384(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 384(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm11, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) @@ -646,7 +647,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm9 @@ -655,16 +656,16 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm15 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm7[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm1[6,7] @@ -673,16 +674,16 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],mem[0],ymm9[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] @@ -712,18 +713,18 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm15[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm2, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) @@ -732,7 +733,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 384(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 256(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -754,75 +755,75 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm9, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 @@ -847,14 +848,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $48, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl @@ -862,8 +863,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) @@ -881,73 +882,73 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] ; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $96, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm10, %zmm9 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm11[0,1,2,3],zmm10[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 ; AVX512F-ONLY-FAST-NEXT: movb $48, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm9, %ymm12, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm9 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3] ; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 @@ -955,42 +956,42 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm12 ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movb $120, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -1000,78 +1001,78 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $-61, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm3 ; AVX512DQ-SLOW-NEXT: movb $96, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX512DQ-SLOW-NEXT: movb $28, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm11[2,3,2,3],zmm4[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] ; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] ; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] ; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k2} ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] ; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 ; AVX512DQ-SLOW-NEXT: movb $56, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] ; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 @@ -1088,30 +1089,30 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] ; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm6, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm5 ; AVX512DQ-SLOW-NEXT: movb $120, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: movb $48, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper @@ -1124,119 +1125,119 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm3 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm9[0,1,2,3],zmm3[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $-61, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] ; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm6 ; AVX512DQ-FAST-NEXT: movb $48, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,7,u> -; AVX512DQ-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,3,7,u> +; AVX512DQ-FAST-NEXT: vpermi2q %ymm9, %ymm10, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm12 ; AVX512DQ-FAST-NEXT: movb $96, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX512DQ-FAST-NEXT: movb $28, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm9[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] ; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm11 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm10, %zmm9 ; AVX512DQ-FAST-NEXT: movb $56, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-FAST-NEXT: movb $12, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} ; AVX512DQ-FAST-NEXT: movb $112, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm11, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm1 ; AVX512DQ-FAST-NEXT: movb $120, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -1248,75 +1249,75 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm9, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 @@ -1341,14 +1342,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl @@ -1356,8 +1357,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) @@ -1375,73 +1376,73 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] ; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $96, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm10, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm11[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm9, %ymm12, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm9 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 @@ -1449,42 +1450,42 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: movb $120, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; @@ -1494,78 +1495,78 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm3 ; AVX512DQBW-SLOW-NEXT: movb $96, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm11[2,3,2,3],zmm4[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] ; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] ; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k2} ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 ; AVX512DQBW-SLOW-NEXT: movb $56, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] ; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 @@ -1582,30 +1583,30 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] ; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm5 ; AVX512DQBW-SLOW-NEXT: movb $120, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $48, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper @@ -1618,119 +1619,119 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm3 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm9[0,1,2,3],zmm3[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $-61, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] ; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm6 ; AVX512DQBW-FAST-NEXT: movb $48, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,7,u> -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,3,7,u> +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm9, %ymm10, %ymm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm12 ; AVX512DQBW-FAST-NEXT: movb $96, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX512DQBW-FAST-NEXT: movb $28, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm9[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] ; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm11 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm11 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm10, %zmm9 ; AVX512DQBW-FAST-NEXT: movb $56, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQBW-FAST-NEXT: movb $12, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} ; AVX512DQBW-FAST-NEXT: movb $112, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm11, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm1 ; AVX512DQBW-FAST-NEXT: movb $120, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 @@ -1759,66 +1760,67 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $536, %rsp # imm = 0x218 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd (%rdi), %xmm2 -; SSE-NEXT: movapd 16(%rdi), %xmm3 -; SSE-NEXT: movapd 32(%rdi), %xmm6 +; SSE-NEXT: movapd 16(%rdi), %xmm6 +; SSE-NEXT: movapd 32(%rdi), %xmm13 ; SSE-NEXT: movapd (%rsi), %xmm4 -; SSE-NEXT: movapd 16(%rsi), %xmm7 -; SSE-NEXT: movapd (%rdx), %xmm5 +; SSE-NEXT: movapd 16(%rsi), %xmm10 +; SSE-NEXT: movapd (%rdx), %xmm3 ; SSE-NEXT: movapd 16(%rdx), %xmm8 -; SSE-NEXT: movapd (%rcx), %xmm9 -; SSE-NEXT: movapd 16(%rcx), %xmm11 -; SSE-NEXT: movapd (%r8), %xmm10 +; SSE-NEXT: movapd (%rcx), %xmm7 +; SSE-NEXT: movapd 16(%rcx), %xmm14 +; SSE-NEXT: movapd (%r8), %xmm5 ; SSE-NEXT: movapd 16(%r8), %xmm12 -; SSE-NEXT: movapd (%r9), %xmm13 +; SSE-NEXT: movapd (%r9), %xmm11 ; SSE-NEXT: movapd 16(%r9), %xmm15 ; SSE-NEXT: movapd (%rax), %xmm0 ; SSE-NEXT: movapd 16(%rax), %xmm1 -; SSE-NEXT: movapd %xmm2, %xmm14 -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm4[0] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm4[0] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm11[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm13[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm3, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm6, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm10[0] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm11[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm8[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm14[0] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm12[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm15[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 32(%rsi), %xmm1 -; SSE-NEXT: movapd %xmm6, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movapd 32(%rsi), %xmm2 +; SSE-NEXT: movapd %xmm13, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%rax), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 32(%rdx), %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 32(%rcx), %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 32(%rdx), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 32(%rcx), %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%r8), %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%r9), %xmm2 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1912,19 +1914,19 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 112(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 112(%rcx), %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: movapd 112(%r8), %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movapd 112(%rcx), %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movapd 112(%r8), %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movapd 112(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 880(%rax) -; SSE-NEXT: movapd %xmm2, 864(%rax) +; SSE-NEXT: movapd %xmm1, 864(%rax) ; SSE-NEXT: movapd %xmm4, 848(%rax) ; SSE-NEXT: movapd %xmm5, 832(%rax) -; SSE-NEXT: movapd %xmm1, 816(%rax) +; SSE-NEXT: movapd %xmm2, 816(%rax) ; SSE-NEXT: movapd %xmm3, 800(%rax) ; SSE-NEXT: movapd %xmm7, 784(%rax) ; SSE-NEXT: movapd %xmm6, 768(%rax) @@ -2024,11 +2026,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $520, %rsp # imm = 0x208 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 @@ -2052,7 +2054,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2060,7 +2062,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2114,9 +2116,9 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2130,30 +2132,30 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm15[0],xmm14[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm1[1] ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] @@ -2209,7 +2211,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm15[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[0] @@ -2232,9 +2234,9 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm6, 864(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm9, 800(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 736(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 704(%rax) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2269,197 +2271,198 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $552, %rsp # imm = 0x228 +; AVX2-ONLY-NEXT: subq $584, %rsp # imm = 0x248 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm14[1],ymm7[3],ymm14[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm13[1],ymm1[1],ymm13[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 80(%rax), %xmm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm2[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps 112(%rax), %xmm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm1[0],ymm13[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm15[1],ymm8[3],ymm15[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 80(%rax), %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovaps 112(%rax), %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm9 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],mem[0],ymm12[2],mem[2] ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm13[0],ymm10[2],ymm13[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm12, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm3 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm14[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX2-ONLY-NEXT: vbroadcastsd %xmm8, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm5[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm2, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm4, 800(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 768(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 576(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 768(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 576(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 544(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -2484,237 +2487,236 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm4, 736(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 704(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 672(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 672(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 640(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 608(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 480(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 864(%rax) -; AVX2-ONLY-NEXT: addq $552, %rsp # imm = 0x228 +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 864(%rax) +; AVX2-ONLY-NEXT: addq $584, %rsp # imm = 0x248 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm6[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm17, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm8, %zmm17 ; AVX512F-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm20, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm9, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm31, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm11, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm28, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm28, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm26, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm27, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm19[0],ymm23[2],ymm19[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k3} = zmm27[2,3,2,3],zmm29[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm30, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm19[0],ymm25[0],ymm19[2],ymm25[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm22[2,3,2,3],zmm30[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm22, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm22, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm24, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm26, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm29, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: movb $48, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm13[0],zmm14[0],zmm13[2],zmm14[2],zmm13[4],zmm14[4],zmm13[6],zmm14[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm30, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm12, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm18 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm3 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm17, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm20 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 {%k2} ; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm6 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm29 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm29 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm25[1],ymm19[3],ymm25[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -2727,186 +2729,186 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm18, %zmm18 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 {%k2} ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm13, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm14, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm25 ; AVX512F-ONLY-FAST-NEXT: movb $96, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] ; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm14 ; AVX512F-ONLY-FAST-NEXT: movb $120, %sil -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm13, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm18 ; AVX512F-ONLY-FAST-NEXT: movb $24, %dil -; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $-31, %dil ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 {%k3} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm24[0,1,2,3],zmm22[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $-61, %dil ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k3} ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm14 {%k3} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm24 ; AVX512F-ONLY-FAST-NEXT: movb $48, %sil ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm26 {%k2} ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm16 {%k3} ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm26 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm7, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm24, %ymm8, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm24 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm26, %ymm9, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm25 {%k3} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm27, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm20, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 {%k2} ; AVX512F-ONLY-FAST-NEXT: movb $28, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k2} = zmm7[2,3,2,3],zmm31[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm9[0],ymm26[0],ymm9[2],ymm26[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm13[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm12 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k2} ; AVX512F-ONLY-FAST-NEXT: movb $64, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k2} ; AVX512F-ONLY-FAST-NEXT: movb $56, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm21, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm4, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm19, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm4, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] @@ -2914,14 +2916,14 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) @@ -2932,209 +2934,209 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm7 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm14 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm15, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm6[0,1,2,3],zmm4[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $64, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm21 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: movb $96, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm17 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm16 ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm20, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm8 {%k1} ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1] ; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm24, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm26, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm27, %zmm26 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] ; AVX512DQ-SLOW-NEXT: movb $28, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k3} = zmm27[2,3,2,3],zmm28[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] ; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm30, %zmm29 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] ; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %ymm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %ymm21 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm10, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm21[0],ymm25[0],ymm21[2],ymm25[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k3} = zmm22[2,3,2,3],zmm30[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm22, %zmm17 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm24, %zmm22 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm1, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm26, %zmm24 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] ; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm25 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm20 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] ; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm30, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm15, %zmm14, %zmm30 ; AVX512DQ-SLOW-NEXT: movb $48, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm28 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm14[0],zmm15[0],zmm14[2],zmm15[2],zmm14[4],zmm15[4],zmm14[6],zmm15[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm9, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm10, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm11, %zmm28 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm10[0],zmm11[0],zmm10[2],zmm11[2],zmm10[4],zmm11[4],zmm10[6],zmm11[6] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm18 {%k5} ; AVX512DQ-SLOW-NEXT: movb $112, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k7 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm19, %zmm18 {%k7} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm14 {%k1} ; AVX512DQ-SLOW-NEXT: movb $120, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm20 {%k4} ; AVX512DQ-SLOW-NEXT: movb $56, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm20 {%k6} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k5} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm5, %zmm30 {%k7} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k2} ; AVX512DQ-SLOW-NEXT: movb $-31, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k2} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $-61, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 {%k6} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm21[1],ymm25[1],ymm21[3],ymm25[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper @@ -3148,184 +3150,184 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm17 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm19 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm15 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] ; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm18 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQ-FAST-NEXT: movb $12, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm12 ; AVX512DQ-FAST-NEXT: movb $112, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 ; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm25 ; AVX512DQ-FAST-NEXT: movb $96, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 +; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm12 ; AVX512DQ-FAST-NEXT: movb $120, %sil ; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm22 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm18[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm4 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm18 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm18, %zmm15, %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm18 ; AVX512DQ-FAST-NEXT: movb $24, %dil -; AVX512DQ-FAST-NEXT: kmovw %edi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: kmovw %edi, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm18 {%k1} ; AVX512DQ-FAST-NEXT: movb $-31, %dil ; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 {%k3} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm16, %zmm28 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm28, %zmm29 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm22 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $-61, %dil ; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k3} ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm12 {%k3} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] ; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm23 ; AVX512DQ-FAST-NEXT: movb $48, %sil ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm28 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm26 {%k2} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm11, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm27, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm14 {%k3} ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm23 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm26 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512DQ-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm9 +; AVX512DQ-FAST-NEXT: vpermt2q %ymm27, %ymm7, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm25 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQ-FAST-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm23 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX512DQ-FAST-NEXT: vpermi2q %ymm26, %ymm9, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm25 {%k3} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm27, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm20, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 {%k2} ; AVX512DQ-FAST-NEXT: movb $28, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k2} = zmm7[2,3,2,3],zmm31[2,3,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm9[0],ymm26[0],ymm9[2],ymm26[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm15[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm7, %zmm9 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k2} ; AVX512DQ-FAST-NEXT: movb $64, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k2} ; AVX512DQ-FAST-NEXT: movb $56, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm21, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm19, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm3, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] @@ -3334,14 +3336,14 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 704(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) @@ -3351,212 +3353,210 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm6[0,1,2,3],zmm3[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm17, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm8, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm20, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm9, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm31, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm11, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm28, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm24, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm28, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm26, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm27, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm19[0],ymm23[2],ymm19[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k3} = zmm27[2,3,2,3],zmm29[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm30, %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm0, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm19[0],ymm25[0],ymm19[2],ymm25[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm22[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm22, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm22, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm24, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm26, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm26 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm29, %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm13[0],zmm14[0],zmm13[2],zmm14[2],zmm13[4],zmm14[4],zmm13[6],zmm14[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm30, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm12, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm18 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm17, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm20 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm6 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm29 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm29 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm25[1],ymm19[3],ymm25[3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq @@ -3569,186 +3569,186 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm20 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm16 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm18, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm13, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm14, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm25 ; AVX512BW-ONLY-FAST-NEXT: movb $96, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm13, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %dil -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm18 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $-31, %dil ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm24[0,1,2,3],zmm22[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %dil ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k3} ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm14 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm24 ; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm26 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm16 {%k3} ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm24 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm26 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm7, %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm24, %ymm8, %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm24 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm26, %ymm9, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm25 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm27, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm20, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 {%k2} ; AVX512BW-ONLY-FAST-NEXT: movb $28, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k2} = zmm7[2,3,2,3],zmm31[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm9[0],ymm26[0],ymm9[2],ymm26[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm13[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm12 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k2} ; AVX512BW-ONLY-FAST-NEXT: movb $64, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k2} ; AVX512BW-ONLY-FAST-NEXT: movb $56, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm21, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm4, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm19, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm4, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] @@ -3756,14 +3756,14 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) @@ -3774,209 +3774,209 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm7 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm14 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm6[0,1,2,3],zmm4[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $64, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm21 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: movb $96, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm17 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm16 ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm20, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm8 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm24, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm26, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm27, %zmm26 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k3} = zmm27[2,3,2,3],zmm28[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] ; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm30, %zmm29 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm0, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %ymm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm21 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm10, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm21[0],ymm25[0],ymm21[2],ymm25[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k3} = zmm22[2,3,2,3],zmm30[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm22, %zmm17 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm24, %zmm22 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm1, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm26, %zmm24 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm25 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm20 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] ; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm30, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm14, %zmm30 ; AVX512DQBW-SLOW-NEXT: movb $48, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm14[0],zmm15[0],zmm14[2],zmm15[2],zmm14[4],zmm15[4],zmm14[6],zmm15[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm9, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm10, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm11, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm10[0],zmm11[0],zmm10[2],zmm11[2],zmm10[4],zmm11[4],zmm10[6],zmm11[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm18 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $112, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k7 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm19, %zmm18 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm14 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $120, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm20 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $56, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm20 {%k6} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k5} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm5, %zmm30 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k2} ; AVX512DQBW-SLOW-NEXT: movb $-31, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k2} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 {%k6} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm21[1],ymm25[1],ymm21[3],ymm25[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper @@ -3990,184 +3990,184 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm17 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm19 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm20 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm15 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] ; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm18 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQBW-FAST-NEXT: movb $12, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm12 ; AVX512DQBW-FAST-NEXT: movb $112, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 ; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm12 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm25 ; AVX512DQBW-FAST-NEXT: movb $96, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] ; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm12 ; AVX512DQBW-FAST-NEXT: movb $120, %sil ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm22 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm18[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm4 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm18 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm18, %zmm15, %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm18 ; AVX512DQBW-FAST-NEXT: movb $24, %dil -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: kmovd %edi, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm18 {%k1} ; AVX512DQBW-FAST-NEXT: movb $-31, %dil ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 {%k3} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm16, %zmm28 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm28, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $-61, %dil ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k3} ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm12 {%k3} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] ; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm23 ; AVX512DQBW-FAST-NEXT: movb $48, %sil ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm26 {%k2} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm11, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm27, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm14 {%k3} ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm23 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm26 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm28, %ymm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm27, %ymm7, %ymm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm25 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm23 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm26, %ymm9, %ymm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm25 {%k3} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm27, %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm20, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 {%k2} ; AVX512DQBW-FAST-NEXT: movb $28, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k2} = zmm7[2,3,2,3],zmm31[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm9[0],ymm26[0],ymm9[2],ymm26[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm15[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm7, %zmm9 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k2} ; AVX512DQBW-FAST-NEXT: movb $64, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k2} ; AVX512DQBW-FAST-NEXT: movb $56, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm21, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm19, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm3, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] @@ -4176,14 +4176,14 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 704(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) @@ -4215,66 +4215,67 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $1432, %rsp # imm = 0x598 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd (%rdi), %xmm2 -; SSE-NEXT: movapd 16(%rdi), %xmm3 -; SSE-NEXT: movapd 32(%rdi), %xmm4 -; SSE-NEXT: movapd (%rsi), %xmm5 -; SSE-NEXT: movapd 16(%rsi), %xmm6 -; SSE-NEXT: movapd (%rdx), %xmm7 +; SSE-NEXT: movapd 16(%rdi), %xmm6 +; SSE-NEXT: movapd 32(%rdi), %xmm13 +; SSE-NEXT: movapd (%rsi), %xmm4 +; SSE-NEXT: movapd 16(%rsi), %xmm10 +; SSE-NEXT: movapd (%rdx), %xmm3 ; SSE-NEXT: movapd 16(%rdx), %xmm8 -; SSE-NEXT: movapd (%rcx), %xmm9 -; SSE-NEXT: movapd 16(%rcx), %xmm10 -; SSE-NEXT: movapd (%r8), %xmm11 +; SSE-NEXT: movapd (%rcx), %xmm7 +; SSE-NEXT: movapd 16(%rcx), %xmm14 +; SSE-NEXT: movapd (%r8), %xmm5 ; SSE-NEXT: movapd 16(%r8), %xmm12 -; SSE-NEXT: movapd (%r9), %xmm13 -; SSE-NEXT: movapd 16(%r9), %xmm14 +; SSE-NEXT: movapd (%r9), %xmm11 +; SSE-NEXT: movapd 16(%r9), %xmm15 ; SSE-NEXT: movapd (%rax), %xmm0 ; SSE-NEXT: movapd 16(%rax), %xmm1 -; SSE-NEXT: movapd %xmm2, %xmm15 -; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm5[0] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm4[0] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm11[0] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm3, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm6, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm10[0] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm10[0] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm8[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm14[0] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm14[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm12[1] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 32(%rsi), %xmm1 -; SSE-NEXT: movapd %xmm4, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm15[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 32(%rsi), %xmm2 +; SSE-NEXT: movapd %xmm13, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%rax), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 32(%rdx), %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 32(%rcx), %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 32(%rdx), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 32(%rcx), %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%r8), %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%r9), %xmm2 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4544,19 +4545,19 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 240(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 240(%rcx), %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: movapd 240(%r8), %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movapd 240(%rcx), %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movapd 240(%r8), %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movapd 240(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 1776(%rax) -; SSE-NEXT: movapd %xmm2, 1760(%rax) +; SSE-NEXT: movapd %xmm1, 1760(%rax) ; SSE-NEXT: movapd %xmm4, 1744(%rax) ; SSE-NEXT: movapd %xmm5, 1728(%rax) -; SSE-NEXT: movapd %xmm1, 1712(%rax) +; SSE-NEXT: movapd %xmm2, 1712(%rax) ; SSE-NEXT: movapd %xmm3, 1696(%rax) ; SSE-NEXT: movapd %xmm7, 1680(%rax) ; SSE-NEXT: movapd %xmm6, 1664(%rax) @@ -4768,16 +4769,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1320, %rsp # imm = 0x528 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm7 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm0 @@ -4785,7 +4787,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] @@ -4793,24 +4795,24 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4864,9 +4866,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] @@ -4957,8 +4958,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] @@ -4966,8 +4967,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] @@ -5047,97 +5048,97 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rcx), %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm3, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] ; AVX1-ONLY-NEXT: vmovapd 224(%rax), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 240(%rcx), %xmm15 ; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm10[0],xmm13[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm13[0],xmm12[0] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm11[0] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm15 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 @@ -5146,18 +5147,18 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm9, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm11, 1360(%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 1344(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 464(%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm14, 448(%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm15, 912(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 896(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 896(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 1760(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 1728(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 1696(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 1664(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 1696(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 1664(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 1632(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 1600(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm7, 1568(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 1536(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 1536(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5252,14 +5253,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1672, %rsp # imm = 0x688 +; AVX2-ONLY-NEXT: subq $1656, %rsp # imm = 0x678 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm10 +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 @@ -5272,47 +5274,47 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm5 ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm4 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm9 +; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm5 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm3 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm4[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -5430,19 +5432,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm15 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5456,13 +5456,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm10 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; AVX2-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -5476,9 +5476,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5490,8 +5490,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm5, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm0 @@ -5499,24 +5499,24 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm3 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 208(%rax), %xmm1 @@ -5525,140 +5525,141 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm2 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 240(%rax), %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 240(%rax), %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0,1],ymm8[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm4, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm8[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm15[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm2, 1760(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 1728(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1664(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1664(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1632(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1600(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1568(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1536(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1536(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5667,10 +5668,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1376(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1376(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1344(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1312(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 1312(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1280(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5681,8 +5682,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1120(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 1088(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1120(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 1088(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5691,10 +5692,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 928(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 928(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 864(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 864(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 832(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5705,7 +5706,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 704(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 672(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5716,7 +5718,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) @@ -5748,7 +5750,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-ONLY-NEXT: addq $1672, %rsp # imm = 0x688 +; AVX2-ONLY-NEXT: addq $1656, %rsp # imm = 0x678 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -5756,1347 +5758,1348 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm24 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm30 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm15 ; AVX512F-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm22, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm16, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm22, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm15[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm16[0],ymm0[2],ymm16[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[2,3,2,3],zmm29[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm19[0],ymm0[2],ymm19[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm20, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm25 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm24, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm17, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm11[0],zmm13[0],zmm11[2],zmm13[2],zmm11[4],zmm13[4],zmm11[6],zmm13[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm16[0],zmm24[0],zmm16[2],zmm24[2],zmm16[4],zmm24[4],zmm16[6],zmm24[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm26, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm12[0],zmm13[0],zmm12[2],zmm13[2],zmm12[4],zmm13[4],zmm12[6],zmm13[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm22[0],zmm0[0],zmm22[2],zmm0[2],zmm22[4],zmm0[4],zmm22[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm26, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm10[0],zmm0[0],zmm10[2],zmm0[2],zmm10[4],zmm0[4],zmm10[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm0, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm10, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm11 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm10, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm10, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm12 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm15, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = zmm19[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k3} ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm24 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm21, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm15, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm21, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm16, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm16[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: movb $64, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k2} ; AVX512F-ONLY-SLOW-NEXT: movb $8, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm13, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm13, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm13, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm12, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm10, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm11, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm13, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm11, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1280(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 704(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1088(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1536(%rax) ; AVX512F-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512F-ONLY-FAST-NEXT: subq $2088, %rsp # imm = 0x828 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm12 ; AVX512F-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm3, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm7[2,3,2,3],zmm12[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm9, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] ; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] ; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm3, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm3, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm22[0],ymm6[0],ymm22[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm9, %ymm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm16[0],ymm8[0],ymm16[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm8, %ymm9, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm20, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm6[2,3,2,3],zmm13[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm24, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm27, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm27, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm21[0],zmm1[0],zmm21[2],zmm1[2],zmm21[4],zmm1[4],zmm21[6],zmm1[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm20, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm10, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm7, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm27, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm15[0],zmm4[0],zmm15[2],zmm4[2],zmm15[4],zmm4[4],zmm15[6],zmm4[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm0 {%k3} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm4, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm16 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm19 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $120, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k3} ; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k5} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm18 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm24 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k5} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k5} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm5[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 {%k5} ; AVX512F-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm1 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm31 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm7 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm27 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm12, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm12, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm9 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm22, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm11, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm11[2,3,2,3],zmm5[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: movb $64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $8, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm6, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm6 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm9, %zmm3 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 640(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $2088, %rsp # imm = 0x828 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512DQ-SLOW-NEXT: subq $2184, %rsp # imm = 0x888 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm23 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm21 ; AVX512DQ-SLOW-NEXT: movb $96, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] ; AVX512DQ-SLOW-NEXT: movb $28, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm19 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm11[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm24 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,13,14,7,6,13,14,7] +; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm16, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm8 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm21[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm29[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm18[0],ymm0[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k2} = zmm0[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm22, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm25 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm17, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm22, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $48, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k3 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm20 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm24 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm27 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm18 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm8[0],zmm13[0],zmm8[2],zmm13[2],zmm8[4],zmm13[4],zmm8[6],zmm13[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm28, %zmm21 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm23[0],zmm10[0],zmm23[2],zmm10[2],zmm23[4],zmm10[4],zmm23[6],zmm10[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm23 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm2, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm29, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm29, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm4[0],zmm29[0],zmm4[2],zmm29[2],zmm4[4],zmm29[4],zmm4[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm23[0],zmm0[0],zmm23[2],zmm0[2],zmm23[4],zmm0[4],zmm23[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm0, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm23, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} ; AVX512DQ-SLOW-NEXT: movb $120, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm18 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 {%k3} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm18[0,1,2,3],zmm26[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $-61, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm3 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k4} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 {%k4} ; AVX512DQ-SLOW-NEXT: movb $-31, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: movb $12, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k4} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: movb $12, %sil +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm19 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} +; AVX512DQ-SLOW-NEXT: movb $112, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm24 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k4} -; AVX512DQ-SLOW-NEXT: movb $112, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm14 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm5, %zmm10 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm5, %zmm24 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm2 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm2, %zmm22 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm19 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm1, %zmm4 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm6 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k4} ; AVX512DQ-SLOW-NEXT: movb $56, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm7 = zmm27[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,11,u,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm16, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,11,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm22, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k2} = zmm16[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm8 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,11,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r8), %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm12[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX512DQ-SLOW-NEXT: movb $64, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k2} ; AVX512DQ-SLOW-NEXT: movb $8, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm13, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm19, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm13, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,12,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <12,u,u,3,4,5,6,13> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm23, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm8, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm10, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm10 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1344(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1536(%rax) -; AVX512DQ-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1536(%rax) +; AVX512DQ-SLOW-NEXT: addq $2184, %rsp # imm = 0x888 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2056, %rsp # imm = 0x808 +; AVX512DQ-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512DQ-FAST-NEXT: movb $96, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7104,420 +7107,420 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm24 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] ; AVX512DQ-FAST-NEXT: movb $28, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm5[2,3,2,3],zmm19[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm5, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm5, %ymm4, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm16, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm2[0],ymm25[2],ymm2[2] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm2, %ymm5, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm4[0],ymm24[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm4, %ymm5, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm11 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm16, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm16, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: movb $48, %r10b +; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm26, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm28[0],zmm13[0],zmm28[2],zmm13[2],zmm28[4],zmm13[4],zmm28[6],zmm13[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: movb $48, %r10b -; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm31 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm24 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm8, %zmm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm21[0],zmm10[0],zmm21[2],zmm10[2],zmm21[4],zmm10[4],zmm21[6],zmm10[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm15 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm13, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm19, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm19 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm27 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm11 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm7[0],zmm0[0],zmm7[2],zmm0[2],zmm7[4],zmm0[4],zmm7[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm24 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} ; AVX512DQ-FAST-NEXT: movb $120, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm25 {%k4} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm20[0,1,2,3],zmm30[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k3} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k3} ; AVX512DQ-FAST-NEXT: movb $-61, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k5 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm18[0,1,2,3],zmm23[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k5} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 {%k5} ; AVX512DQ-FAST-NEXT: movb $-31, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm21 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k4} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm24 {%k4} ; AVX512DQ-FAST-NEXT: movb $112, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm4, %zmm15 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm23 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm14 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm2, %zmm24 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm29 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 {%k4} ; AVX512DQ-FAST-NEXT: movb $56, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm22, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,11,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm9, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm9[2,3,2,3],zmm2[2,3,2,3] ; AVX512DQ-FAST-NEXT: movb $64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: movb $8, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm10, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm10, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 384(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1728(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) -; AVX512DQ-FAST-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1536(%rax) +; AVX512DQ-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -7525,1347 +7528,1348 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm24 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm30 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm22, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm16, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm22, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm15[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm16[0],ymm0[2],ymm16[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[2,3,2,3],zmm29[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm19[0],ymm0[2],ymm19[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm20, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm24, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm17, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm11[0],zmm13[0],zmm11[2],zmm13[2],zmm11[4],zmm13[4],zmm11[6],zmm13[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm16[0],zmm24[0],zmm16[2],zmm24[2],zmm16[4],zmm24[4],zmm16[6],zmm24[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm26, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm12[0],zmm13[0],zmm12[2],zmm13[2],zmm12[4],zmm13[4],zmm12[6],zmm13[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm22[0],zmm0[0],zmm22[2],zmm0[2],zmm22[4],zmm0[4],zmm22[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm26, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm10[0],zmm0[0],zmm10[2],zmm0[2],zmm10[4],zmm0[4],zmm10[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm0, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm10, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm10, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm12 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm15, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = zmm19[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm24 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm10 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm21, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm15, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm21, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm16, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm16[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: movb $64, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: movb $8, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm13, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm13, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm22, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm13, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm10, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm8, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm11, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm13, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm11, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1280(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 704(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1088(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1536(%rax) ; AVX512BW-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512BW-ONLY-FAST-NEXT: subq $2088, %rsp # imm = 0x828 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm12 ; AVX512BW-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm3, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm7[2,3,2,3],zmm12[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm9, %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm3, %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm3, %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b -; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm22[0],ymm6[0],ymm22[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm9, %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm16[0],ymm8[0],ymm16[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm8, %ymm9, %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm20, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm6[2,3,2,3],zmm13[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm24, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b +; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm27, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm27, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm21[0],zmm1[0],zmm21[2],zmm1[2],zmm21[4],zmm1[4],zmm21[6],zmm1[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm20, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm10, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm7, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm27, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm15[0],zmm4[0],zmm15[2],zmm4[2],zmm15[4],zmm4[4],zmm15[6],zmm4[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm4, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm3, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm3, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm16 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm19 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k3} ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm18 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm24 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm5[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 {%k5} ; AVX512BW-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm31 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm7 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm27 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm12, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm22, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm11, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm11[2,3,2,3],zmm5[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: movb $64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $8, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm6, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm6 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm10, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm9, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 640(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $2088, %rsp # imm = 0x828 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf32: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512DQBW-SLOW-NEXT: subq $2184, %rsp # imm = 0x888 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm23 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm21 ; AVX512DQBW-SLOW-NEXT: movb $96, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm19 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm11[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm24 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm16, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm8 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm21[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm29[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm18[0],ymm0[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k2} = zmm0[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm22, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm17, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm22, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $48, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm24 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm27 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm18 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm8[0],zmm13[0],zmm8[2],zmm13[2],zmm8[4],zmm13[4],zmm8[6],zmm13[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm28, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm23[0],zmm10[0],zmm23[2],zmm10[2],zmm23[4],zmm10[4],zmm23[6],zmm10[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm23 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm2, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm29, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm29, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm4[0],zmm29[0],zmm4[2],zmm29[2],zmm4[4],zmm29[4],zmm4[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm23[0],zmm0[0],zmm23[2],zmm0[2],zmm23[4],zmm0[4],zmm23[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm0, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm23, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $120, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm18 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 {%k3} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm18[0,1,2,3],zmm26[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm3 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $-31, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm24 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k4} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm19 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $112, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm14 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm5, %zmm10 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm5, %zmm24 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm2, %zmm22 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm19 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm1, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm6 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $56, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm27[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm16, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm22, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k2} = zmm16[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm8 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r8), %ymm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm12[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX512DQBW-SLOW-NEXT: movb $64, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k2} ; AVX512DQBW-SLOW-NEXT: movb $8, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm13, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm19, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm13, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm23, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm10 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1344(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1536(%rax) -; AVX512DQBW-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: addq $2184, %rsp # imm = 0x888 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf32: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $2056, %rsp # imm = 0x808 +; AVX512DQBW-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512DQBW-FAST-NEXT: movb $96, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8873,420 +8877,420 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm24 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] ; AVX512DQBW-FAST-NEXT: movb $28, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm7 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm5[2,3,2,3],zmm19[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm5, %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm5, %ymm4, %ymm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm16, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm2[0],ymm25[2],ymm2[2] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm2, %ymm5, %ymm25 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm4[0],ymm24[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm4, %ymm5, %ymm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm11 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm16, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm16, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: movb $48, %r10b +; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm26, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm28[0],zmm13[0],zmm28[2],zmm13[2],zmm28[4],zmm13[4],zmm28[6],zmm13[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: movb $48, %r10b -; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm31 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm24 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] ; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm8, %zmm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm20 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm21[0],zmm10[0],zmm21[2],zmm10[2],zmm21[4],zmm10[4],zmm21[6],zmm10[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm13, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm19, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm27 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm7[0],zmm0[0],zmm7[2],zmm0[2],zmm7[4],zmm0[4],zmm7[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm0, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} ; AVX512DQBW-FAST-NEXT: movb $120, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm25 {%k4} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm20[0,1,2,3],zmm30[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k3} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k3} ; AVX512DQBW-FAST-NEXT: movb $-61, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm18[0,1,2,3],zmm23[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k5} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 {%k5} ; AVX512DQBW-FAST-NEXT: movb $-31, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm21 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k4} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm24 {%k4} ; AVX512DQBW-FAST-NEXT: movb $112, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm4, %zmm15 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm23 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm14 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm2, %zmm24 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm29 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 {%k4} ; AVX512DQBW-FAST-NEXT: movb $56, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm22, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm9, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm9[2,3,2,3],zmm2[2,3,2,3] ; AVX512DQBW-FAST-NEXT: movb $64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: movb $8, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm10, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm10, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 384(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 256(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1728(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) -; AVX512DQBW-FAST-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1536(%rax) +; AVX512DQBW-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -9315,66 +9319,67 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $3224, %rsp # imm = 0xC98 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd (%rdi), %xmm2 -; SSE-NEXT: movapd 16(%rdi), %xmm3 -; SSE-NEXT: movapd 32(%rdi), %xmm4 -; SSE-NEXT: movapd (%rsi), %xmm5 -; SSE-NEXT: movapd 16(%rsi), %xmm6 -; SSE-NEXT: movapd (%rdx), %xmm7 +; SSE-NEXT: movapd 16(%rdi), %xmm5 +; SSE-NEXT: movapd 32(%rdi), %xmm13 +; SSE-NEXT: movapd (%rsi), %xmm4 +; SSE-NEXT: movapd 16(%rsi), %xmm10 +; SSE-NEXT: movapd (%rdx), %xmm3 ; SSE-NEXT: movapd 16(%rdx), %xmm8 -; SSE-NEXT: movapd (%rcx), %xmm9 -; SSE-NEXT: movapd 16(%rcx), %xmm10 +; SSE-NEXT: movapd (%rcx), %xmm7 +; SSE-NEXT: movapd 16(%rcx), %xmm14 ; SSE-NEXT: movapd 16(%r8), %xmm12 -; SSE-NEXT: movapd (%r8), %xmm11 -; SSE-NEXT: movapd 16(%r9), %xmm14 -; SSE-NEXT: movapd (%r9), %xmm13 +; SSE-NEXT: movapd (%r8), %xmm6 +; SSE-NEXT: movapd 16(%r9), %xmm15 +; SSE-NEXT: movapd (%r9), %xmm11 ; SSE-NEXT: movapd 16(%rax), %xmm0 ; SSE-NEXT: movapd (%rax), %xmm1 -; SSE-NEXT: movapd %xmm2, %xmm15 -; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm5[0] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm4[0] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm3, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm5, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm10[0] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm10[0] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm8[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm14[0] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm14[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm12[1] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 32(%rsi), %xmm1 -; SSE-NEXT: movapd %xmm4, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm15[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 32(%rsi), %xmm2 +; SSE-NEXT: movapd %xmm13, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%rax), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 32(%rdx), %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 32(%rcx), %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 32(%rdx), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 32(%rcx), %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%r8), %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%r9), %xmm2 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9996,19 +10001,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 496(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 496(%rcx), %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: movapd 496(%r8), %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movapd 496(%rcx), %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movapd 496(%r8), %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movapd 496(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 3568(%rax) -; SSE-NEXT: movapd %xmm2, 3552(%rax) +; SSE-NEXT: movapd %xmm1, 3552(%rax) ; SSE-NEXT: movapd %xmm4, 3536(%rax) ; SSE-NEXT: movapd %xmm5, 3520(%rax) -; SSE-NEXT: movapd %xmm1, 3504(%rax) +; SSE-NEXT: movapd %xmm2, 3504(%rax) ; SSE-NEXT: movapd %xmm3, 3488(%rax) ; SSE-NEXT: movapd %xmm7, 3472(%rax) ; SSE-NEXT: movapd %xmm6, 3456(%rax) @@ -10444,11 +10449,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $3816, %rsp # imm = 0xEE8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm5 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 @@ -10472,7 +10477,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10480,7 +10485,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11234,8 +11239,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rax), %ymm3, %ymm7 @@ -11244,37 +11249,37 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm7 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 464(%rcx), %xmm1 @@ -11287,28 +11292,25 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 496(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0],ymm15[1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0],ymm12[1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd 496(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm0[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -11318,10 +11320,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0] @@ -11331,36 +11336,36 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm15 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm7, 16(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm9, 2704(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 2688(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 3152(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 3136(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 3152(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 3136(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm15, 2256(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 2240(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 1360(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 1344(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 1344(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm12, 464(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 912(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 896(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 912(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 896(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm14, 1808(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1792(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 1792(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3520(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11575,21 +11580,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3880, %rsp # imm = 0xF28 +; AVX2-ONLY-NEXT: subq $3912, %rsp # imm = 0xF48 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11603,13 +11608,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 -; AVX2-ONLY-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11994,18 +11999,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm11 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 320(%r9), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12024,9 +12029,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm6 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; AVX2-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -12040,10 +12045,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%r9), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12070,19 +12076,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm15 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 384(%r9), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12100,9 +12105,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm12 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] ; AVX2-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -12134,287 +12139,288 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm9 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm9[0,1],ymm0[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 448(%rax), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 464(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 464(%rax), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm12 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm10 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm10[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 480(%rax), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm7 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] ; AVX2-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovaps 496(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps 496(%rax), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd (%rsp), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 280(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 280(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 288(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 288(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd %xmm10, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd %xmm11, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 344(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 344(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 352(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 352(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rax), %ymm0 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm13 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 448(%rax), %ymm14 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm14, %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm14 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm12, 3552(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm12, 3520(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 3488(%rcx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm10, 3552(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 3520(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 3488(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3456(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3424(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3392(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 3360(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 3328(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 3360(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 3328(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3296(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 3264(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 3232(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 3232(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3200(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12432,8 +12438,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2976(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2944(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 2912(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 2880(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 2912(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 2880(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2848(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12442,10 +12448,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2784(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2752(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 2720(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 2720(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2688(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 2656(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 2656(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2624(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12456,8 +12462,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2528(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 2464(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 2432(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 2464(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 2432(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2400(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12466,7 +12472,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2336(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2304(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 2272(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 2272(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2240(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12609,7 +12615,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-ONLY-NEXT: addq $3880, %rsp # imm = 0xF28 +; AVX2-ONLY-NEXT: addq $3912, %rsp # imm = 0xF48 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -12617,761 +12623,758 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: subq $6600, %rsp # imm = 0x19C8 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm10[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm21[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm10[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm25, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm17, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm30[0],ymm4[2],ymm30[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm23, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm17, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm21, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm23, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm23[0],ymm3[2],ymm23[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm25, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm30, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm30, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm5, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm5, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k3 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm4[0],zmm6[0],zmm4[2],zmm6[2],zmm4[4],zmm6[4],zmm4[6],zmm6[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm31[0],zmm29[0],zmm31[2],zmm29[2],zmm31[4],zmm29[4],zmm31[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm24[0],zmm27[0],zmm24[2],zmm27[2],zmm24[4],zmm27[4],zmm24[6],zmm27[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm8, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm21[0],zmm22[0],zmm21[2],zmm22[2],zmm21[4],zmm22[4],zmm21[6],zmm22[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm17[0],zmm22[0],zmm17[2],zmm22[2],zmm17[4],zmm22[4],zmm17[6],zmm22[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm8, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm31 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm6, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm7[0],zmm0[0],zmm7[2],zmm0[2],zmm7[4],zmm0[4],zmm7[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm21[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: movb $4, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,10,u,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm15, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm22, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm20 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm21 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm6, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k5} ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm1 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,9,u,u,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 {%k5} ; AVX512F-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,4,8,u,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,2,3,9,u,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm15, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,10,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,8,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: movb $8, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k5} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k5} ; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm9 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm19, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm29 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm20, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm21, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm31 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 {%k3} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -13379,171 +13382,170 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm30[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm18 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm18 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm18[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2944(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2432(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2240(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1984(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1792(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 3008(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2944(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2880(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 2816(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 2688(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 2560(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2496(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2432(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 2368(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 2304(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2112(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1984(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 1920(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1856(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1600(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1344(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 640(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13552,16 +13554,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3520(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3520(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13572,7 +13574,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 3072(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) ; AVX512F-ONLY-SLOW-NEXT: addq $6600, %rsp # imm = 0x19C8 @@ -13581,811 +13583,812 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $6696, %rsp # imm = 0x1A28 +; AVX512F-ONLY-FAST-NEXT: subq $6632, %rsp # imm = 0x19E8 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] ; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm4[2,3,2,3],zmm8[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm26[0],ymm11[0],ymm26[2],ymm11[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm16, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm21[0],ymm12[2],ymm21[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm13[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm27, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm18, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm17, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm10[0],ymm26[0],ymm10[2],ymm26[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm9[0],ymm22[2],ymm9[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm17, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm22[0],ymm5[2],ymm22[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm7[2,3,2,3],zmm23[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm7[0],ymm27[2],ymm7[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm8[2,3,2,3],zmm23[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm16, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm21, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm30[0],ymm11[0],ymm30[2],ymm11[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm14[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm26 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm17, %ymm0, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm26, %ymm4, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm22, %ymm4, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm7, %ymm4, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm11, %ymm4, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm0[0],zmm23[0],zmm0[2],zmm23[2],zmm0[4],zmm23[4],zmm0[6],zmm23[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm30, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm31[0],zmm2[2],zmm31[2],zmm2[4],zmm31[4],zmm2[6],zmm31[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm30, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm1[0],zmm17[0],zmm1[2],zmm17[2],zmm1[4],zmm17[4],zmm1[6],zmm17[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm30, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm29[0],zmm15[0],zmm29[2],zmm15[2],zmm29[4],zmm15[4],zmm29[6],zmm15[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm30, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm7, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm13[0],zmm2[2],zmm13[2],zmm2[4],zmm13[4],zmm2[6],zmm13[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm20[0],zmm0[2],zmm20[2],zmm0[4],zmm20[4],zmm0[6],zmm20[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm18, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm16[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm20, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm18[0],zmm13[0],zmm18[2],zmm13[2],zmm18[4],zmm13[4],zmm18[6],zmm13[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm18, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm18, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm30, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm20, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm2, %zmm14 ; AVX512F-ONLY-FAST-NEXT: movb $4, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,10,u,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm14, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,10,u,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm20 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm20, %zmm26 ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k5 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k5} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $64, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm3, %zmm30 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,4,8,u,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,9,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,9,u,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: movb $8, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: movb $8, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm24 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm10 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm10, %zmm3 ; AVX512F-ONLY-FAST-NEXT: movb $112, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm25 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm24 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm22 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm12 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -14393,42 +14396,46 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $120, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $-61, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] @@ -14437,98 +14444,92 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm29[0,1,2,3],zmm30[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm11 {%k1} ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2944(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2880(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2816(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2752(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2688(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2624(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2560(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2944(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2880(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2688(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2624(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2560(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2432(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2368(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2432(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2304(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2112(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2112(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1984(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm10, 1920(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1856(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1792(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1664(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm9, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1152(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 960(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 576(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3520(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14539,943 +14540,940 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $6696, %rsp # imm = 0x1A28 +; AVX512F-ONLY-FAST-NEXT: addq $6632, %rsp # imm = 0x19E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $6472, %rsp # imm = 0x1948 +; AVX512DQ-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] ; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: movb $96, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] ; AVX512DQ-SLOW-NEXT: movb $28, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm8[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm19, %zmm1 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,13,14,7,6,13,14,7] -; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [6,13,14,7,6,13,14,7] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm2 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm10[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm28, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r9), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm15, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm16, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm27, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r9), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm27, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r9), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r8), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm10[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm28, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm27, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r9), %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm10 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm28, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r9), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r8), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm27, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r8), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm24[0],ymm2[2],ymm24[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm24 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm2[2,3,2,3],zmm12[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm30, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm19, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm19, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm8, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 ; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm24 ; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm30 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm27 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm8, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 ; AVX512DQ-SLOW-NEXT: movb $48, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k3 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k3} = zmm7[0],zmm8[0],zmm7[2],zmm8[2],zmm7[4],zmm8[4],zmm7[6],zmm8[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,0,1,0,8,0,1] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] ; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm27[0],zmm5[0],zmm27[2],zmm5[2],zmm27[4],zmm5[4],zmm27[6],zmm5[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm31[0],zmm7[0],zmm31[2],zmm7[2],zmm31[4],zmm7[4],zmm31[6],zmm7[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm22[0],zmm23[0],zmm22[2],zmm23[2],zmm22[4],zmm23[4],zmm22[6],zmm23[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm28[0],zmm21[0],zmm28[2],zmm21[2],zmm28[4],zmm21[4],zmm28[6],zmm21[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm16[0],zmm15[0],zmm16[2],zmm15[2],zmm16[4],zmm15[4],zmm16[6],zmm15[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm10[0],zmm13[0],zmm10[2],zmm13[2],zmm10[4],zmm13[4],zmm10[6],zmm13[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,11,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm15, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm15, %zmm11 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm15[0],zmm13[0],zmm15[2],zmm13[2],zmm15[4],zmm13[4],zmm15[6],zmm13[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm7[0],zmm21[0],zmm7[2],zmm21[2],zmm7[4],zmm21[4],zmm7[6],zmm21[6] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,11,u,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm8, %zmm10 ; AVX512DQ-SLOW-NEXT: movb $4, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <12,u,u,3,4,5,6,13> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm21 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm19 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm22 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm28 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 {%k5} ; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k5} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,9,u,u,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k5} ; AVX512DQ-SLOW-NEXT: movb $64, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 ; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r9), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r8), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm1 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,4,8,u,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,9,u,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r9), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r8), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm6[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,8,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm14, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm7 ; AVX512DQ-SLOW-NEXT: movb $8, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k5} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k5} ; AVX512DQ-SLOW-NEXT: movb $-31, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm5 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k4} ; AVX512DQ-SLOW-NEXT: movb $112, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm19, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm10, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm16, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm19, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm20, %zmm5 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm22, %zmm9 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm28, %zmm25 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm4, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm0 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm30 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm25 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm3, %zmm21 {%k2} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k3} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm5 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm28 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm10 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k3} ; AVX512DQ-SLOW-NEXT: movb $56, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: movb $120, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm27 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} ; AVX512DQ-SLOW-NEXT: movb $-61, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm11 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm5 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm7 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm2 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm8 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm30 {%k1} ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 3008(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 2944(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 2560(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 2432(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 3008(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 2944(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 2880(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 2816(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 2688(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 2560(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 2496(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 2432(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 2368(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 2304(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 2176(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 2112(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1984(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1792(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1536(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 2112(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1984(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 1920(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 1792(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 1472(%rax) ; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 1216(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 1024(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 3520(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 3520(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15486,815 +15484,817 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 3072(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQ-SLOW-NEXT: addq $6472, %rsp # imm = 0x1948 +; AVX512DQ-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 +; AVX512DQ-FAST-NEXT: subq $6696, %rsp # imm = 0x1A28 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: movb $96, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm22, %zmm1 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX512DQ-FAST-NEXT: movb $28, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm4[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm9[0],ymm30[2],ymm9[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm24 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm15, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm2[0],ymm21[0],ymm2[2],ymm21[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm13[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm25[0],ymm11[2],ymm25[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm22, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm16, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%r9), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%r8), %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 128(%r8), %ymm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm27 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm7[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm22, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm9[0],ymm18[0],ymm9[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm7[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%r9), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%r8), %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm10[2,3,2,3],zmm21[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%r9), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%r8), %ymm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm8[2,3,2,3],zmm24[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm28, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%r9), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %ymm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm29[0],ymm11[0],ymm29[2],ymm11[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm15[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm8, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm25, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q (%rsp), %ymm4, %ymm27 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %ymm18, %ymm4, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %ymm7, %ymm4, %ymm5 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %ymm11, %ymm4, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 384(%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%r8), %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $48, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm4[0],zmm2[0],zmm4[2],zmm2[2],zmm4[4],zmm2[4],zmm4[6],zmm2[6] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm20[0],zmm2[2],zmm20[2],zmm2[4],zmm20[4],zmm2[6],zmm20[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm1[0],zmm31[0],zmm1[2],zmm31[2],zmm1[4],zmm31[4],zmm1[6],zmm31[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm26, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm21[0],zmm2[2],zmm21[2],zmm2[4],zmm21[4],zmm2[6],zmm21[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm17, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm13[0],zmm2[2],zmm13[2],zmm2[4],zmm13[4],zmm2[6],zmm13[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm31[0],zmm0[2],zmm31[2],zmm0[4],zmm31[4],zmm0[6],zmm31[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm30 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm31[0],zmm19[0],zmm31[2],zmm19[2],zmm31[4],zmm19[4],zmm31[6],zmm19[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm28 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm24[0],zmm12[0],zmm24[2],zmm12[2],zmm24[4],zmm12[4],zmm24[6],zmm12[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm18, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm23, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm23 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm24 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm21 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm6 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,11,u,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm13[0],zmm17[0],zmm13[2],zmm17[2],zmm13[4],zmm17[4],zmm13[6],zmm17[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm13, %zmm15 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm13, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,11,u,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,11,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm2, %zmm16 ; AVX512DQ-FAST-NEXT: movb $4, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,10,u,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,10,u,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,12,u,3,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm10, %zmm18 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm23 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm27 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm25 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm17 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k4} ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k5 -; AVX512DQ-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k5} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k4} ; AVX512DQ-FAST-NEXT: movb $64, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm5, %zmm29 ; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k3} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm9 {%k3} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm9, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,9,u,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm17[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512DQ-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,9,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm11 ; AVX512DQ-FAST-NEXT: movb $8, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k4} ; AVX512DQ-FAST-NEXT: movb $-31, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm30 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k3} ; AVX512DQ-FAST-NEXT: movb $112, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm13 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm25, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm5 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm23, %zmm7 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm10, %zmm7 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm27, %zmm11 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm10, %zmm25 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm30 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm12, %zmm27 {%k2} ; AVX512DQ-FAST-NEXT: movb $56, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm21 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm25 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -16308,135 +16308,134 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-FAST-NEXT: movb $120, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} ; AVX512DQ-FAST-NEXT: movb $-61, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm7 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm8 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm9 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm11 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm10 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 2944(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 2880(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2816(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 2752(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 2688(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2624(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 2560(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 3008(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 2944(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 2880(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm10, 2816(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2688(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 2624(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2560(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 2432(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2368(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 2368(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 2304(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 2240(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 2112(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 2112(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1984(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1920(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 1984(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 1856(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 1536(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1472(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 3520(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16447,10 +16446,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 3072(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQ-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 +; AVX512DQ-FAST-NEXT: addq $6696, %rsp # imm = 0x1A28 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -16458,761 +16457,758 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: subq $6600, %rsp # imm = 0x19C8 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm10[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm21[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm10[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm25, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm17, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm9[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm30[0],ymm4[2],ymm30[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm23, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm17, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm9[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm21, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm23, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm23[0],ymm3[2],ymm23[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm25, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm30, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm30, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm5, %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm5, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k3 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm4[0],zmm6[0],zmm4[2],zmm6[2],zmm4[4],zmm6[4],zmm4[6],zmm6[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm31[0],zmm29[0],zmm31[2],zmm29[2],zmm31[4],zmm29[4],zmm31[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm26 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm24[0],zmm27[0],zmm24[2],zmm27[2],zmm24[4],zmm27[4],zmm24[6],zmm27[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm8, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm21[0],zmm22[0],zmm21[2],zmm22[2],zmm21[4],zmm22[4],zmm21[6],zmm22[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm17[0],zmm22[0],zmm17[2],zmm22[2],zmm17[4],zmm22[4],zmm17[6],zmm22[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm8, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm31 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm6, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm7[0],zmm0[0],zmm7[2],zmm0[2],zmm7[4],zmm0[4],zmm7[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movb $4, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,10,u,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm15, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm22, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm20 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm17 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm21 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,9,u,u,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,4,8,u,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,2,3,9,u,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm15, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,10,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,8,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: movb $8, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm9 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm19, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm29 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm20, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm21, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm31 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -17220,171 +17216,170 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm30[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm18 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm18 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm18[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2944(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2432(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2240(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1984(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1792(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 3008(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2944(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2880(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 2816(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 2688(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 2560(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2496(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2432(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 2368(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 2304(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2112(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1984(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 1920(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1856(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1600(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1344(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 640(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17393,16 +17388,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3520(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3520(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17413,7 +17408,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 3072(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) ; AVX512BW-ONLY-SLOW-NEXT: addq $6600, %rsp # imm = 0x19C8 @@ -17422,811 +17417,812 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $6696, %rsp # imm = 0x1A28 +; AVX512BW-ONLY-FAST-NEXT: subq $6632, %rsp # imm = 0x19E8 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm4[2,3,2,3],zmm8[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm26[0],ymm11[0],ymm26[2],ymm11[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm10, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm16, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm21[0],ymm12[2],ymm21[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm13[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm17, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %ymm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm10[0],ymm26[0],ymm10[2],ymm26[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm17, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm22[0],ymm5[2],ymm22[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm7[2,3,2,3],zmm23[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm16, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm27, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm18, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm27 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm7[0],ymm27[2],ymm7[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm8[2,3,2,3],zmm23[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm9[0],ymm22[2],ymm9[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm16, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm21, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %ymm30 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm30[0],ymm11[0],ymm30[2],ymm11[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm14[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm26, %ymm4, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm22, %ymm4, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm7, %ymm4, %ymm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm11, %ymm4, %ymm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b +; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm26 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm25 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm17, %ymm0, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm30, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b -; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm0[0],zmm23[0],zmm0[2],zmm23[2],zmm0[4],zmm23[4],zmm0[6],zmm23[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm31[0],zmm2[2],zmm31[2],zmm2[4],zmm31[4],zmm2[6],zmm31[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm4, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm30, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm1[0],zmm17[0],zmm1[2],zmm17[2],zmm1[4],zmm17[4],zmm1[6],zmm17[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm30, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm29[0],zmm15[0],zmm29[2],zmm15[2],zmm29[4],zmm15[4],zmm29[6],zmm15[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm30, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm13[0],zmm2[2],zmm13[2],zmm2[4],zmm13[4],zmm2[6],zmm13[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm27, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm30, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm18, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm20[0],zmm0[2],zmm20[2],zmm0[4],zmm20[4],zmm0[6],zmm20[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm20, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm18[0],zmm13[0],zmm18[2],zmm13[2],zmm18[4],zmm13[4],zmm18[6],zmm13[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm18, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm18, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm30, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm20, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm2, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: movb $4, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,10,u,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm14, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,10,u,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm20 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm20, %zmm26 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k5 ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $64, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm3, %zmm30 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,4,8,u,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,9,u,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,9,u,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: movb $8, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm24 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm10 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm10, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm25 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm24 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm22 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm12 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -18234,42 +18230,46 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $120, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] @@ -18278,98 +18278,92 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm13 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm29[0,1,2,3],zmm30[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm11 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2944(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2880(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2816(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2752(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2688(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2624(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2560(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2944(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2880(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2688(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2624(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2560(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2432(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2368(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2432(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2304(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2112(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2112(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1984(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm10, 1920(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1856(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1792(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1664(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm9, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1152(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 960(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 576(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3520(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18380,943 +18374,940 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $6696, %rsp # imm = 0x1A28 +; AVX512BW-ONLY-FAST-NEXT: addq $6632, %rsp # imm = 0x19E8 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $6472, %rsp # imm = 0x1948 +; AVX512DQBW-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: movb $96, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm8[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm19, %zmm1 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm2 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm10[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm28, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r9), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm15, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm16, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm27, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r9), %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm27, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r9), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r8), %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm10[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm28, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm27, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r9), %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm10 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm28, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r9), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r8), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm27, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %ymm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r8), %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm24[0],ymm2[2],ymm24[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm24 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm2[2,3,2,3],zmm12[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm30, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm19, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm19, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm8, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm24 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm27 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm8, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 ; AVX512DQBW-SLOW-NEXT: movb $48, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k3} = zmm7[0],zmm8[0],zmm7[2],zmm8[2],zmm7[4],zmm8[4],zmm7[6],zmm8[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,0,1,0,8,0,1] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] ; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm27[0],zmm5[0],zmm27[2],zmm5[2],zmm27[4],zmm5[4],zmm27[6],zmm5[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm31[0],zmm7[0],zmm31[2],zmm7[2],zmm31[4],zmm7[4],zmm31[6],zmm7[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm22[0],zmm23[0],zmm22[2],zmm23[2],zmm22[4],zmm23[4],zmm22[6],zmm23[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm28[0],zmm21[0],zmm28[2],zmm21[2],zmm28[4],zmm21[4],zmm28[6],zmm21[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm16[0],zmm15[0],zmm16[2],zmm15[2],zmm16[4],zmm15[4],zmm16[6],zmm15[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm10[0],zmm13[0],zmm10[2],zmm13[2],zmm10[4],zmm13[4],zmm10[6],zmm13[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm15, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm15, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm15[0],zmm13[0],zmm15[2],zmm13[2],zmm15[4],zmm13[4],zmm15[6],zmm13[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm7[0],zmm21[0],zmm7[2],zmm21[2],zmm7[4],zmm21[4],zmm7[6],zmm21[6] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm8, %zmm10 ; AVX512DQBW-SLOW-NEXT: movb $4, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm21 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm19 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm22 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm28 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k5} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,9,u,u,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $64, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r9), %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r8), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm1 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,4,8,u,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,9,u,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r9), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r8), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm6[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,8,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm14, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm7 ; AVX512DQBW-SLOW-NEXT: movb $8, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $-31, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm5 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %xmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %xmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $112, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm19, %zmm7 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm10, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm16, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm19, %zmm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm20, %zmm5 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm22, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm28, %zmm25 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm4, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm30 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm25 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm3, %zmm21 {%k2} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k3} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm5 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm28 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm10 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k3} ; AVX512DQBW-SLOW-NEXT: movb $56, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $120, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $-61, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm11 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm5 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm2 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm8 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm30 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 3008(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 2944(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 2560(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 2432(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 3008(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 2944(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 2880(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 2816(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 2688(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 2560(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 2496(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 2432(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 2368(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 2304(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 2176(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 2112(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1984(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1792(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1536(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 2112(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1984(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 1920(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 1792(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 1472(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 1216(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 1024(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 3520(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 3520(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19327,815 +19318,817 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 3072(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQBW-SLOW-NEXT: addq $6472, %rsp # imm = 0x1948 +; AVX512DQBW-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 +; AVX512DQBW-FAST-NEXT: subq $6696, %rsp # imm = 0x1A28 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm26 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: movb $96, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm22, %zmm1 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm9 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %ymm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX512DQBW-FAST-NEXT: movb $28, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm4[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm9[0],ymm30[2],ymm9[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm24 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm15, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm2[0],ymm21[0],ymm2[2],ymm21[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm13[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %ymm25 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm25[0],ymm11[2],ymm25[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm22, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm16, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r9), %ymm9 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r8), %ymm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r8), %ymm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm13, %ymm27 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm7[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm22, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm9[0],ymm18[0],ymm9[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm7[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r9), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r8), %ymm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm10[2,3,2,3],zmm21[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r9), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r8), %ymm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm8[2,3,2,3],zmm24[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm28, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r9), %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %ymm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm29[0],ymm11[0],ymm29[2],ymm11[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm15[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm8, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm25, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm5 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q (%rsp), %ymm4, %ymm27 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm18, %ymm4, %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm7, %ymm4, %ymm5 ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm11, %ymm4, %ymm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r9), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r8), %ymm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r8), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $48, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm30 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm20[0],zmm2[2],zmm20[2],zmm2[4],zmm20[4],zmm2[6],zmm20[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm28 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm26 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm4[0],zmm2[0],zmm4[2],zmm2[2],zmm4[4],zmm2[4],zmm4[6],zmm2[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm1[0],zmm31[0],zmm1[2],zmm31[2],zmm1[4],zmm31[4],zmm1[6],zmm31[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm15, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm26, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm21[0],zmm2[2],zmm21[2],zmm2[4],zmm21[4],zmm2[6],zmm21[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm17, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm13[0],zmm2[2],zmm13[2],zmm2[4],zmm13[4],zmm2[6],zmm13[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm31[0],zmm0[2],zmm31[2],zmm0[4],zmm31[4],zmm0[6],zmm31[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm30 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm31[0],zmm19[0],zmm31[2],zmm19[2],zmm31[4],zmm19[4],zmm31[6],zmm19[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm28 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm24[0],zmm12[0],zmm24[2],zmm12[2],zmm24[4],zmm12[4],zmm24[6],zmm12[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm18, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm23, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm6 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm17, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm13[0],zmm17[0],zmm13[2],zmm17[2],zmm13[4],zmm17[4],zmm13[6],zmm17[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm17, %zmm13, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm17, %zmm13, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm2, %zmm16 ; AVX512DQBW-FAST-NEXT: movb $4, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,10,u,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,10,u,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm10, %zmm18 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm23 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm25 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm17 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k4} ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k5 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k5} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k4} ; AVX512DQBW-FAST-NEXT: movb $64, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm5, %zmm29 ; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k3} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm9 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm9, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,9,u,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm17[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,9,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm11 ; AVX512DQBW-FAST-NEXT: movb $8, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k4} ; AVX512DQBW-FAST-NEXT: movb $-31, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm30 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k3} ; AVX512DQBW-FAST-NEXT: movb $112, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm13 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm25, %zmm5 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm5 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm23, %zmm7 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm10, %zmm7 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm27, %zmm11 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm10, %zmm25 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm30 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm12, %zmm27 {%k2} ; AVX512DQBW-FAST-NEXT: movb $56, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm21 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm25 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -20149,135 +20142,134 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQBW-FAST-NEXT: movb $120, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} ; AVX512DQBW-FAST-NEXT: movb $-61, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm7 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm8 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm9 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm11 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm11 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm10 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 2944(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 2880(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2816(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 2752(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 2688(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2624(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 2560(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 3008(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 2944(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 2880(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm10, 2816(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2688(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 2624(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2560(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 2432(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2368(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 2368(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 2304(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 2240(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 2112(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 2112(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1984(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1920(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 1984(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 1856(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 1536(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1472(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 3520(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -20288,10 +20280,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 3072(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQBW-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 +; AVX512DQBW-FAST-NEXT: addq $6696, %rsp # imm = 0x1A28 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll index 293a3447577f76..e14ad35280cf12 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -153,61 +153,61 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps (%rsi), %xmm10 -; SSE-NEXT: movaps 16(%rsi), %xmm14 +; SSE-NEXT: movaps (%rsi), %xmm8 +; SSE-NEXT: movaps 16(%rsi), %xmm9 ; SSE-NEXT: movaps (%rdx), %xmm1 ; SSE-NEXT: movaps 16(%rdx), %xmm3 -; SSE-NEXT: movaps (%rcx), %xmm7 -; SSE-NEXT: movaps 16(%rcx), %xmm12 -; SSE-NEXT: movaps (%r8), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm5 +; SSE-NEXT: movaps 16(%rcx), %xmm10 +; SSE-NEXT: movaps (%r8), %xmm6 ; SSE-NEXT: movaps 16(%r8), %xmm0 -; SSE-NEXT: movaps (%r9), %xmm13 -; SSE-NEXT: movaps (%r10), %xmm6 -; SSE-NEXT: movaps 16(%r10), %xmm9 -; SSE-NEXT: movaps (%rax), %xmm15 -; SSE-NEXT: movaps 16(%rax), %xmm11 -; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] +; SSE-NEXT: movaps (%r9), %xmm12 +; SSE-NEXT: movaps (%r10), %xmm7 +; SSE-NEXT: movaps 16(%r10), %xmm13 +; SSE-NEXT: movaps (%rax), %xmm14 +; SSE-NEXT: movaps 16(%rax), %xmm15 +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm5[0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] ; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] -; SSE-NEXT: movaps %xmm5, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; SSE-NEXT: movaps 16(%r9), %xmm11 -; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] +; SSE-NEXT: movaps %xmm13, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm15[1] +; SSE-NEXT: movaps 16(%r9), %xmm15 +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 224(%rax) -; SSE-NEXT: movaps %xmm9, 240(%rax) -; SSE-NEXT: movaps %xmm8, 160(%rax) -; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps %xmm5, 96(%rax) -; SSE-NEXT: movaps %xmm6, 112(%rax) -; SSE-NEXT: movaps %xmm15, 32(%rax) -; SSE-NEXT: movaps %xmm14, 48(%rax) +; SSE-NEXT: movaps %xmm13, 240(%rax) +; SSE-NEXT: movaps %xmm11, 160(%rax) +; SSE-NEXT: movaps %xmm12, 176(%rax) +; SSE-NEXT: movaps %xmm6, 96(%rax) +; SSE-NEXT: movaps %xmm7, 112(%rax) +; SSE-NEXT: movaps %xmm14, 32(%rax) +; SSE-NEXT: movaps %xmm9, 48(%rax) ; SSE-NEXT: movaps %xmm2, 192(%rax) ; SSE-NEXT: movaps %xmm3, 208(%rax) -; SSE-NEXT: movaps %xmm12, 128(%rax) -; SSE-NEXT: movaps %xmm10, 144(%rax) +; SSE-NEXT: movaps %xmm10, 128(%rax) +; SSE-NEXT: movaps %xmm8, 144(%rax) ; SSE-NEXT: movaps %xmm4, 64(%rax) ; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps %xmm7, (%rax) +; SSE-NEXT: movaps %xmm5, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: retq @@ -386,58 +386,58 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 16(%rdi), %xmm9 -; SSE-NEXT: movaps (%rsi), %xmm3 -; SSE-NEXT: movaps 16(%rsi), %xmm0 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rsi), %xmm1 ; SSE-NEXT: movaps (%rdx), %xmm8 -; SSE-NEXT: movaps 16(%rdx), %xmm11 -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps 16(%rcx), %xmm1 +; SSE-NEXT: movaps 16(%rdx), %xmm12 +; SSE-NEXT: movaps (%rcx), %xmm2 +; SSE-NEXT: movaps 16(%rcx), %xmm3 ; SSE-NEXT: movaps (%r8), %xmm10 -; SSE-NEXT: movaps 16(%r8), %xmm13 -; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps 16(%r9), %xmm2 -; SSE-NEXT: movaps (%r10), %xmm12 +; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movaps (%r9), %xmm4 +; SSE-NEXT: movaps 16(%r9), %xmm5 +; SSE-NEXT: movaps (%r10), %xmm13 ; SSE-NEXT: movaps 16(%r10), %xmm15 ; SSE-NEXT: movaps (%rax), %xmm6 -; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm13 @@ -446,21 +446,21 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 32(%rdx), %xmm11 +; SSE-NEXT: movaps 32(%rdx), %xmm12 ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm15 +; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 32(%r8), %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: movaps %xmm11, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 32(%r10), %xmm8 -; SSE-NEXT: movaps 32(%rax), %xmm1 -; SSE-NEXT: movaps %xmm8, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps 32(%rax), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 48(%rdi), %xmm6 ; SSE-NEXT: movaps 48(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm6, %xmm9 @@ -491,10 +491,10 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm7, 400(%rax) ; SSE-NEXT: movaps %xmm9, 384(%rax) ; SSE-NEXT: movaps %xmm8, 368(%rax) -; SSE-NEXT: movaps %xmm10, 352(%rax) -; SSE-NEXT: movaps %xmm11, 336(%rax) +; SSE-NEXT: movaps %xmm11, 352(%rax) +; SSE-NEXT: movaps %xmm12, 336(%rax) ; SSE-NEXT: movaps %xmm13, 320(%rax) -; SSE-NEXT: movaps %xmm12, 304(%rax) +; SSE-NEXT: movaps %xmm10, 304(%rax) ; SSE-NEXT: movaps %xmm14, 288(%rax) ; SSE-NEXT: movaps %xmm15, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -557,46 +557,46 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm8[1],ymm2[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm7 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2],ymm9[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2],ymm10[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%r9), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] @@ -611,36 +611,36 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm15[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) @@ -651,7 +651,7 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) ; AVX1-ONLY-NEXT: popq %rax ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -664,106 +664,106 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm7 +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm5 ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm8[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm7[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm5[0] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm5, %ymm5 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm15[0],xmm14[0] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm12 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm8[0] ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm15, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm13[0],xmm11[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm15 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm13 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 384(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) @@ -783,57 +783,57 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm8 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%r11), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%r11), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512F-NEXT: movb $-64, %r8b ; AVX512F-NEXT: kmovw %r8d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm17 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm9, %ymm9 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm17 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] ; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [7,15,7,15] ; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm14 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm13 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] ; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 @@ -841,54 +841,54 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm15 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512F-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX512F-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm10[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 384(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -898,57 +898,57 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: movb $-64, %r8b ; AVX512BW-NEXT: kmovd %r8d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm17 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm9, %ymm9 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm17 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] ; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [7,15,7,15] ; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] ; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 @@ -956,54 +956,54 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm15 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13] +; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm10[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1034,59 +1034,59 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movaps 16(%rdx), %xmm10 -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps 16(%rcx), %xmm1 -; SSE-NEXT: movaps (%r8), %xmm11 -; SSE-NEXT: movaps 16(%r8), %xmm12 -; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps 16(%r9), %xmm3 +; SSE-NEXT: movaps 16(%rdi), %xmm9 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rsi), %xmm1 +; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movaps 16(%rdx), %xmm12 +; SSE-NEXT: movaps (%rcx), %xmm2 +; SSE-NEXT: movaps 16(%rcx), %xmm3 +; SSE-NEXT: movaps (%r8), %xmm10 +; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movaps (%r9), %xmm4 +; SSE-NEXT: movaps 16(%r9), %xmm5 ; SSE-NEXT: movaps (%r10), %xmm13 ; SSE-NEXT: movaps 16(%r10), %xmm15 ; SSE-NEXT: movaps (%rax), %xmm6 -; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm2 @@ -1194,43 +1194,43 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r10), %xmm2 +; SSE-NEXT: movaps 80(%r10), %xmm14 ; SSE-NEXT: movaps 80(%rax), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm12 ; SSE-NEXT: movaps 96(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdx), %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdx), %xmm15 ; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 96(%r8), %xmm11 -; SSE-NEXT: movaps 96(%r9), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 96(%r9), %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] ; SSE-NEXT: movaps 96(%r10), %xmm9 ; SSE-NEXT: movaps 96(%rax), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm7 +; SSE-NEXT: movaps 112(%rdi), %xmm6 ; SSE-NEXT: movaps 112(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 112(%rdx), %xmm5 ; SSE-NEXT: movaps 112(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 112(%r8), %xmm1 ; SSE-NEXT: movaps 112(%r9), %xmm2 @@ -1246,22 +1246,22 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, 1008(%rax) ; SSE-NEXT: movaps %xmm1, 992(%rax) ; SSE-NEXT: movaps %xmm5, 976(%rax) -; SSE-NEXT: movaps %xmm7, 960(%rax) +; SSE-NEXT: movaps %xmm6, 960(%rax) ; SSE-NEXT: movaps %xmm0, 944(%rax) ; SSE-NEXT: movaps %xmm4, 928(%rax) -; SSE-NEXT: movaps %xmm6, 912(%rax) +; SSE-NEXT: movaps %xmm7, 912(%rax) ; SSE-NEXT: movaps %xmm8, 896(%rax) ; SSE-NEXT: movaps %xmm9, 880(%rax) ; SSE-NEXT: movaps %xmm11, 864(%rax) -; SSE-NEXT: movaps %xmm10, 848(%rax) -; SSE-NEXT: movaps %xmm13, 832(%rax) -; SSE-NEXT: movaps %xmm12, 816(%rax) -; SSE-NEXT: movaps %xmm14, 800(%rax) -; SSE-NEXT: movaps %xmm15, 784(%rax) +; SSE-NEXT: movaps %xmm15, 848(%rax) +; SSE-NEXT: movaps %xmm12, 832(%rax) +; SSE-NEXT: movaps %xmm10, 816(%rax) +; SSE-NEXT: movaps %xmm13, 800(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 768(%rax) +; SSE-NEXT: movaps %xmm0, 784(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 752(%rax) +; SSE-NEXT: movaps %xmm0, 768(%rax) +; SSE-NEXT: movaps %xmm14, 752(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 736(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1492,10 +1492,10 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -1504,38 +1504,38 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rax), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] +; AVX1-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] @@ -1545,18 +1545,18 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%rax), %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -1573,9 +1573,9 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 704(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 672(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 640(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 704(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 672(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 640(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1584,8 +1584,8 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 448(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 416(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) @@ -1597,8 +1597,8 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1645,10 +1645,10 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm15 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm7 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -1662,13 +1662,13 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm9 @@ -1696,43 +1696,43 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%r10), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%r10), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm15, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm13, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm14[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm12, %ymm12 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm7, %ymm7 @@ -1757,58 +1757,58 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm7 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm8 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rax), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm10 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm10[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] @@ -1818,18 +1818,18 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%rax), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm10 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm10[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm10 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm10[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -1838,16 +1838,16 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm12, 928(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 896(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 736(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 704(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 672(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 640(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 704(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 672(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 640(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 480(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1894,201 +1894,201 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm3 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm26 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm19 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm31 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm29 ; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm27 ; AVX512F-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%r11), %zmm30 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm29 +; AVX512F-NEXT: vmovdqa64 64(%r11), %zmm31 +; AVX512F-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm28 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm18, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm18, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm18, %zmm4 ; AVX512F-NEXT: movb $-64, %r8b ; AVX512F-NEXT: kmovw %r8d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm23, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm22, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm22, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm12, %zmm21 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm1 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm31[1],zmm6[3],zmm31[3],zmm6[5],zmm31[5],zmm6[7],zmm31[7] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm22 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm31[0],zmm6[2],zmm31[2],zmm6[4],zmm31[4],zmm6[6],zmm31[6] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm13 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm29[1],zmm6[3],zmm29[3],zmm6[5],zmm29[5],zmm6[7],zmm29[7] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm10, %zmm2 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,13,5,13] +; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm23 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm14, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm15 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm24 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm13, %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm5 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm1 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512F-NEXT: vpermt2q %zmm19, %zmm28, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm20 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm9 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] ; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm18, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm16, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm23, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm16, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm16[1],zmm27[1],zmm16[3],zmm27[3],zmm16[5],zmm27[5],zmm16[7],zmm27[7] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vpermt2q %zmm19, %zmm7, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm18, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm27, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm22, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm27, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-NEXT: vpermi2q %zmm25, %zmm11, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm27[0],zmm16[2],zmm27[2],zmm16[4],zmm27[4],zmm16[6],zmm27[6] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vpermi2q %zmm26, %zmm13, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-NEXT: vpermi2q %zmm25, %zmm11, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm26, %zmm13, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm15, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm31[1],zmm28[1],zmm31[3],zmm28[3],zmm31[5],zmm28[5],zmm31[7],zmm28[7] ; AVX512F-NEXT: vpermi2q %zmm25, %zmm11, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] -; AVX512F-NEXT: vpermi2q %zmm25, %zmm11, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm28, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] -; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm7, %zmm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm26, %zmm13, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm5, %zmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm31[0],zmm28[0],zmm31[2],zmm28[2],zmm31[4],zmm28[4],zmm31[6],zmm28[6] +; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm7, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm11 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm12 -; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512F-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %ymm23 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm25[1],ymm15[1],ymm25[3],ymm15[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm10, %zmm6 +; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512F-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm22[1],ymm14[1],ymm22[3],ymm14[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm13[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm9, %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm14[0],ymm22[2],ymm14[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm26[1],ymm23[1],ymm26[3],ymm23[3] +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm27, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm25[1],ymm15[1],ymm25[3],ymm15[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm10, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm26[0],ymm23[0],ymm26[2],ymm23[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm9, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k1} +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 704(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 832(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm17, 384(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm20, 448(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm23, 320(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm21, (%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) @@ -2101,201 +2101,201 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm19 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm31 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm29 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r11), %zmm30 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm29 +; AVX512BW-NEXT: vmovdqa64 64(%r11), %zmm31 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm28 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm18, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm4 ; AVX512BW-NEXT: movb $-64, %r8b ; AVX512BW-NEXT: kmovd %r8d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm23, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm22, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm12, %zmm21 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm31[1],zmm6[3],zmm31[3],zmm6[5],zmm31[5],zmm6[7],zmm31[7] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm22 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm31[0],zmm6[2],zmm31[2],zmm6[4],zmm31[4],zmm6[6],zmm31[6] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm13 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm29[1],zmm6[3],zmm29[3],zmm6[5],zmm29[5],zmm6[7],zmm29[7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm2 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,13,5,13] +; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm23 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm15 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm24 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm13, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm1 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] ; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm20 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm9 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm16, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm23, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm16, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm16[1],zmm27[1],zmm16[3],zmm27[3],zmm16[5],zmm27[5],zmm16[7],zmm27[7] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-NEXT: vpermi2q %zmm25, %zmm11, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm27[0],zmm16[2],zmm27[2],zmm16[4],zmm27[4],zmm16[6],zmm27[6] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm13, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm11, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm13, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm31[1],zmm28[1],zmm31[3],zmm28[3],zmm31[5],zmm28[5],zmm31[7],zmm28[7] ; AVX512BW-NEXT: vpermi2q %zmm25, %zmm11, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm11, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm13, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm31[0],zmm28[0],zmm31[2],zmm28[2],zmm31[4],zmm28[4],zmm31[6],zmm28[6] +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm11 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm12 -; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm23 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm25[1],ymm15[1],ymm25[3],ymm15[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm22[1],ymm14[1],ymm22[3],ymm14[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm14[0],ymm22[2],ymm14[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm16, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm26[1],ymm23[1],ymm26[3],ymm23[3] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm25[1],ymm15[1],ymm25[3],ymm15[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm26[0],ymm23[0],ymm26[2],ymm23[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm9, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 704(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 832(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) @@ -2328,61 +2328,61 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps (%rsi), %xmm1 -; SSE-NEXT: movaps 16(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movaps 16(%rdx), %xmm10 -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps (%r8), %xmm11 -; SSE-NEXT: movaps 16(%r8), %xmm12 -; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps 16(%r9), %xmm4 +; SSE-NEXT: movaps 16(%rdi), %xmm9 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rsi), %xmm1 +; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movaps 16(%rdx), %xmm12 +; SSE-NEXT: movaps (%rcx), %xmm2 +; SSE-NEXT: movaps 16(%rcx), %xmm3 +; SSE-NEXT: movaps (%r8), %xmm10 +; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movaps (%r9), %xmm4 +; SSE-NEXT: movaps 16(%r9), %xmm5 ; SSE-NEXT: movaps (%r10), %xmm13 -; SSE-NEXT: movaps 16(%r10), %xmm14 +; SSE-NEXT: movaps 16(%r10), %xmm15 ; SSE-NEXT: movaps (%rax), %xmm6 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] +; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 32(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm1 @@ -2712,24 +2712,24 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%r10), %xmm2 +; SSE-NEXT: movaps 208(%r10), %xmm15 ; SSE-NEXT: movaps 208(%rax), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm11 ; SSE-NEXT: movaps 224(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdx), %xmm14 +; SSE-NEXT: movaps 224(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdx), %xmm10 -; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 224(%r8), %xmm12 ; SSE-NEXT: movaps 224(%r9), %xmm0 ; SSE-NEXT: movaps %xmm12, %xmm13 @@ -2737,19 +2737,19 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 224(%r10), %xmm8 ; SSE-NEXT: movaps 224(%rax), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 240(%rdi), %xmm5 +; SSE-NEXT: movaps 240(%rdi), %xmm6 ; SSE-NEXT: movaps 240(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movaps %xmm6, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 240(%rdx), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdx), %xmm5 ; SSE-NEXT: movaps 240(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 240(%r8), %xmm1 ; SSE-NEXT: movaps 240(%r9), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 @@ -2763,23 +2763,23 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm2, 2032(%rax) ; SSE-NEXT: movaps %xmm1, 2016(%rax) -; SSE-NEXT: movaps %xmm6, 2000(%rax) -; SSE-NEXT: movaps %xmm5, 1984(%rax) +; SSE-NEXT: movaps %xmm5, 2000(%rax) +; SSE-NEXT: movaps %xmm6, 1984(%rax) ; SSE-NEXT: movaps %xmm0, 1968(%rax) ; SSE-NEXT: movaps %xmm4, 1952(%rax) ; SSE-NEXT: movaps %xmm7, 1936(%rax) ; SSE-NEXT: movaps %xmm9, 1920(%rax) ; SSE-NEXT: movaps %xmm8, 1904(%rax) ; SSE-NEXT: movaps %xmm12, 1888(%rax) -; SSE-NEXT: movaps %xmm10, 1872(%rax) -; SSE-NEXT: movaps %xmm14, 1856(%rax) -; SSE-NEXT: movaps %xmm11, 1840(%rax) +; SSE-NEXT: movaps %xmm14, 1872(%rax) +; SSE-NEXT: movaps %xmm11, 1856(%rax) +; SSE-NEXT: movaps %xmm10, 1840(%rax) ; SSE-NEXT: movaps %xmm13, 1824(%rax) -; SSE-NEXT: movaps %xmm15, 1808(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1792(%rax) +; SSE-NEXT: movaps %xmm0, 1808(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1776(%rax) +; SSE-NEXT: movaps %xmm0, 1792(%rax) +; SSE-NEXT: movaps %xmm15, 1776(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1760(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3007,7 +3007,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1672, %rsp # imm = 0x688 +; AVX1-ONLY-NEXT: subq $1576, %rsp # imm = 0x628 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 @@ -3250,10 +3250,12 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -3274,7 +3276,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -3313,7 +3316,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3334,12 +3337,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 152(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 152(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 176(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -3356,81 +3358,79 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 184(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 184(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm4[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 208(%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 208(%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 208(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 208(%rax), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vbroadcastsd 216(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 240(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rax), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovaps 240(%r9), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 240(%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rax), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; AVX1-ONLY-NEXT: vbroadcastsd 248(%r10), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2016(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1984(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 1952(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 1920(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 1952(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 1920(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1888(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3439,10 +3439,9 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1824(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1792(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 1760(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 1728(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1696(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 1760(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 1728(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 1696(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1664(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3453,8 +3452,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1568(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1536(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 1504(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 1472(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 1504(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 1472(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1440(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3467,7 +3466,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1312(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1280(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 1248(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 1248(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 1216(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1184(%rdx) @@ -3481,8 +3480,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 992(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 960(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 992(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 960(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3495,8 +3494,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 800(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 736(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 704(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 736(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 704(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3509,8 +3508,9 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 480(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3523,7 +3523,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) @@ -3539,7 +3539,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: addq $1672, %rsp # imm = 0x688 +; AVX1-ONLY-NEXT: addq $1576, %rsp # imm = 0x628 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3848,7 +3848,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -3865,7 +3865,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] @@ -3966,70 +3966,70 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 208(%rax), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm6 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 240(%rax), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r10), %ymm3 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r10), %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm2, 2016(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1984(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1984(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 1952(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 1920(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1920(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 1760(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 1728(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 1728(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 1696(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -4041,24 +4041,24 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 1248(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1216(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1216(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 992(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 960(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 992(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 960(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 736(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 704(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 704(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 480(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 448(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) @@ -4142,461 +4142,463 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i64_stride8_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm25 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512F-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm21 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm16 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm24 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm21 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm22 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512F-NEXT: vmovaps 192(%rdx), %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovaps 128(%rdx), %zmm1 +; AVX512F-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm31 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm29 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm19 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512F-NEXT: vmovdqa64 (%r10), %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm30 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm18 ; AVX512F-NEXT: movb $-64, %r11b ; AVX512F-NEXT: kmovw %r11d, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,12,4,12,4,12,4,12] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm7, %zmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm31[0],zmm19[0],zmm31[2],zmm19[2],zmm31[4],zmm19[4],zmm31[6],zmm19[6] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm4 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm19[1],zmm31[3],zmm19[3],zmm31[5],zmm19[5],zmm31[7],zmm19[7] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm9, %zmm4 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm8[0],zmm10[2],zmm8[2],zmm10[4],zmm8[4],zmm10[6],zmm8[6] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm12 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] ; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm15, %zmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm6, %zmm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm6, %zmm11 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] +; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm12, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm7, %zmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm29[0],zmm23[0],zmm29[2],zmm23[2],zmm29[4],zmm23[4],zmm29[6],zmm23[6] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] +; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm9, %zmm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm29[1],zmm23[1],zmm29[3],zmm23[3],zmm29[5],zmm23[5],zmm29[7],zmm23[7] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,13,5,13] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm15, %zmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm30[0],zmm18[0],zmm30[2],zmm18[2],zmm30[4],zmm18[4],zmm30[6],zmm18[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm6, %zmm5 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm18[1],zmm30[3],zmm18[3],zmm30[5],zmm18[5],zmm30[7],zmm18[7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm29, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm30 -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm16 +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm21 -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm13 +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm14 -; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm26 ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm1 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm24 +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm15, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm6, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm20 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm25, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm29, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6] -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm25, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm5[0],zmm16[2],zmm5[2],zmm16[4],zmm5[4],zmm16[6],zmm5[6] +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm16[1],zmm5[1],zmm16[3],zmm5[3],zmm16[5],zmm5[5],zmm16[7],zmm5[7] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm13[1],zmm28[1],zmm13[3],zmm28[3],zmm13[5],zmm28[5],zmm13[7],zmm28[7] +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm14[0],zmm26[0],zmm14[2],zmm26[2],zmm14[4],zmm26[4],zmm14[6],zmm26[6] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm14[1],zmm26[1],zmm14[3],zmm26[3],zmm14[5],zmm26[5],zmm14[7],zmm26[7] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm25, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm27, %zmm24, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm27, %zmm24, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm27, %zmm24, %zmm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm26 = zmm24[0],zmm27[0],zmm24[2],zmm27[2],zmm24[4],zmm27[4],zmm24[6],zmm27[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm24[1],zmm27[1],zmm24[3],zmm27[3],zmm24[5],zmm27[5],zmm24[7],zmm27[7] +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm28, %zmm13, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512F-NEXT: vinserti32x4 $1, (%rdx), %ymm20, %ymm20 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm7, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm20[1],ymm0[1],ymm20[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} ; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 ; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm7, %ymm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqa 128(%rsi), %xmm0 ; AVX512F-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 128(%rdx), %ymm12, %ymm13 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512F-NEXT: vinserti128 $1, 128(%rdx), %ymm7, %ymm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm23, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm26 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm5, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rsi), %xmm7 -; AVX512F-NEXT: vinserti128 $1, 192(%rcx), %ymm7, %ymm7 -; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 192(%rdx), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm17 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm20 ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm10 +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm3, %zmm23 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm15 +; AVX512F-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rsi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, 192(%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512F-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm31 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2] -; AVX512F-NEXT: vmovdqa64 (%rsi), %ymm20 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm7 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa (%rcx), %ymm5 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm5[0],ymm8[2],ymm5[2] +; AVX512F-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm21[0],ymm15[0],ymm21[2],ymm15[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %ymm16 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %ymm20 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm13 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2] -; AVX512F-NEXT: vmovdqa64 128(%rsi), %ymm20 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm21[1],ymm15[1],ymm21[3],ymm15[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm21 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm9 -; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm21[0],ymm15[0],ymm21[2],ymm15[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm10[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm21[1],ymm15[1],ymm21[3],ymm15[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm10 +; AVX512F-NEXT: vmovdqa 128(%rdx), %ymm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX512F-NEXT: vmovdqa64 128(%rsi), %ymm19 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm21[0],ymm19[0],ymm21[2],ymm19[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm21[1],ymm19[1],ymm21[3],ymm19[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm11 +; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512F-NEXT: vmovdqa 192(%rsi), %ymm15 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm13[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm24 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512F-NEXT: vmovaps %zmm10, 1920(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 1792(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1536(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 1472(%rax) -; AVX512F-NEXT: vmovaps %zmm19, 1408(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1088(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1152(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 1984(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 1920(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1856(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 1792(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm31, 1600(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm30, 1536(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm29, 1472(%rax) +; AVX512F-NEXT: vmovaps %zmm23, 1408(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 1344(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1280(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm22, 1088(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, 1024(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -4605,8 +4607,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -4617,471 +4618,473 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512BW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm24 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovaps 128(%rdx), %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm31 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm29 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm19 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm30 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm18 ; AVX512BW-NEXT: movb $-64, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm31[0],zmm19[0],zmm31[2],zmm19[2],zmm31[4],zmm19[4],zmm31[6],zmm19[6] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm4 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm19[1],zmm31[3],zmm19[3],zmm31[5],zmm19[5],zmm31[7],zmm19[7] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm9, %zmm4 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm8[0],zmm10[2],zmm8[2],zmm10[4],zmm8[4],zmm10[6],zmm8[6] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm12 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] ; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm6, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm11 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] +; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm7, %zmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm29[0],zmm23[0],zmm29[2],zmm23[2],zmm29[4],zmm23[4],zmm29[6],zmm23[6] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] +; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm9, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm29[1],zmm23[1],zmm29[3],zmm23[3],zmm29[5],zmm23[5],zmm29[7],zmm23[7] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,13,5,13] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm30[0],zmm18[0],zmm30[2],zmm18[2],zmm30[4],zmm18[4],zmm30[6],zmm18[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm18[1],zmm30[3],zmm18[3],zmm30[5],zmm18[5],zmm30[7],zmm18[7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm30 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm16 +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm21 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm13 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm14 -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm24 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm20 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6] -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm25, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm5[0],zmm16[2],zmm5[2],zmm16[4],zmm5[4],zmm16[6],zmm5[6] +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm16[1],zmm5[1],zmm16[3],zmm5[3],zmm16[5],zmm5[5],zmm16[7],zmm5[7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm13[1],zmm28[1],zmm13[3],zmm28[3],zmm13[5],zmm28[5],zmm13[7],zmm28[7] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm14[0],zmm26[0],zmm14[2],zmm26[2],zmm14[4],zmm26[4],zmm14[6],zmm26[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm14[1],zmm26[1],zmm14[3],zmm26[3],zmm14[5],zmm26[5],zmm14[7],zmm26[7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm24, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm24, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm24, %zmm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm26 = zmm24[0],zmm27[0],zmm24[2],zmm27[2],zmm24[4],zmm27[4],zmm24[6],zmm27[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm24[1],zmm27[1],zmm24[3],zmm27[3],zmm24[5],zmm27[5],zmm24[7],zmm27[7] +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm3, %ymm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm13, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-NEXT: vinserti32x4 $1, (%rdx), %ymm20, %ymm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm7, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm20[1],ymm0[1],ymm20[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} ; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm7, %ymm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm12, %ymm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm7, %ymm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm23, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm26 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm5, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm7 -; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm7, %ymm7 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm17 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm20 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm10 +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm0, %zmm3, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm15 +; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm31 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2] -; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm7 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm5 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm5[0],ymm8[2],ymm5[2] +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm21[0],ymm15[0],ymm21[2],ymm15[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm16 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm13 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2] -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm21[1],ymm15[1],ymm21[3],ymm15[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm21 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm9 -; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm21[0],ymm15[0],ymm21[2],ymm15[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm10[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm21[1],ymm15[1],ymm21[3],ymm15[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm10 +; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm19 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm21[0],ymm19[0],ymm21[2],ymm19[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm21[1],ymm19[1],ymm21[3],ymm19[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm11 +; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512BW-NEXT: vmovdqa 192(%rsi), %ymm15 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512BW-NEXT: vmovaps %zmm10, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1472(%rax) -; AVX512BW-NEXT: vmovaps %zmm19, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1088(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 1984(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1920(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1856(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1472(%rax) +; AVX512BW-NEXT: vmovaps %zmm23, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 1344(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 1024(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5090,8 +5093,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5102,11 +5104,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -5136,61 +5138,61 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps (%rsi), %xmm1 -; SSE-NEXT: movaps 16(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movaps 16(%rdx), %xmm10 -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps 16(%r8), %xmm12 +; SSE-NEXT: movaps 16(%rdi), %xmm9 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rsi), %xmm1 +; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movaps 16(%rdx), %xmm12 +; SSE-NEXT: movaps (%rcx), %xmm2 +; SSE-NEXT: movaps 16(%rcx), %xmm3 +; SSE-NEXT: movaps 16(%r8), %xmm14 ; SSE-NEXT: movaps (%r8), %xmm11 ; SSE-NEXT: movaps 16(%r9), %xmm4 ; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps 16(%r10), %xmm14 +; SSE-NEXT: movaps 16(%r10), %xmm15 ; SSE-NEXT: movaps (%r10), %xmm13 ; SSE-NEXT: movaps (%rax), %xmm6 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 32(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm1 @@ -5968,34 +5970,34 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%r10), %xmm2 +; SSE-NEXT: movaps 464(%r10), %xmm15 ; SSE-NEXT: movaps 464(%rax), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 480(%rdi), %xmm12 ; SSE-NEXT: movaps 480(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdx), %xmm10 -; SSE-NEXT: movaps 480(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 480(%rdx), %xmm13 +; SSE-NEXT: movaps 480(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] ; SSE-NEXT: movaps 480(%r8), %xmm9 -; SSE-NEXT: movaps 480(%r9), %xmm0 +; SSE-NEXT: movaps 480(%r9), %xmm1 ; SSE-NEXT: movaps %xmm9, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 480(%r10), %xmm11 -; SSE-NEXT: movaps 480(%rax), %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps 480(%r10), %xmm10 +; SSE-NEXT: movaps 480(%rax), %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm7 ; SSE-NEXT: movaps 496(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm7, %xmm8 @@ -6025,17 +6027,17 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm4, 4000(%rcx) ; SSE-NEXT: movaps %xmm6, 3984(%rcx) ; SSE-NEXT: movaps %xmm8, 3968(%rcx) -; SSE-NEXT: movaps %xmm11, 3952(%rcx) +; SSE-NEXT: movaps %xmm10, 3952(%rcx) ; SSE-NEXT: movaps %xmm9, 3936(%rcx) -; SSE-NEXT: movaps %xmm10, 3920(%rcx) -; SSE-NEXT: movaps %xmm13, 3904(%rcx) -; SSE-NEXT: movaps %xmm12, 3888(%rcx) +; SSE-NEXT: movaps %xmm13, 3920(%rcx) +; SSE-NEXT: movaps %xmm12, 3904(%rcx) +; SSE-NEXT: movaps %xmm11, 3888(%rcx) ; SSE-NEXT: movaps %xmm14, 3872(%rcx) -; SSE-NEXT: movaps %xmm15, 3856(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 3840(%rcx) +; SSE-NEXT: movaps %xmm0, 3856(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 3824(%rcx) +; SSE-NEXT: movaps %xmm0, 3840(%rcx) +; SSE-NEXT: movaps %xmm15, 3824(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 3808(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6519,7 +6521,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3784, %rsp # imm = 0xEC8 +; AVX1-ONLY-NEXT: subq $3720, %rsp # imm = 0xE88 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 @@ -6550,58 +6552,58 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[2] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm6[0],xmm5[0] ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6986,10 +6988,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7006,10 +7008,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7026,10 +7028,12 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7266,12 +7270,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 440(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 440(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 464(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7288,40 +7290,42 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 472(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 472(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 496(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps 496(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 496(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps 496(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm5[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%rax), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%rax), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 504(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vbroadcastsd 504(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -7360,71 +7364,69 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm3, 4064(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 4032(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 4000(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 3968(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 3936(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 3904(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 3872(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 3840(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 3808(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 3776(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 3744(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 3712(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 3680(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 3648(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 3616(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 3584(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 3552(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 3520(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 3488(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 3456(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 3424(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 3392(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 3360(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 3328(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 3296(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 3264(%rdx) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, 4064(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 4032(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 4000(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 3968(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3936(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3904(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3872(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3840(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 3808(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 3776(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3744(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3712(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3680(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3648(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3616(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3584(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 3552(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 3520(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3488(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3456(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3424(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3392(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3360(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 3328(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 3296(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 3264(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3232(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7437,8 +7439,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3104(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3072(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 3040(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 3008(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 3040(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 3008(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2976(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7451,8 +7453,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2848(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2816(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 2784(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 2752(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 2784(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 2752(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2720(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7465,8 +7467,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2592(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2560(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 2528(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 2496(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 2528(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 2496(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2464(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7623,7 +7625,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: addq $3784, %rsp # imm = 0xEC8 +; AVX1-ONLY-NEXT: addq $3720, %rsp # imm = 0xE88 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -7849,14 +7851,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 288(%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8129,10 +8131,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 288(%rdx), %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -8293,7 +8295,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] @@ -8552,7 +8554,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 496(%rax), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 504(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] @@ -8572,7 +8574,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -8605,7 +8607,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8634,7 +8636,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm9, 4064(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 4032(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 4000(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 4000(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm9, 3968(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3808(%rdx) @@ -8673,7 +8675,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2464(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2432(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 2272(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 2272(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2240(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8878,7905 +8880,7929 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512F-ONLY-SLOW-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 ; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm9, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm31[0],zmm26[0],zmm31[2],zmm26[2],zmm31[4],zmm26[4],zmm31[6],zmm26[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm31[1],zmm26[1],zmm31[3],zmm26[3],zmm31[5],zmm26[5],zmm31[7],zmm26[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm22[0],zmm24[0],zmm22[2],zmm24[2],zmm22[4],zmm24[4],zmm22[6],zmm24[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm22[1],zmm24[1],zmm22[3],zmm24[3],zmm22[5],zmm24[5],zmm22[7],zmm24[7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm18[0],zmm17[0],zmm18[2],zmm17[2],zmm18[4],zmm17[4],zmm18[6],zmm17[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm18[1],zmm17[1],zmm18[3],zmm17[3],zmm18[5],zmm17[5],zmm18[7],zmm17[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm15, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm20[0],zmm8[2],zmm20[2],zmm8[4],zmm20[4],zmm8[6],zmm20[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm8[1],zmm20[1],zmm8[3],zmm20[3],zmm8[5],zmm20[5],zmm8[7],zmm20[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm28[1],zmm13[3],zmm28[3],zmm13[5],zmm28[5],zmm13[7],zmm28[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm0[0],zmm25[0],zmm0[2],zmm25[2],zmm0[4],zmm25[4],zmm0[6],zmm25[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm25[1],zmm0[3],zmm25[3],zmm0[5],zmm25[5],zmm0[7],zmm25[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm18[0],zmm10[0],zmm18[2],zmm10[2],zmm18[4],zmm10[4],zmm18[6],zmm10[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm18[1],zmm10[1],zmm18[3],zmm10[3],zmm18[5],zmm10[5],zmm18[7],zmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm27[0],zmm13[0],zmm27[2],zmm13[2],zmm27[4],zmm13[4],zmm27[6],zmm13[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm13[1],zmm27[3],zmm13[3],zmm27[5],zmm13[5],zmm27[7],zmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm14[0],zmm2[0],zmm14[2],zmm2[2],zmm14[4],zmm2[4],zmm14[6],zmm2[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm14[1],zmm2[1],zmm14[3],zmm2[3],zmm14[5],zmm2[5],zmm14[7],zmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm16[0],zmm6[0],zmm16[2],zmm6[2],zmm16[4],zmm6[4],zmm16[6],zmm6[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm16[1],zmm6[1],zmm16[3],zmm6[3],zmm16[5],zmm6[5],zmm16[7],zmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm22[0],zmm8[0],zmm22[2],zmm8[2],zmm22[4],zmm8[4],zmm22[6],zmm8[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm23 {%k1} = zmm22[1],zmm8[1],zmm22[3],zmm8[3],zmm22[5],zmm8[5],zmm22[7],zmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm17 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm15[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm24 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm1, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm3, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rdx), %ymm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rdx), %ymm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 384(%rcx), %ymm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm29 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm29, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm29[0],ymm8[0],ymm29[2],ymm8[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm9, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm29[1],ymm8[1],ymm29[3],ymm8[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm29 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm29, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 3776(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3712(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3264(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 3200(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2688(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 4032(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 3968(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 3904(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 3840(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 3648(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 3136(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 3072(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2624(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2560(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2496(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2432(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2368(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2304(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 2112(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1856(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1792(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 256(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512F-ONLY-FAST-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 ; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm9, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm31[0],zmm26[0],zmm31[2],zmm26[2],zmm31[4],zmm26[4],zmm31[6],zmm26[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm31[1],zmm26[1],zmm31[3],zmm26[3],zmm31[5],zmm26[5],zmm31[7],zmm26[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm22[0],zmm24[0],zmm22[2],zmm24[2],zmm22[4],zmm24[4],zmm22[6],zmm24[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm22[1],zmm24[1],zmm22[3],zmm24[3],zmm22[5],zmm24[5],zmm22[7],zmm24[7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm18[0],zmm17[0],zmm18[2],zmm17[2],zmm18[4],zmm17[4],zmm18[6],zmm17[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm18[1],zmm17[1],zmm18[3],zmm17[3],zmm18[5],zmm17[5],zmm18[7],zmm17[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm15, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm20[0],zmm8[2],zmm20[2],zmm8[4],zmm20[4],zmm8[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm8[1],zmm20[1],zmm8[3],zmm20[3],zmm8[5],zmm20[5],zmm8[7],zmm20[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm28[1],zmm13[3],zmm28[3],zmm13[5],zmm28[5],zmm13[7],zmm28[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm0[0],zmm25[0],zmm0[2],zmm25[2],zmm0[4],zmm25[4],zmm0[6],zmm25[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm25[1],zmm0[3],zmm25[3],zmm0[5],zmm25[5],zmm0[7],zmm25[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm18[0],zmm10[0],zmm18[2],zmm10[2],zmm18[4],zmm10[4],zmm18[6],zmm10[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm18[1],zmm10[1],zmm18[3],zmm10[3],zmm18[5],zmm10[5],zmm18[7],zmm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm27[0],zmm13[0],zmm27[2],zmm13[2],zmm27[4],zmm13[4],zmm27[6],zmm13[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm13[1],zmm27[3],zmm13[3],zmm27[5],zmm13[5],zmm27[7],zmm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm14[0],zmm2[0],zmm14[2],zmm2[2],zmm14[4],zmm2[4],zmm14[6],zmm2[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm14[1],zmm2[1],zmm14[3],zmm2[3],zmm14[5],zmm2[5],zmm14[7],zmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm16[0],zmm6[0],zmm16[2],zmm6[2],zmm16[4],zmm6[4],zmm16[6],zmm6[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm16[1],zmm6[1],zmm16[3],zmm6[3],zmm16[5],zmm6[5],zmm16[7],zmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm22[0],zmm8[0],zmm22[2],zmm8[2],zmm22[4],zmm8[4],zmm22[6],zmm8[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm23 {%k1} = zmm22[1],zmm8[1],zmm22[3],zmm8[3],zmm22[5],zmm8[5],zmm22[7],zmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm26 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm15[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm24 ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm1, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm3, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 256(%rdx), %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 320(%rdx), %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 384(%rcx), %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm29 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm29, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm29[0],ymm8[0],ymm29[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm9, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm29[1],ymm8[1],ymm29[3],ymm8[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm29 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm29, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 3776(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3712(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3264(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 3200(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2688(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 2176(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 4032(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 3968(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 3904(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 3840(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 3648(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 3136(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3072(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2624(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 2560(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2496(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2432(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2368(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2304(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2112(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1792(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512DQ-SLOW-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 ; AVX512DQ-SLOW-NEXT: movb $-64, %r11b ; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] ; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm30 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm9, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm31[0],zmm26[0],zmm31[2],zmm26[2],zmm31[4],zmm26[4],zmm31[6],zmm26[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm14 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm31[1],zmm26[1],zmm31[3],zmm26[3],zmm31[5],zmm26[5],zmm31[7],zmm26[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm10 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm22[0],zmm24[0],zmm22[2],zmm24[2],zmm22[4],zmm24[4],zmm22[6],zmm24[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm22[1],zmm24[1],zmm22[3],zmm24[3],zmm22[5],zmm24[5],zmm22[7],zmm24[7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm18[0],zmm17[0],zmm18[2],zmm17[2],zmm18[4],zmm17[4],zmm18[6],zmm17[6] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm18[1],zmm17[1],zmm18[3],zmm17[3],zmm18[5],zmm17[5],zmm18[7],zmm17[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm15, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm20[0],zmm8[2],zmm20[2],zmm8[4],zmm20[4],zmm8[6],zmm20[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm8[1],zmm20[1],zmm8[3],zmm20[3],zmm8[5],zmm20[5],zmm8[7],zmm20[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm28[1],zmm13[3],zmm28[3],zmm13[5],zmm28[5],zmm13[7],zmm28[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm0[0],zmm25[0],zmm0[2],zmm25[2],zmm0[4],zmm25[4],zmm0[6],zmm25[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm25[1],zmm0[3],zmm25[3],zmm0[5],zmm25[5],zmm0[7],zmm25[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm18[0],zmm10[0],zmm18[2],zmm10[2],zmm18[4],zmm10[4],zmm18[6],zmm10[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm18[1],zmm10[1],zmm18[3],zmm10[3],zmm18[5],zmm10[5],zmm18[7],zmm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm27[0],zmm13[0],zmm27[2],zmm13[2],zmm27[4],zmm13[4],zmm27[6],zmm13[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm13[1],zmm27[3],zmm13[3],zmm27[5],zmm13[5],zmm27[7],zmm13[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm23 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm14[0],zmm2[0],zmm14[2],zmm2[2],zmm14[4],zmm2[4],zmm14[6],zmm2[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm14[1],zmm2[1],zmm14[3],zmm2[3],zmm14[5],zmm2[5],zmm14[7],zmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm16[0],zmm6[0],zmm16[2],zmm6[2],zmm16[4],zmm6[4],zmm16[6],zmm6[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm16[1],zmm6[1],zmm16[3],zmm6[3],zmm16[5],zmm6[5],zmm16[7],zmm6[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm22[0],zmm8[0],zmm22[2],zmm8[2],zmm22[4],zmm8[4],zmm22[6],zmm8[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm23 {%k1} = zmm22[1],zmm8[1],zmm22[3],zmm8[3],zmm22[5],zmm8[5],zmm22[7],zmm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm26 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm17 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm23 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm15[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm15 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm20 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm24 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm1, %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm3 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm3, %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %xmm6 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %xmm6 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 256(%rdx), %ymm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %xmm6 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 320(%rdx), %ymm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %xmm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 384(%rcx), %ymm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm29 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm29, %ymm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm29[0],ymm8[0],ymm29[2],ymm8[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm9, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm29[1],ymm8[1],ymm29[3],ymm8[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm29 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm29, %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 3776(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 3712(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3264(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 3200(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 2688(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 2240(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 4032(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 3968(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 3904(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 3840(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 3648(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 3136(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 3072(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 2624(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 2560(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2496(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2432(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2368(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2304(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 2112(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1792(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride8_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512DQ-FAST-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 ; AVX512DQ-FAST-NEXT: movb $-64, %r11b ; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm9, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm31[0],zmm26[0],zmm31[2],zmm26[2],zmm31[4],zmm26[4],zmm31[6],zmm26[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm14 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm14 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm31[1],zmm26[1],zmm31[3],zmm26[3],zmm31[5],zmm26[5],zmm31[7],zmm26[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm22[0],zmm24[0],zmm22[2],zmm24[2],zmm22[4],zmm24[4],zmm22[6],zmm24[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm22[1],zmm24[1],zmm22[3],zmm24[3],zmm22[5],zmm24[5],zmm22[7],zmm24[7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm18[0],zmm17[0],zmm18[2],zmm17[2],zmm18[4],zmm17[4],zmm18[6],zmm17[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm18[1],zmm17[1],zmm18[3],zmm17[3],zmm18[5],zmm17[5],zmm18[7],zmm17[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm15, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm20[0],zmm8[2],zmm20[2],zmm8[4],zmm20[4],zmm8[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm8[1],zmm20[1],zmm8[3],zmm20[3],zmm8[5],zmm20[5],zmm8[7],zmm20[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm28[1],zmm13[3],zmm28[3],zmm13[5],zmm28[5],zmm13[7],zmm28[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm0[0],zmm25[0],zmm0[2],zmm25[2],zmm0[4],zmm25[4],zmm0[6],zmm25[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm25[1],zmm0[3],zmm25[3],zmm0[5],zmm25[5],zmm0[7],zmm25[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm18[0],zmm10[0],zmm18[2],zmm10[2],zmm18[4],zmm10[4],zmm18[6],zmm10[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm18[1],zmm10[1],zmm18[3],zmm10[3],zmm18[5],zmm10[5],zmm18[7],zmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm27[0],zmm13[0],zmm27[2],zmm13[2],zmm27[4],zmm13[4],zmm27[6],zmm13[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm13[1],zmm27[3],zmm13[3],zmm27[5],zmm13[5],zmm27[7],zmm13[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm24 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm23 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm14[0],zmm2[0],zmm14[2],zmm2[2],zmm14[4],zmm2[4],zmm14[6],zmm2[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm14[1],zmm2[1],zmm14[3],zmm2[3],zmm14[5],zmm2[5],zmm14[7],zmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm16[0],zmm6[0],zmm16[2],zmm6[2],zmm16[4],zmm6[4],zmm16[6],zmm6[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm16[1],zmm6[1],zmm16[3],zmm6[3],zmm16[5],zmm6[5],zmm16[7],zmm6[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm22[0],zmm8[0],zmm22[2],zmm8[2],zmm22[4],zmm8[4],zmm22[6],zmm8[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm23 {%k1} = zmm22[1],zmm8[1],zmm22[3],zmm8[3],zmm22[5],zmm8[5],zmm22[7],zmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm22 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %ymm23 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm15[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm15 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm8 ; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm24 ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm1, %ymm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm3, %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %xmm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %xmm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 256(%rdx), %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %xmm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 320(%rdx), %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %xmm8 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 384(%rcx), %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %xmm29 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm29, %ymm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm29[0],ymm8[0],ymm29[2],ymm8[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm9, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm29[1],ymm8[1],ymm29[3],ymm8[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm29 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm29, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 3776(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 3712(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3264(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 3200(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 2688(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 2176(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 4032(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 3968(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 3904(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 3840(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 3648(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 3136(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 3072(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 2624(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 2560(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2496(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2432(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2368(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2304(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 2112(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1792(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512BW-ONLY-SLOW-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] ; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm9, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm31[0],zmm26[0],zmm31[2],zmm26[2],zmm31[4],zmm26[4],zmm31[6],zmm26[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm31[1],zmm26[1],zmm31[3],zmm26[3],zmm31[5],zmm26[5],zmm31[7],zmm26[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm22[0],zmm24[0],zmm22[2],zmm24[2],zmm22[4],zmm24[4],zmm22[6],zmm24[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm22[1],zmm24[1],zmm22[3],zmm24[3],zmm22[5],zmm24[5],zmm22[7],zmm24[7] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm18[0],zmm17[0],zmm18[2],zmm17[2],zmm18[4],zmm17[4],zmm18[6],zmm17[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm18[1],zmm17[1],zmm18[3],zmm17[3],zmm18[5],zmm17[5],zmm18[7],zmm17[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm15, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm20[0],zmm8[2],zmm20[2],zmm8[4],zmm20[4],zmm8[6],zmm20[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm8[1],zmm20[1],zmm8[3],zmm20[3],zmm8[5],zmm20[5],zmm8[7],zmm20[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm28[1],zmm13[3],zmm28[3],zmm13[5],zmm28[5],zmm13[7],zmm28[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm0[0],zmm25[0],zmm0[2],zmm25[2],zmm0[4],zmm25[4],zmm0[6],zmm25[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm25[1],zmm0[3],zmm25[3],zmm0[5],zmm25[5],zmm0[7],zmm25[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm18[0],zmm10[0],zmm18[2],zmm10[2],zmm18[4],zmm10[4],zmm18[6],zmm10[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm18[1],zmm10[1],zmm18[3],zmm10[3],zmm18[5],zmm10[5],zmm18[7],zmm10[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm27[0],zmm13[0],zmm27[2],zmm13[2],zmm27[4],zmm13[4],zmm27[6],zmm13[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm13[1],zmm27[3],zmm13[3],zmm27[5],zmm13[5],zmm27[7],zmm13[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm14[0],zmm2[0],zmm14[2],zmm2[2],zmm14[4],zmm2[4],zmm14[6],zmm2[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm14[1],zmm2[1],zmm14[3],zmm2[3],zmm14[5],zmm2[5],zmm14[7],zmm2[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm16[0],zmm6[0],zmm16[2],zmm6[2],zmm16[4],zmm6[4],zmm16[6],zmm6[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm16[1],zmm6[1],zmm16[3],zmm6[3],zmm16[5],zmm6[5],zmm16[7],zmm6[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm22[0],zmm8[0],zmm22[2],zmm8[2],zmm22[4],zmm8[4],zmm22[6],zmm8[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm23 {%k1} = zmm22[1],zmm8[1],zmm22[3],zmm8[3],zmm22[5],zmm8[5],zmm22[7],zmm8[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm26 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm15[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm24 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm1, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm3, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm6, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm8, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm6, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rdx), %ymm8, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm6, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rdx), %ymm8, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 384(%rcx), %ymm8, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm29 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm29, %ymm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm29[0],ymm8[0],ymm29[2],ymm8[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm9, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm29[1],ymm8[1],ymm29[3],ymm8[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm29 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm29, %ymm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 3776(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3712(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3264(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 3200(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2688(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 4032(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 3968(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 3904(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 3840(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 3648(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 3136(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 3072(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2624(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2560(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2496(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2432(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2368(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2304(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 2112(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1856(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1792(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512BW-ONLY-FAST-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] ; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm9, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm31[0],zmm26[0],zmm31[2],zmm26[2],zmm31[4],zmm26[4],zmm31[6],zmm26[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm31[1],zmm26[1],zmm31[3],zmm26[3],zmm31[5],zmm26[5],zmm31[7],zmm26[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm22[0],zmm24[0],zmm22[2],zmm24[2],zmm22[4],zmm24[4],zmm22[6],zmm24[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm22[1],zmm24[1],zmm22[3],zmm24[3],zmm22[5],zmm24[5],zmm22[7],zmm24[7] ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm18[0],zmm17[0],zmm18[2],zmm17[2],zmm18[4],zmm17[4],zmm18[6],zmm17[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm18[1],zmm17[1],zmm18[3],zmm17[3],zmm18[5],zmm17[5],zmm18[7],zmm17[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm15, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm20[0],zmm8[2],zmm20[2],zmm8[4],zmm20[4],zmm8[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm8[1],zmm20[1],zmm8[3],zmm20[3],zmm8[5],zmm20[5],zmm8[7],zmm20[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm28[1],zmm13[3],zmm28[3],zmm13[5],zmm28[5],zmm13[7],zmm28[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm0[0],zmm25[0],zmm0[2],zmm25[2],zmm0[4],zmm25[4],zmm0[6],zmm25[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm25[1],zmm0[3],zmm25[3],zmm0[5],zmm25[5],zmm0[7],zmm25[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm18[0],zmm10[0],zmm18[2],zmm10[2],zmm18[4],zmm10[4],zmm18[6],zmm10[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm18[1],zmm10[1],zmm18[3],zmm10[3],zmm18[5],zmm10[5],zmm18[7],zmm10[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm27[0],zmm13[0],zmm27[2],zmm13[2],zmm27[4],zmm13[4],zmm27[6],zmm13[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm13[1],zmm27[3],zmm13[3],zmm27[5],zmm13[5],zmm27[7],zmm13[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm14[0],zmm2[0],zmm14[2],zmm2[2],zmm14[4],zmm2[4],zmm14[6],zmm2[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm14[1],zmm2[1],zmm14[3],zmm2[3],zmm14[5],zmm2[5],zmm14[7],zmm2[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm16[0],zmm6[0],zmm16[2],zmm6[2],zmm16[4],zmm6[4],zmm16[6],zmm6[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm16[1],zmm6[1],zmm16[3],zmm6[3],zmm16[5],zmm6[5],zmm16[7],zmm6[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm22[0],zmm8[0],zmm22[2],zmm8[2],zmm22[4],zmm8[4],zmm22[6],zmm8[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm23 {%k1} = zmm22[1],zmm8[1],zmm22[3],zmm8[3],zmm22[5],zmm8[5],zmm22[7],zmm8[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm26 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm15[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm20 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm24 ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm1, %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm3, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm6, %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm8, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm21 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm6, %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 256(%rdx), %ymm8, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm6, %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 320(%rdx), %ymm8, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 384(%rcx), %ymm8, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm29 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm29, %ymm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm29[0],ymm8[0],ymm29[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm9, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm29[1],ymm8[1],ymm29[3],ymm8[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm29 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm29, %ymm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 3776(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3712(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3264(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 3200(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2688(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 2176(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 4032(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 3968(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 3904(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 3840(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 3648(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 3136(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3072(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2624(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 2560(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2496(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2432(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2368(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2304(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2112(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1792(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 256(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512DQBW-SLOW-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 ; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b ; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] ; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm30 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm9, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm31[0],zmm26[0],zmm31[2],zmm26[2],zmm31[4],zmm26[4],zmm31[6],zmm26[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm31[1],zmm26[1],zmm31[3],zmm26[3],zmm31[5],zmm26[5],zmm31[7],zmm26[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm22[0],zmm24[0],zmm22[2],zmm24[2],zmm22[4],zmm24[4],zmm22[6],zmm24[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm22[1],zmm24[1],zmm22[3],zmm24[3],zmm22[5],zmm24[5],zmm22[7],zmm24[7] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm18[0],zmm17[0],zmm18[2],zmm17[2],zmm18[4],zmm17[4],zmm18[6],zmm17[6] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm18[1],zmm17[1],zmm18[3],zmm17[3],zmm18[5],zmm17[5],zmm18[7],zmm17[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm15, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm20[0],zmm8[2],zmm20[2],zmm8[4],zmm20[4],zmm8[6],zmm20[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm8[1],zmm20[1],zmm8[3],zmm20[3],zmm8[5],zmm20[5],zmm8[7],zmm20[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm28[1],zmm13[3],zmm28[3],zmm13[5],zmm28[5],zmm13[7],zmm28[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm0[0],zmm25[0],zmm0[2],zmm25[2],zmm0[4],zmm25[4],zmm0[6],zmm25[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm25[1],zmm0[3],zmm25[3],zmm0[5],zmm25[5],zmm0[7],zmm25[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm18[0],zmm10[0],zmm18[2],zmm10[2],zmm18[4],zmm10[4],zmm18[6],zmm10[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm18[1],zmm10[1],zmm18[3],zmm10[3],zmm18[5],zmm10[5],zmm18[7],zmm10[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm27[0],zmm13[0],zmm27[2],zmm13[2],zmm27[4],zmm13[4],zmm27[6],zmm13[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm13[1],zmm27[3],zmm13[3],zmm27[5],zmm13[5],zmm27[7],zmm13[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm23 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm14[0],zmm2[0],zmm14[2],zmm2[2],zmm14[4],zmm2[4],zmm14[6],zmm2[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm14[1],zmm2[1],zmm14[3],zmm2[3],zmm14[5],zmm2[5],zmm14[7],zmm2[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm16[0],zmm6[0],zmm16[2],zmm6[2],zmm16[4],zmm6[4],zmm16[6],zmm6[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm16[1],zmm6[1],zmm16[3],zmm6[3],zmm16[5],zmm6[5],zmm16[7],zmm6[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm22[0],zmm8[0],zmm22[2],zmm8[2],zmm22[4],zmm8[4],zmm22[6],zmm8[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm23 {%k1} = zmm22[1],zmm8[1],zmm22[3],zmm8[3],zmm22[5],zmm8[5],zmm22[7],zmm8[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm26 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm17 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm23 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm15[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm20 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm8 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm24 ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm1, %ymm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm3, %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %xmm6 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm6, %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm8, %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %xmm6 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm6, %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 256(%rdx), %ymm8, %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm28 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %xmm6 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm6, %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 320(%rdx), %ymm8, %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 384(%rcx), %ymm8, %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm29 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm29, %ymm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm29[0],ymm8[0],ymm29[2],ymm8[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm9, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm29[1],ymm8[1],ymm29[3],ymm8[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm29 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm29, %ymm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 3776(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 3712(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3264(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 3200(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 2688(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 2240(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 4032(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 3968(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 3904(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 3840(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 3648(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 3136(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 3072(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 2624(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 2560(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2496(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2432(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2368(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2304(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 2112(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1792(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 -; AVX512DQBW-FAST-NEXT: movb $-64, %r11b -; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm30 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: subq $5448, %rsp # imm = 0x1548 +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512DQBW-FAST-NEXT: movb $-64, %r11b +; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm9, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm31[0],zmm26[0],zmm31[2],zmm26[2],zmm31[4],zmm26[4],zmm31[6],zmm26[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm14 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm14 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm31[1],zmm26[1],zmm31[3],zmm26[3],zmm31[5],zmm26[5],zmm31[7],zmm26[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm22[0],zmm24[0],zmm22[2],zmm24[2],zmm22[4],zmm24[4],zmm22[6],zmm24[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm22[1],zmm24[1],zmm22[3],zmm24[3],zmm22[5],zmm24[5],zmm22[7],zmm24[7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm18[0],zmm17[0],zmm18[2],zmm17[2],zmm18[4],zmm17[4],zmm18[6],zmm17[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm18[1],zmm17[1],zmm18[3],zmm17[3],zmm18[5],zmm17[5],zmm18[7],zmm17[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm15, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm20[0],zmm8[2],zmm20[2],zmm8[4],zmm20[4],zmm8[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm8[1],zmm20[1],zmm8[3],zmm20[3],zmm8[5],zmm20[5],zmm8[7],zmm20[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm28[1],zmm13[3],zmm28[3],zmm13[5],zmm28[5],zmm13[7],zmm28[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm0[0],zmm25[0],zmm0[2],zmm25[2],zmm0[4],zmm25[4],zmm0[6],zmm25[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm25[1],zmm0[3],zmm25[3],zmm0[5],zmm25[5],zmm0[7],zmm25[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm18[0],zmm10[0],zmm18[2],zmm10[2],zmm18[4],zmm10[4],zmm18[6],zmm10[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm18[1],zmm10[1],zmm18[3],zmm10[3],zmm18[5],zmm10[5],zmm18[7],zmm10[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm27[0],zmm13[0],zmm27[2],zmm13[2],zmm27[4],zmm13[4],zmm27[6],zmm13[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm13[1],zmm27[3],zmm13[3],zmm27[5],zmm13[5],zmm27[7],zmm13[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm23 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm14[0],zmm2[0],zmm14[2],zmm2[2],zmm14[4],zmm2[4],zmm14[6],zmm2[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm14[1],zmm2[1],zmm14[3],zmm2[3],zmm14[5],zmm2[5],zmm14[7],zmm2[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm16[0],zmm6[0],zmm16[2],zmm6[2],zmm16[4],zmm6[4],zmm16[6],zmm6[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm16[1],zmm6[1],zmm16[3],zmm6[3],zmm16[5],zmm6[5],zmm16[7],zmm6[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm22[0],zmm8[0],zmm22[2],zmm8[2],zmm22[4],zmm8[4],zmm22[6],zmm8[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm23 {%k1} = zmm22[1],zmm8[1],zmm22[3],zmm8[3],zmm22[5],zmm8[5],zmm22[7],zmm8[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm0 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm26 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %ymm23 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm15[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm15 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm20 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm8 ; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm23[0],ymm8[0],ymm23[2],ymm8[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm8[1],ymm23[3],ymm8[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm24 ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm1, %ymm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm3 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm3, %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %xmm6 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm6, %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm8, %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %xmm6 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm6, %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 256(%rdx), %ymm8, %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %xmm6 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm6, %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 320(%rdx), %ymm8, %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %xmm8 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 384(%rcx), %ymm8, %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %xmm29 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm29, %ymm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm29[0],ymm8[0],ymm29[2],ymm8[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm9, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm29[1],ymm8[1],ymm29[3],ymm8[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm29 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm29, %ymm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 3776(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 3712(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3264(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 3200(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 2688(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 2176(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 4032(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 3968(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 3904(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 3840(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 3648(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 3136(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 3072(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 2624(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 2560(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2496(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2432(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2368(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2304(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 2112(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1792(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 256(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQBW-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index 035db822be5180..e5478386695cd9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -223,29 +223,29 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i8_stride3_vf16: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa (%rsi), %xmm3 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,6] ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] @@ -261,14 +261,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; SSE-NEXT: pand %xmm2, %xmm0 @@ -279,7 +279,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm7, (%rcx) -; SSE-NEXT: movdqa %xmm3, 16(%rcx) +; SSE-NEXT: movdqa %xmm4, 16(%rcx) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i8_stride3_vf16: @@ -342,70 +342,70 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm10 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm9 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa 16(%rdx), %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,6] ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,6,5] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm6[0,1,2,3,4,5,5,6] ; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: por %xmm10, %xmm11 ; SSE-NEXT: pand %xmm5, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm10, %xmm6 ; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm0, %xmm12 ; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] +; SSE-NEXT: pand %xmm10, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: pandn %xmm13, %xmm11 ; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,2,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,6,6] @@ -414,35 +414,35 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm0, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,6,6] ; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm9, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm3[8],xmm8[9],xmm3[9],xmm8[10],xmm3[10],xmm8[11],xmm3[11],xmm8[12],xmm3[12],xmm8[13],xmm3[13],xmm8[14],xmm3[14],xmm8[15],xmm3[15] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,7,7] -; SSE-NEXT: pandn %xmm10, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,2,4,5,6,7] +; SSE-NEXT: por %xmm9, %xmm8 +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,6] ; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] @@ -451,11 +451,11 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm9, 32(%rcx) +; SSE-NEXT: movdqa %xmm10, 32(%rcx) ; SSE-NEXT: movdqa %xmm7, 48(%rcx) ; SSE-NEXT: movdqa %xmm11, 80(%rcx) ; SSE-NEXT: movdqa %xmm6, 16(%rcx) -; SSE-NEXT: movdqa %xmm3, 64(%rcx) +; SSE-NEXT: movdqa %xmm4, 64(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride3_vf32: @@ -590,19 +590,19 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm12 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movdqa 32(%rsi), %xmm14 -; SSE-NEXT: movdqa 48(%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rdx), %xmm8 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rsi), %xmm7 +; SSE-NEXT: movdqa 32(%rsi), %xmm14 +; SSE-NEXT: movdqa 48(%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm12 +; SSE-NEXT: movdqa 48(%rdx), %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] ; SSE-NEXT: movdqa %xmm1, %xmm3 @@ -616,13 +616,13 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm3, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 @@ -634,37 +634,37 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm6, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm8, %xmm13 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm11 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: por %xmm3, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: movdqa (%rsi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,1,2,3] +; SSE-NEXT: movdqa (%rsi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] @@ -685,12 +685,12 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] @@ -703,12 +703,12 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,5,5,6,6] ; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,5,5,6,6] ; SSE-NEXT: movdqa %xmm1, %xmm6 @@ -721,13 +721,13 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] ; SSE-NEXT: movdqa %xmm15, %xmm4 @@ -739,57 +739,57 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,5,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,5,5,6,6] ; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm9, %xmm5 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] ; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 ; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,1,2] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,6,6] ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm9, %xmm7 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm9 @@ -807,8 +807,8 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] ; SSE-NEXT: pandn %xmm9, %xmm15 ; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,1,2] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,2,1,4,5,6,7] @@ -831,7 +831,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm6, 144(%rcx) ; SSE-NEXT: movdqa %xmm8, 176(%rcx) ; SSE-NEXT: movdqa %xmm10, 16(%rcx) -; SSE-NEXT: movdqa %xmm13, 64(%rcx) +; SSE-NEXT: movdqa %xmm11, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -840,119 +840,118 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i8_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $24, %rsp -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 +; AVX1-ONLY-NEXT: pushq %rax +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm15 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm15, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 80(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm14, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 160(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 176(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 160(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 176(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm13, 96(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 112(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 128(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 144(%rcx) -; AVX1-ONLY-NEXT: addq $24, %rsp +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 128(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 144(%rcx) +; AVX1-ONLY-NEXT: popq %rax ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i8_stride3_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll index 9e7d970a6abcde..7fb3ae5545705e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll @@ -257,44 +257,44 @@ define void @store_i8_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa (%rdx), %xmm7 -; SSE-NEXT: movdqa 16(%rdx), %xmm4 -; SSE-NEXT: movdqa (%rcx), %xmm8 -; SSE-NEXT: movdqa 16(%rcx), %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm5 +; SSE-NEXT: movdqa (%rcx), %xmm6 +; SSE-NEXT: movdqa 16(%rcx), %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE-NEXT: movdqa %xmm1, 96(%r8) -; SSE-NEXT: movdqa %xmm6, 112(%r8) -; SSE-NEXT: movdqa %xmm8, 64(%r8) -; SSE-NEXT: movdqa %xmm10, 80(%r8) +; SSE-NEXT: movdqa %xmm3, 112(%r8) +; SSE-NEXT: movdqa %xmm6, 64(%r8) +; SSE-NEXT: movdqa %xmm8, 80(%r8) ; SSE-NEXT: movdqa %xmm0, 32(%r8) -; SSE-NEXT: movdqa %xmm5, 48(%r8) -; SSE-NEXT: movdqa %xmm2, (%r8) -; SSE-NEXT: movdqa %xmm3, 16(%r8) +; SSE-NEXT: movdqa %xmm2, 48(%r8) +; SSE-NEXT: movdqa %xmm9, (%r8) +; SSE-NEXT: movdqa %xmm10, 16(%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride4_vf32: @@ -423,13 +423,13 @@ define void @store_i8_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i8_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride4_vf64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm0 +; SSE-NEXT: movdqa 32(%rsi), %xmm2 ; SSE-NEXT: movdqa (%rdx), %xmm7 ; SSE-NEXT: movdqa 16(%rdx), %xmm13 ; SSE-NEXT: movdqa 32(%rdx), %xmm10 @@ -438,72 +438,72 @@ define void @store_i8_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa 32(%rcx), %xmm12 ; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] ; SSE-NEXT: movdqa %xmm13, %xmm15 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] ; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; SSE-NEXT: movdqa 48(%rdx), %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; SSE-NEXT: movdqa 48(%rdx), %xmm13 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] ; SSE-NEXT: movdqa 48(%rcx), %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: movdqa %xmm13, %xmm10 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE-NEXT: movdqa 48(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa 48(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: movdqa %xmm2, 224(%r8) -; SSE-NEXT: movdqa %xmm1, 240(%r8) -; SSE-NEXT: movdqa %xmm3, 192(%r8) +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: movdqa %xmm1, 224(%r8) +; SSE-NEXT: movdqa %xmm2, 240(%r8) +; SSE-NEXT: movdqa %xmm5, 192(%r8) ; SSE-NEXT: movdqa %xmm0, 208(%r8) -; SSE-NEXT: movdqa %xmm4, 160(%r8) -; SSE-NEXT: movdqa %xmm9, 176(%r8) -; SSE-NEXT: movdqa %xmm13, 128(%r8) -; SSE-NEXT: movdqa %xmm14, 144(%r8) +; SSE-NEXT: movdqa %xmm9, 160(%r8) +; SSE-NEXT: movdqa %xmm3, 176(%r8) +; SSE-NEXT: movdqa %xmm14, 128(%r8) +; SSE-NEXT: movdqa %xmm15, 144(%r8) ; SSE-NEXT: movdqa %xmm11, 96(%r8) ; SSE-NEXT: movdqa %xmm8, 112(%r8) ; SSE-NEXT: movdqa %xmm7, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movdqa %xmm5, 32(%r8) +; SSE-NEXT: movdqa %xmm4, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movdqa %xmm6, (%r8) @@ -516,71 +516,71 @@ define void @store_i8_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm7[8],xmm14[9],xmm7[9],xmm14[10],xmm7[10],xmm14[11],xmm7[11],xmm14[12],xmm7[12],xmm14[13],xmm7[13],xmm14[14],xmm7[14],xmm14[15],xmm7[15] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 ; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 1f21eee473194b..240874d1fca4d4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -211,58 +211,58 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,0,65535,65535,0] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,0,65535,65535,0] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,1,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm8, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm10, %xmm5 ; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm4[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] -; SSE-NEXT: pandn %xmm10, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,1,3] -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm9, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] +; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,0,0] ; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,1,3] +; SSE-NEXT: pandn %xmm10, %xmm6 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm6, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,0,0] ; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm7, %xmm4 @@ -560,56 +560,55 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa (%rdx), %xmm9 +; SSE-NEXT: movdqa (%rcx), %xmm3 ; SSE-NEXT: movdqa (%r8), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,2,2] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,1,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: por %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm10 ; SSE-NEXT: pandn %xmm11, %xmm10 ; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm10 ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm11, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[2,2,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,2,1] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,4,7] ; SSE-NEXT: movdqa %xmm12, %xmm14 @@ -619,98 +618,101 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm13, %xmm11 ; SSE-NEXT: pandn %xmm14, %xmm11 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm9[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15] +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: pandn %xmm15, %xmm11 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: movdqa %xmm8, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] ; SSE-NEXT: movdqa %xmm12, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: pandn %xmm4, %xmm12 ; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,3] -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,3] +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pand %xmm13, %xmm7 ; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: por %xmm7, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm13, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,6,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: pshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,6] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, 64(%r9) -; SSE-NEXT: movdqa %xmm6, (%r9) +; SSE-NEXT: movdqa %xmm4, (%r9) ; SSE-NEXT: movdqa %xmm15, 16(%r9) ; SSE-NEXT: movdqa %xmm11, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -772,11 +774,11 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm4 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[6],zero,zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9,25],zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero @@ -787,7 +789,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 @@ -796,16 +798,16 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22] ; AVX2-SLOW-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) @@ -859,11 +861,11 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[6],zero,zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9,25],zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero @@ -874,7 +876,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 @@ -883,16 +885,16 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) @@ -903,12 +905,12 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-LABEL: store_i8_stride5_vf16: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,7],zero,ymm6[u,u,u,8],zero,ymm6[u,u,u,9],zero,ymm6[u,u,u],zero,ymm6[26,u,u,u],zero,ymm6[27,u,u,u],zero,ymm6[28,u,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[7,u,u,u],zero,ymm8[8,u,u,u],zero,ymm8[9,u,u,u,26],zero,ymm8[u,u,u,27],zero,ymm8[u,u,u,28],zero,ymm8[u,u] @@ -926,15 +928,15 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512F-SLOW-NEXT: vporq %zmm7, %zmm5, %zmm5 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm6, %zmm6 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[10,11,u],zero,zero,xmm2[12,13,u],zero,zero,xmm2[14,15,u] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u] -; AVX512F-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12],zero,zero,zero,zero,xmm1[13],zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,xmm1[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] ; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) @@ -943,13 +945,13 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512F-FAST-LABEL: store_i8_stride5_vf16: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] @@ -963,18 +965,18 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 ; AVX512F-FAST-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u] -; AVX512F-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] -; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[10,11,u],zero,zero,xmm2[12,13,u],zero,zero,xmm2[14,15,u] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u] +; AVX512F-FAST-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] +; AVX512F-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-FAST-NEXT: vzeroupper @@ -1076,151 +1078,148 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rsi), %xmm7 +; SSE-NEXT: subq $136, %rsp +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rsi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm8 ; SSE-NEXT: movdqa (%rdx), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm11 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 +; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rcx), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm3 ; SSE-NEXT: movdqa 16(%r8), %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa (%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,7] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] +; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm3[8],xmm15[9],xmm3[9],xmm15[10],xmm3[10],xmm15[11],xmm3[11],xmm15[12],xmm3[12],xmm15[13],xmm3[13],xmm15[14],xmm3[14],xmm15[15],xmm3[15] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm12, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm7, %xmm4 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 @@ -1230,9 +1229,9 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] @@ -1243,170 +1242,173 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] ; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm5, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm12, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[0,1,2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: pand %xmm12, %xmm14 ; SSE-NEXT: por %xmm7, %xmm14 -; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: pand %xmm10, %xmm14 ; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: pand %xmm3, %xmm14 ; SSE-NEXT: por %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: movdqa %xmm11, %xmm14 ; SSE-NEXT: pandn %xmm4, %xmm14 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm14, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,2] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,7,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[3,3,3,3] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm14 ; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pand %xmm11, %xmm14 -; SSE-NEXT: por %xmm14, %xmm15 -; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw $164, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: pand %xmm8, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm14 ; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,7,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,1,2,3,7,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,2] -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,0,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pandn %xmm14, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,0,0] +; SSE-NEXT: pand %xmm5, %xmm14 +; SSE-NEXT: por %xmm14, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm15 +; SSE-NEXT: por %xmm15, %xmm3 +; SSE-NEXT: pshufhw $237, (%rsp), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,3,2] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: pandn %xmm5, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[3,3,3,3] +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,2,3,7,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,2] +; SSE-NEXT: pandn %xmm8, %xmm11 ; SSE-NEXT: por %xmm5, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] ; SSE-NEXT: pandn %xmm5, %xmm13 ; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pand %xmm8, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm8 -; SSE-NEXT: por %xmm13, %xmm8 -; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: pand %xmm12, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm3, (%r9) -; SSE-NEXT: movdqa %xmm11, 64(%r9) -; SSE-NEXT: movdqa %xmm0, 80(%r9) -; SSE-NEXT: movdqa %xmm15, 144(%r9) -; SSE-NEXT: movdqa %xmm7, 16(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: movdqa %xmm12, (%r9) +; SSE-NEXT: movdqa %xmm13, 64(%r9) +; SSE-NEXT: movdqa %xmm3, 80(%r9) +; SSE-NEXT: movdqa %xmm0, 144(%r9) +; SSE-NEXT: movdqa %xmm4, 16(%r9) +; SSE-NEXT: movdqa %xmm7, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1415,153 +1417,152 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) -; SSE-NEXT: addq $152, %rsp +; SSE-NEXT: addq $136, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4],zero,xmm3[6,7,8,9],zero,xmm3[11,12,13,14],zero -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm15[9],zero,zero,zero,zero,xmm15[10],zero,zero,zero,zero,xmm15[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm1[9],zero,zero,zero,zero,xmm1[10],zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm15 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm14 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm13, %ymm14 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm14, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm14 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm14 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm14 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3],zero,xmm14[5,6,7,8],zero,xmm14[10,11,12,13],zero,xmm14[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm15[0],zero,zero,zero,zero,xmm15[1],zero,zero,zero,zero,xmm15[2],zero -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[1,2,3,4],zero,xmm5[6,7,8,9],zero,xmm5[11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,xmm1[2],zero +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm14, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[6],zero,xmm5[u,u,u,7],zero,xmm5[u,u,u,8],zero,xmm5[u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] ; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u],zero,xmm10[7,u,u,u],zero,xmm10[8,u,u,u],zero,xmm10[9,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u,7],zero,xmm13[u,u,u,8],zero,xmm13[u,u,u,9],zero,xmm13[u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[0,1,2,3],zero,xmm9[5,6,7,8],zero,xmm9[10,11,12,13],zero,xmm9[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,xmm14[2],zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero,xmm2[9,u] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,xmm1[5,6,7,8],zero,xmm1[10,11,12,13],zero,xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,xmm14[2],zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm7[6,u,u,u],zero,xmm7[7,u,u,u],zero,xmm7[8,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm15[6,u,u,u],zero,xmm15[7,u,u,u],zero,xmm15[8,u,u,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm14[9],zero,zero,zero,zero,xmm14[10],zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm14[9],zero,zero,zero,zero,xmm14[10],zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm15, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1854,11 +1855,11 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512F-SLOW-LABEL: store_i8_stride5_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11] ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 @@ -1891,12 +1892,12 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u] ; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u,u],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u,u,19] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u,u,19] ; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm5, %ymm11, %ymm8 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25],zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] ; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm10, %ymm5, %ymm9 @@ -1908,8 +1909,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm5 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[12],zero,zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[0,2,1,1,4,6,5,5] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX512F-SLOW-NEXT: vpandn %ymm9, %ymm11, %ymm9 @@ -1922,14 +1923,14 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm10, %ymm4, %ymm3 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[26],zero,ymm0[28],zero,zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero,zero,ymm0[30],zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm7, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper @@ -2112,12 +2113,12 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512BW-FAST-LABEL: store_i8_stride5_vf32: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11] +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] ; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm6 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero ; AVX512BW-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 @@ -2130,38 +2131,38 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero ; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,0,1,1,4,4,5,5] ; AVX512BW-FAST-NEXT: movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] ; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm6, %zmm6 ; AVX512BW-FAST-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm2[21],zero,ymm2[21,20],zero,ymm2[22],zero,ymm2[24],zero,ymm2[22,23],zero,ymm2[25] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm2[12,13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,ymm2[14,15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16,17],zero,zero,zero,zero,ymm2[18],zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm3[13],zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,ymm3[18],zero,zero ; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <3,3,3,u,4,4,4,4> -; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm8 ; AVX512BW-FAST-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 {%k1} = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 {%k1} = ymm1[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 ; AVX512BW-FAST-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 @@ -2171,27 +2172,27 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,zero,zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512BW-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[26],zero,ymm0[28],zero,zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero,zero,ymm0[30],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512BW-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[27],zero,zero,ymm3[26],zero,ymm3[28],zero,ymm3[30],zero,zero,ymm3[29],zero,ymm3[31],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512BW-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512BW-FAST-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7] -; AVX512BW-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,6,6,6,7,7,7,7] +; AVX512BW-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: movl $-2078209982, %eax # imm = 0x84210842 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 64(%r9) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 @@ -2213,34 +2214,33 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i8_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $504, %rsp # imm = 0x1F8 -; SSE-NEXT: movdqa (%rdi), %xmm7 -; SSE-NEXT: movdqa (%rsi), %xmm9 -; SSE-NEXT: movdqa 16(%rsi), %xmm14 -; SSE-NEXT: movdqa (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa (%rsi), %xmm8 +; SSE-NEXT: movdqa 16(%rsi), %xmm13 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa 16(%rdx), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm10 -; SSE-NEXT: movdqa 16(%rcx), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r8), %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa (%rcx), %xmm9 +; SSE-NEXT: movdqa 16(%rcx), %xmm12 +; SSE-NEXT: movdqa (%r8), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] @@ -2248,27 +2248,28 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,2] +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] @@ -2280,12 +2281,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa 16(%r8), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 @@ -2298,16 +2299,17 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 32(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 32(%rsi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,2,1] +; SSE-NEXT: movdqa 32(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] @@ -2319,9 +2321,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa 32(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2337,13 +2339,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa 48(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2359,520 +2361,522 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa 48(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[2,2,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,4] ; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,2,2,2] +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,0,0] -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,3,2] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm12[8],xmm0[9],xmm12[9],xmm0[10],xmm12[10],xmm0[11],xmm12[11],xmm0[12],xmm12[12],xmm0[13],xmm12[13],xmm0[14],xmm12[14],xmm0[15],xmm12[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,4] ; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,0,0] +; SSE-NEXT: pand %xmm7, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm5, %xmm13 -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: por %xmm7, %xmm13 -; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,2] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,2] +; SSE-NEXT: movdqa %xmm15, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[3,3,3,3] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,2] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: pand %xmm11, %xmm10 ; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: por %xmm10, %xmm13 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm3 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] ; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[1,0,2,3,4,5,6,7] +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm14[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: pand %xmm8, %xmm11 -; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] +; SSE-NEXT: pandn %xmm10, %xmm14 +; SSE-NEXT: por %xmm3, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,1,3] +; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm14, %xmm11 +; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 ; SSE-NEXT: pand %xmm2, %xmm11 -; SSE-NEXT: por %xmm11, %xmm14 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,1] -; SSE-NEXT: pandn %xmm11, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,1,3] -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: pand %xmm8, %xmm12 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] ; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm9, 304(%r9) -; SSE-NEXT: movdqa %xmm0, 240(%r9) -; SSE-NEXT: movdqa %xmm6, 224(%r9) -; SSE-NEXT: movdqa %xmm14, 160(%r9) -; SSE-NEXT: movdqa %xmm7, 144(%r9) -; SSE-NEXT: movdqa %xmm10, 80(%r9) -; SSE-NEXT: movdqa %xmm13, 64(%r9) +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm8, 304(%r9) +; SSE-NEXT: movdqa %xmm10, 240(%r9) +; SSE-NEXT: movdqa %xmm5, 224(%r9) +; SSE-NEXT: movdqa %xmm13, 160(%r9) +; SSE-NEXT: movdqa %xmm6, 144(%r9) +; SSE-NEXT: movdqa %xmm9, 80(%r9) +; SSE-NEXT: movdqa %xmm12, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2906,295 +2910,297 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $104, %rsp ; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm9 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1],zero,xmm6[3,4,5,6],zero,xmm6[8,9,10,11],zero,xmm6[13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm7[6],zero,zero,zero,zero,xmm7[7],zero,zero,zero,zero,xmm7[8],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm9[6],zero,zero,zero,zero,xmm9[7],zero,zero,zero,zero,xmm9[8],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7],zero,xmm4[9,10,11,12],zero,xmm4[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm7[3],zero,zero,zero,zero,xmm7[4],zero,zero,zero,zero,xmm7[5],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3],zero,zero,zero,zero,xmm9[4],zero,zero,zero,zero,xmm9[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[12],zero,zero,zero,zero,xmm7[13],zero,zero,zero,zero,xmm7[14],zero,zero,zero,zero,xmm7[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[12],zero,zero,zero,zero,xmm9[13],zero,zero,zero,zero,xmm9[14],zero,zero,zero,zero,xmm9[15] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm6[6,u,u,u],zero,xmm6[7,u,u,u],zero,xmm6[8,u,u,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm14 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[0,1],zero,xmm14[3,4,5,6],zero,xmm14[8,9,10,11],zero,xmm14[13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm6[6],zero,zero,zero,zero,xmm6[7],zero,zero,zero,zero,xmm6[8],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[0,1],zero,xmm14[3,4,5,6],zero,xmm14[8,9,10,11],zero,xmm14[13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm4[6],zero,zero,zero,zero,xmm4[7],zero,zero,zero,zero,xmm4[8],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7],zero,xmm1[9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3],zero,zero,zero,zero,xmm6[4],zero,zero,zero,zero,xmm6[5],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm4[3],zero,zero,zero,zero,xmm4[4],zero,zero,zero,zero,xmm4[5],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm12 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0],zero,xmm4[2,3,4,5],zero,xmm4[7,8,9,10],zero,xmm4[12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5],zero,xmm3[7,8,9,10],zero,xmm3[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm12 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3],zero,xmm13[5,6,7,8],zero,xmm13[10,11,12,13],zero,xmm13[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[12],zero,zero,zero,zero,xmm13[13],zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,xmm13[15] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm11 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm11, %ymm13, %ymm11 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,xmm11[1,2,3,4],zero,xmm11[6,7,8,9],zero,xmm11[11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[12],zero,zero,zero,zero,xmm11[13],zero,zero,zero,zero,xmm11[14],zero,zero,zero,zero,xmm11[15] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,xmm8[7,u,u,u],zero,xmm8[8,u,u,u],zero,xmm8[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,7],zero,xmm6[u,u,u,8],zero,xmm6[u,u,u,9],zero,xmm6[u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero,xmm9[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero,xmm13[2],zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3],zero,xmm2[5,6,7,8],zero,xmm2[10,11,12,13],zero,xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm11[0],zero,zero,zero,zero,xmm11[1],zero,zero,zero,zero,xmm11[2],zero +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u],zero,xmm7[7,u,u,u],zero,xmm7[8,u,u,u],zero,xmm7[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,7],zero,xmm4[u,u,u,8],zero,xmm4[u,u,u,9],zero,xmm4[u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6],zero,xmm7[u,u,u,7],zero,xmm7[u,u,u,8],zero,xmm7[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm8[6,u,u,u],zero,xmm8[7,u,u,u],zero,xmm8[8,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6],zero,xmm6[u,u,u,7],zero,xmm6[u,u,u,8],zero,xmm6[u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5],zero,xmm5[7,8,9,10],zero,xmm5[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm13[9],zero,zero,zero,zero,xmm13[10],zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm11[9],zero,zero,zero,zero,xmm11[10],zero,zero,zero,zero,xmm11[11],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm13, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%r9) +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 112(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3229,18 +3235,18 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-SLOW-LABEL: store_i8_stride5_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $312, %rsp # imm = 0x138 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX2-SLOW-NEXT: subq $216, %rsp +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 @@ -3249,217 +3255,210 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm8 -; AVX2-SLOW-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm2 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm12 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm13 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm9, %ymm12, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] +; AVX2-SLOW-NEXT: vpor %ymm6, %ymm14, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm1, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm10 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm8 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[0,2,1,1,4,6,5,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm9, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,2,1,1,4,6,5,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm9, %ymm5, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,1,1,4,6,5,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[0,2,1,1,4,6,5,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,u,4,4,4,4> -; AVX2-SLOW-NEXT: vpermd %ymm11, %ymm3, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,u,4,4,4,4> +; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm8, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm11 -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm4 -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm10, %ymm13 +; AVX2-SLOW-NEXT: vpor %ymm8, %ymm13, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm7, %ymm8 +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,3,u,4,4,4> -; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd $80, (%rsp), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm7 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm9 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm12, %ymm7 +; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm14[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm6 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 256(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 160(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 288(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%r9) +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 256(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 128(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 288(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: addq $312, %rsp # imm = 0x138 +; AVX2-SLOW-NEXT: addq $216, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride5_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $168, %rsp -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 @@ -3500,135 +3499,138 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm13 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm14, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] -; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm1, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,2,3,3] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm9, %ymm15, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm10, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm8, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm9 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm15, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpor %ymm10, %ymm15, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,5,5,5,5,4,6] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <3,3,3,u,4,4,4,4> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpor %ymm10, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,6,5,5,5,5,4,6] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,u,4,4,4,4> +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm6 -; AVX2-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm9 +; AVX2-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm3 ; AVX2-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,3,u,4,4,4> -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <3,3,3,3,u,4,4,4> +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] @@ -3641,23 +3643,23 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 256(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 256(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FAST-NEXT: addq $168, %rsp @@ -3666,10 +3668,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: subq $232, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> @@ -3695,7 +3697,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] @@ -3711,7 +3713,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 @@ -3722,589 +3724,596 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm5, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,2,3,3,6,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,2,3,3,6,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm13, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] ; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm15, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm12, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm5, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm14, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm8, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <3,3,3,u,4,4,4,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm15, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <3,3,3,u,4,4,4,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <3,3,3,3,u,4,4,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm15, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <3,3,3,3,u,4,4,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd $80, (%rsp), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 224(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 96(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 160(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $200, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $232, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm30 -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm10 -; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm10, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512F-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm5 +; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm26 -; AVX512F-SLOW-NEXT: vporq %xmm4, %xmm12, %xmm31 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm22 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm23 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[27],zero,zero,ymm11[26],zero,ymm11[28],zero,ymm11[30],zero,zero,ymm11[29],zero,ymm11[31],zero,zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm9[19],zero,ymm9[21],zero,zero,ymm9[20],zero,ymm9[22],zero,ymm9[24],zero,zero,ymm9[23],zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vporq %xmm2, %xmm3, %xmm27 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 -; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm15, %xmm28 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm10 +; AVX512F-SLOW-NEXT: vporq %xmm2, %xmm10, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm19 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[26],zero,ymm0[28],zero,zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero,zero,ymm0[30],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm21 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm3 +; AVX512F-SLOW-NEXT: vporq %ymm1, %ymm3, %ymm23 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm7, %ymm10 +; AVX512F-SLOW-NEXT: vporq %ymm3, %ymm10, %ymm24 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm11 +; AVX512F-SLOW-NEXT: vporq %xmm3, %xmm11, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-SLOW-NEXT: vporq %xmm11, %xmm6, %xmm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = mem[1,1,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,1] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm0, %ymm25, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,1,4,6,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm14, %ymm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm25 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[19],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm12 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,ymm4[27],zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpandnq %ymm11, %ymm25, %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,1,1,4,6,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-SLOW-NEXT: vpandn %ymm9, %ymm11, %ymm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm28 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[21],zero,zero,ymm7[20],zero,ymm7[22],zero,ymm7[24],zero,zero,ymm7[23],zero,ymm7[25],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,ymm7[27],zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30],zero +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,u,5,5,5,5,u,6,6,6,6,u,7,7,7,7> -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm1, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <6,6,6,u,7,7,7,7,u,16,16,16,16,u,17,17> -; AVX512F-SLOW-NEXT: vpermi2d %zmm15, %zmm29, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm9[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm11[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <4,u,5,5,5,5,u,6,6,6,6,u,7,7,7,7> +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm30, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <6,6,6,u,7,7,7,7,u,16,16,16,16,u,17,17> +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm29, %zmm31 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm13[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm14[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm27, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm28, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm26, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm27, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm22[0,0,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm31[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm15, %zmm17, %zmm16 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm14, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm20, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm14, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm22[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm23[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm14, %ymm12, %ymm6 -; AVX512F-SLOW-NEXT: vpandq %ymm14, %ymm30, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm18, %zmm2 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm24[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vporq %zmm11, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm11, %zmm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm11, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm26 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm25 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm8[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm9[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm12, %zmm16, %zmm15 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm13, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm23, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] +; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm12, %ymm14, %ymm11 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm24, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm16, %zmm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm19[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm20[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm13 +; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm12, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vpand %ymm6, %ymm12, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm21[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vporq %zmm4, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm25 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm30 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm28 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm10[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm5[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = ; AVX512F-SLOW-NEXT: vpermd %zmm29, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 192(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 128(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 256(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 192(%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 -; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm4 -; AVX512F-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm4 -; AVX512F-FAST-NEXT: vporq %xmm3, %xmm4, %xmm19 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm23 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[27],zero,zero,ymm8[26],zero,ymm8[28],zero,ymm8[30],zero,zero,ymm8[29],zero,ymm8[31],zero,zero +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512F-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-FAST-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm4 +; AVX512F-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm12 +; AVX512F-FAST-NEXT: vporq %xmm14, %xmm12, %xmm22 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm19 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[26],zero,ymm0[28],zero,zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero,zero,ymm0[30],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25],zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm15[27],zero,zero,ymm15[26],zero,ymm15[28],zero,ymm15[30],zero,zero,ymm15[29],zero,ymm15[31],zero,zero ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm7 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm30 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 -; AVX512F-FAST-NEXT: vporq %xmm0, %xmm2, %xmm28 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm14 -; AVX512F-FAST-NEXT: vporq %xmm0, %xmm14, %xmm29 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [1,1,2,2,2,2,2,2] -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX512F-FAST-NEXT: vporq %ymm7, %ymm2, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm13 +; AVX512F-FAST-NEXT: vporq %ymm2, %ymm13, %ymm24 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm6 +; AVX512F-FAST-NEXT: vporq %xmm11, %xmm6, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX512F-FAST-NEXT: vporq %xmm9, %xmm1, %xmm29 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,2,2,2,2,2,2] +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm9 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX512F-FAST-NEXT: vpandnq %ymm14, %ymm25, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <4,u,5,5,5,5,u,6,30,30,30,u,31,31,31,31> -; AVX512F-FAST-NEXT: vpermi2d %zmm25, %zmm0, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <4,u,5,5,5,5,u,6> -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-FAST-NEXT: vpandnq %ymm15, %ymm26, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm26 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm9[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,ymm4[27],zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30],zero -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-FAST-NEXT: vpandnq %ymm9, %ymm25, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <4,u,5,5,5,5,u,6,30,30,30,u,31,31,31,31> +; AVX512F-FAST-NEXT: vpermi2d %zmm26, %zmm1, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <4,u,5,5,5,5,u,6> +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-FAST-NEXT: vpandnq %ymm10, %ymm27, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm27 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm4 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero,zero +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm3[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero,ymm12[25],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[26],zero,ymm12[28],zero,zero,ymm12[27],zero,ymm12[29],zero,ymm12[31],zero,zero,ymm12[30],zero +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm15[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm13 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm28, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm29, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm25, %zmm0 -; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm7 = mem[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm28, %zmm13, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm29, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm26, %zmm1 +; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm22[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm13 -; AVX512F-FAST-NEXT: vpor %ymm15, %ymm12, %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm20, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm12, %ymm11, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm21, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm22[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm23[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm12, %ymm1, %ymm6 -; AVX512F-FAST-NEXT: vpandq %ymm12, %ymm30, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm24[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vporq %zmm7, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm7, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm27 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm31 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm26 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> -; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm16, %zmm15 +; AVX512F-FAST-NEXT: vpor %ymm2, %ymm10, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm23, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm10, %ymm8, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm16, %zmm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm19[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm20[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm8 +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm10, %ymm4, %ymm14 +; AVX512F-FAST-NEXT: vpandq %ymm10, %ymm30, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm21[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vporq %zmm4, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm8, %zmm4, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm25 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm13[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm6[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 192(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 192(%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride5_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] ; AVX512BW-ONLY-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm4, %ymm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm4, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm4, %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm5[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] ; AVX512BW-ONLY-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm9, %ymm2 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm10, %ymm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm10, %zmm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm15, %zmm2, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm10, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm3 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm23, %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm17 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm24, %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm17, %ymm18, %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm12, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm6, %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm14, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm13, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm18 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm18, %xmm14, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm13, %xmm13 ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[0,0,1,1] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 @@ -4320,27 +4329,27 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm13, %zmm6 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <3,3,3,3,u,4,4,4> -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm16, %ymm13, %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = mem[1,1,2,2] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm15, %ymm13, %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = mem[1,1,2,2] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,1,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm21, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] ; AVX512BW-ONLY-SLOW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm21[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm21, %ymm25, %ymm28 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm26, %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16 ; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm25, %ymm15 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm27, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] @@ -4355,101 +4364,101 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermd %zmm16, %zmm15, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermd %zmm15, %zmm16, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm15, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm16, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm22, %xmm19 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm16, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm15, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm18, %xmm16, %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm15, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm19, %xmm18 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm9, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm15, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm11, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,1,1,4,4,5,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512BW-ONLY-SLOW-NEXT: vpermd %zmm3, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermd %zmm2, %zmm7, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm7 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm2, %ymm7, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm5, %ymm12, %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm4, %ymm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm4, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm5, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm4, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm21, %ymm5, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm0, %ymm13, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm0, %ymm13, %ymm2 ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,1,4,6,5,5] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i8_stride5_vf64: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm19 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm19, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %ymm17 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm17, %ymm3 ; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm16 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm16, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm16, %xmm3 ; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm7 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm18 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm18, %xmm4 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm18, %xmm4 ; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm3 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm17 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm19 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm17, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm19, %xmm4 ; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm3[0,0,1,1] ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %ymm21 @@ -4477,23 +4486,23 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm21[21],zero,zero,zmm21[20],zero,zmm21[22],zero,zmm21[24],zero,zero,zmm21[23],zero,zmm21[25],zero,zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm21[58],zero,zmm21[60],zero,zero,zmm21[59],zero,zmm21[61],zero,zmm21[63],zero,zero,zmm21[62],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm21 = zmm21[2,2,3,3,6,6,7,7] ; AVX512BW-FAST-NEXT: vporq %zmm22, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm19 = zmm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,zmm19[21],zero,zmm19[21,20],zero,zmm19[22],zero,zmm19[24],zero,zmm19[22,23],zero,zmm19[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm19[59],zero,zero,zmm19[58],zero,zmm19[60],zero,zmm19[62],zero,zero,zmm19[61],zero,zmm19[63],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm19 = zmm19[2,2,3,3,6,6,7,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm13 = zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm13[19],zero,zmm13[21],zero,zero,zmm13[20],zero,zmm13[22],zero,zmm13[24],zero,zero,zmm13[23],zero,zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm13[59],zero,zero,zmm13[58],zero,zmm13[60],zero,zmm13[62],zero,zero,zmm13[61],zero,zmm13[63],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm13 = zmm13[2,2,3,3,6,6,7,7] -; AVX512BW-FAST-NEXT: vporq %zmm19, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,zmm10[21],zero,zmm10[21,20],zero,zmm10[22],zero,zmm10[24],zero,zmm10[22,23],zero,zmm10[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[59],zero,zero,zmm10[58],zero,zmm10[60],zero,zmm10[62],zero,zero,zmm10[61],zero,zmm10[63],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,3,3,6,6,7,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm17 = zmm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm17[19],zero,zmm17[21],zero,zero,zmm17[20],zero,zmm17[22],zero,zmm17[24],zero,zero,zmm17[23],zero,zmm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm17[59],zero,zero,zmm17[58],zero,zmm17[60],zero,zmm17[62],zero,zero,zmm17[61],zero,zmm17[63],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm17 = zmm17[2,2,3,3,6,6,7,7] +; AVX512BW-FAST-NEXT: vporq %zmm10, %zmm17, %zmm10 ; AVX512BW-FAST-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm13 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm20, %zmm19 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm10 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm20, %zmm17 ; AVX512BW-FAST-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512BW-FAST-NEXT: kmovq %rax, %k4 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm19, %zmm13 {%k4} -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm19 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm19[27],zero,zero,ymm19[26],zero,ymm19[28],zero,ymm19[30],zero,zero,ymm19[29],zero,ymm19[31],zero +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm17, %zmm10 {%k4} +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm17[27],zero,zero,ymm17[26],zero,ymm17[28],zero,ymm17[30],zero,zero,ymm17[29],zero,ymm17[31],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,3,3] ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm21 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm21[27],zero,zero,ymm21[26],zero,ymm21[28],zero,ymm21[30],zero,zero,ymm21[29],zero,ymm21[31],zero,zero @@ -4503,37 +4512,37 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> ; AVX512BW-FAST-NEXT: vpshufb %xmm18, %xmm16, %xmm16 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,0,1,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %ymm16 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm16[26],zero,ymm16[28],zero,zero,zero,zero,ymm16[29],zero,ymm16[31],zero,zero,ymm16[30] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %ymm20 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm20[26],zero,ymm20[28],zero,zero,zero,zero,ymm20[29],zero,ymm20[31],zero,zero,ymm20[30] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,3,3] ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm24 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm23[26],zero,ymm23[28],zero,zero,ymm23[27],zero,ymm23[29],zero,ymm23[31],zero,zero,ymm23[30],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,3,3] ; AVX512BW-FAST-NEXT: vporq %ymm22, %ymm24, %ymm22 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3],xmm15[4],xmm17[4],xmm15[5],xmm17[5],xmm15[6],xmm17[6],xmm15[7],xmm17[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm17 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm15, %xmm15 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm19[0],xmm15[1],xmm19[1],xmm15[2],xmm19[2],xmm15[3],xmm19[3],xmm15[4],xmm19[4],xmm15[5],xmm19[5],xmm15[6],xmm19[6],xmm15[7],xmm19[7] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512BW-FAST-NEXT: vpshufb %xmm19, %xmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm22, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm20, %zmm15 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] -; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm20, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm16, %zmm15 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] +; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm16, %zmm5 ; AVX512BW-FAST-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm15 {%k3} -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm5 -; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm8 -; AVX512BW-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm5 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm9 +; AVX512BW-FAST-NEXT: vpor %xmm5, %xmm9, %xmm5 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX512BW-FAST-NEXT: vpshufb %xmm18, %xmm6, %xmm6 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5] -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm6 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm7 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm6 +; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm7 ; AVX512BW-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm7, %xmm7 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm19, %xmm7, %xmm7 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm7, %zmm6 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] ; AVX512BW-FAST-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C @@ -4546,21 +4555,21 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm7, %zmm6 {%k3} ; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm21, %ymm1 -; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm19, %ymm2 +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm17, %ymm2 ; AVX512BW-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm21[21],zero,ymm21[21,20],zero,ymm21[22],zero,ymm21[24],zero,ymm21[22,23],zero,ymm21[25] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm19[19],zero,ymm19[21],zero,zero,ymm19[20],zero,ymm19[22],zero,ymm19[24],zero,zero,ymm19[23],zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm17[19],zero,ymm17[21],zero,zero,ymm17[20],zero,ymm17[22],zero,ymm17[24],zero,zero,ymm17[23],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512BW-FAST-NEXT: vpor %ymm2, %ymm7, %ymm2 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm16[21],zero,zero,ymm16[20],zero,ymm16[22],zero,ymm16[24],zero,zero,ymm16[23],zero,ymm16[25],zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm20[21],zero,zero,ymm20[20],zero,ymm20[22],zero,ymm20[24],zero,zero,ymm20[23],zero,ymm20[25],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm23[21],zero,zero,ymm23[20],zero,ymm23[22],zero,ymm23[24],zero,zero,ymm23[23],zero,ymm23[25],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512BW-FAST-NEXT: vpor %ymm2, %ymm7, %ymm2 ; AVX512BW-FAST-NEXT: vpermd %ymm23, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm16, %ymm3 {%k1} +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm20, %ymm3 {%k1} ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm2 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14> @@ -4571,75 +4580,75 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 64(%r9) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 128(%r9) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 256(%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 256(%r9) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 192(%r9) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i8_stride5_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] ; AVX512DQBW-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm10 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm15, %ymm4, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm4, %ymm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm5[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] ; AVX512DQBW-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm2 {%k5} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm10, %ymm3 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm11 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 ; AVX512DQBW-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm15, %zmm2, %zmm9 ; AVX512DQBW-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm3 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm10, %ymm23, %ymm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm17 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm10, %ymm24, %ymm18 ; AVX512DQBW-SLOW-NEXT: vporq %ymm17, %ymm18, %ymm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm12, %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm6, %xmm6 ; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm14, %xmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm13, %xmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm18 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm18, %xmm14, %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm13, %xmm13 ; AVX512DQBW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[0,0,1,1] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 @@ -4655,27 +4664,27 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm13, %zmm6 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <3,3,3,3,u,4,4,4> -; AVX512DQBW-SLOW-NEXT: vpermd %ymm16, %ymm13, %ymm17 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = mem[1,1,2,2] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermd %ymm15, %ymm13, %ymm17 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = mem[1,1,2,2] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,1,1,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm21, %zmm17 ; AVX512DQBW-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k6 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] ; AVX512DQBW-SLOW-NEXT: # ymm17 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm21 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm21[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm21, %ymm25, %ymm28 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm15, %ymm26, %ymm15 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16 ; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm15 {%k5} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm27, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] @@ -4690,69 +4699,69 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] -; AVX512DQBW-SLOW-NEXT: vpermd %zmm16, %zmm15, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] +; AVX512DQBW-SLOW-NEXT: vpermd %zmm15, %zmm16, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 ; AVX512DQBW-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm15, %xmm20 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19 ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm16, %xmm22 -; AVX512DQBW-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm22, %xmm19 ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm16, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm15, %xmm20 -; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm18, %xmm16, %xmm18 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm15, %xmm19 +; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm19, %xmm18 ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm9, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm15, %xmm11 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm11, %zmm11 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,1,1,4,4,5,5] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] ; AVX512DQBW-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm11 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512DQBW-SLOW-NEXT: vpermd %zmm3, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermd %zmm2, %zmm7, %zmm2 ; AVX512DQBW-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm7 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vpor %ymm2, %ymm7, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vpermd %ymm5, %ymm12, %ymm1 ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm14, %ymm4, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm4, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm5, %ymm4 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm4, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm21, %ymm5, %ymm4 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpermd %ymm0, %ymm13, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpermd %ymm0, %ymm13, %ymm2 ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,1,4,6,5,5] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r9) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 9c0dbdf03484b0..0d76ee96578ebd 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -237,64 +237,64 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-LABEL: store_i8_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,1,0,1] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm10, %xmm6 -; SSE-NEXT: por %xmm9, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,3] -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm5, 32(%rax) -; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm6, (%rax) +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm4, 32(%rax) +; SSE-NEXT: movdqa %xmm1, 16(%rax) +; SSE-NEXT: movdqa %xmm10, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride6_vf8: @@ -444,197 +444,197 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm13 -; SSE-NEXT: movdqa (%rcx), %xmm2 -; SSE-NEXT: movdqa (%r8), %xmm11 -; SSE-NEXT: movdqa (%r9), %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa (%rdx), %xmm11 +; SSE-NEXT: movdqa (%rcx), %xmm0 +; SSE-NEXT: movdqa (%r8), %xmm9 +; SSE-NEXT: movdqa (%r9), %xmm12 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm8, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm9, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm15, %xmm10 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: por %xmm14, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm2, %xmm13 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm14, %xmm7 +; SSE-NEXT: por %xmm13, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: por %xmm10, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm10, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm15, %xmm12 +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: por %xmm14, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,3] -; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: pand %xmm3, %xmm14 ; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm15, %xmm5 -; SSE-NEXT: pand %xmm4, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 ; SSE-NEXT: por %xmm14, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,0,2,2,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,1,1] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm11 -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,5,6,7,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: por %xmm10, %xmm13 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm9, 16(%rax) -; SSE-NEXT: movdqa %xmm14, 32(%rax) -; SSE-NEXT: movdqa %xmm3, 48(%rax) -; SSE-NEXT: movdqa %xmm15, 80(%rax) -; SSE-NEXT: movdqa %xmm10, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movdqa %xmm13, 16(%rax) +; SSE-NEXT: movdqa %xmm15, 32(%rax) +; SSE-NEXT: movdqa %xmm2, 48(%rax) +; SSE-NEXT: movdqa %xmm14, 80(%rax) +; SSE-NEXT: movdqa %xmm12, 64(%rax) +; SSE-NEXT: movdqa %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride6_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3,4],xmm9[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5],xmm10[6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1,2],xmm10[3],xmm7[4,5],xmm10[6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -825,294 +825,295 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i8_stride6_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $200, %rsp -; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 ; SSE-NEXT: movdqa 16(%rsi), %xmm5 ; SSE-NEXT: movdqa 16(%rdx), %xmm12 ; SSE-NEXT: movdqa 16(%rcx), %xmm4 -; SSE-NEXT: movdqa 16(%r8), %xmm11 +; SSE-NEXT: movdqa 16(%r8), %xmm10 ; SSE-NEXT: movdqa 16(%r9), %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm11, %xmm7 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm6 ; SSE-NEXT: por %xmm6, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm14 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa (%rdx), %xmm12 +; SSE-NEXT: movdqa (%rcx), %xmm14 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa (%rdx), %xmm11 -; SSE-NEXT: movdqa (%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,7,7] -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa (%r8), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm6, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa (%r9), %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa (%r9), %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: pand %xmm9, %xmm15 ; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[1,1,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3],xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,2] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,0,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm10, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm8, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 ; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,2,2] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] ; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,0,2,2,4,5,6,7] +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,1,1] +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: pshuflw $161, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: pshufd $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: pand %xmm7, %xmm13 -; SSE-NEXT: pshufd $0, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: pshufd $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,0,0] -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: por %xmm10, %xmm11 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: pshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,3] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: pand %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm10, 32(%rax) -; SSE-NEXT: movdqa %xmm7, 48(%rax) +; SSE-NEXT: movdqa %xmm9, 32(%rax) +; SSE-NEXT: movdqa %xmm11, 48(%rax) ; SSE-NEXT: movdqa %xmm0, 96(%rax) ; SSE-NEXT: movdqa %xmm5, 112(%rax) -; SSE-NEXT: movdqa %xmm15, 160(%rax) -; SSE-NEXT: movdqa %xmm6, 176(%rax) -; SSE-NEXT: movdqa %xmm11, (%rax) -; SSE-NEXT: movdqa %xmm12, 16(%rax) +; SSE-NEXT: movdqa %xmm14, 160(%rax) +; SSE-NEXT: movdqa %xmm12, 176(%rax) +; SSE-NEXT: movdqa %xmm13, (%rax) +; SSE-NEXT: movdqa %xmm15, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1128,178 +1129,179 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm12[8,u],zero,zero,zero,zero,xmm12[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm11[8,u],zero,zero,zero,zero,xmm11[9,u],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10],zero,xmm3[12,13,14,15] ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm13 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm13[8],zero,zero,zero,zero,zero,xmm13[9],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm12[5,u],zero,zero,zero,zero,xmm12[6,u],zero,zero,zero,zero,xmm12[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm11[5,u],zero,zero,zero,zero,xmm11[6,u],zero,zero,zero,zero,xmm11[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm13[5],zero,zero,zero,zero,zero,xmm13[6],zero,zero,zero,zero,zero,xmm13[7] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[10,u],zero,zero,zero,zero,xmm12[11,u],zero,zero,zero,zero,xmm12[12,u],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[10,u],zero,zero,zero,zero,xmm11[11,u],zero,zero,zero,zero,xmm11[12,u],zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm13[10],zero,zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero,zero,xmm13[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm13[10],zero,zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero,zero,xmm13[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm11[13,u],zero,zero,zero,zero,xmm11[14,u],zero,zero,zero,zero,xmm11[15,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm13[13],zero,zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,zero,xmm13[15] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm15[1,2],xmm2[3],xmm15[4,5],xmm2[6],xmm15[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6],zero,xmm2[8,9,10,11,12],zero,xmm2[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm9 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm15 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm12[0,u],zero,zero,zero,zero,xmm12[1,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1],xmm11[2],xmm15[3,4],xmm11[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3],xmm0[4],xmm8[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm1[13],zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,xmm1[15] +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1,2],xmm8[3],xmm15[4,5],xmm8[6],xmm15[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0],zero,xmm8[2,3,4,5,6],zero,xmm8[8,9,10,11,12],zero,xmm8[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm1[10],zero,zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero,zero,xmm1[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm15, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,xmm11[0,u],zero,zero,zero,zero,xmm11[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0,1],xmm12[2],xmm0[3,4],xmm12[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1,2],xmm11[3],xmm0[4,5],xmm11[6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm2[0,u],zero,zero,zero,zero,xmm2[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7,8],zero,xmm4[10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[4,5,6,7,8],zero,xmm3[10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[5],zero,zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[8],zero,zero,zero,zero,zero,xmm1[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 96(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 16(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1317,124 +1319,120 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-SLOW-LABEL: store_i8_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: pushq %rax ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm10, %ymm14 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[2],ymm10[2],ymm7[3],ymm10[3],ymm7[4],ymm10[4],ymm7[5],ymm10[5],ymm7[6],ymm10[6],ymm7[7],ymm10[7],ymm7[16],ymm10[16],ymm7[17],ymm10[17],ymm7[18],ymm10[18],ymm7[19],ymm10[19],ymm7[20],ymm10[20],ymm7[21],ymm10[21],ymm7[22],ymm10[22],ymm7[23],ymm10[23] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm7 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[16],ymm15[16],ymm7[17],ymm15[17],ymm7[18],ymm15[18],ymm7[19],ymm15[19],ymm7[20],ymm15[20],ymm7[21],ymm15[21],ymm7[22],ymm15[22],ymm7[23],ymm15[23] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm14 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm8 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[16],ymm12[16],ymm8[17],ymm12[17],ymm8[18],ymm12[18],ymm8[19],ymm12[19],ymm8[20],ymm12[20],ymm8[21],ymm12[21],ymm8[22],ymm12[22],ymm8[23],ymm12[23] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[2],ymm15[2],ymm8[3],ymm15[3],ymm8[4],ymm15[4],ymm8[5],ymm15[5],ymm8[6],ymm15[6],ymm8[7],ymm15[7],ymm8[16],ymm15[16],ymm8[17],ymm15[17],ymm8[18],ymm15[18],ymm8[19],ymm15[19],ymm8[20],ymm15[20],ymm8[21],ymm15[21],ymm8[22],ymm15[22],ymm8[23],ymm15[23] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm10, %ymm15, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm10 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm12, %ymm15, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm15, %ymm12 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm15, %ymm13 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm10, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm12, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm13 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm15, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm15, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm10, %ymm15 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm12[8],ymm2[9],ymm12[9],ymm2[10],ymm12[10],ymm2[11],ymm12[11],ymm2[12],ymm12[12],ymm2[13],ymm12[13],ymm2[14],ymm12[14],ymm2[15],ymm12[15],ymm2[24],ymm12[24],ymm2[25],ymm12[25],ymm2[26],ymm12[26],ymm2[27],ymm12[27],ymm2[28],ymm12[28],ymm2[29],ymm12[29],ymm2[30],ymm12[30],ymm2[31],ymm12[31] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm13[8],ymm1[9],ymm13[9],ymm1[10],ymm13[10],ymm1[11],ymm13[11],ymm1[12],ymm13[12],ymm1[13],ymm13[13],ymm1[14],ymm13[14],ymm1[15],ymm13[15],ymm1[24],ymm13[24],ymm1[25],ymm13[25],ymm1[26],ymm13[26],ymm1[27],ymm13[27],ymm1[28],ymm13[28],ymm1[29],ymm13[29],ymm1[30],ymm13[30],ymm1[31],ymm13[31] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15],ymm3[24],ymm2[24],ymm3[25],ymm2[25],ymm3[26],ymm2[26],ymm3[27],ymm2[27],ymm3[28],ymm2[28],ymm3[29],ymm2[29],ymm3[30],ymm2[30],ymm3[31],ymm2[31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1446,132 +1444,122 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm14, (%rax) -; AVX2-SLOW-NEXT: popq %rax ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $40, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm10 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm9 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm7 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[16],ymm12[16],ymm9[17],ymm12[17],ymm9[18],ymm12[18],ymm9[19],ymm12[19],ymm9[20],ymm12[20],ymm9[21],ymm12[21],ymm9[22],ymm12[22],ymm9[23],ymm12[23] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[4],ymm15[4],ymm9[5],ymm15[5],ymm9[6],ymm15[6],ymm9[7],ymm15[7],ymm9[16],ymm15[16],ymm9[17],ymm15[17],ymm9[18],ymm15[18],ymm9[19],ymm15[19],ymm9[20],ymm15[20],ymm9[21],ymm15[21],ymm9[22],ymm15[22],ymm9[23],ymm15[23] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm14 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[16],ymm12[16],ymm8[17],ymm12[17],ymm8[18],ymm12[18],ymm8[19],ymm12[19],ymm8[20],ymm12[20],ymm8[21],ymm12[21],ymm8[22],ymm12[22],ymm8[23],ymm12[23] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[2],ymm15[2],ymm8[3],ymm15[3],ymm8[4],ymm15[4],ymm8[5],ymm15[5],ymm8[6],ymm15[6],ymm8[7],ymm15[7],ymm8[16],ymm15[16],ymm8[17],ymm15[17],ymm8[18],ymm15[18],ymm8[19],ymm15[19],ymm8[20],ymm15[20],ymm8[21],ymm15[21],ymm8[22],ymm15[22],ymm8[23],ymm15[23] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm12, %ymm15, %ymm9 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm15, %ymm13 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] -; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm13 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15],ymm3[24],ymm2[24],ymm3[25],ymm2[25],ymm3[26],ymm2[26],ymm3[27],ymm2[27],ymm3[28],ymm2[28],ymm3[29],ymm2[29],ymm3[30],ymm2[30],ymm3[31],ymm2[31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm15, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm15 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm9, %ymm15 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[4],ymm13[4],ymm1[5],ymm13[5],ymm1[6],ymm13[6],ymm1[7],ymm13[7],ymm1[16],ymm13[16],ymm1[17],ymm13[17],ymm1[18],ymm13[18],ymm1[19],ymm13[19],ymm1[20],ymm13[20],ymm1[21],ymm13[21],ymm1[22],ymm13[22],ymm1[23],ymm13[23] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1580,135 +1568,125 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm15, 160(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm14, 64(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[16],ymm12[16],ymm9[17],ymm12[17],ymm9[18],ymm12[18],ymm9[19],ymm12[19],ymm9[20],ymm12[20],ymm9[21],ymm12[21],ymm9[22],ymm12[22],ymm9[23],ymm12[23] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[4],ymm15[4],ymm9[5],ymm15[5],ymm9[6],ymm15[6],ymm9[7],ymm15[7],ymm9[16],ymm15[16],ymm9[17],ymm15[17],ymm9[18],ymm15[18],ymm9[19],ymm15[19],ymm9[20],ymm15[20],ymm9[21],ymm15[21],ymm9[22],ymm15[22],ymm9[23],ymm15[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[16],ymm12[16],ymm8[17],ymm12[17],ymm8[18],ymm12[18],ymm8[19],ymm12[19],ymm8[20],ymm12[20],ymm8[21],ymm12[21],ymm8[22],ymm12[22],ymm8[23],ymm12[23] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[2],ymm15[2],ymm8[3],ymm15[3],ymm8[4],ymm15[4],ymm8[5],ymm15[5],ymm8[6],ymm15[6],ymm8[7],ymm15[7],ymm8[16],ymm15[16],ymm8[17],ymm15[17],ymm8[18],ymm15[18],ymm8[19],ymm15[19],ymm8[20],ymm15[20],ymm8[21],ymm15[21],ymm8[22],ymm15[22],ymm8[23],ymm15[23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm12, %ymm15, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm15, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15],ymm3[24],ymm2[24],ymm3[25],ymm2[25],ymm3[26],ymm2[26],ymm3[27],ymm2[27],ymm3[28],ymm2[28],ymm3[29],ymm2[29],ymm3[30],ymm2[30],ymm3[31],ymm2[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm15, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm9, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[4],ymm13[4],ymm1[5],ymm13[5],ymm1[6],ymm13[6],ymm1[7],ymm13[7],ymm1[16],ymm13[16],ymm1[17],ymm13[17],ymm1[18],ymm13[18],ymm1[19],ymm13[19],ymm1[20],ymm13[20],ymm1[21],ymm13[21],ymm1[22],ymm13[22],ymm1[23],ymm13[23] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1717,10 +1695,9 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1733,77 +1710,77 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512F-SLOW-NEXT: vprold $16, %ymm9, %ymm9 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm14, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm12 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm10, %zmm12, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm13 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,0,3,4,5,6,7,10,9,8,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm14 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm15[8],xmm11[9],xmm15[9],xmm11[10],xmm15[10],xmm11[11],xmm15[11],xmm11[12],xmm15[12],xmm11[13],xmm15[13],xmm11[14],xmm15[14],xmm11[15],xmm15[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX512F-SLOW-NEXT: vprold $16, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm13 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm13[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm10, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,0,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm7 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,0,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm14, %zmm8 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm7 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[16],ymm9[16],ymm7[17],ymm9[17],ymm7[18],ymm9[18],ymm7[19],ymm9[19],ymm7[20],ymm9[20],ymm7[21],ymm9[21],ymm7[22],ymm9[22],ymm7[23],ymm9[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm12, %zmm7 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm8 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[16],ymm8[16],ymm6[17],ymm8[17],ymm6[18],ymm8[18],ymm6[19],ymm8[19],ymm6[20],ymm8[20],ymm6[21],ymm8[21],ymm6[22],ymm8[22],ymm6[23],ymm8[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] @@ -1811,7 +1788,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512F-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $202, %zmm7, %zmm2, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $202, %zmm6, %zmm2, %zmm3 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm4 ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm2 @@ -1822,8 +1799,8 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -1862,11 +1839,11 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm6[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm8 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm10 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm7 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] @@ -1880,44 +1857,44 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm13 ; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm12 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] ; AVX512F-FAST-NEXT: vprold $16, %xmm13, %xmm13 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm13 = zmm12[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm13 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm12 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm11 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm15 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm14 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm15, %zmm14 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm14 = zmm14[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm13, %zmm15, %zmm14 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm13 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3],xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm14, %zmm13 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm13 = zmm13[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm12, %zmm14, %zmm13 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512F-FAST-NEXT: vprold $16, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,1,10,10,10,11] -; AVX512F-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,10,10,10,11] +; AVX512F-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm5 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm15, %zmm3 +; AVX512F-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm14, %zmm3 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm11[8],xmm15[9],xmm11[9],xmm15[10],xmm11[10],xmm15[11],xmm11[11],xmm15[12],xmm11[12],xmm15[13],xmm11[13],xmm15[14],xmm11[14],xmm15[15],xmm11[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -2159,26 +2136,27 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i8_stride6_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $184, %rsp -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: movdqa (%rcx), %xmm9 -; SSE-NEXT: movdqa (%r8), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm8 +; SSE-NEXT: movdqa (%rdx), %xmm5 +; SSE-NEXT: movdqa (%rcx), %xmm6 +; SSE-NEXT: movdqa (%r8), %xmm4 ; SSE-NEXT: movdqa (%r9), %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: pandn %xmm7, %xmm12 ; SSE-NEXT: por %xmm3, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm13 @@ -2186,132 +2164,130 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm12, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,0,0,0] +; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: por %xmm13, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: pandn %xmm7, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: por %xmm7, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,2,2] -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: pandn %xmm7, %xmm15 +; SSE-NEXT: pand %xmm12, %xmm13 +; SSE-NEXT: por %xmm13, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[1,1,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm14 -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[1,1,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: pand %xmm13, %xmm15 +; SSE-NEXT: por %xmm15, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm7 ; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: pand %xmm13, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa 16(%rcx), %xmm2 +; SSE-NEXT: pand %xmm1, %xmm5 ; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] ; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa 16(%rcx), %xmm5 -; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm4 ; SSE-NEXT: movdqa 16(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa 16(%r8), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa 16(%r8), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: pandn %xmm7, %xmm13 ; SSE-NEXT: pand %xmm3, %xmm6 @@ -2319,132 +2295,129 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa 16(%r9), %xmm6 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm15, %xmm9 ; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: por %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: por %xmm13, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm10[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm13 ; SSE-NEXT: por %xmm13, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm12 ; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: pand %xmm9, %xmm15 ; SSE-NEXT: por %xmm15, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm13, %xmm9 ; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm9 ; SSE-NEXT: por %xmm12, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[3,3,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm10, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm11 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 ; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pand %xmm9, %xmm7 ; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm2 ; SSE-NEXT: movdqa 32(%rcx), %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm4 ; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa 32(%r8), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa 32(%r8), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm12 ; SSE-NEXT: pandn %xmm7, %xmm12 ; SSE-NEXT: pand %xmm3, %xmm6 @@ -2452,149 +2425,150 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa 32(%r9), %xmm6 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm13, %xmm9 ; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm12 ; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm15 ; SSE-NEXT: pandn %xmm12, %xmm15 -; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: pand %xmm9, %xmm13 ; SSE-NEXT: por %xmm13, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: pandn %xmm12, %xmm9 ; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: por %xmm15, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[3,3,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm9 -; SSE-NEXT: por %xmm12, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm10, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm11 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,0,2,2,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm5 ; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm5, %xmm8 ; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pand %xmm9, %xmm7 ; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm8 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm14 +; SSE-NEXT: movdqa 48(%rdx), %xmm10 ; SSE-NEXT: movdqa 48(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm8 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 +; SSE-NEXT: movdqa 48(%rsi), %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa 48(%r8), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa 48(%r8), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm13 ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movdqa 48(%r9), %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,0,0] +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: movdqa 48(%r9), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,0,0] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm12, %xmm15 +; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: por %xmm13, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm12 ; SSE-NEXT: por %xmm12, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,2,2] @@ -2603,94 +2577,94 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm12, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm13 ; SSE-NEXT: por %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] ; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: pandn %xmm12, %xmm13 ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm12, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,0,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,1,1] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,2,2] +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,3,3] +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm14, 368(%rax) -; SSE-NEXT: movdqa %xmm10, 352(%rax) -; SSE-NEXT: movdqa %xmm4, 336(%rax) +; SSE-NEXT: movdqa %xmm2, 368(%rax) +; SSE-NEXT: movdqa %xmm11, 352(%rax) +; SSE-NEXT: movdqa %xmm5, 336(%rax) ; SSE-NEXT: movdqa %xmm8, 320(%rax) ; SSE-NEXT: movdqa %xmm13, 304(%rax) ; SSE-NEXT: movdqa %xmm15, 288(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 272(%rax) +; SSE-NEXT: movdqa %xmm14, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2730,193 +2704,192 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i8_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $200, %rsp +; AVX1-ONLY-NEXT: subq $184, %rsp ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,1,2,3,4],zero,xmm5[6,7,8,9,10],zero,xmm5[12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm5[8],zero,zero,zero,zero,zero,xmm5[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4],xmm7[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10],zero,xmm3[12,13,14,15] +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm3[8],zero,zero,zero,zero,zero,xmm3[9],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm3[5],zero,zero,zero,zero,zero,xmm3[6],zero,zero,zero,zero,zero,xmm3[7] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6],xmm6[7] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm3[13],zero,zero,zero,zero,zero,xmm3[14],zero,zero,zero,zero,zero,xmm3[15] ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6],zero,xmm2[8,9,10,11,12],zero,xmm2[14,15] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,1,2],zero,xmm8[4,5,6,7,8],zero,xmm8[10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm8[13],zero,zero,zero,zero,zero,xmm8[14],zero,zero,zero,zero,zero,xmm8[15] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[0,1,2],zero,xmm7[4,5,6,7,8],zero,xmm7[10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm7[13],zero,zero,zero,zero,zero,xmm7[14],zero,zero,zero,zero,zero,xmm7[15] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[10,u],zero,zero,zero,zero,xmm6[11,u],zero,zero,zero,zero,xmm6[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1,2],xmm10[3],xmm8[4,5],xmm10[6],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1,2],xmm9[3],xmm3[4,5],xmm9[6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[2,u],zero,zero,zero,zero,xmm1[3,u],zero,zero,zero,zero,xmm1[4,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2],xmm8[3],xmm5[4,5],xmm8[6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,u],zero,zero,zero,zero,xmm1[3,u],zero,zero,zero,zero,xmm1[4,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5,6],zero,xmm5[8,9,10,11,12],zero,xmm5[14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[2,u],zero,zero,zero,zero,xmm6[3,u],zero,zero,zero,zero,xmm6[4,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0],zero,xmm4[2,3,4,5,6],zero,xmm4[8,9,10,11,12],zero,xmm4[14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,8,128,128,128,128,128,9,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,8,128,128,128,128,128,9,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u> +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -2938,17 +2911,17 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,1,2,3,4],zero,xmm2[6,7,8,9,10],zero,xmm2[12,13,14,15] ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] @@ -2963,12 +2936,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <10,u,128,128,128,128,11,u,128,128,128,128,12,u,128,128> @@ -2979,126 +2952,123 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm2[10],zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,xmm2[12],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm1[13,u],zero,zero,zero,zero,xmm1[14,u],zero,zero,zero,zero,xmm1[15,u] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5],xmm15[6],xmm12[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1,2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,0,1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm15, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm13, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm13, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm10, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1,2],xmm15[3],xmm10[4,5],xmm15[6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0],xmm10[1,2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm10, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm6 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1,2],xmm3[3],xmm8[4,5],xmm3[6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5,6],zero,xmm3[8,9,10,11,12],zero,xmm3[14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0],zero,xmm8[2,3,4,5,6],zero,xmm8[8,9,10,11,12],zero,xmm8[14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2],xmm6[3,4],xmm9[5],xmm6[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw $161, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8],zero,xmm1[10,11,12,13,14],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm15, 112(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3111,9 +3081,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) @@ -3135,343 +3105,341 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rax) -; AVX1-ONLY-NEXT: addq $200, %rsp +; AVX1-ONLY-NEXT: addq $184, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $664, %rsp # imm = 0x298 +; AVX2-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm14 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm5, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm12, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm11, %ymm12, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,0,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[4],ymm14[4],ymm10[5],ymm14[5],ymm10[6],ymm14[6],ymm10[7],ymm14[7],ymm10[16],ymm14[16],ymm10[17],ymm14[17],ymm10[18],ymm14[18],ymm10[19],ymm14[19],ymm10[20],ymm14[20],ymm10[21],ymm14[21],ymm10[22],ymm14[22],ymm10[23],ymm14[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[16],ymm10[16],ymm9[17],ymm10[17],ymm9[18],ymm10[18],ymm9[19],ymm10[19],ymm9[20],ymm10[20],ymm9[21],ymm10[21],ymm9[22],ymm10[22],ymm9[23],ymm10[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm12, %ymm14, %ymm12 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[2],ymm13[2],ymm8[3],ymm13[3],ymm8[4],ymm13[4],ymm8[5],ymm13[5],ymm8[6],ymm13[6],ymm8[7],ymm13[7],ymm8[16],ymm13[16],ymm8[17],ymm13[17],ymm8[18],ymm13[18],ymm8[19],ymm13[19],ymm8[20],ymm13[20],ymm8[21],ymm13[21],ymm8[22],ymm13[22],ymm8[23],ymm13[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[2],mem[2],ymm11[3],mem[3],ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[16],mem[16],ymm11[17],mem[17],ymm11[18],mem[18],ymm11[19],mem[19],ymm11[20],mem[20],ymm11[21],mem[21],ymm11[22],mem[22],ymm11[23],mem[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm14, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm13 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm15[0],mem[0],ymm15[1],mem[1],ymm15[2],mem[2],ymm15[3],mem[3],ymm15[4],mem[4],ymm15[5],mem[5],ymm15[6],mem[6],ymm15[7],mem[7],ymm15[16],mem[16],ymm15[17],mem[17],ymm15[18],mem[18],ymm15[19],mem[19],ymm15[20],mem[20],ymm15[21],mem[21],ymm15[22],mem[22],ymm15[23],mem[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[16],ymm13[16],ymm12[17],ymm13[17],ymm12[18],ymm13[18],ymm12[19],ymm13[19],ymm12[20],ymm13[20],ymm12[21],ymm13[21],ymm12[22],ymm13[22],ymm12[23],ymm13[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[16],mem[16],ymm3[17],mem[17],ymm3[18],mem[18],ymm3[19],mem[19],ymm3[20],mem[20],ymm3[21],mem[21],ymm3[22],mem[22],ymm3[23],mem[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm11, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm11 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm15, %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm12, %ymm14, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm14, %ymm13 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm13, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm6, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm13 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm13, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15],ymm12[24],ymm13[24],ymm12[25],ymm13[25],ymm12[26],ymm13[26],ymm12[27],ymm13[27],ymm12[28],ymm13[28],ymm12[29],ymm13[29],ymm12[30],ymm13[30],ymm12[31],ymm13[31] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[8],mem[8],ymm5[9],mem[9],ymm5[10],mem[10],ymm5[11],mem[11],ymm5[12],mem[12],ymm5[13],mem[13],ymm5[14],mem[14],ymm5[15],mem[15],ymm5[24],mem[24],ymm5[25],mem[25],ymm5[26],mem[26],ymm5[27],mem[27],ymm5[28],mem[28],ymm5[29],mem[29],ymm5[30],mem[30],ymm5[31],mem[31] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15],ymm2[24],mem[24],ymm2[25],mem[25],ymm2[26],mem[26],ymm2[27],mem[27],ymm2[28],mem[28],ymm2[29],mem[29],ymm2[30],mem[30],ymm2[31],mem[31] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15],ymm9[24],ymm10[24],ymm9[25],ymm10[25],ymm9[26],ymm10[26],ymm9[27],ymm10[27],ymm9[28],ymm10[28],ymm9[29],ymm10[29],ymm9[30],ymm10[30],ymm9[31],ymm10[31] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11],ymm3[12],mem[12],ymm3[13],mem[13],ymm3[14],mem[14],ymm3[15],mem[15],ymm3[24],mem[24],ymm3[25],mem[25],ymm3[26],mem[26],ymm3[27],mem[27],ymm3[28],mem[28],ymm3[29],mem[29],ymm3[30],mem[30],ymm3[31],mem[31] +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm11[8],mem[8],ymm11[9],mem[9],ymm11[10],mem[10],ymm11[11],mem[11],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15],ymm11[24],mem[24],ymm11[25],mem[25],ymm11[26],mem[26],ymm11[27],mem[27],ymm11[28],mem[28],ymm11[29],mem[29],ymm11[30],mem[30],ymm11[31],mem[31] +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm9 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 352(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 352(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 320(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 256(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: addq $664, %rsp # imm = 0x298 +; AVX2-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 @@ -3479,325 +3447,328 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm12 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm8 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm14 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm2 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm14 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm6 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm8, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm8, %ymm12, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm11, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm14[8],xmm9[9],xmm14[9],xmm9[10],xmm14[10],xmm9[11],xmm14[11],xmm9[12],xmm14[12],xmm9[13],xmm14[13],xmm9[14],xmm14[14],xmm9[15],xmm14[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm13[8],ymm0[9],ymm13[9],ymm0[10],ymm13[10],ymm0[11],ymm13[11],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15],ymm0[24],ymm13[24],ymm0[25],ymm13[25],ymm0[26],ymm13[26],ymm0[27],ymm13[27],ymm0[28],ymm13[28],ymm0[29],ymm13[29],ymm0[30],ymm13[30],ymm0[31],ymm13[31] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15],ymm13[24],ymm14[24],ymm13[25],ymm14[25],ymm13[26],ymm14[26],ymm13[27],ymm14[27],ymm13[28],ymm14[28],ymm13[29],ymm14[29],ymm13[30],ymm14[30],ymm13[31],ymm14[31] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm12 -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm15[8],ymm3[9],ymm15[9],ymm3[10],ymm15[10],ymm3[11],ymm15[11],ymm3[12],ymm15[12],ymm3[13],ymm15[13],ymm3[14],ymm15[14],ymm3[15],ymm15[15],ymm3[24],ymm15[24],ymm3[25],ymm15[25],ymm3[26],ymm15[26],ymm3[27],ymm15[27],ymm3[28],ymm15[28],ymm3[29],ymm15[29],ymm3[30],ymm15[30],ymm3[31],ymm15[31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[16],ymm14[16],ymm13[17],ymm14[17],ymm13[18],ymm14[18],ymm13[19],ymm14[19],ymm13[20],ymm14[20],ymm13[21],ymm14[21],ymm13[22],ymm14[22],ymm13[23],ymm14[23] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[16],mem[16],ymm3[17],mem[17],ymm3[18],mem[18],ymm3[19],mem[19],ymm3[20],mem[20],ymm3[21],mem[21],ymm3[22],mem[22],ymm3[23],mem[23] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm15[0],mem[0],ymm15[1],mem[1],ymm15[2],mem[2],ymm15[3],mem[3],ymm15[4],mem[4],ymm15[5],mem[5],ymm15[6],mem[6],ymm15[7],mem[7],ymm15[16],mem[16],ymm15[17],mem[17],ymm15[18],mem[18],ymm15[19],mem[19],ymm15[20],mem[20],ymm15[21],mem[21],ymm15[22],mem[22],ymm15[23],mem[23] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 288(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 288(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 352(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 320(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 224(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 256(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-PERLANE-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm5 @@ -3805,904 +3776,915 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm12, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm14[8],xmm9[9],xmm14[9],xmm9[10],xmm14[10],xmm9[11],xmm14[11],xmm9[12],xmm14[12],xmm9[13],xmm14[13],xmm9[14],xmm14[14],xmm9[15],xmm14[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm13[8],ymm0[9],ymm13[9],ymm0[10],ymm13[10],ymm0[11],ymm13[11],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15],ymm0[24],ymm13[24],ymm0[25],ymm13[25],ymm0[26],ymm13[26],ymm0[27],ymm13[27],ymm0[28],ymm13[28],ymm0[29],ymm13[29],ymm0[30],ymm13[30],ymm0[31],ymm13[31] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15],ymm13[24],ymm14[24],ymm13[25],ymm14[25],ymm13[26],ymm14[26],ymm13[27],ymm14[27],ymm13[28],ymm14[28],ymm13[29],ymm14[29],ymm13[30],ymm14[30],ymm13[31],ymm14[31] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm15[8],ymm3[9],ymm15[9],ymm3[10],ymm15[10],ymm3[11],ymm15[11],ymm3[12],ymm15[12],ymm3[13],ymm15[13],ymm3[14],ymm15[14],ymm3[15],ymm15[15],ymm3[24],ymm15[24],ymm3[25],ymm15[25],ymm3[26],ymm15[26],ymm3[27],ymm15[27],ymm3[28],ymm15[28],ymm3[29],ymm15[29],ymm3[30],ymm15[30],ymm3[31],ymm15[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm11, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm11, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm14, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[16],ymm14[16],ymm13[17],ymm14[17],ymm13[18],ymm14[18],ymm13[19],ymm14[19],ymm13[20],ymm14[20],ymm13[21],ymm14[21],ymm13[22],ymm14[22],ymm13[23],ymm14[23] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[16],mem[16],ymm3[17],mem[17],ymm3[18],mem[18],ymm3[19],mem[19],ymm3[20],mem[20],ymm3[21],mem[21],ymm3[22],mem[22],ymm3[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0],mem[0],ymm15[1],mem[1],ymm15[2],mem[2],ymm15[3],mem[3],ymm15[4],mem[4],ymm15[5],mem[5],ymm15[6],mem[6],ymm15[7],mem[7],ymm15[16],mem[16],ymm15[17],mem[17],ymm15[18],mem[18],ymm15[19],mem[19],ymm15[20],mem[20],ymm15[21],mem[21],ymm15[22],mem[22],ymm15[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm14, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-PERLANE-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride6_vf64: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm28 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm7, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm6 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31] -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm8, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15],ymm9[24],ymm8[24],ymm9[25],ymm8[25],ymm9[26],ymm8[26],ymm9[27],ymm8[27],ymm9[28],ymm8[28],ymm9[29],ymm8[29],ymm9[30],ymm8[30],ymm9[31],ymm8[31] -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm27 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm8, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm18 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm0 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm14, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15],ymm15[24],ymm14[24],ymm15[25],ymm14[25],ymm15[26],ymm14[26],ymm15[27],ymm14[27],ymm15[28],ymm14[28],ymm15[29],ymm14[29],ymm15[30],ymm14[30],ymm15[31],ymm14[31] +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm6 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15],ymm10[24],ymm11[24],ymm10[25],ymm11[25],ymm10[26],ymm11[26],ymm10[27],ymm11[27],ymm10[28],ymm11[28],ymm10[29],ymm11[29],ymm10[30],ymm11[30],ymm10[31],ymm11[31] +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] ; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm15, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm8[8],ymm0[8],ymm8[9],ymm0[9],ymm8[10],ymm0[10],ymm8[11],ymm0[11],ymm8[12],ymm0[12],ymm8[13],ymm0[13],ymm8[14],ymm0[14],ymm8[15],ymm0[15],ymm8[24],ymm0[24],ymm8[25],ymm0[25],ymm8[26],ymm0[26],ymm8[27],ymm0[27],ymm8[28],ymm0[28],ymm8[29],ymm0[29],ymm8[30],ymm0[30],ymm8[31],ymm0[31] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm5 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[16],ymm2[16],ymm5[17],ymm2[17],ymm5[18],ymm2[18],ymm5[19],ymm2[19],ymm5[20],ymm2[20],ymm5[21],ymm2[21],ymm5[22],ymm2[22],ymm5[23],ymm2[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm7 ; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm7 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[4],ymm0[4],ymm7[5],ymm0[5],ymm7[6],ymm0[6],ymm7[7],ymm0[7],ymm7[16],ymm0[16],ymm7[17],ymm0[17],ymm7[18],ymm0[18],ymm7[19],ymm0[19],ymm7[20],ymm0[20],ymm7[21],ymm0[21],ymm7[22],ymm0[22],ymm7[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15],ymm2[24],ymm15[24],ymm2[25],ymm15[25],ymm2[26],ymm15[26],ymm2[27],ymm15[27],ymm2[28],ymm15[28],ymm2[29],ymm15[29],ymm2[30],ymm15[30],ymm2[31],ymm15[31] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[16],ymm7[16],ymm12[17],ymm7[17],ymm12[18],ymm7[18],ymm12[19],ymm7[19],ymm12[20],ymm7[20],ymm12[21],ymm7[21],ymm12[22],ymm7[22],ymm12[23],ymm7[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm13[8],ymm0[8],ymm13[9],ymm0[9],ymm13[10],ymm0[10],ymm13[11],ymm0[11],ymm13[12],ymm0[12],ymm13[13],ymm0[13],ymm13[14],ymm0[14],ymm13[15],ymm0[15],ymm13[24],ymm0[24],ymm13[25],ymm0[25],ymm13[26],ymm0[26],ymm13[27],ymm0[27],ymm13[28],ymm0[28],ymm13[29],ymm0[29],ymm13[30],ymm0[30],ymm13[31],ymm0[31] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm22 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[4],ymm0[4],ymm13[5],ymm0[5],ymm13[6],ymm0[6],ymm13[7],ymm0[7],ymm13[16],ymm0[16],ymm13[17],ymm0[17],ymm13[18],ymm0[18],ymm13[19],ymm0[19],ymm13[20],ymm0[20],ymm13[21],ymm0[21],ymm13[22],ymm0[22],ymm13[23],ymm0[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm24 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[16],ymm15[16],ymm2[17],ymm15[17],ymm2[18],ymm15[18],ymm2[19],ymm15[19],ymm2[20],ymm15[20],ymm2[21],ymm15[21],ymm2[22],ymm15[22],ymm2[23],ymm15[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm8 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[2],ymm0[2],ymm8[3],ymm0[3],ymm8[4],ymm0[4],ymm8[5],ymm0[5],ymm8[6],ymm0[6],ymm8[7],ymm0[7],ymm8[16],ymm0[16],ymm8[17],ymm0[17],ymm8[18],ymm0[18],ymm8[19],ymm0[19],ymm8[20],ymm0[20],ymm8[21],ymm0[21],ymm8[22],ymm0[22],ymm8[23],ymm0[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[16],ymm11[16],ymm10[17],ymm11[17],ymm10[18],ymm11[18],ymm10[19],ymm11[19],ymm10[20],ymm11[20],ymm10[21],ymm11[21],ymm10[22],ymm11[22],ymm10[23],ymm11[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[16],ymm14[16],ymm15[17],ymm14[17],ymm15[18],ymm14[18],ymm15[19],ymm14[19],ymm15[20],ymm14[20],ymm15[21],ymm14[21],ymm15[22],ymm14[22],ymm15[23],ymm14[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; AVX512F-SLOW-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm27 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-SLOW-NEXT: vprold $16, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512F-SLOW-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm9, %ymm16 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm10 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm8 ; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] ; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm2 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm14[0,0,0,1] -; AVX512F-SLOW-NEXT: vprold $16, %ymm19, %ymm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm12, %zmm1 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm8, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm29[0,0,0,1] -; AVX512F-SLOW-NEXT: vprold $16, %ymm30, %ymm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,0,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm11 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm20[0,0,0,1] +; AVX512F-SLOW-NEXT: vprold $16, %ymm29, %ymm20 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm14 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm15[0,0,0,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm20[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm7 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm14, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm7, %ymm20, %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm29, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm31[0,0,0,1] +; AVX512F-SLOW-NEXT: vprold $16, %ymm22, %ymm6 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm28[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm21[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm18[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm13[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm16[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm17[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm21[0,0,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm7 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm30[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm18[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm16[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm9 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm8, %ymm6 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm9, %ymm4, %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm15, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm26[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm6 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm14, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm6, %ymm20, %ymm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm13[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm29, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm27[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm5 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm27[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm11 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm29 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm8, %ymm30 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm28[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm7 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm7 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm6, %ymm31 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm14, %ymm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm5 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm11, %ymm8, %ymm31 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm31[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq $64, (%rsp), %zmm8 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm8 -; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm1 = mem[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm11 = mem[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm24[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm23[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm13 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm10 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm11, %ymm9, %ymm7 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm13, %ymm9, %ymm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm25[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm22[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm9, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm7, %ymm14, %ymm12 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm6, %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm12[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm7 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm9, %zmm7 +; AVX512F-SLOW-NEXT: vpermq $234, (%rsp), %zmm0 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm0 = mem[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm23[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm25[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm26[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm10 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm20, %ymm8 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm20, %ymm4 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm9, %ymm6, %ymm3 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm6, %ymm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm3 = mem[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm24[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm2 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512F-SLOW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride6_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $360, %rsp # imm = 0x168 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm2 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm4 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm4 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm4 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm31 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm8[8],ymm14[8],ymm8[9],ymm14[9],ymm8[10],ymm14[10],ymm8[11],ymm14[11],ymm8[12],ymm14[12],ymm8[13],ymm14[13],ymm8[14],ymm14[14],ymm8[15],ymm14[15],ymm8[24],ymm14[24],ymm8[25],ymm14[25],ymm8[26],ymm14[26],ymm8[27],ymm14[27],ymm8[28],ymm14[28],ymm8[29],ymm14[29],ymm8[30],ymm14[30],ymm8[31],ymm14[31] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15],ymm6[24],ymm10[24],ymm6[25],ymm10[25],ymm6[26],ymm10[26],ymm6[27],ymm10[27],ymm6[28],ymm10[28],ymm6[29],ymm10[29],ymm6[30],ymm10[30],ymm6[31],ymm10[31] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm6[8],ymm11[9],ymm6[9],ymm11[10],ymm6[10],ymm11[11],ymm6[11],ymm11[12],ymm6[12],ymm11[13],ymm6[13],ymm11[14],ymm6[14],ymm11[15],ymm6[15],ymm11[24],ymm6[24],ymm11[25],ymm6[25],ymm11[26],ymm6[26],ymm11[27],ymm6[27],ymm11[28],ymm6[28],ymm11[29],ymm6[29],ymm11[30],ymm6[30],ymm11[31],ymm6[31] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm15[8],ymm9[8],ymm15[9],ymm9[9],ymm15[10],ymm9[10],ymm15[11],ymm9[11],ymm15[12],ymm9[12],ymm15[13],ymm9[13],ymm15[14],ymm9[14],ymm15[15],ymm9[15],ymm15[24],ymm9[24],ymm15[25],ymm9[25],ymm15[26],ymm9[26],ymm15[27],ymm9[27],ymm15[28],ymm9[28],ymm15[29],ymm9[29],ymm15[30],ymm9[30],ymm15[31],ymm9[31] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm2 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] ; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm26 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm10, %zmm28 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm18 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm14, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm14 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm0[2,2,2,3] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm0[2,2,2,3] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm0[2,2,2,3] -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm15 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-FAST-NEXT: vprold $16, %xmm15, %xmm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm23 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm0[2,2,2,3] +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm12 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX512F-FAST-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm27 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm10 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512F-FAST-NEXT: vprold $16, %xmm10, %xmm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm24 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[4],ymm6[4],ymm11[5],ymm6[5],ymm11[6],ymm6[6],ymm11[7],ymm6[7],ymm11[16],ymm6[16],ymm11[17],ymm6[17],ymm11[18],ymm6[18],ymm11[19],ymm6[19],ymm11[20],ymm6[20],ymm11[21],ymm6[21],ymm11[22],ymm6[22],ymm11[23],ymm6[23] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; AVX512F-FAST-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm22 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm15[0],ymm9[0],ymm15[1],ymm9[1],ymm15[2],ymm9[2],ymm15[3],ymm9[3],ymm15[4],ymm9[4],ymm15[5],ymm9[5],ymm15[6],ymm9[6],ymm15[7],ymm9[7],ymm15[16],ymm9[16],ymm15[17],ymm9[17],ymm15[18],ymm9[18],ymm15[19],ymm9[19],ymm15[20],ymm9[20],ymm15[21],ymm9[21],ymm15[22],ymm9[22],ymm15[23],ymm9[23] ; AVX512F-FAST-NEXT: vprold $16, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm8[0],ymm14[0],ymm8[1],ymm14[1],ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[4],ymm14[4],ymm8[5],ymm14[5],ymm8[6],ymm14[6],ymm8[7],ymm14[7],ymm8[16],ymm14[16],ymm8[17],ymm14[17],ymm8[18],ymm14[18],ymm8[19],ymm14[19],ymm8[20],ymm14[20],ymm8[21],ymm14[21],ymm8[22],ymm14[22],ymm8[23],ymm14[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,0,1,10,10,10,11] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm2 -; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm30, %zmm6 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[16],ymm10[16],ymm6[17],ymm10[17],ymm6[18],ymm10[18],ymm6[19],ymm10[19],ymm6[20],ymm10[20],ymm6[21],ymm10[21],ymm6[22],ymm10[22],ymm6[23],ymm10[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm16 -; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm8 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[4],ymm8[4],ymm1[5],ymm8[5],ymm1[6],ymm8[6],ymm1[7],ymm8[7],ymm1[16],ymm8[16],ymm1[17],ymm8[17],ymm1[18],ymm8[18],ymm1[19],ymm8[19],ymm1[20],ymm8[20],ymm1[21],ymm8[21],ymm1[22],ymm8[22],ymm1[23],ymm8[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,1,10,10,10,11] +; AVX512F-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm30, %zmm9 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm19 +; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX512F-FAST-NEXT: vprold $16, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm4 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm13 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm15 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX512F-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm4 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm30, %zmm5 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm11 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm30, %zmm11 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm8 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm13[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm12[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm14[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm15[0,0,0,1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm6, %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm6, %ymm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm1[0,0,0,1] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm15, %ymm0 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm5, %ymm5 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-FAST-NEXT: vpermt2q %zmm10, %zmm31, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm10, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm14 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm10[0,0,0,1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm9, %ymm10, %ymm8 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm11, %ymm10, %ymm3 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm11 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm11, %ymm4, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm7 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm6, %ymm10, %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm13[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm0 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm9, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm8[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm2 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm3 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm30, %zmm3 ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm2 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm30, %zmm2 -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm0 = mem[2,2,2,3,6,6,6,7] ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm5 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm30, %zmm5 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm0, %ymm15, %ymm21 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm5, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm0, %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm0, %ymm22 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm22[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm30, %zmm5 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm10, %ymm28 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm5, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm10, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm3, %ymm2, %ymm29 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm29[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm6 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm9, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm2 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm2 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm23[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm26[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm2, %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm14[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm5 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm27[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpermq $64, (%rsp), %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm7 = mem[0,0,0,1,4,4,4,5] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm8, %zmm5 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm24[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm28[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm8, %zmm9 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm5, %ymm1 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm1, %ymm0, %ymm3 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm1 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm1, %ymm0, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm10, %ymm7 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm9, %ymm10, %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm27[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm22[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm9 = mem[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm8, %zmm9 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm7, %ymm3 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm3, %ymm2, %ymm31 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm3 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm3, %ymm2, %ymm12 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm7, %ymm4, %ymm0 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm9, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm2 = mem[0,0,0,1,4,4,4,5] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm1 = mem[0,0,0,1,4,4,4,5] ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm12[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm29[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512F-FAST-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512F-FAST-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride6_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] ; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] ; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 @@ -4715,7 +4697,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm14 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6 @@ -4744,10 +4726,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] ; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29 @@ -4756,7 +4738,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24 @@ -4789,7 +4771,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28 ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm13[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7] @@ -4818,74 +4800,74 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm13, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm28, %zmm13, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm13[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm13, %zmm16 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm17[8],xmm13[8],xmm17[9],xmm13[9],xmm17[10],xmm13[10],xmm17[11],xmm13[11],xmm17[12],xmm13[12],xmm17[13],xmm13[13],xmm17[14],xmm13[14],xmm17[15],xmm13[15] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm8, %ymm10, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[2,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm9, %zmm11, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm14, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm9, %zmm11, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm8 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm1, %ymm10, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm11, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm11, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) @@ -4896,109 +4878,109 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-LABEL: store_i8_stride6_vf64: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm2 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm1, %ymm9, %ymm1 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm10, %ymm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm2 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512BW-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm3 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15],ymm7[24],ymm5[24],ymm7[25],ymm5[25],ymm7[26],ymm5[26],ymm7[27],ymm5[27],ymm7[28],ymm5[28],ymm7[29],ymm5[29],ymm7[30],ymm5[30],ymm7[31],ymm5[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm13, %ymm2 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm16, %ymm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: movl $613566756, %eax # imm = 0x24924924 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = +; AVX512BW-FAST-NEXT: vpshufb %zmm17, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7] ; AVX512BW-FAST-NEXT: movl $-1840700270, %eax # imm = 0x92492492 ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm1[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-FAST-NEXT: vpshufb %zmm16, %zmm11, %zmm11 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm3, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-FAST-NEXT: vpshufb %zmm18, %zmm9, %zmm9 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7] ; AVX512BW-FAST-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm11, %zmm0 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm11 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm17 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm6 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm17[0],ymm6[1],ymm17[1],ymm6[2],ymm17[2],ymm6[3],ymm17[3],ymm6[4],ymm17[4],ymm6[5],ymm17[5],ymm6[6],ymm17[6],ymm6[7],ymm17[7],ymm6[16],ymm17[16],ymm6[17],ymm17[17],ymm6[18],ymm17[18],ymm6[19],ymm17[19],ymm6[20],ymm17[20],ymm6[21],ymm17[21],ymm6[22],ymm17[22],ymm6[23],ymm17[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm17 = ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15],ymm14[24],ymm11[24],ymm14[25],ymm11[25],ymm14[26],ymm11[26],ymm14[27],ymm11[27],ymm14[28],ymm11[28],ymm14[29],ymm11[29],ymm14[30],ymm11[30],ymm14[31],ymm11[31] -; AVX512BW-FAST-NEXT: vpermw %ymm17, %ymm9, %ymm9 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm18 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm18, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm19, %ymm12 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[16],ymm6[16],ymm12[17],ymm6[17],ymm12[18],ymm6[18],ymm12[19],ymm6[19],ymm12[20],ymm6[20],ymm12[21],ymm6[21],ymm12[22],ymm6[22],ymm12[23],ymm6[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] -; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm13, %ymm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm9, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm9 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm9, %zmm0 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm13 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm7 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm13[0],ymm7[1],ymm13[1],ymm7[2],ymm13[2],ymm7[3],ymm13[3],ymm7[4],ymm13[4],ymm7[5],ymm13[5],ymm7[6],ymm13[6],ymm7[7],ymm13[7],ymm7[16],ymm13[16],ymm7[17],ymm13[17],ymm7[18],ymm13[18],ymm7[19],ymm13[19],ymm7[20],ymm13[20],ymm7[21],ymm13[21],ymm7[22],ymm13[22],ymm7[23],ymm13[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15],ymm11[24],ymm9[24],ymm11[25],ymm9[25],ymm11[26],ymm9[26],ymm11[27],ymm9[27],ymm11[28],ymm9[28],ymm11[29],ymm9[29],ymm11[30],ymm9[30],ymm11[31],ymm9[31] +; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm10, %ymm10 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512BW-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512BW-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm15 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[4],ymm7[4],ymm15[5],ymm7[5],ymm15[6],ymm7[6],ymm15[7],ymm7[7],ymm15[16],ymm7[16],ymm15[17],ymm7[17],ymm15[18],ymm7[18],ymm15[19],ymm7[19],ymm15[20],ymm7[20],ymm15[21],ymm7[21],ymm15[22],ymm7[22],ymm15[23],ymm7[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm16, %ymm15 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm10, %zmm7 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm10 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vpshufb %zmm17, %zmm8, %zmm8 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm6 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm7 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm10 -; AVX512BW-FAST-NEXT: vpshufb %zmm16, %zmm10, %zmm10 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm6 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm22 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm21 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm12, %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm23 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm15 -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm15, %xmm13 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpshufb %zmm18, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm12, %zmm7 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm19 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm17 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm23 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm17, %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm18 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm18, %xmm15 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] -; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm24, %ymm13 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm17 +; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm24, %ymm15 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm20 ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm25 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm17, %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm20 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm20, %xmm16 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm16[8],xmm10[8],xmm16[9],xmm10[9],xmm16[10],xmm10[10],xmm16[11],xmm10[11],xmm16[12],xmm10[12],xmm16[13],xmm10[13],xmm16[14],xmm10[14],xmm16[15],xmm10[15] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm17[0],xmm20[1],xmm17[1],xmm20[2],xmm17[2],xmm20[3],xmm17[3],xmm20[4],xmm17[4],xmm20[5],xmm17[5],xmm20[6],xmm17[6],xmm20[7],xmm17[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm20, %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm21 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm21, %xmm16 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm16[8],xmm12[8],xmm16[9],xmm12[9],xmm16[10],xmm12[10],xmm16[11],xmm12[11],xmm16[12],xmm12[12],xmm16[13],xmm12[13],xmm16[14],xmm12[14],xmm16[15],xmm12[15] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] ; AVX512BW-FAST-NEXT: vpermw %ymm16, %ymm26, %ymm16 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm10 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm15, %zmm12 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm15 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm27 = <8,u,9,u,u,u,u,u,u,u,5,u,6,u,7,u> -; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm13, %xmm16 -; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero +; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm15, %xmm16 +; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero,xmm15[4],zero,xmm15[5],zero,xmm15[6],zero,xmm15[7],zero ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm28, %zmm16 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512BW-FAST-NEXT: vpermw %zmm16, %zmm28, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vpermw %zmm16, %zmm28, %zmm12 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm16 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = ; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm16, %xmm30 @@ -5008,91 +4990,91 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpermw %zmm30, %zmm28, %zmm30 ; AVX512BW-FAST-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm30, %zmm10 {%k3} -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm22, %xmm30 -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm23, %xmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm21[0],xmm30[0],xmm21[1],xmm30[1],xmm21[2],xmm30[2],xmm21[3],xmm30[3],xmm21[4],xmm30[4],xmm21[5],xmm30[5],xmm21[6],xmm30[6],xmm21[7],xmm30[7] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm30 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm30, %zmm12 {%k3} +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm19, %xmm30 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm22, %xmm23 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm23[0],xmm30[0],xmm23[1],xmm30[1],xmm23[2],xmm30[2],xmm23[3],xmm30[3],xmm23[4],xmm30[4],xmm23[5],xmm30[5],xmm23[6],xmm30[6],xmm23[7],xmm30[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm30 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7] ; AVX512BW-FAST-NEXT: vpermw %ymm30, %ymm24, %ymm24 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm30 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm31, %xmm21 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm24, %zmm24 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm31, %xmm23 ; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm30, %xmm25 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm25[8],xmm21[8],xmm25[9],xmm21[9],xmm25[10],xmm21[10],xmm25[11],xmm21[11],xmm25[12],xmm21[12],xmm25[13],xmm21[13],xmm25[14],xmm21[14],xmm25[15],xmm21[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm23[8],xmm25[9],xmm23[9],xmm25[10],xmm23[10],xmm25[11],xmm23[11],xmm25[12],xmm23[12],xmm25[13],xmm23[13],xmm25[14],xmm23[14],xmm25[15],xmm23[15] ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm31[0],xmm30[1],xmm31[1],xmm30[2],xmm31[2],xmm30[3],xmm31[3],xmm30[4],xmm31[4],xmm30[5],xmm31[5],xmm30[6],xmm31[6],xmm30[7],xmm31[7] ; AVX512BW-FAST-NEXT: vpermw %ymm25, %ymm26, %ymm25 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm25, %zmm21 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm23 ; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %xmm25 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm21 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm23 {%k2} ; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm25, %xmm24 ; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm26 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm26 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %xmm24 -; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm28, %zmm21 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm24, %xmm26 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm24[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm26, %zmm27, %zmm26 -; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm28, %zmm26 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm21 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm18 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[16],ymm11[16],ymm14[17],ymm11[17],ymm14[18],ymm11[18],ymm14[19],ymm11[19],ymm14[20],ymm11[20],ymm14[21],ymm11[21],ymm14[22],ymm11[22],ymm14[23],ymm11[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm30[8],xmm31[8],xmm30[9],xmm31[9],xmm30[10],xmm31[10],xmm30[11],xmm31[11],xmm30[12],xmm31[12],xmm30[13],xmm31[13],xmm30[14],xmm31[14],xmm30[15],xmm31[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] -; AVX512BW-FAST-NEXT: vpermw %zmm11, %zmm14, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] -; AVX512BW-FAST-NEXT: vpermw %zmm18, %zmm19, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm20[8],xmm17[8],xmm20[9],xmm17[9],xmm20[10],xmm17[10],xmm20[11],xmm17[11],xmm20[12],xmm17[12],xmm20[13],xmm17[13],xmm20[14],xmm17[14],xmm20[15],xmm17[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512BW-FAST-NEXT: vpermw %zmm3, %zmm14, %zmm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %xmm26 +; AVX512BW-FAST-NEXT: vpermw %zmm24, %zmm28, %zmm23 {%k1} +; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm26, %xmm24 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm26[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm24, %zmm27, %zmm24 +; AVX512BW-FAST-NEXT: vpermw %zmm24, %zmm28, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm24, %zmm23 {%k3} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[16],ymm9[16],ymm11[17],ymm9[17],ymm11[18],ymm9[18],ymm11[19],ymm9[19],ymm11[20],ymm9[20],ymm11[21],ymm9[21],ymm11[22],ymm9[22],ymm11[23],ymm9[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm30[8],xmm31[8],xmm30[9],xmm31[9],xmm30[10],xmm31[10],xmm30[11],xmm31[11],xmm30[12],xmm31[12],xmm30[13],xmm31[13],xmm30[14],xmm31[14],xmm30[15],xmm31[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512BW-FAST-NEXT: vpermw %zmm9, %zmm11, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] +; AVX512BW-FAST-NEXT: vpermw %zmm13, %zmm14, %zmm9 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm11, %zmm1 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm19, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm14, %zmm1 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] ; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm4 ; AVX512BW-FAST-NEXT: movl $1227133513, %eax # imm = 0x49249249 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm24[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm9 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm26[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512BW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 ; AVX512BW-FAST-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm11 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm9 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k2} ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -5100,24 +5082,24 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-LABEL: store_i8_stride6_vf64: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm12 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] ; AVX512DQBW-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] ; AVX512DQBW-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 @@ -5130,7 +5112,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %xmm14 ; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] ; AVX512DQBW-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6 @@ -5159,10 +5141,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27 ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23] -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 ; AVX512DQBW-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] ; AVX512DQBW-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29 @@ -5171,7 +5153,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20 ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQBW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30 ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24 @@ -5204,7 +5186,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm13[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512DQBW-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7] @@ -5233,74 +5215,74 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm13, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm28, %zmm13, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm13[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm13, %zmm16 {%k2} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12 ; AVX512DQBW-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm13 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm17 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm17[8],xmm13[8],xmm17[9],xmm13[9],xmm17[10],xmm13[10],xmm17[11],xmm13[11],xmm17[12],xmm13[12],xmm17[13],xmm13[13],xmm17[14],xmm13[14],xmm17[15],xmm13[15] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512DQBW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm8, %ymm10, %ymm8 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm17 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,0,1,4,4,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[2,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] +; AVX512DQBW-SLOW-NEXT: vpermw %zmm9, %zmm11, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm14, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm9, %zmm11, %zmm9 ; AVX512DQBW-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 ; AVX512DQBW-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm8 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm12 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm1, %ymm10, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5] ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2} ; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] ; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm11, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3] ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm11, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 7893a799c207ab..ba7f2a108d27c7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -101,45 +101,45 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm3 ; SSE-NEXT: movdqa (%r8), %xmm5 -; SSE-NEXT: movdqa (%r10), %xmm2 +; SSE-NEXT: movdqa (%r10), %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,0,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,6] -; SSE-NEXT: packuswb %xmm8, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,2,0,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] -; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm7, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm9, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: por %xmm8, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,3,1,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] @@ -148,22 +148,22 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm3, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movq %xmm3, 16(%rax) ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE-NEXT: movd %xmm0, 24(%rax) @@ -288,100 +288,103 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-LABEL: store_i8_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movq {{.*#+}} xmm10 = mem[0],zero ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm14 = mem[0],zero -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] +; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm11 +; SSE-NEXT: por %xmm7, %xmm11 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm10[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,0] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pand %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: por %xmm12, %xmm9 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,0] +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: por %xmm7, %xmm13 +; SSE-NEXT: pand %xmm9, %xmm13 +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: por %xmm13, %xmm9 +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] ; SSE-NEXT: movdqa %xmm12, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: packuswb %xmm0, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,3] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm14, %xmm6 -; SSE-NEXT: por %xmm9, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: packuswb %xmm7, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,0,2,3] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm11, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,2,1] +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm11, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm14, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por %xmm9, %xmm14 +; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: por %xmm11, %xmm14 ; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; SSE-NEXT: packuswb %xmm2, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; SSE-NEXT: packuswb %xmm2, %xmm11 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: pand %xmm12, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: por %xmm9, %xmm12 +; SSE-NEXT: por %xmm11, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] ; SSE-NEXT: pand %xmm2, %xmm12 ; SSE-NEXT: pandn %xmm14, %xmm2 @@ -390,61 +393,62 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: packuswb %xmm13, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] -; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm15[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,0,0] +; SSE-NEXT: pandn %xmm11, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] ; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,4] -; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,4] +; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,3] ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %xmm2, 48(%rax) -; SSE-NEXT: movdqa %xmm1, 16(%rax) +; SSE-NEXT: movq %xmm0, 48(%rax) +; SSE-NEXT: movdqa %xmm9, 16(%rax) ; SSE-NEXT: movdqa %xmm12, 32(%rax) -; SSE-NEXT: movdqa %xmm6, (%rax) +; SSE-NEXT: movdqa %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf8: @@ -813,25 +817,23 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: subq $72, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm5 -; SSE-NEXT: movdqa (%r8), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm8 +; SSE-NEXT: movdqa (%rcx), %xmm7 +; SSE-NEXT: movdqa (%r8), %xmm15 +; SSE-NEXT: movdqa (%r9), %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm2 @@ -840,142 +842,139 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm14 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm4, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa (%rax), %xmm7 -; SSE-NEXT: por %xmm10, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,7,7,7] -; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: movdqa (%rax), %xmm3 +; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,7,7,7] +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: por %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,0] ; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm4, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm15[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm12, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm12, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm7, %xmm4 -; SSE-NEXT: por %xmm14, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,5,7] +; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: por %xmm14, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm12, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,6,6,6] +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,1,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] ; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,6,6] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm12, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -983,179 +982,177 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm12, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: pshuflw $233, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm15[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm12, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3] +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] ; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: por %xmm6, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: por %xmm12, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: por %xmm5, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] ; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] -; SSE-NEXT: pand %xmm12, %xmm9 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,2] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: por %xmm9, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm15[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,4] -; SSE-NEXT: pandn %xmm7, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm5, %xmm12 +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm12, %xmm5 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,1,2,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: pshuflw $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm15[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,4] +; SSE-NEXT: pandn %xmm8, %xmm11 +; SSE-NEXT: por %xmm5, %xmm11 ; SSE-NEXT: pand %xmm6, %xmm11 ; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm0, 16(%rax) +; SSE-NEXT: movdqa %xmm2, 16(%rax) ; SSE-NEXT: movdqa %xmm10, 32(%rax) -; SSE-NEXT: movdqa %xmm3, 64(%rax) -; SSE-NEXT: movdqa %xmm14, (%rax) +; SSE-NEXT: movdqa %xmm0, 64(%rax) +; SSE-NEXT: movdqa %xmm7, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: addq $72, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf16: @@ -1169,20 +1166,20 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm9[6,7],zero,zero,zero,zero,zero,xmm9[8,9],zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[4,5],zero,zero,zero,zero,zero,xmm10[6,7],zero,zero,zero,zero,zero,xmm10[8,9] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm10[0,1],zero,zero,zero,zero,zero,xmm10[2,3],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[0,1],zero,zero,zero,zero,zero,xmm9[2,3],zero,zero,zero,zero,zero,xmm9[4,5] -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,4,5],zero,xmm8[u,u,u,u,6,7],zero,xmm8[u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm8[6,7],zero,zero,zero,zero,zero,xmm8[8,9],zero,zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[4,5],zero,zero,zero,zero,zero,xmm9[6,7],zero,zero,zero,zero,zero,xmm9[8,9] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm9[0,1],zero,zero,zero,zero,zero,xmm9[2,3],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[0,1],zero,zero,zero,zero,zero,xmm8[2,3],zero,zero,zero,zero,zero,xmm8[4,5] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,4,5],zero,xmm10[u,u,u,u,6,7],zero,xmm10[u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u],zero,zero,xmm1[2,u,u,u,u],zero,zero,xmm1[3,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 @@ -1197,24 +1194,24 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm10[10,11],zero,zero,zero,zero,zero,xmm10[12,13],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[10,11],zero,zero,zero,zero,zero,xmm9[12,13],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm9[10,11],zero,zero,zero,zero,zero,xmm9[12,13],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[10,11],zero,zero,zero,zero,zero,xmm8[12,13],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u],zero,xmm2[7,u,u,u,u,u],zero,xmm2[8,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,7],zero,xmm0[u,u,u,u,u,8],zero,xmm0[u,u] ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm11[u,u,u,u,5,6],zero,xmm11[u,u,u,u,12,13],zero,xmm11[u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[6,u,u,u,u],zero,zero,xmm1[7,u,u,u,u],zero,zero,xmm1[8,u] ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9],zero,xmm8[u,u,u,u,10,11],zero,xmm8[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9],zero,xmm10[u,u,u,u,10,11],zero,xmm10[u,u,u,u,12,13] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm1[4,u,u,u,u],zero,zero,xmm1[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm9[8,9],zero,zero,zero,zero,zero,xmm9[10,11],zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] @@ -1264,19 +1261,19 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm7 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm9[5],zero,zero,zero,zero,zero,zero,ymm9[6],zero,zero,zero,zero,zero,zero,zero,ymm9[23],zero,zero,zero,zero,zero,zero,ymm9[24],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero,zero,ymm11[25] -; AVX2-SLOW-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] +; AVX2-SLOW-NEXT: vpor %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpor %ymm12, %ymm11, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4],zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero @@ -1286,7 +1283,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[0,2,0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] @@ -1294,9 +1291,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,2,0,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,2,0,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,8],zero,zero,zero,zero,zero,ymm13[1,9],zero,zero,zero,zero,zero,ymm13[2,10],zero,zero,zero,zero,zero,ymm13[19,27],zero,zero,zero,zero,zero,ymm13[20,28],zero,zero ; AVX2-SLOW-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] @@ -1308,13 +1305,13 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[3,19],zero,zero,zero,zero,zero,ymm8[28,20],zero,zero,zero,zero,zero,ymm8[29,21],zero -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,1,1,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[1],zero,zero,zero,zero,zero,ymm9[10,2],zero,zero,zero,zero,zero,ymm9[11,3],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero,zero,zero,zero,ymm9[21,29],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,3,1] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[2,10],zero,zero,zero,zero,zero,ymm7[3,19],zero,zero,zero,zero,zero,ymm7[28,20],zero,zero,zero,zero,zero,ymm7[29,21],zero +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[12,13],zero,zero,zero,zero,zero,xmm5[14,15],zero,zero,zero ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] @@ -1331,7 +1328,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1346,8 +1343,8 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm3 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm4 ; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[1,1,0,0,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] @@ -1356,9 +1353,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,2,0,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,ymm11[0,8],zero,zero,zero,zero,zero,ymm11[1,9],zero,zero,zero,zero,zero,ymm11[18,26],zero,zero,zero,zero,zero,ymm11[19,27],zero,zero,zero,zero,zero,ymm11[20,28] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,2,0,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,2,0,2] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[2,10],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28],zero,zero ; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] @@ -1371,9 +1368,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm13 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,ymm13[1,5],zero,zero,zero,zero,zero,ymm13[2,6],zero,zero,zero,zero,zero,ymm13[19,23],zero,zero,zero,zero,zero,ymm13[24,28],zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,ymm12[1,5],zero,zero,zero,zero,zero,ymm12[2,6],zero,zero,zero,zero,zero,ymm12[19,23],zero,zero,zero,zero,zero,ymm12[24,28],zero,zero,zero,zero,zero,ymm12[25] ; AVX2-FAST-NEXT: vpor %ymm13, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] @@ -1386,11 +1383,11 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero -; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,1,1,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[1],zero,zero,zero,zero,zero,ymm9[10,2],zero,zero,zero,zero,zero,ymm9[11,3],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero,zero,zero,zero,ymm9[21,29],zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[3,19],zero,zero,zero,zero,zero,ymm8[28,20],zero,zero,zero,zero,zero,ymm8[29,21],zero +; AVX2-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm10, %ymm8 ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] @@ -1421,54 +1418,54 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[5],zero,zero,zero,zero,zero,zero,ymm9[6],zero,zero,zero,zero,zero,ymm9[23],zero,zero,zero,zero,zero,zero,ymm9[24],zero,zero,zero,zero,zero,zero,ymm9[25] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm6[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,ymm11[4],zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm8, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm9, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,2,0,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,2,0,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,2,0,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,8],zero,zero,zero,zero,zero,ymm13[1,9],zero,zero,zero,zero,zero,ymm13[2,10],zero,zero,zero,zero,zero,ymm13[19,27],zero,zero,zero,zero,zero,ymm13[20,28],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,zero,zero,zero,ymm6[10,2],zero,zero,zero,zero,zero,ymm6[11,3],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero,zero,zero,zero,ymm6[21,29],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,3,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[2,10],zero,zero,zero,zero,zero,ymm7[3,19],zero,zero,zero,zero,zero,ymm7[28,20],zero,zero,zero,zero,zero,ymm7[29,21],zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,3,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9],zero,zero,zero,zero,zero,ymm6[2,10],zero,zero,zero,zero,zero,ymm6[3,19],zero,zero,zero,zero,zero,ymm6[28,20],zero,zero,zero,zero,zero,ymm6[29,21],zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] @@ -1478,14 +1475,14 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm8[13,14,15,4,5],zero,zero,xmm8[14,15,14,15,12],zero,zero,xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm9[13,14,15,4,5],zero,zero,xmm9[14,15,14,15,12],zero,zero,xmm9[15] ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1575,21 +1572,21 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm1 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vinserti128 $1, (%r9), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[12,13,u,u,u],zero,zero,xmm4[14,15,u,u,u] ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,12,13],zero,zero,xmm2[u,u,u,14,15],zero,zero,xmm2[u,u,u] ; AVX512F-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15] ; AVX512F-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[3,1,1,3] @@ -1597,11 +1594,11 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,3,1] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9,u,u,u],zero,zero,ymm4[2,10,u,u,u],zero,zero,ymm4[3,19,u,u,u],zero,zero,ymm4[28,20,u,u,u],zero,zero,ymm4[29,21,u] ; AVX512F-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,6] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,5,6] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3] ; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm0[1,3,1,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] ; AVX512F-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 @@ -1619,21 +1616,21 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,5],zero,zero,ymm7[u,u,u,2,6],zero,zero,ymm7[u,u,u,19,23],zero,zero,ymm7[u,u,u,24,28],zero,zero,ymm7[u,u,u,25] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512F-FAST-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[1,1,0,0,4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u] -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4],zero,ymm0[u,u,u,u,1,5],zero,ymm0[u,u,u,u,2,6],zero,ymm0[u,u,u,u,19,23],zero,ymm0[u,u,u,u,24,28],zero,ymm0[u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, 96(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-FAST-NEXT: vmovdqa %ymm2, 64(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -1642,84 +1639,84 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, (%r9), %ymm6, %ymm6 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, (%r10), %zmm6, %zmm6 -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm4 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 +; AVX512BW-SLOW-NEXT: vinserti128 $1, (%r9), %ymm4, %ymm4 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, (%r10), %zmm4, %zmm4 +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm8 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm8 ; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,7,7,7] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,3,2] ; AVX512BW-SLOW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm9, %xmm7 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[12,13],zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[12,13],zero,zero,zero,zero,zero,xmm3[14,15],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512BW-SLOW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm7, %xmm2 {%k1} -; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,1,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm7, %xmm0 {%k1} +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[1,3,1,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-SLOW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,3,1] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[3,1,1,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,zero,zero,zero,ymm5[10,2],zero,zero,zero,zero,zero,ymm5[11,3],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero,zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm6[1,3,3,1] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm5[3,1,1,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[1],zero,zero,zero,zero,zero,ymm3[10,2],zero,zero,zero,zero,zero,ymm3[11,3],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero,zero,zero,zero,ymm3[21,29],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512BW-SLOW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512BW-SLOW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512BW-SLOW-NEXT: vpermi2w %zmm4, %zmm8, %zmm2 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[4],zero,zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512BW-SLOW-NEXT: vpor %ymm3, %ymm7, %ymm3 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] -; AVX512BW-SLOW-NEXT: vpor %ymm7, %ymm6, %ymm6 +; AVX512BW-SLOW-NEXT: vpor %ymm7, %ymm4, %ymm4 ; AVX512BW-SLOW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28] -; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm4 {%k1} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,2,0,2] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,2,0,2] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28] +; AVX512BW-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512BW-SLOW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, 96(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -1819,130 +1816,132 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i8_stride7_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $360, %rsp # imm = 0x168 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rdx), %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r8), %xmm6 -; SSE-NEXT: movdqa 16(%r9), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdx), %xmm4 +; SSE-NEXT: movdqa 16(%rcx), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm7 +; SSE-NEXT: movdqa 16(%r9), %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm8 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa 16(%rax), %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa 16(%rax), %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,7,7,7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,1,2,3] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,1,2,3] @@ -1950,473 +1949,469 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm11, %xmm7 ; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 ; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,1,2,3] +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 ; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: por %xmm7, %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: por %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,1,0,3] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa (%rsi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,1,2,3] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,0,3] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm7 ; SSE-NEXT: movdqa (%rcx), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa (%r9), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,1,2,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] +; SSE-NEXT: movdqa %xmm10, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,1,2,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa (%r8), %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: movdqa (%rax), %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: movdqa (%rax), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,5,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,6,5,7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm14 ; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14 ; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,2] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,1] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,1,1,0] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm10, %xmm3 -; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: por %xmm10, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshuflw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,0] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] ; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,0,0,4,5,6,7] +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm10 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,5,5,7] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm14 -; SSE-NEXT: por %xmm6, %xmm14 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 ; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshuflw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshufhw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2424,84 +2419,83 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pandn %xmm1, %xmm13 ; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm12 ; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,7,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pshufhw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: por %xmm5, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm4, 32(%rax) +; SSE-NEXT: movdqa %xmm15, 32(%rax) ; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm7, 112(%rax) +; SSE-NEXT: movdqa %xmm4, 112(%rax) ; SSE-NEXT: movdqa %xmm14, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) @@ -2528,267 +2522,263 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $216, %rsp +; AVX1-ONLY-NEXT: subq $184, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u],zero,zero,xmm14[9,u,u,u,u],zero,zero,xmm14[10,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u],zero,zero,xmm12[9,u,u,u,u],zero,zero,xmm12[10,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,3],zero,xmm1[u,u,u,u,4,5],zero,xmm1[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u],zero,xmm10[7,u,u,u,u,u],zero,xmm10[8,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm2, %ymm15 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm2, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u],zero,xmm6[7,u,u,u,u,u],zero,xmm6[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,7],zero,xmm5[u,u,u,u,u,8],zero,xmm5[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u],zero,xmm1[7,u,u,u,u,u],zero,xmm1[8,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm4[4,u,u,u,u],zero,zero,xmm4[5,u,u,u,u],zero,zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm11, %ymm15 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u],zero,xmm8[7,u,u,u,u,u],zero,xmm8[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,7],zero,xmm7[u,u,u,u,u,8],zero,xmm7[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm12[4,u,u,u,u],zero,zero,xmm12[5,u,u,u,u],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm15 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm15, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm15 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm4, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm4, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm14, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm12, %ymm13 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0],zero,xmm6[2,3,4,5,6,7],zero,xmm6[9,10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5,6,7],zero,xmm5[9,10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm3[13],zero,zero,zero,zero,zero,zero,xmm3[14],zero,zero,zero,zero,zero,zero,xmm3[15] ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm3[11],zero,zero,zero,zero,zero,zero,xmm3[12],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,2,3],zero,xmm8[u,u,u,u,4,5],zero,xmm8[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7],zero,xmm8[u,u,u,u,8,9],zero,xmm8[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10],zero,xmm7[u,u,u,u,13,12],zero,xmm7[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[13,u,u,u,u],zero,zero,xmm8[14,u,u,u,u],zero,zero,xmm8[15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[10],zero,xmm9[u,u,u,u,13,12],zero,xmm9[u,u,u,u,15,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm7[13,u,u,u,u],zero,zero,xmm7[14,u,u,u,u],zero,zero,xmm7[15] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u],zero,zero,xmm7[2,u,u,u,u],zero,zero,xmm7[3,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,4,5],zero,xmm9[u,u,u,u,6,7],zero,xmm9[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[2,u,u,u,u],zero,zero,xmm8[3,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX1-ONLY-NEXT: addq $216, %rsp +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 192(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 208(%rax) +; AVX1-ONLY-NEXT: addq $184, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2837,21 +2827,21 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm11 ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm15 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm14 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = @@ -2859,8 +2849,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u],zero,xmm0[7],zero,xmm0[5,u,u,u],zero,xmm0[8],zero,xmm0[6,u,u,u],zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u,u,9] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm7, %xmm1 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] @@ -2870,33 +2860,33 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero ; AVX2-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[1,1,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] @@ -2913,64 +2903,62 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm11[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,ymm4[27,20,21,26],zero,ymm4[24],zero,ymm4[26,27,26,27],zero,ymm4[25] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm11 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,ymm4[27,20,21,26],zero,ymm4[24],zero,ymm4[26,27,26,27],zero,ymm4[25] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18],zero -; AVX2-SLOW-NEXT: vpor %ymm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31] +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) @@ -2992,9 +2980,9 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> @@ -3004,17 +2992,17 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm13 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm15 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u,u],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u,u,9] ; AVX2-FAST-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u] @@ -3025,8 +3013,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero ; AVX2-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> @@ -3036,18 +3024,18 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm7 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm12, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[1,1,0,0,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = @@ -3163,9 +3151,9 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> @@ -3173,17 +3161,17 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u,u],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u,u,9] ; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u] @@ -3194,8 +3182,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero ; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> @@ -3205,17 +3193,17 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm12, %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = @@ -3322,10 +3310,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r10), %ymm4 @@ -3341,135 +3329,138 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm5[23],zero,ymm5[23,24,25,26],zero,ymm5[24],zero,ymm5[30,31] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vporq %zmm7, %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vporq %zmm7, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] ; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpandq %ymm16, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm11[18,19,20,21],zero,ymm11[19],zero,ymm11[25,26,27,22],zero,ymm11[20],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[18],zero,zero,zero,zero,ymm14[21],zero,ymm14[19],zero,zero,zero,zero,ymm14[22],zero,ymm14[20] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vporq %zmm9, %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vporq %zmm9, %zmm8, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,1,1,4,4,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] ; AVX512F-ONLY-SLOW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpandq %ymm17, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm11[23],zero,ymm11[21,22,23,26],zero,ymm11[24],zero,ymm11[28,29,26,27] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vporq %zmm10, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u,u,9] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm7[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u],zero,xmm7[7],zero,xmm7[5,u,u,u],zero,xmm7[8],zero,xmm7[6,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm18 = zmm13[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm15, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm19 = zmm10[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r10), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm0[0,0,1,0,4,4,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14,u,u],zero,zero,zero,zero,ymm1[15,u,u],zero,zero,zero,zero,ymm1[16,u,u],zero,zero,zero,zero,ymm1[17,u,u],zero,zero,zero,zero,ymm1[18] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[u,u,0,1,14,15],zero,ymm2[u,u,13,2,3,16],zero,ymm2[u,u,28,29,16,17],zero,ymm2[u,u,19,28,29,18],zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,14],zero,ymm2[u,u,u,u,u,15],zero,ymm2[u,u,u,u,u,16],zero,ymm2[u,u,u,u,u,17],zero,ymm2[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vporq %zmm12, %zmm8, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u,u],zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero,xmm11[u,u,u,9] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm13, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero,xmm15[u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm13, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r10), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4] ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u,u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm2[14,u,u],zero,zero,zero,zero,ymm2[15,u,u],zero,zero,zero,zero,ymm2[16,u,u],zero,zero,zero,zero,ymm2[17,u,u],zero,zero,zero,zero,ymm2[18] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,14],zero,ymm2[u,u,u,u,u,15],zero,ymm2[u,u,u,u,u,16],zero,ymm2[u,u,u,u,u,17],zero,ymm2[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u,u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm8, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm7, %ymm9, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm8, %ymm9, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm4[13,u,u,u,u],zero,zero,ymm4[14,u,u,u,u],zero,zero,ymm4[15,u,u,u,u],zero,zero,ymm4[16,u,u,u,u],zero,zero,ymm4[17,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm16, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm16, %ymm0, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; @@ -3481,9 +3472,9 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm5 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %ymm17 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 (%r10), %ymm17 +; AVX512F-FAST-NEXT: vmovdqa64 (%r10), %ymm16 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 @@ -3501,33 +3492,33 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm13, %zmm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm15 = zmm10[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm13 = zmm10[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm13 ; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm10 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm13 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX512F-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm15 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm15[4,u,u,u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[u,u,u,7],zero,xmm0[5],zero,xmm0[u,u,u,8],zero,xmm0[6],zero +; AVX512F-FAST-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[0,1,14],zero,ymm2[u,u,0,1,14,15],zero,ymm2[u,u,13,2,3,16],zero,ymm2[u,u,28,29,16,17],zero,ymm2[u,u,19,28,29,18],zero -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm15, %ymm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm1[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[0,1,14],zero,ymm2[u,u,0,1,14,15],zero,ymm2[u,u,13,2,3,16],zero,ymm2[u,u,28,29,16,17],zero,ymm2[u,u,19,28,29,18],zero +; AVX512F-FAST-NEXT: vpor %ymm1, %ymm13, %ymm1 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] ; AVX512F-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 @@ -3535,66 +3526,69 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[u,u,u,u,u,14],zero,ymm1[u,u,u,u,u,15],zero,ymm1[u,u,u,u,u,16],zero,ymm1[u,u,u,u,u,17],zero,ymm1[u,u,u] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm12 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm12[u,u,u,u,u,14],zero,ymm12[u,u,u,u,u,15],zero,ymm12[u,u,u,u,u,16],zero,ymm12[u,u,u,u,u,17],zero,ymm12[u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,6] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-FAST-NEXT: vpandn %ymm8, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm13[13,u,u,u,u],zero,zero,ymm13[14,u,u,u,u],zero,zero,ymm13[15,u,u,u,u],zero,zero,ymm13[16,u,u,u,u],zero,zero,ymm13[17,u,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,5,5,6] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,2,3,3,2,2,3,3] +; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-FAST-NEXT: vpandn %ymm1, %ymm8, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm14 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm14[13,u,u,u,u],zero,zero,ymm14[14,u,u,u,u],zero,zero,ymm14[15,u,u,u,u],zero,zero,ymm14[16,u,u,u,u],zero,zero,ymm14[17,u,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm8 ; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpand %ymm0, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm9, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm2[19],zero,ymm2[21,20,21,22],zero,ymm2[20],zero,ymm2[22,23] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm10, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vporq %zmm9, %zmm10, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[20],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm12[18],zero,ymm12[20,21,20,21],zero,ymm12[19],zero,ymm12[19,20,21,22],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm12[23],zero,ymm12[23,24,25,26],zero,ymm12[24],zero,ymm12[30,31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm2[19],zero,ymm2[21,20,21,22],zero,ymm2[20],zero,ymm2[22,23] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm10, %zmm11, %zmm10 +; AVX512F-FAST-NEXT: vporq %zmm0, %zmm10, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm12 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,5,4,0,5,5,4,0] +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[20],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm0, %zmm11, %zmm0 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,5,4,0,5,5,4,0] -; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero @@ -3608,16 +3602,16 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm0 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm1, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm9, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, 192(%rax) +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -3626,10 +3620,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm14 ; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm5 ; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm6 ; AVX512DQ-SLOW-NEXT: vmovdqa (%r10), %ymm4 @@ -3645,135 +3639,138 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm5[23],zero,ymm5[23,24,25,26],zero,ymm5[24],zero,ymm5[30,31] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vporq %zmm7, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vporq %zmm7, %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] ; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpandq %ymm16, %ymm8, %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm11[18,19,20,21],zero,ymm11[19],zero,ymm11[25,26,27,22],zero,ymm11[20],zero +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[18],zero,zero,zero,zero,ymm14[21],zero,ymm14[19],zero,zero,zero,zero,ymm14[22],zero,ymm14[20] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vporq %zmm9, %zmm8, %zmm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vporq %zmm9, %zmm8, %zmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,1,1,4,4,5,5] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] ; AVX512DQ-SLOW-NEXT: # ymm17 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpandq %ymm17, %ymm8, %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm11[23],zero,ymm11[21,22,23,26],zero,ymm11[24],zero,ymm11[28,29,26,27] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vporq %zmm10, %zmm8, %zmm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u,u,9] -; AVX512DQ-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm7[0,1,0,1,4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u],zero,xmm7[7],zero,xmm7[5,u,u,u],zero,xmm7[8],zero,xmm7[6,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm14, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm18 = zmm13[0,1,0,1,4,5,4,5] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm13 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm14 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm15, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm19 = zmm10[0,1,0,1,4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r10), %xmm15 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[1,1,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm0[0,0,1,0,4,4,5,4] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm10 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14,u,u],zero,zero,zero,zero,ymm1[15,u,u],zero,zero,zero,zero,ymm1[16,u,u],zero,zero,zero,zero,ymm1[17,u,u],zero,zero,zero,zero,ymm1[18] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[u,u,0,1,14,15],zero,ymm2[u,u,13,2,3,16],zero,ymm2[u,u,28,29,16,17],zero,ymm2[u,u,19,28,29,18],zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,14],zero,ymm2[u,u,u,u,u,15],zero,ymm2[u,u,u,u,u,16],zero,ymm2[u,u,u,u,u,17],zero,ymm2[u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm8 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm20 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vporq %zmm12, %zmm8, %zmm18 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm18 +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u,u],zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero,xmm11[u,u,u,9] +; AVX512DQ-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm13, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm14 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero,xmm15[u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm13, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm13 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero +; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r10), %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[1,1,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4] ; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u,u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm2[14,u,u],zero,zero,zero,zero,ymm2[15,u,u],zero,zero,zero,zero,ymm2[16,u,u],zero,zero,zero,zero,ymm2[17,u,u],zero,zero,zero,zero,ymm2[18] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,5,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,14],zero,ymm2[u,u,u,u,u,15],zero,ymm2[u,u,u,u,u,16],zero,ymm2[u,u,u,u,u,17],zero,ymm2[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u,u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm8, %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,5,5,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512DQ-SLOW-NEXT: vpandn %ymm7, %ymm9, %ymm7 +; AVX512DQ-SLOW-NEXT: vpandn %ymm8, %ymm9, %ymm8 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm4[13,u,u,u,u],zero,zero,ymm4[14,u,u,u,u],zero,zero,ymm4[15,u,u,u,u],zero,zero,ymm4[16,u,u,u,u],zero,zero,ymm4[17,u,u] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm16, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm16, %ymm0, %ymm5 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -3788,18 +3785,18 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero ; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18] ; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm13 ; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 @@ -3809,14 +3806,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm12 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm8 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512BW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] @@ -3825,17 +3822,17 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 ; AVX512BW-SLOW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} ; AVX512BW-SLOW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm9 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm9[18,19,20,21],zero,zmm9[19],zero,zmm9[25,26,27,22],zero,zmm9[20],zero,zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm9[55],zero,zero,zero,zero,zmm9[58],zero,zmm9[56],zero,zero,zero,zero,zmm9[59],zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm8 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm8[18,19,20,21],zero,zmm8[19],zero,zmm8[25,26,27,22],zero,zmm8[20],zero,zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm8[55],zero,zero,zero,zero,zmm8[58],zero,zmm8[56],zero,zero,zero,zero,zmm8[59],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm15 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[18],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero,zmm15[59],zero,zmm15[57] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vporq %zmm9, %zmm15, %zmm9 +; AVX512BW-SLOW-NEXT: vporq %zmm8, %zmm15, %zmm8 ; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,1,1,4,4,5,5] ; AVX512BW-SLOW-NEXT: movl $676341840, %ecx # imm = 0x28502850 @@ -3850,7 +3847,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 ; AVX512BW-SLOW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm15, %zmm15 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] @@ -3863,28 +3860,28 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm16 {%k2} ; AVX512BW-SLOW-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm9 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k2} ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u] ; AVX512BW-SLOW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm13[0,1,0,1,4,5,4,5] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u,u],zero -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] ; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm8, %zmm8 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm9, %zmm9 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] ; AVX512BW-SLOW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm8 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm13, %zmm9 {%k2} ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512BW-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[4],zero,xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero +; AVX512BW-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] @@ -3895,7 +3892,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k2} ; AVX512BW-SLOW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm8 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k2} ; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] ; AVX512BW-SLOW-NEXT: movl $338170920, %ecx # imm = 0x14281428 @@ -3925,8 +3922,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq @@ -3951,9 +3948,9 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] ; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm13 ; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 @@ -3966,8 +3963,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 ; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm13 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm12 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm10 @@ -4014,12 +4011,12 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm16, %zmm10 {%k1} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u] ; AVX512BW-FAST-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm13 = zmm13[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] ; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 @@ -4029,11 +4026,11 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm12, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm13, %zmm8 {%k1} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512BW-FAST-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[4],zero,xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero +; AVX512BW-FAST-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm11, %zmm9 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] @@ -4103,23 +4100,23 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: subq $648, %rsp # imm = 0x288 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 48(%rdi), %xmm14 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 ; SSE-NEXT: movdqa 48(%rsi), %xmm2 ; SSE-NEXT: movdqa 48(%rdx), %xmm3 -; SSE-NEXT: movdqa 48(%rcx), %xmm10 -; SSE-NEXT: movdqa 48(%r8), %xmm5 -; SSE-NEXT: movdqa 48(%r9), %xmm8 -; SSE-NEXT: movdqa 48(%rax), %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa 48(%rcx), %xmm4 +; SSE-NEXT: movdqa 48(%r8), %xmm10 +; SSE-NEXT: movdqa 48(%r9), %xmm5 +; SSE-NEXT: movdqa 48(%rax), %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] @@ -4129,8 +4126,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,1,2,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] @@ -4143,157 +4141,154 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] ; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa %xmm6, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: por %xmm11, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] ; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm9, %xmm11 ; SSE-NEXT: pandn %xmm6, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pand %xmm9, %xmm6 ; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,4,6,5] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 ; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: pand %xmm15, %xmm6 ; SSE-NEXT: por %xmm6, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: por %xmm11, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: pand %xmm11, %xmm8 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: pand %xmm14, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pand %xmm9, %xmm6 ; SSE-NEXT: por %xmm6, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] @@ -4301,12 +4296,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rsi), %xmm0 @@ -4314,62 +4309,61 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: movdqa (%rcx), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa (%r9), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,1,2,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] -; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: movdqa (%r8), %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa (%rax), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm3, %xmm11 ; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm6, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rsi), %xmm0 @@ -4377,13 +4371,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa 16(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4392,13 +4386,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: movdqa 16(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 @@ -4409,1060 +4403,1044 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] -; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: movdqa 16(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm3, %xmm6 ; SSE-NEXT: movdqa 16(%rax), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm3, %xmm11 ; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm6, %xmm14 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,0,3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa 32(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa 32(%rdx), %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa 32(%r9), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa 32(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa 32(%rdx), %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: movdqa 32(%r9), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 32(%r8), %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r8), %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: movdqa 32(%rax), %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rax), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: por %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw $170, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,2,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,0] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm15 -; SSE-NEXT: por %xmm15, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,0] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[1,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[1,1,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm10, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm14[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm15, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm10, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: por %xmm10, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,3] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm14 +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: por %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm8 -; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: por %xmm10, %xmm11 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm11, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,6,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,0] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,5,7] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[2,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,0,0] +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm8, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,1,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm14, %xmm10 ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,5,7,7] +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,6,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,2] -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: por %xmm6, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] +; SSE-NEXT: pand %xmm15, %xmm7 +; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,2] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 ; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm13[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] -; SSE-NEXT: pand %xmm12, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,0] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: por %xmm7, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: pandn %xmm8, %xmm15 -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: por %xmm10, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm15, %xmm10 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: por %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm10, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm4, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm8, %xmm15 -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: por %xmm10, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm15, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: por %xmm7, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm15 -; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm4, %xmm14 +; SSE-NEXT: por %xmm8, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm15, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,5,7] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm14, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm15, %xmm9 -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: por %xmm10, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm14, %xmm7 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm8, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: por %xmm15, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: por %xmm14, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pand %xmm14, %xmm10 -; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm8, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,6,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,2] -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,2] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm8, %xmm14 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pandn %xmm10, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm15 -; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: por %xmm14, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,2,1] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm10, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; SSE-NEXT: pandn %xmm4, %xmm6 ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] -; SSE-NEXT: pandn %xmm3, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,2] +; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm10, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm3, 368(%rax) -; SSE-NEXT: movdqa %xmm1, 352(%rax) +; SSE-NEXT: movdqa %xmm4, 368(%rax) +; SSE-NEXT: movdqa %xmm8, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 320(%rax) +; SSE-NEXT: movdqa %xmm11, 320(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%rax) -; SSE-NEXT: movdqa %xmm9, 256(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, 256(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rax) @@ -5511,38 +5489,39 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $616, %rsp # imm = 0x268 +; AVX1-ONLY-NEXT: subq $600, %rsp # imm = 0x258 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm4 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u],zero,zero,xmm6[11,u,u,u,u],zero,zero,xmm6[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u],zero,zero,xmm2[11,u,u,u,u],zero,zero,xmm2[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7],zero,xmm0[u,u,u,u,8,9],zero,xmm0[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,6,7],zero,xmm2[u,u,u,u,8,9],zero,xmm2[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 @@ -5553,169 +5532,167 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,3],zero,xmm0[u,u,u,u,4,5],zero,xmm0[u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u],zero,zero,xmm6[9,u,u,u,u],zero,zero,xmm6[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,2,3],zero,xmm2[u,u,u,u,4,5],zero,xmm2[u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm2[4,u,u,u,u],zero,zero,xmm2[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[4,u,u,u,u],zero,zero,xmm1[5,u,u,u,u],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u],zero,zero,xmm10[11,u,u,u,u],zero,zero,xmm10[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u],zero,zero,xmm10[9,u,u,u,u],zero,zero,xmm10[10,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u],zero,zero,xmm9[2,u,u,u,u],zero,zero,xmm9[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 @@ -5723,246 +5700,247 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u],zero,xmm6[7,u,u,u,u,u],zero,xmm6[8,u,u] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,7],zero,xmm8[u,u,u,u,u,8],zero,xmm8[u,u] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,7],zero,xmm7[u,u,u,u,u,8],zero,xmm7[u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u] -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[6,u,u,u,u],zero,zero,xmm7[7,u,u,u,u],zero,zero,xmm7[8,u] +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[6,u,u,u,u],zero,zero,xmm8[7,u,u,u,u],zero,zero,xmm8[8,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm15, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,7],zero,xmm1[u,u,u,u,u,8],zero,xmm1[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm11, %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u> ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9> -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u],zero,xmm13[7,u,u,u,u,u],zero,xmm13[8,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,7],zero,xmm12[u,u,u,u,u,8],zero,xmm12[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u],zero,xmm0[7,u,u,u,u,u],zero,xmm0[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u],zero,xmm1[7,u,u,u,u,u],zero,xmm1[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[u,u,u,u,u,8],zero,xmm10[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[u,u,6,7,8,9],zero,xmm4[u,u,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm6[9,u,u],zero,zero,zero,zero,xmm6[10,u,u],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[u,6,7,8,9,10],zero,xmm4[u,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero,zero,xmm5[10,u],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10,11],zero,xmm4[13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm12[9],zero,zero,zero,zero,zero,zero,xmm12[10],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4],zero,xmm3[u,u,8,9,10,11],zero,xmm3[u,u,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u],zero,zero,zero,zero,xmm6[7,u,u],zero,zero,zero,zero,xmm6[8,u,u],zero +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[u,u,6,7,8,9],zero,xmm3[u,u,13,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm2[9,u,u],zero,zero,zero,zero,xmm2[10,u,u],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3],zero,xmm3[u,6,7,8,9,10],zero,xmm3[u,13,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm0[9,u],zero,zero,zero,zero,zero,xmm0[10,u],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4,5],zero,xmm3[u,8,9,10,11,12],zero,xmm3[u,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u],zero,zero,zero,zero,zero,xmm5[7,u],zero,zero,zero,zero,zero,xmm5[8,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10,11],zero,xmm3[13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm9[9],zero,zero,zero,zero,zero,zero,xmm9[10],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4,5,6],zero,xmm3[8,9,10,11,12,13],zero,xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[6],zero,zero,zero,zero,zero,zero,xmm12[7],zero,zero,zero,zero,zero,zero,xmm12[8],zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,1,2,3,4],zero,xmm1[u,u,8,9,10,11],zero,xmm1[u,u,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u],zero,zero,zero,zero,xmm2[7,u,u],zero,zero,zero,zero,xmm2[8,u,u],zero +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,1,2,3,4,5],zero,xmm1[u,8,9,10,11,12],zero,xmm1[u,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u],zero,zero,zero,zero,zero,xmm0[7,u],zero,zero,zero,zero,zero,xmm0[8,u],zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6],zero,xmm1[8,9,10,11,12,13],zero,xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[6],zero,zero,zero,zero,zero,zero,xmm9[7],zero,zero,zero,zero,zero,zero,xmm9[8],zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm6[11,u,u],zero,zero,zero,zero,xmm6[12,u,u],zero,zero,zero,zero,xmm6[13] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0],zero,xmm1[u,u,4,5,6,7],zero,xmm1[u,u,11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm5[11,u,u],zero,zero,zero,zero,xmm5[12,u,u],zero,zero,zero,zero,xmm5[13] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[u,4,5,6,7,8],zero,xmm3[u,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm5[11,u],zero,zero,zero,zero,zero,xmm5[12,u],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm0[11,u],zero,zero,zero,zero,zero,xmm0[12,u],zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[4,5,6,7,8,9],zero,xmm3[11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[11],zero,zero,zero,zero,zero,zero,xmm12[12],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,4,5,6,7,0],zero,xmm0[u,11,12,13,14,1],zero,xmm0[u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[13,u],zero,zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,zero,xmm5[15,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6,7],zero,xmm0[9,10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm9[11],zero,zero,zero,zero,zero,zero,xmm9[12],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u,4,5,6,7,0],zero,xmm1[u,11,12,13,14,1],zero,xmm1[u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[13,u],zero,zero,zero,zero,zero,xmm0[14,u],zero,zero,zero,zero,zero,xmm0[15,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6,7],zero,xmm1[9,10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm9[13],zero,zero,zero,zero,zero,zero,xmm9[14],zero,zero,zero,zero,zero,zero,xmm9[15] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,xmm4[u,u,u,u,13,12],zero,xmm4[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm14[13,u,u,u,u],zero,zero,xmm14[14,u,u,u,u],zero,zero,xmm14[15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm13[13,u,u,u,u],zero,zero,xmm13[14,u,u,u,u],zero,zero,xmm13[15] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[4,5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm9 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9> ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm10 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm13 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] @@ -5974,70 +5952,69 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm12 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 ; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm13, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6054,13 +6031,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) -; AVX1-ONLY-NEXT: addq $616, %rsp # imm = 0x268 +; AVX1-ONLY-NEXT: addq $600, %rsp # imm = 0x258 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $824, %rsp # imm = 0x338 +; AVX2-SLOW-NEXT: subq $840, %rsp # imm = 0x348 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6145,32 +6122,38 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm14 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] @@ -6178,307 +6161,306 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,1,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm12, %ymm15, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm10 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm15, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm12 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm12 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm14 -; AVX2-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm12, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm13 +; AVX2-SLOW-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm14 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm15 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm14 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm15 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm15, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm12, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm14[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm15[8],xmm11[9],xmm15[9],xmm11[10],xmm15[10],xmm11[11],xmm15[11],xmm11[12],xmm15[12],xmm11[13],xmm15[13],xmm11[14],xmm15[14],xmm11[15],xmm15[15] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5] -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm5 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm6 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,1,1,4,4,5,5] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[18],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero,ymm15[27] +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm12 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm11, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm7 -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm12, %ymm8 -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm15 +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm13, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm10 -; AVX2-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm8 +; AVX2-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm14, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,3,3,4,6,7,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm15[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm9, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 320(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6501,8 +6483,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 416(%rax) -; AVX2-SLOW-NEXT: addq $824, %rsp # imm = 0x338 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 416(%rax) +; AVX2-SLOW-NEXT: addq $840, %rsp # imm = 0x348 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -6514,8 +6496,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm3 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 @@ -6536,13 +6518,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -6564,11 +6546,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -6578,46 +6560,50 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm14 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm13 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm14 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,2,0,0,1] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,2,0,0,1] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] @@ -6626,67 +6612,68 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm9 ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm14 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm13 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm13, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm3 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm14 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm14 -; AVX2-FAST-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm13 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm13, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm15, %ymm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm12 +; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm11 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm6 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] @@ -6694,7 +6681,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] @@ -6706,17 +6694,16 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,6] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,6] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 @@ -6727,7 +6714,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,5,5,6] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 @@ -6738,184 +6725,183 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm2, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm2, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[23],zero,ymm12[27,20,21,26],zero,ymm12[24],zero,ymm12[26,27,26,27],zero,ymm12[25] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[23],zero,ymm15[27,20,21,26],zero,ymm15[24],zero,ymm15[26,27,26,27],zero,ymm15[25] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero +; AVX2-FAST-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm11, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm9 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm7, %ymm8, %ymm14 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,5,7,4,5] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm15, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm13, %ymm9 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,4,5,5,7,4,5] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm5, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm8, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm14, %ymm0, %ymm14 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpor %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpor %ymm6, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm1 ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm8 +; AVX2-FAST-NEXT: vpor %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm9, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 320(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6950,8 +6936,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 @@ -6972,13 +6958,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -7000,11 +6986,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -7016,124 +7002,120 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm12, %ymm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm15, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm14, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm10, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] @@ -7143,32 +7125,32 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7176,184 +7158,182 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[23],zero,ymm5[27,20,21,26],zero,ymm5[24],zero,ymm5[26,27,26,27],zero,ymm5[25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[23],zero,ymm14[27,20,21,26],zero,ymm14[24],zero,ymm14[26,27,26,27],zero,ymm14[25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero,ymm15[27],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm15, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm15, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm14, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm13, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm15, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm15, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm13, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm14, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm8, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm13, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm9, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7382,467 +7362,457 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512F-SLOW-LABEL: store_i8_stride7_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $1416, %rsp # imm = 0x588 +; AVX512F-SLOW-NEXT: subq $1384, %rsp # imm = 0x568 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm11 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm6 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm9 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm12, %ymm1 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] ; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm23 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29] -; AVX512F-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero -; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm15[27],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] ; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm8, %ymm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm3 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm8[23],zero,ymm8[21,22,23,26],zero,ymm8[24],zero,ymm8[28,29,26,27] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm8[18,19,20,21],zero,ymm8[19],zero,ymm8[25,26,27,22],zero,ymm8[20],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm9[18],zero,zero,zero,zero,ymm9[21],zero,ymm9[19],zero,zero,zero,zero,ymm9[22],zero,ymm9[20] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm12[18,19,20,21],zero,ymm12[19],zero,ymm12[25,26,27,22],zero,ymm12[20],zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm19 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20],zero,zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm28 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm2, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u],zero,xmm2[7],zero,xmm2[5,u,u,u],zero,xmm2[8],zero,xmm2[6,u,u] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm30 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm30 -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm26 +; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm22 -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-SLOW-NEXT: vporq %xmm1, %xmm2, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u> +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm12, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm22 +; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm25 +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm10, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u> -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,6] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm9 -; AVX512F-SLOW-NEXT: vporq %xmm8, %xmm9, %xmm24 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm9 -; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm15, %ymm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm14, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm10, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[0,1,14],zero,ymm0[12,13,0,1,14,15],zero,ymm0[3,12,13,2,3,16],zero,ymm0[30,31,28,29,16,17],zero,ymm0[31,18,19,28,29,18],zero -; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm28 -; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX512F-SLOW-NEXT: vporq %xmm5, %xmm4, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm7 -; AVX512F-SLOW-NEXT: vporq %xmm4, %xmm7, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm9 +; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,xmm7[7],zero,xmm7[5,u,u,u],zero,xmm7[8],zero,xmm7[6,u,u] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm14, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm6 -; AVX512F-SLOW-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm27 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm6[0,1,0,1],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm12 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,4,5,5,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandn %ymm4, %ymm11, %ymm4 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,ymm1[13],zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm4[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10] -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm26 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm13 +; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm7[8],xmm14[9],xmm7[9],xmm14[10],xmm7[10],xmm14[11],xmm7[11],xmm14[12],xmm7[12],xmm14[13],xmm7[13],xmm14[14],xmm7[14],xmm14[15],xmm7[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm13[0,1,0,1],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm13 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,5,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-SLOW-NEXT: vpandnq %ymm0, %ymm18, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = zero,ymm3[13],zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm16 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10] +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,0,1],zmm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm29, %zmm23 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm24[0,1,0,1],zmm0[0,1,0,1] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm6 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20],zero,zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm11 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm11[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm27 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm7, %ymm25 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm14, %ymm28, %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm21, %zmm9, %zmm14 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm4[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm18[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm3[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm21 = ymm1[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-SLOW-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm1 = zmm1[0,1,0,1],mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 -; AVX512F-SLOW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm19, %ymm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm6 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm6, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpand %ymm6, %ymm13, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm26[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm12 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm12 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm0 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm7, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm7[13],zero,zero,zero,zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-SLOW-NEXT: vpandnq %ymm7, %ymm17, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm5 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm17 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm20[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm15[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm6[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm22 = ymm6[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm6 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm6 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-SLOW-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm6 = zmm6[0,1,0,1],mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,1,0,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm9 +; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm12 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm12, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpand %ymm12, %ymm15, %ymm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm4 ; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm13 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm13 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm30[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm22[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm6, %ymm10, %ymm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm21[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $236, %ymm6, %ymm4, %ymm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufhw $190, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm6, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm31[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm20[2,3,2,3] -; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vporq %zmm13, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm13 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm14 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm14 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm14 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm1 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm13 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm1, %zmm13, %zmm1 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm13 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm15 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm15 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm15 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm30[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm23[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm13 +; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm12, %ymm2, %ymm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm22[2,3,2,3] +; AVX512F-SLOW-NEXT: vpternlogq $236, %ymm12, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufhw $190, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm10[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm29[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,3,2,3] +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm7, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm1 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm11 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm15 = mem[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm17 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm29[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm19 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm27[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm21 = mem[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm12 = mem[1,1,0,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm18 = mem[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm24[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm20 = mem[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm25[2,3,2,3] ; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm22 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm6, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm23 = mem[2,3,2,3] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm31 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm31 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm12[0,0,1,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm23 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm23 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm15[0,0,1,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm5 -; AVX512F-SLOW-NEXT: vporq %ymm17, %ymm18, %ymm6 -; AVX512F-SLOW-NEXT: vporq %ymm19, %ymm20, %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm8[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm25 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm24 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm24 -; AVX512F-SLOW-NEXT: vporq %ymm21, %ymm22, %ymm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm3 +; AVX512F-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm4 +; AVX512F-SLOW-NEXT: vporq %ymm20, %ymm21, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm28 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm28 +; AVX512F-SLOW-NEXT: vporq %ymm22, %ymm23, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm14[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm9[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm3[0,0,1,0,4,4,5,4] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm5[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm17[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm9[0,0,1,0,4,4,5,4] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rax) -; AVX512F-SLOW-NEXT: addq $1416, %rsp # imm = 0x588 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512F-SLOW-NEXT: addq $1384, %rsp # imm = 0x568 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i8_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX512F-ONLY-FAST-NEXT: subq $1368, %rsp # imm = 0x558 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm1 @@ -7851,450 +7821,424 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm1, %ymm2, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vporq %xmm1, %xmm2, %xmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vporq %xmm1, %xmm2, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vporq %xmm0, %xmm9, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm9, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[0,1,14],zero,ymm10[12,13,0,1,14,15],zero,ymm10[3,12,13,2,3,16],zero,ymm10[30,31,28,29,16,17],zero,ymm10[31,18,19,28,29,18],zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm9, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero,zero,ymm15[18] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm7, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm11, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm13, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm15[18],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm13[0,1,0,1],zmm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [2,2,3,3,2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,2,3,3,2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm23, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm14, %ymm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm31, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm12, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512F-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm28, %ymm3, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm14[0,1,2,3],zmm3[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm29, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm23, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm24, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm2[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm2[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm30[0,1,0,1],zmm5[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm6[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm0, %ymm25, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm16 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] ; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm9[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm7[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm14[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm11[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm23[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm12[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm30 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] ; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm5, %ymm7, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm8, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm12, %ymm25, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm15, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm9, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm5, %ymm27, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm12, %ymm24, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm24, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm3, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm25, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm28, %ymm20, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm4, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm28, %ymm27, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm3, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vporq %zmm3, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm29[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm28, %ymm10, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm7, %zmm2, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm5, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm26 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,0,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm9[23],zero,ymm9[23,24,25,26],zero,ymm9[24],zero,ymm9[30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,4,5,5,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm6 = zmm3[0,1,0,1],mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,2,0,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm14, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm7[23],zero,ymm7[23,24,25,26],zero,ymm7[24],zero,ymm7[30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm19, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm11[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm14, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm12, %ymm15, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm20, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm30[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm13, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm11, %ymm14, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1496, %rsp # imm = 0x5D8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1368, %rsp # imm = 0x558 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i8_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX512DQ-FAST-NEXT: subq $1368, %rsp # imm = 0x558 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm1 @@ -8303,442 +8247,415 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm31 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm28 +; AVX512DQ-FAST-NEXT: vporq %xmm1, %xmm2, %xmm29 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512DQ-FAST-NEXT: vporq %xmm1, %xmm2, %xmm24 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> ; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512DQ-FAST-NEXT: vporq %xmm0, %xmm9, %xmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm9 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm9, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[0,1,14],zero,ymm10[12,13,0,1,14,15],zero,ymm10[3,12,13,2,3,16],zero,ymm10[30,31,28,29,16,17],zero,ymm10[31,18,19,28,29,18],zero +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm9, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm13 -; AVX512DQ-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero,zero,ymm15[18] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm7, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm6 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm9 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm10 +; AVX512DQ-FAST-NEXT: vpor %xmm11, %xmm10, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm15 +; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm14 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm2, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm13, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm15[18],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm16 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm13[0,1,0,1],zmm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm23, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm14, %ymm14 ; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm26 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm31 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm31, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm21 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm28, %ymm3, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm14[0,1,2,3],zmm3[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm19 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm29, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm23, %ymm0 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20],zero,zero +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm24, %zmm29 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm2[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm2[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm30[0,1,0,1],zmm5[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %xmm8 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,5,5,6] +; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm6[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; AVX512DQ-FAST-NEXT: vpandnq %ymm0, %ymm25, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm16 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm9 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm9[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm7[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm14 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm14[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm3[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm11[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm23[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm4 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm13 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm30 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm11 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] ; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm9 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm5, %ymm7, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm12, %ymm25, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm15, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm9, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpandq %ymm5, %ymm27, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpandq %ymm12, %ymm24, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm3, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm25, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vpandq %ymm28, %ymm20, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm4, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpandq %ymm28, %ymm27, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 ; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm3, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vporq %zmm3, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm29[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpandq %ymm28, %ymm10, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm7, %zmm2, %zmm7 ; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm5, %zmm22 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm26 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[1,1,0,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,0,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm9[23],zero,ymm9[23,24,25,26],zero,ymm9[24],zero,ymm9[30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,4,5,5,7,4,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm20 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm24 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm25 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm6 = zmm3[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[1,1,0,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,2,0,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[1,1,0,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm14, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm7[23],zero,ymm7[23,24,25,26],zero,ymm7[24],zero,ymm7[30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm19, %ymm19 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm21 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm18 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm16 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm16 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm23 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm11[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm14, %ymm1 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm26 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm26 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpor %ymm12, %ymm15, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm16 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm16 +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm30[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm13, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm11, %ymm14, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512DQ-FAST-NEXT: addq $1496, %rsp # imm = 0x5D8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512DQ-FAST-NEXT: addq $1368, %rsp # imm = 0x558 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride7_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm15, %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm14, %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm20, %ymm10, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm21, %ymm10, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm11, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm22, %ymm11, %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm1 @@ -8746,59 +8663,59 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm20 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm16, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm16, %ymm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm23, %ymm17, %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm24 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm18, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm18, %ymm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm19, %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm3, %ymm7, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm2, %ymm7, %ymm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm27, %xmm27 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm27, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm27, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm22, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm24, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm21, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm2, %ymm21, %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm9, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r9), %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm20, %ymm21, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm22, %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm20, %ymm24, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm24 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[20],zero,ymm22[18],zero,ymm22[20,21,20,21],zero,ymm22[19],zero,ymm22[19,20,21,22],zero +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm20, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm3, %ymm20, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r9), %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm21, %ymm20, %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm21 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm22, %ymm21, %ymm22 +; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm24, %ymm22, %ymm22 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm24 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm21[20],zero,ymm21[18],zero,ymm21[20,21,20,21],zero,ymm21[19],zero,ymm21[19,20,21,22],zero ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm27 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm21[20],zero,ymm21[18],zero,zero,zero,zero,ymm21[21],zero,ymm21[19],zero,zero,zero,zero,ymm21[22] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm27 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm20[20],zero,ymm20[18],zero,zero,zero,zero,ymm20[21],zero,ymm20[19],zero,zero,zero,zero,ymm20[22] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm24, %ymm27, %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm22, %zmm24 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm24 {%k1} @@ -8806,48 +8723,48 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm28 ; AVX512BW-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm28[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,1,1,4,4,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm22 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] ; AVX512BW-ONLY-SLOW-NEXT: movl $676341840, %r10d # imm = 0x28502850 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm20, %ymm27, %ymm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm22, %ymm27, %ymm9 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm28, %ymm25 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm27, %ymm26 ; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm25, %ymm26, %ymm25 ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm25, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm26, %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm26, %ymm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm29 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm23, %ymm29, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm14, %ymm23, %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm15, %ymm23, %ymm15 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm23 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm26[18],zero,ymm26[18,19,20,21],zero,ymm26[19],zero,ymm26[25,26,27,22],zero,ymm26[20],zero ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm25 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm29[18],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22],zero,ymm29[20] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm23, %ymm25, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm14, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm15, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm14, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm24, %zmm9 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm28[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm24 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] +; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm28[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm23 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] ; AVX512BW-ONLY-SLOW-NEXT: movl $338170920, %r10d # imm = 0x14281428 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm14 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512BW-ONLY-SLOW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm23, %ymm27, %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm23, %ymm27, %ymm15 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512BW-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm25 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm28 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm25, %ymm28, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm25, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm25, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm25 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm26[24,25],zero,ymm26[23],zero,ymm26[21,22,23,26],zero,ymm26[24],zero,ymm26[28,29,26,27] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm28 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero,zero,zero @@ -8864,44 +8781,44 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm28, %zmm14 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm28, %zmm15 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm22[23],zero,zmm22[23,24,25,26],zero,zmm22[24],zero,zmm22[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm22[59],zero,zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm29[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm21[25],zero,zmm21[23],zero,zero,zero,zero,zmm21[26],zero,zmm21[24],zero,zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm21[59],zero,zero,zero,zero,zmm21[62],zero,zmm21[60],zero,zero,zero,zero,zmm21[63],zero,zmm21[61] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm28[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm21[23],zero,zmm21[23,24,25,26],zero,zmm21[24],zero,zmm21[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm21[59],zero,zero,zero,zero,zmm21[62],zero,zmm21[60],zero,zero,zero,zero,zmm21[63],zero ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm21 = zmm21[2,3,2,3,6,7,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vporq %zmm22, %zmm21, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,2,3],zmm29[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm20 = zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm20[25],zero,zmm20[23],zero,zero,zero,zero,zmm20[26],zero,zmm20[24],zero,zero,zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm20[59],zero,zero,zero,zero,zmm20[62],zero,zmm20[60],zero,zero,zero,zero,zmm20[63],zero,zmm20[61] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vporq %zmm21, %zmm20, %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm22, %zmm14 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm21, %zmm2, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm20, %zmm15 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm21, %zmm3, %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm22, %zmm14 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm22 = ymm18[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm30 = ymm22[0,0,1,1,4,4,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm20, %ymm19, %ymm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm23, %ymm19, %ymm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm20, %zmm15 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm20 = ymm18[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,0,1,1,4,4,5,5] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm22, %ymm19, %ymm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm19, %ymm22 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm22, %ymm23, %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm22, %ymm24, %ymm24 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r9), %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm30[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm27, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %xmm23 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm26, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm26 = zmm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm26[18,19,20,21],zero,zmm26[19],zero,zmm26[25,26,27,22],zero,zmm26[20],zero,zmm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm26[55],zero,zmm26[53,54,55,58],zero,zmm26[56],zero,zmm26[60,61,58,59] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm26 = zmm26[2,3,2,3,6,7,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm20 = zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm20[18],zero,zero,zero,zero,zmm20[21],zero,zmm20[19],zero,zero,zero,zero,zmm20[22],zero,zmm20[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm20[57],zero,zmm20[55],zero,zero,zero,zero,zmm20[58],zero,zmm20[56],zero,zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %xmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm26, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm20 = zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm20[18,19,20,21],zero,zmm20[19],zero,zmm20[25,26,27,22],zero,zmm20[20],zero,zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm20[55],zero,zmm20[53,54,55,58],zero,zmm20[56],zero,zmm20[60,61,58,59] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vporq %zmm26, %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm30, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm26 = zmm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm26[18],zero,zero,zero,zero,zmm26[21],zero,zmm26[19],zero,zero,zero,zero,zmm26[22],zero,zmm26[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm26[57],zero,zmm26[55],zero,zero,zero,zero,zmm26[58],zero,zmm26[56],zero,zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm26 = zmm26[2,3,2,3,6,7,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vporq %zmm20, %zmm26, %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm26 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm27, %zmm20 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm27 @@ -8911,40 +8828,40 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm28, %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm28 = zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm28[20],zero,zmm28[18],zero,zero,zero,zero,zmm28[21],zero,zmm28[19],zero,zero,zero,zero,zmm28[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm28[57],zero,zmm28[55],zero,zero,zero,zero,zmm28[58],zero,zmm28[56],zero,zero ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vporq %zmm29, %zmm28, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm15, %zmm28, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vporq %zmm29, %zmm28, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm21, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm14, %zmm29, %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm28, %zmm29 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm29, %zmm28 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm29 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm29, %zmm20 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm28, %zmm20 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm28 ; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm18 = ymm18[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm30 = ymm18[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm19, %ymm30 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm23, %ymm19, %ymm30 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm18 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm18, %xmm19, %xmm19 ; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm30[2,3,2,3],zmm19[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm26[0],xmm27[0],xmm26[1],xmm27[1],xmm26[2],xmm27[2],xmm26[3],xmm27[3],xmm26[4],xmm27[4],xmm26[5],xmm27[5],xmm26[6],xmm27[6],xmm26[7],xmm27[7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm26[0],xmm27[0],xmm26[1],xmm27[1],xmm26[2],xmm27[2],xmm26[3],xmm27[3],xmm26[4],xmm27[4],xmm26[5],xmm27[5],xmm26[6],xmm27[6],xmm26[7],xmm27[7] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm17, %ymm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm17, %xmm24, %xmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm17, %xmm23, %xmm23 ; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm16 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[0,2,3,3,4,6,7,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm16, %ymm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm25[2,3,2,3],zmm24[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm25[2,3,2,3],zmm23[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm19, %zmm16 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3],xmm24[4],xmm22[4],xmm24[5],xmm22[5],xmm24[6],xmm22[6],xmm24[7],xmm22[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm19, %xmm19 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,1,0,1] @@ -8957,33 +8874,33 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm11, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm27, %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm24 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm24, %xmm26, %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm11, %xmm25, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm27, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm23 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm23, %xmm26, %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm25, %xmm19 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm25, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm11, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm29, %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm19, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm28, %xmm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm28, %xmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm27 ; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm25, %xmm27, %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm28 = zmm11[0,1,0,1,4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm27, %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm25, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,1,0,1,4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm28, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm22, %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm28[8],xmm29[8],xmm28[9],xmm29[9],xmm28[10],xmm29[10],xmm28[11],xmm29[11],xmm28[12],xmm29[12],xmm28[13],xmm29[13],xmm28[14],xmm29[14],xmm28[15],xmm29[15] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm27, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm25, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm13[0,1,0,1,4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm12, %zmm13 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm22, %xmm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm21, %zmm2, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm21, %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm21 ; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm25, %xmm21, %xmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm24[8],xmm22[8],xmm24[9],xmm22[9],xmm24[10],xmm22[10],xmm24[11],xmm22[11],xmm24[12],xmm22[12],xmm24[13],xmm22[13],xmm24[14],xmm22[14],xmm24[15],xmm22[15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm4 ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm21, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] @@ -8992,14 +8909,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm27, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm4, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm24, %xmm5, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm19, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm4, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm23, %xmm5, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm11, %xmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm17, %xmm5, %xmm5 ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm8, %xmm5 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm7, %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] @@ -9010,14 +8927,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm15, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm14, %zmm1, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512BW-ONLY-SLOW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 @@ -9028,36 +8945,36 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i8_stride7_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: subq $200, %rsp +; AVX512BW-FAST-NEXT: subq $168, %rsp ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm9 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FAST-NEXT: vmovdqa (%rax), %ymm4 ; AVX512BW-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vmovdqa 32(%rax), %ymm13 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] ; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm15 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm14 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm15, %ymm7 +; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm14, %ymm7 ; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] ; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm1, %ymm8 @@ -9068,23 +8985,24 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm25[8],xmm10[9],xmm25[9],xmm10[10],xmm25[10],xmm10[11],xmm25[11],xmm10[12],xmm25[12],xmm10[13],xmm25[13],xmm10[14],xmm25[14],xmm10[15],xmm25[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm22 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm21 ; AVX512BW-FAST-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm22 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm21 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm1, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512BW-FAST-NEXT: vpshufb %ymm22, %ymm1, %ymm6 ; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm1, %ymm11 ; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm11, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm15 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm16 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm16[8],xmm14[8],xmm16[9],xmm14[9],xmm16[10],xmm14[10],xmm16[11],xmm14[11],xmm16[12],xmm14[12],xmm16[13],xmm14[13],xmm16[14],xmm14[14],xmm16[15],xmm14[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm26 @@ -9106,92 +9024,91 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm6 {%k1} ; AVX512BW-FAST-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm22, %ymm22 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm21, %ymm21 ; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm22 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %ymm27 ; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm27, %ymm17 -; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm1, %ymm20 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %ymm31 +; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm31, %ymm20 ; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm31[20],zero,ymm31[18],zero,ymm31[20,21,20,21],zero,ymm31[19],zero,ymm31[19,20,21,22],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm26 = ymm27[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm27[20],zero,ymm27[18],zero,zero,zero,zero,ymm27[21],zero,ymm27[19],zero,zero,zero,zero,ymm27[22] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3] -; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm26, %ymm20 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm27[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm27[20],zero,ymm27[18],zero,zero,zero,zero,ymm27[21],zero,ymm27[19],zero,zero,zero,zero,ymm27[22] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3] +; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm21, %ymm20 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm26 ; AVX512BW-FAST-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm26 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm22 -; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm22, %ymm17 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm26 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm22, %ymm0, %ymm17 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %ymm30 ; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm30, %ymm20 ; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[18],zero,ymm22[18,19,20,21],zero,ymm22[19],zero,ymm22[25,26,27,22],zero,ymm22[20],zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm30[18],zero,zero,zero,zero,ymm30[21],zero,ymm30[19],zero,zero,zero,zero,ymm30[22],zero,ymm30[20] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3] -; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm21, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm31, %ymm17 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm0, %ymm20 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm30[18],zero,zero,zero,zero,ymm30[21],zero,ymm30[19],zero,zero,zero,zero,ymm30[22],zero,ymm30[20] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,3,2,3] +; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm22, %ymm20 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm1, %ymm17 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %ymm21 +; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm21, %ymm20 ; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm31[18,19,20,21],zero,ymm31[19],zero,ymm31[21,20,21,22],zero,ymm31[20],zero,ymm31[22,23] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,20,21],zero,ymm1[19],zero,ymm1[21,20,21,22],zero,ymm1[20],zero,ymm1[22,23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm23 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm23 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm21[21],zero,ymm21[19],zero,zero,zero,zero,ymm21[22],zero,ymm21[20],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] ; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm23, %ymm20 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm23 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm20 ; AVX512BW-FAST-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 ; AVX512BW-FAST-NEXT: kmovq %r10, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm17 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm17 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm22 ; AVX512BW-FAST-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm17 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm23[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm22[23],zero,zmm22[21,22,23,26],zero,zmm22[24],zero,zmm22[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero,zmm22[61],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm30[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm23[4,5,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm0[23],zero,zmm0[21,22,23,26],zero,zmm0[24],zero,zmm0[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero,zmm0[61],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm30[0,1,2,3],zmm3[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm28 = zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm28[25],zero,zmm28[23],zero,zero,zero,zero,zmm28[26],zero,zmm28[24],zero,zero,zero,zero,zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm28[62],zero,zmm28[60],zero,zero,zero,zero,zmm28[63],zero,zmm28[61],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm22, %zmm28, %zmm29 +; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm28, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm28 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm0[23],zero,zero,zero,zero,zmm0[26],zero,zmm0[24],zero,zero,zero,zero,zmm0[27],zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm0[60],zero,zmm0[62,63,62,63],zero,zmm0[61],zero,zmm0[63,60,61] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm31[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm22[23],zero,zero,zero,zero,zmm22[26],zero,zmm22[24],zero,zero,zero,zero,zmm22[27],zero,zmm22[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero,zmm22[61],zero,zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm22, %zmm22 +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm21[23],zero,zero,zero,zero,zmm21[26],zero,zmm21[24],zero,zero,zero,zero,zmm21[27],zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm21[60],zero,zmm21[62,63,62,63],zero,zmm21[61],zero,zmm21[63,60,61] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm21 = zmm21[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zero,zero,zmm1[27],zero,zmm1[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61],zero,zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm21, %zmm1, %zmm21 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm30 ; AVX512BW-FAST-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm29, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm31[0,1,2,3],zmm22[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm27[0,1,2,3],zmm3[4,5,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm27[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[25],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm22 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm21 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] ; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm13, %zmm0 ; AVX512BW-FAST-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm22 {%k3} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm21 {%k3} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm11[28],zero,ymm11[30,31,30,31],zero,ymm11[29],zero,ymm11[31,28,29] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero,zero,zero @@ -9200,11 +9117,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm28[0],xmm30[0],xmm28[1],xmm30[1],xmm28[2],xmm30[2],xmm28[3],xmm30[3],xmm28[4],xmm30[4],xmm28[5],xmm30[5],xmm28[6],xmm30[6],xmm28[7],xmm30[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm27 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm7, %ymm20 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] ; AVX512BW-FAST-NEXT: vporq %ymm0, %ymm27, %ymm27 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm31 @@ -9214,38 +9130,38 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm27 {%k2} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm15[27],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm27 {%k2} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm24[27],zero,zero,zero,zero,ymm24[30],zero,ymm24[28],zero,zero,zero,zero,ymm24[31],zero,ymm24[29] -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm24, %ymm9 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm3, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm24[27],zero,zero,zero,zero,ymm24[30],zero,ymm24[28],zero,zero,zero,zero,ymm24[31],zero,ymm24[29] +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm24, %ymm8 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm2, %ymm4 +; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm4, %zmm4 ; AVX512BW-FAST-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm0 {%k2} ; AVX512BW-FAST-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm27 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm2 +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm4 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm31, %xmm24 -; AVX512BW-FAST-NEXT: vporq %xmm2, %xmm24, %xmm2 +; AVX512BW-FAST-NEXT: vporq %xmm4, %xmm24, %xmm4 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm31[8],xmm1[9],xmm31[9],xmm1[10],xmm31[10],xmm1[11],xmm31[11],xmm1[12],xmm31[12],xmm1[13],xmm31[13],xmm1[14],xmm31[14],xmm1[15],xmm31[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm30, %xmm24 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm30, %xmm24 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm31 = ; AVX512BW-FAST-NEXT: vpshufb %xmm31, %xmm28, %xmm29 ; AVX512BW-FAST-NEXT: vporq %xmm24, %xmm29, %xmm24 @@ -9256,118 +9172,117 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm24 = zmm24[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm24 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm28 +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm28 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm3, %xmm30 +; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm2, %xmm30 ; AVX512BW-FAST-NEXT: vporq %xmm28, %xmm30, %xmm28 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm28, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm13, %zmm4 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm28, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] +; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm13, %zmm3 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm3 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm2 {%k2} ; AVX512BW-FAST-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm24 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm16, %xmm3 -; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3],xmm14[4],xmm16[4],xmm14[5],xmm16[5],xmm14[6],xmm16[6],xmm14[7],xmm16[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 -; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm19, %xmm2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm24 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm16, %xmm2 +; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX512BW-FAST-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm19, %xmm2 ; AVX512BW-FAST-NEXT: vpshufb %xmm31, %xmm18, %xmm3 ; AVX512BW-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm18[0],xmm19[0],xmm18[1],xmm19[1],xmm18[2],xmm19[2],xmm18[3],xmm19[3],xmm18[4],xmm19[4],xmm18[5],xmm19[5],xmm18[6],xmm19[6],xmm18[7],xmm19[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm2[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm25, %xmm1 -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm10, %xmm2 -; AVX512BW-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm25[0],xmm10[1],xmm25[1],xmm10[2],xmm25[2],xmm10[3],xmm25[3],xmm10[4],xmm25[4],xmm10[5],xmm25[5],xmm10[6],xmm25[6],xmm10[7],xmm25[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm25, %xmm0 +; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm10, %xmm1 +; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm25[0],xmm10[1],xmm25[1],xmm10[2],xmm25[2],xmm10[3],xmm25[3],xmm10[4],xmm25[4],xmm10[5],xmm25[5],xmm10[6],xmm25[6],xmm10[7],xmm25[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 32-byte Folded Reload ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm3, %zmm3 ; AVX512BW-FAST-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm0 {%k2} ; AVX512BW-FAST-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm0[19],zero,zmm0[21,20,21,22],zero,zmm0[20],zero,zmm0[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm0[55],zero,zero,zero,zero,zmm0[58],zero,zmm0[56],zero,zero,zero,zero,zmm0[59],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm3[21],zero,zmm3[19],zero,zero,zero,zero,zmm3[22],zero,zmm3[20],zero,zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[55],zero,zero,zero,zero,zmm3[58],zero,zmm3[56],zero,zero,zero,zero,zmm3[59],zero,zmm3[57] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload ; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm4 # 32-byte Folded Reload ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm3[18,19,20,21],zero,zmm3[19],zero,zmm3[25,26,27,22],zero,zmm3[20],zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm3[55],zero,zmm3[53,54,55,58],zero,zmm3[56],zero,zmm3[60,61,58,59] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22],zero,zmm4[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm4 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm1[18],zero,zmm1[20,21,20,21],zero,zmm1[19],zero,zmm1[19,20,21,22],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm1[55],zero,zmm1[55,56,57,58],zero,zmm1[56],zero,zmm1[62,63] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm20, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm22, %zmm4 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm0[18],zero,zmm0[20,21,20,21],zero,zmm0[19],zero,zmm0[19,20,21,22],zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm0[55],zero,zmm0[55,56,57,58],zero,zmm0[56],zero,zmm0[62,63] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[20],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm1, %zmm4, %zmm1 +; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm4, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm4, %zmm2 +; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm4, %zmm1 ; AVX512BW-FAST-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-FAST-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512BW-FAST-NEXT: addq $200, %rsp +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512BW-FAST-NEXT: addq $168, %rsp ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i8_stride7_vf64: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rax), %ymm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rax), %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rax), %ymm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rax), %ymm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm0 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm0 ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm15, %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermw %ymm14, %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm20, %ymm10, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm21, %ymm10, %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm11, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm22, %ymm11, %ymm1 ; AVX512DQBW-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %xmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm1 @@ -9375,59 +9290,59 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm21 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm20 ; AVX512DQBW-SLOW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 ; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm20 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm14, %ymm16, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm15, %ymm16, %ymm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm23, %ymm17, %ymm5 -; AVX512DQBW-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm22 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm24 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm18, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm18, %ymm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm19, %ymm7 -; AVX512DQBW-SLOW-NEXT: vpor %ymm3, %ymm7, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpor %ymm2, %ymm7, %ymm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm27, %xmm27 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm27, %zmm2 ; AVX512DQBW-SLOW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 ; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm22, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm24, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 ; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm21, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm2, %ymm21, %ymm21 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm9 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm9, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm21 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm20, %ymm21, %ymm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm22 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm22, %ymm24 -; AVX512DQBW-SLOW-NEXT: vporq %ymm20, %ymm24, %ymm20 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm24 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[20],zero,ymm22[18],zero,ymm22[20,21,20,21],zero,ymm22[19],zero,ymm22[19,20,21,22],zero +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm20, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm3, %ymm20, %ymm20 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm9 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm20 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm21, %ymm20, %ymm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm21 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm22, %ymm21, %ymm22 +; AVX512DQBW-SLOW-NEXT: vporq %ymm24, %ymm22, %ymm22 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm24 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm21[20],zero,ymm21[18],zero,ymm21[20,21,20,21],zero,ymm21[19],zero,ymm21[19,20,21,22],zero ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm27 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm21[20],zero,ymm21[18],zero,zero,zero,zero,ymm21[21],zero,ymm21[19],zero,zero,zero,zero,ymm21[22] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm27 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm20[20],zero,ymm20[18],zero,zero,zero,zero,ymm20[21],zero,ymm20[19],zero,zero,zero,zero,ymm20[22] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vporq %ymm24, %ymm27, %ymm24 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm24 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm22, %zmm24 ; AVX512DQBW-SLOW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 ; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm24 {%k1} @@ -9435,48 +9350,48 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm28 ; AVX512DQBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm28[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,1,1,4,4,5,5] -; AVX512DQBW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX512DQBW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm22 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] ; AVX512DQBW-SLOW-NEXT: movl $676341840, %r10d # imm = 0x28502850 ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm20, %ymm27, %ymm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm22, %ymm27, %ymm9 {%k1} ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm28, %ymm25 ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm27, %ymm26 ; AVX512DQBW-SLOW-NEXT: vporq %ymm25, %ymm26, %ymm25 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm25, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm26 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm14, %ymm26, %ymm14 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm15, %ymm26, %ymm15 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm29 ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm23, %ymm29, %ymm23 -; AVX512DQBW-SLOW-NEXT: vporq %ymm14, %ymm23, %ymm14 +; AVX512DQBW-SLOW-NEXT: vporq %ymm15, %ymm23, %ymm15 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm23 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm26[18],zero,ymm26[18,19,20,21],zero,ymm26[19],zero,ymm26[25,26,27,22],zero,ymm26[20],zero ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm25 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm29[18],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22],zero,ymm29[20] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vporq %ymm23, %ymm25, %ymm23 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm14, %zmm14 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm15, %zmm15 ; AVX512DQBW-SLOW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 ; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm14, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k2} ; AVX512DQBW-SLOW-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 ; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k3 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm24, %zmm9 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm28[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm24 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] +; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm28[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm23 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] ; AVX512DQBW-SLOW-NEXT: movl $338170920, %r10d # imm = 0x14281428 ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k4 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm14 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512DQBW-SLOW-NEXT: # ymm23 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm23, %ymm27, %ymm25 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm23, %ymm27, %ymm15 {%k4} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512DQBW-SLOW-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm25 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm28 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vporq %ymm25, %ymm28, %ymm25 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm25, %zmm14 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm25, %zmm15 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm25 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm26[24,25],zero,ymm26[23],zero,ymm26[21,22,23,26],zero,ymm26[24],zero,ymm26[28,29,26,27] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm28 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero,zero,zero @@ -9493,44 +9408,44 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm29 ; AVX512DQBW-SLOW-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 ; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm28, %zmm14 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm28, %zmm15 {%k3} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm22[23],zero,zmm22[23,24,25,26],zero,zmm22[24],zero,zmm22[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm22[59],zero,zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm29[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm21[25],zero,zmm21[23],zero,zero,zero,zero,zmm21[26],zero,zmm21[24],zero,zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm21[59],zero,zero,zero,zero,zmm21[62],zero,zmm21[60],zero,zero,zero,zero,zmm21[63],zero,zmm21[61] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm28[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm21[23],zero,zmm21[23,24,25,26],zero,zmm21[24],zero,zmm21[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm21[59],zero,zero,zero,zero,zmm21[62],zero,zmm21[60],zero,zero,zero,zero,zmm21[63],zero ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm21 = zmm21[2,3,2,3,6,7,6,7] -; AVX512DQBW-SLOW-NEXT: vporq %zmm22, %zmm21, %zmm22 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,2,3],zmm29[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm20 = zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm20[25],zero,zmm20[23],zero,zero,zero,zero,zmm20[26],zero,zmm20[24],zero,zero,zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm20[59],zero,zero,zero,zero,zmm20[62],zero,zmm20[60],zero,zero,zero,zero,zmm20[63],zero,zmm20[61] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7] +; AVX512DQBW-SLOW-NEXT: vporq %zmm21, %zmm20, %zmm20 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 ; AVX512DQBW-SLOW-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm22, %zmm14 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm21, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm20, %zmm15 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm21, %zmm3, %zmm20 ; AVX512DQBW-SLOW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm22, %zmm14 {%k5} -; AVX512DQBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm22 = ymm18[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm30 = ymm22[0,0,1,1,4,4,5,5] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm20, %ymm19, %ymm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm23, %ymm19, %ymm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm20, %zmm15 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm20 = ymm18[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,0,1,1,4,4,5,5] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm22, %ymm19, %ymm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm30 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm19, %ymm22 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm23 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vporq %ymm22, %ymm23, %ymm23 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm24 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vporq %ymm22, %ymm24, %ymm24 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm22 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm30[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm27, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm23 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm26, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm26 = zmm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm26[18,19,20,21],zero,zmm26[19],zero,zmm26[25,26,27,22],zero,zmm26[20],zero,zmm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm26[55],zero,zmm26[53,54,55,58],zero,zmm26[56],zero,zmm26[60,61,58,59] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm26 = zmm26[2,3,2,3,6,7,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm20 = zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm20[18],zero,zero,zero,zero,zmm20[21],zero,zmm20[19],zero,zero,zero,zero,zmm20[22],zero,zmm20[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm20[57],zero,zmm20[55],zero,zero,zero,zero,zmm20[58],zero,zmm20[56],zero,zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm24 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm26, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm20 = zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm20[18,19,20,21],zero,zmm20[19],zero,zmm20[25,26,27,22],zero,zmm20[20],zero,zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm20[55],zero,zmm20[53,54,55,58],zero,zmm20[56],zero,zmm20[60,61,58,59] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7] -; AVX512DQBW-SLOW-NEXT: vporq %zmm26, %zmm20, %zmm20 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm30, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm26 = zmm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm26[18],zero,zero,zero,zero,zmm26[21],zero,zmm26[19],zero,zero,zero,zero,zmm26[22],zero,zmm26[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm26[57],zero,zmm26[55],zero,zero,zero,zero,zmm26[58],zero,zmm26[56],zero,zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm26 = zmm26[2,3,2,3,6,7,6,7] +; AVX512DQBW-SLOW-NEXT: vporq %zmm20, %zmm26, %zmm20 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm26 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm27, %zmm20 {%k3} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm27 @@ -9540,40 +9455,40 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm28, %zmm28 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm28 = zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm28[20],zero,zmm28[18],zero,zero,zero,zero,zmm28[21],zero,zmm28[19],zero,zero,zero,zero,zmm28[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm28[57],zero,zmm28[55],zero,zero,zero,zero,zmm28[58],zero,zmm28[56],zero,zero ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7] -; AVX512DQBW-SLOW-NEXT: vporq %zmm29, %zmm28, %zmm29 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512DQBW-SLOW-NEXT: vpermw %zmm15, %zmm28, %zmm28 +; AVX512DQBW-SLOW-NEXT: vporq %zmm29, %zmm28, %zmm28 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm21, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512DQBW-SLOW-NEXT: vpermw %zmm14, %zmm29, %zmm29 ; AVX512DQBW-SLOW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm28, %zmm29 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm29, %zmm28 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm29 ; AVX512DQBW-SLOW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm29, %zmm20 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm28, %zmm20 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm28 ; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm18 = ymm18[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm30 = ymm18[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm19, %ymm30 {%k4} -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm23, %ymm19, %ymm30 {%k4} +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm18 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm18, %xmm19, %xmm19 ; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm30[2,3,2,3],zmm19[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm26[0],xmm27[0],xmm26[1],xmm27[1],xmm26[2],xmm27[2],xmm26[3],xmm27[3],xmm26[4],xmm27[4],xmm26[5],xmm27[5],xmm26[6],xmm27[6],xmm26[7],xmm27[7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm26[0],xmm27[0],xmm26[1],xmm27[1],xmm26[2],xmm27[2],xmm26[3],xmm27[3],xmm26[4],xmm27[4],xmm26[5],xmm27[5],xmm26[6],xmm27[6],xmm26[7],xmm27[7] ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm17, %ymm25 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm17, %xmm24, %xmm24 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm17, %xmm23, %xmm23 ; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm16 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[0,2,3,3,4,6,7,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm16, %ymm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm25[2,3,2,3],zmm24[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm25[2,3,2,3],zmm23[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm19, %zmm16 {%k2} ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm11 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3],xmm24[4],xmm22[4],xmm24[5],xmm22[5],xmm24[6],xmm22[6],xmm24[7],xmm22[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm19, %xmm19 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,1,0,1] @@ -9586,33 +9501,33 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm27, %xmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm24 = -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm24, %xmm26, %xmm25 -; AVX512DQBW-SLOW-NEXT: vporq %xmm11, %xmm25, %xmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm27, %xmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm23 = +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm23, %xmm26, %xmm25 +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm25, %xmm19 ; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm25, %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm11, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm29, %xmm25 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm19, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm28, %xmm25 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm28, %xmm27 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm27 ; AVX512DQBW-SLOW-NEXT: vporq %xmm25, %xmm27, %xmm25 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm28 = zmm11[0,1,0,1,4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm27, %xmm11 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm25, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,1,0,1,4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm28, %zmm11 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm22, %xmm25 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm28[8],xmm29[8],xmm28[9],xmm29[9],xmm28[10],xmm29[10],xmm28[11],xmm29[11],xmm28[12],xmm29[12],xmm28[13],xmm29[13],xmm28[14],xmm29[14],xmm28[15],xmm29[15] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm27, %xmm13 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm25, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm13[0,1,0,1,4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm13 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm22, %xmm25 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm21, %zmm2, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm21 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm21, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm21 ; AVX512DQBW-SLOW-NEXT: vporq %xmm25, %xmm21, %xmm21 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm24[8],xmm22[8],xmm24[9],xmm22[9],xmm24[10],xmm22[10],xmm24[11],xmm22[11],xmm24[12],xmm22[12],xmm24[13],xmm22[13],xmm24[14],xmm22[14],xmm24[15],xmm22[15] ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm4 ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm21, %zmm4 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] @@ -9621,14 +9536,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm27, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm24, %xmm5, %xmm19 -; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm19, %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm23, %xmm5, %xmm11 +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm11, %xmm4 ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm17, %xmm5, %xmm5 ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm5 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm8, %xmm5 ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm7, %xmm6 ; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] @@ -9639,14 +9554,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512DQBW-SLOW-NEXT: vpermw %zmm15, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm14, %zmm1, %zmm1 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512DQBW-SLOW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 @@ -9657,11 +9572,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 86843b6c204f42..9854e5d0c48108 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -259,31 +259,31 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,1] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 ; SSE-NEXT: por %xmm8, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm8 @@ -294,15 +294,15 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] -; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: pandn %xmm7, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 ; SSE-NEXT: por %xmm9, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] ; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm9 @@ -313,30 +313,30 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,3,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm10 ; SSE-NEXT: pandn %xmm9, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 ; SSE-NEXT: por %xmm10, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movdqa %xmm0, 48(%rax) ; SSE-NEXT: movdqa %xmm9, 32(%rax) @@ -681,46 +681,46 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa (%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rcx), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r8), %xmm3 ; SSE-NEXT: movdqa (%r9), %xmm11 ; SSE-NEXT: movdqa (%r10), %xmm4 -; SSE-NEXT: movdqa (%rax), %xmm13 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa (%rax), %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: por %xmm9, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] ; SSE-NEXT: movdqa %xmm2, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm8 @@ -732,14 +732,14 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] ; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm8 ; SSE-NEXT: por %xmm9, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm13[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm9 @@ -749,35 +749,35 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm9, %xmm8 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm12, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm8, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] ; SSE-NEXT: movdqa %xmm2, %xmm6 @@ -792,16 +792,16 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,3] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm9, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] ; SSE-NEXT: movdqa %xmm2, %xmm6 @@ -831,12 +831,12 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm11, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: por %xmm13, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] @@ -861,9 +861,9 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, 96(%rax) ; SSE-NEXT: movdqa %xmm11, 112(%rax) ; SSE-NEXT: movdqa %xmm5, 80(%rax) -; SSE-NEXT: movdqa %xmm9, 64(%rax) -; SSE-NEXT: movdqa %xmm12, 32(%rax) -; SSE-NEXT: movdqa %xmm8, 48(%rax) +; SSE-NEXT: movdqa %xmm8, 64(%rax) +; SSE-NEXT: movdqa %xmm13, 32(%rax) +; SSE-NEXT: movdqa %xmm9, 48(%rax) ; SSE-NEXT: movdqa %xmm7, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) @@ -1140,10 +1140,10 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: subq $232, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rcx), %xmm8 @@ -1175,9 +1175,9 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm8, %xmm6 ; SSE-NEXT: por %xmm10, %xmm6 @@ -1197,7 +1197,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] ; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: por %xmm10, %xmm7 @@ -1563,9 +1563,9 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: subq $72, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1580,181 +1580,180 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm14 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm5 ; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3],ymm5[4],ymm13[5],ymm5[6],ymm13[7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm14[8],mem[8],xmm14[9],mem[9],xmm14[10],mem[10],xmm14[11],mem[11],xmm14[12],mem[12],xmm14[13],mem[13],xmm14[14],mem[14],xmm14[15],mem[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm2, %ymm9 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm10[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm15, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm5[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm7[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm8, %ymm13, %ymm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 @@ -1812,7 +1811,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -1837,9 +1836,9 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] @@ -1851,8 +1850,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,2,2,3,4,5,6,7] @@ -1864,69 +1863,69 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm11[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm15, %ymm15 ; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3],ymm15[4,5,6],ymm1[7],ymm15[8,9,10],ymm1[11],ymm15[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 ; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7,8],ymm13[9],ymm1[10,11,12],ymm13[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm1[1],ymm13[2],ymm1[3],ymm13[4],ymm1[5],ymm13[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7],ymm5[8,9,10],ymm2[11],ymm5[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7,8],ymm10[9],ymm7[10,11,12],ymm10[13],ymm7[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm2[1],ymm7[2],ymm2[3],ymm7[4],ymm2[5],ymm7[6],ymm2[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] @@ -1942,19 +1941,19 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm11[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,6,5,7,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] @@ -1975,8 +1974,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] @@ -1995,8 +1994,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] @@ -2058,124 +2057,124 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: subq $72, %rsp ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm5 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm6 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm15 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3,4],ymm4[5],ymm11[6,7,8],ymm4[9],ymm11[10,11,12],ymm4[13],ymm11[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2],ymm13[3],ymm4[4],ymm13[5],ymm4[6],ymm13[7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3],ymm8[4,5,6],ymm3[7],ymm8[8,9,10],ymm3[11],ymm8[12,13,14],ymm3[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2],ymm3[3],ymm8[4],ymm3[5],ymm8[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm15[8],xmm6[8],xmm15[9],xmm6[9],xmm15[10],xmm6[10],xmm15[11],xmm6[11],xmm15[12],xmm6[12],xmm15[13],xmm6[13],xmm15[14],xmm6[14],xmm15[15],xmm6[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm6 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm7 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm6 ; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] ; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm4 ; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm6 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3],ymm12[4,5,6],ymm7[7],ymm12[8,9,10],ymm7[11],ymm12[12,13,14],ymm7[15] ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7,8],ymm4[9],ymm1[10,11,12],ymm4[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4],ymm7[5],ymm1[6],ymm7[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm4 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -2185,7 +2184,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm11, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm5 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] @@ -2193,8 +2192,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm15, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) @@ -2211,124 +2210,124 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3,4],ymm4[5],ymm11[6,7,8],ymm4[9],ymm11[10,11,12],ymm4[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2],ymm13[3],ymm4[4],ymm13[5],ymm4[6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3],ymm8[4,5,6],ymm3[7],ymm8[8,9,10],ymm3[11],ymm8[12,13,14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2],ymm3[3],ymm8[4],ymm3[5],ymm8[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm15[8],xmm6[8],xmm15[9],xmm6[9],xmm15[10],xmm6[10],xmm15[11],xmm6[11],xmm15[12],xmm6[12],xmm15[13],xmm6[13],xmm15[14],xmm6[14],xmm15[15],xmm6[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3],ymm12[4,5,6],ymm7[7],ymm12[8,9,10],ymm7[11],ymm12[12,13,14],ymm7[15] ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7,8],ymm4[9],ymm1[10,11,12],ymm4[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4],ymm7[5],ymm1[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm9, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm8, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -2338,7 +2337,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] @@ -2346,8 +2345,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) @@ -2370,130 +2369,130 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3],ymm7[4,5,6],ymm2[7],ymm7[8,9,10],ymm2[11],ymm7[12,13,14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6],ymm1[7],ymm10[8,9,10],ymm1[11],ymm10[12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7],ymm10[8,9,10],ymm2[11],ymm10[12,13,14],ymm2[15] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm19 = xmm0[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm0[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7,8],ymm15[9],ymm1[10,11,12],ymm15[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] @@ -2569,12 +2568,12 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm17, %zmm19 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -2586,144 +2585,149 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r10), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3],ymm8[4,5,6],ymm3[7],ymm8[8,9,10],ymm3[11],ymm8[12,13,14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm29 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rax), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r8), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm6, %ymm17, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7,8],ymm0[9],ymm6[10,11,12],ymm0[13],ymm6[14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r10), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rax), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r8), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3],ymm8[4,5,6],ymm3[7],ymm8[8,9,10],ymm3[11],ymm8[12,13,14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3,4],ymm14[5],ymm9[6,7,8],ymm14[9],ymm9[10,11,12],ymm14[13],ymm9[14,15] +; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm14, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7,8],ymm14[9],ymm0[10,11,12],ymm14[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4],ymm14[5],ymm8[6,7,8],ymm14[9],ymm8[10,11,12],ymm14[13],ymm8[14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7],ymm2[8,9,10],ymm5[11],ymm2[12,13,14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15] @@ -2731,14 +2735,14 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm20, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm21, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -2969,117 +2973,117 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm22 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm10 -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7,8],ymm1[9],ymm6[10,11,12],ymm1[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm7 +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm13 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm14 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3],ymm6[4,5,6],ymm15[7],ymm6[8,9,10],ymm15[11],ymm6[12,13,14],ymm15[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] ; AVX512DQ-FAST-NEXT: movw $-21846, %r11w # imm = 0xAAAA ; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm29 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r10), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rax), %xmm4 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r9), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r8), %xmm2 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm0, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5,6],ymm1[7],ymm14[8,9,10],ymm1[11],ymm14[12,13,14],ymm1[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6],ymm0[7],ymm14[8,9,10],ymm0[11],ymm14[12,13,14],ymm0[15] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7,8],ymm11[9],ymm13[10,11,12],ymm11[13],ymm13[14,15] +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm29 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm15[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4],ymm13[5],ymm6[6,7,8],ymm13[9],ymm6[10,11,12],ymm13[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r10), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rax), %xmm11 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r9), %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r8), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm8[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm7, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7,8],ymm7[9],ymm6[10,11,12],ymm7[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm15 -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm4 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7],ymm1[8,9,10],ymm2[11],ymm1[12,13,14],ymm2[15] ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 @@ -3088,26 +3092,26 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm4 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm4 ; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm2, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -3123,110 +3127,110 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r11), %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r11), %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm25[8],xmm1[9],xmm25[9],xmm1[10],xmm25[10],xmm1[11],xmm25[11],xmm1[12],xmm25[12],xmm1[13],xmm25[13],xmm1[14],xmm25[14],xmm1[15],xmm25[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r11), %xmm23 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r11), %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r10), %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm25[8],xmm23[8],xmm25[9],xmm23[9],xmm25[10],xmm23[10],xmm25[11],xmm23[11],xmm25[12],xmm23[12],xmm25[13],xmm23[13],xmm25[14],xmm23[14],xmm25[15],xmm23[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 ; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r9), %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r9), %xmm12 ; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r8), %xmm13 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] -; AVX512BW-SLOW-NEXT: vpermt2w %zmm5, %zmm23, %zmm19 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] +; AVX512BW-SLOW-NEXT: vpermt2w %zmm5, %zmm17, %zmm14 ; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm5 ; AVX512BW-SLOW-NEXT: vmovdqa 16(%rsi), %xmm15 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm6 ; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm16 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm10 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm17 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm7, %ymm4 +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm18 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdx), %xmm18 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdx), %xmm19 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] ; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero ; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm22 = xmm20[0,2,2,3,4,5,6,7] ; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm22 = xmm22[0],zero,xmm22[1],zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm21, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7,8],ymm4[9],ymm10[10,11,12],ymm4[13],ymm10[14,15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm21, %ymm0 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15] ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> -; AVX512BW-SLOW-NEXT: vpermt2w %ymm20, %ymm22, %ymm7 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 +; AVX512BW-SLOW-NEXT: vpermt2w %ymm20, %ymm22, %ymm8 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 ; AVX512BW-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm19, %zmm7 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2w %zmm4, %zmm23, %zmm20 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = xmm4[1,1,1,1] +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm14, %zmm8 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2w %zmm0, %zmm17, %zmm14 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm10 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm19[0,0,2,1,4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm4 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] ; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm24 = xmm19[0,2,2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm24 = xmm20[0,2,2,3,4,5,6,7] ; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm24 = xmm24[0],zero,xmm24[1],zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7,8],ymm0[9],ymm10[10,11,12],ymm0[13],ymm10[14,15] -; AVX512BW-SLOW-NEXT: vpermt2w %ymm19, %ymm22, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7,8],ymm1[9],ymm4[10,11,12],ymm1[13],ymm4[14,15] +; AVX512BW-SLOW-NEXT: vpermt2w %ymm20, %ymm22, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm14, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpermt2w %zmm0, %zmm23, %zmm4 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpermt2w %zmm1, %zmm17, %zmm4 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] ; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] ; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero ; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7] ; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] -; AVX512BW-SLOW-NEXT: vpermt2w %ymm11, %ymm22, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm25[0],xmm1[1],xmm25[1],xmm1[2],xmm25[2],xmm1[3],xmm25[3],xmm1[4],xmm25[4],xmm1[5],xmm25[5],xmm1[6],xmm25[6],xmm1[7],xmm25[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2w %ymm11, %ymm22, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm25[0],xmm23[0],xmm25[1],xmm23[1],xmm25[2],xmm23[2],xmm25[3],xmm23[3],xmm25[4],xmm23[4],xmm25[5],xmm23[5],xmm25[6],xmm23[6],xmm25[7],xmm23[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 ; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512BW-SLOW-NEXT: vpermt2w %zmm1, %zmm23, %zmm2 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vpermt2w %zmm4, %zmm17, %zmm2 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] ; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] -; AVX512BW-SLOW-NEXT: vpermt2w %ymm4, %ymm22, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX512BW-SLOW-NEXT: vpermt2w %ymm5, %ymm22, %ymm3 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -3236,73 +3240,73 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FAST-NEXT: vmovdqa (%r11), %xmm0 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r11), %xmm7 -; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r9), %xmm9 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r8), %xmm10 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] -; AVX512BW-FAST-NEXT: vpermt2w %zmm5, %zmm4, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512BW-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512BW-FAST-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm17 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm19 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r11), %xmm1 +; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm2 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm5 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] +; AVX512BW-FAST-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm18 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm19 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u> -; AVX512BW-FAST-NEXT: vpermt2w %zmm15, %zmm20, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 ; AVX512BW-FAST-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm4, %zmm15 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm19[0],xmm17[0],xmm19[1],xmm17[1],xmm19[2],xmm17[2],xmm19[3],xmm17[3],xmm19[4],xmm17[4],xmm19[5],xmm17[5],xmm19[6],xmm17[6],xmm19[7],xmm17[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm21, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm20, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm15, %zmm21 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm9, %zmm19 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2w %zmm7, %zmm20, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2w %zmm9, %zmm10, %zmm14 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm21, %zmm21, %zmm21 +; AVX512BW-FAST-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm14, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2w %zmm1, %zmm10, %zmm3 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2w %zmm1, %zmm20, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2w %zmm0, %zmm4, %zmm1 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] +; AVX512BW-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 ; AVX512BW-FAST-NEXT: vpermt2w %zmm0, %zmm20, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 @@ -3331,11 +3335,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: subq $312, %rsp # imm = 0x138 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: movdqa (%rcx), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa (%rdx), %xmm8 +; SSE-NEXT: movdqa (%rcx), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r8), %xmm6 ; SSE-NEXT: movdqa (%r9), %xmm9 ; SSE-NEXT: movdqa (%r10), %xmm7 @@ -3343,358 +3348,364 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm3, %xmm12 ; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm5, %xmm14 ; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,1,1,3] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm15, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm5, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm14, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm14, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm14 ; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm5, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm14, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm14, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm14 ; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: por %xmm5, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: por %xmm5, %xmm12 -; SSE-NEXT: movdqa 16(%r8), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movdqa 16(%r8), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm12[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm8 ; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm8 ; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,2,2,3] +; SSE-NEXT: pand %xmm5, %xmm9 ; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: movdqa 16(%r10), %xmm5 +; SSE-NEXT: movdqa 16(%r10), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm8 ; SSE-NEXT: movdqa 16(%rax), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 ; SSE-NEXT: por %xmm8, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm8, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 ; SSE-NEXT: movdqa 16(%r9), %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm13, %xmm11 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm14 -; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,2,1] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: movdqa 16(%rdx), %xmm4 ; SSE-NEXT: movdqa 16(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 ; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: por %xmm15, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: pandn %xmm7, %xmm14 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm7 ; SSE-NEXT: por %xmm14, %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm14 ; SSE-NEXT: por %xmm15, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: pandn %xmm7, %xmm14 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm7 ; SSE-NEXT: por %xmm14, %xmm7 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm14 ; SSE-NEXT: por %xmm15, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm10 ; SSE-NEXT: pandn %xmm7, %xmm10 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: por %xmm10, %xmm7 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: por %xmm11, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r10), %xmm0 @@ -3702,17 +3713,17 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm10 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa 32(%r8), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa 32(%r8), %xmm2 ; SSE-NEXT: movdqa 32(%r9), %xmm7 -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: pand %xmm3, %xmm14 ; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: movdqa 32(%rdx), %xmm4 ; SSE-NEXT: movdqa 32(%rcx), %xmm8 @@ -3724,9 +3735,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm5, %xmm15 ; SSE-NEXT: movdqa 32(%rdi), %xmm5 ; SSE-NEXT: movdqa 32(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: por %xmm15, %xmm6 @@ -3736,17 +3747,17 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: pandn %xmm6, %xmm14 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm14, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm14 ; SSE-NEXT: por %xmm15, %xmm14 @@ -3756,17 +3767,17 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: pandn %xmm6, %xmm14 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm14, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm14 ; SSE-NEXT: por %xmm15, %xmm14 @@ -3776,34 +3787,34 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: pandn %xmm6, %xmm10 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm10, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: por %xmm11, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm10, %xmm6 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,0,2,1,4,5,6,7] @@ -3816,16 +3827,16 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] @@ -3836,16 +3847,16 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] @@ -3856,29 +3867,30 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 48(%r10), %xmm9 ; SSE-NEXT: movdqa 48(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3886,8 +3898,8 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa 48(%r8), %xmm8 ; SSE-NEXT: movdqa 48(%r9), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3895,180 +3907,182 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: movdqa 48(%rdx), %xmm7 -; SSE-NEXT: movdqa 48(%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa 48(%rcx), %xmm14 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 ; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa 48(%rsi), %xmm11 +; SSE-NEXT: movdqa 48(%rsi), %xmm12 ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm13, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm10, %xmm14 +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm10, %xmm13 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: por %xmm14, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: por %xmm15, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm10, %xmm14 +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: por %xmm14, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm2[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm12[8],xmm5[9],xmm12[9],xmm5[10],xmm12[10],xmm5[11],xmm12[11],xmm5[12],xmm12[12],xmm5[13],xmm12[13],xmm5[14],xmm12[14],xmm5[15],xmm12[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,1,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,2,2] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm4, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm11, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm1, 496(%rax) -; SSE-NEXT: movdqa %xmm3, 480(%rax) +; SSE-NEXT: movdqa %xmm4, 480(%rax) ; SSE-NEXT: movdqa %xmm0, 464(%rax) -; SSE-NEXT: movdqa %xmm4, 448(%rax) +; SSE-NEXT: movdqa %xmm2, 448(%rax) ; SSE-NEXT: movdqa %xmm6, 432(%rax) ; SSE-NEXT: movdqa %xmm10, 416(%rax) -; SSE-NEXT: movdqa %xmm15, 400(%rax) +; SSE-NEXT: movdqa %xmm13, 400(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) @@ -4120,70 +4134,70 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: subq $360, %rsp # imm = 0x168 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm14 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm13 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm13, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm14, %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] @@ -4191,430 +4205,430 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm10 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm13, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm13, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4],ymm15[5],ymm3[6],ymm15[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm13, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm5, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2],ymm2[3],ymm7[4],ymm2[5],ymm7[6],ymm2[7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm13, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4],ymm2[5],ymm6[6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4662,15 +4676,15 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,5,7,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 @@ -4685,22 +4699,22 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7,8],ymm7[9],ymm1[10,11,12],ymm7[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vmovdqa 48(%rax), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4],ymm4[5],ymm7[6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 48(%r9), %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6],ymm5[7],ymm3[8,9,10],ymm5[11],ymm3[12,13,14],ymm5[15] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7],ymm3[8,9,10],ymm4[11],ymm3[12,13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] @@ -4720,41 +4734,41 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm14 ; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7,8],ymm12[9],ymm14[10,11,12],ymm12[13],ymm14[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] @@ -4820,13 +4834,13 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm12 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 @@ -4837,20 +4851,20 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,4,6,5] @@ -4906,101 +4920,101 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm10 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm9 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm6 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7,8],ymm14[9],ymm2[10,11,12],ymm14[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm11[1],ymm2[2],ymm11[3],ymm2[4],ymm11[5],ymm2[6],ymm11[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4],ymm9[5],ymm1[6],ymm9[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7],ymm8[8,9,10],ymm1[11],ymm8[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7,8],ymm0[9],ymm8[10,11,12],ymm0[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -5027,12 +5041,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3,4],ymm9[5],ymm6[6,7,8],ymm9[9],ymm6[10,11,12],ymm9[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7,8],ymm10[9],ymm6[10,11,12],ymm10[13],ymm6[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] @@ -5060,7 +5074,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 192(%rax) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5087,7 +5101,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-FAST-LABEL: store_i8_stride8_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $392, %rsp # imm = 0x188 +; AVX2-FAST-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm1 @@ -5096,294 +5110,289 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm13 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%r10), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 48(%rax), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 48(%r9), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 48(%r8), %xmm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] +; AVX2-FAST-NEXT: vmovdqa 48(%r10), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 48(%rax), %xmm5 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 48(%r9), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 48(%r8), %xmm10 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6],ymm12[7],ymm13[8,9,10],ymm12[11],ymm13[12,13,14],ymm12[15] +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7,8],ymm7[9],ymm3[10,11,12],ymm7[13],ymm3[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm5 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm10 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4],ymm6[5],ymm10[6],ymm6[7] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7],ymm5[8,9,10],ymm2[11],ymm5[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm15 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm2 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm0 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm10 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm3 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3],ymm7[4,5,6],ymm2[7],ymm7[8,9,10],ymm2[11],ymm7[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm15 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm0 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7,8],ymm10[9],ymm12[10,11,12],ymm10[13],ymm12[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4],ymm2[5],ymm10[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm5 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7,8],ymm6[9],ymm10[10,11,12],ymm6[13],ymm10[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2],ymm1[3],ymm6[4],ymm1[5],ymm6[6],ymm1[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm7 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7,8],ymm0[9],ymm7[10,11,12],ymm0[13],ymm7[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm9 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4],ymm6[5],ymm9[6],ymm6[7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%rax) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) @@ -5405,13 +5414,13 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $392, %rsp # imm = 0x188 +; AVX2-FAST-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride8_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $392, %rsp # imm = 0x188 +; AVX2-FAST-PERLANE-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm1 @@ -5420,294 +5429,289 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm9, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r10), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rax), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r9), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r10), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rax), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r9), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r8), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm11, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6],ymm12[7],ymm13[8,9,10],ymm12[11],ymm13[12,13,14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7,8],ymm7[9],ymm3[10,11,12],ymm7[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4],ymm6[5],ymm10[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7],ymm5[8,9,10],ymm2[11],ymm5[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3],ymm7[4,5,6],ymm2[7],ymm7[8,9,10],ymm2[11],ymm7[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm13, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7,8],ymm10[9],ymm12[10,11,12],ymm10[13],ymm12[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4],ymm2[5],ymm10[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7,8],ymm6[9],ymm10[10,11,12],ymm6[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2],ymm1[3],ymm6[4],ymm1[5],ymm6[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7,8],ymm0[9],ymm7[10,11,12],ymm0[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm5, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4],ymm6[5],ymm9[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) @@ -5729,152 +5733,151 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $392, %rsp # imm = 0x188 +; AVX2-FAST-PERLANE-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512F-SLOW-NEXT: subq $632, %rsp # imm = 0x278 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r10), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r9), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r8), %xmm13 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm8 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm14, %ymm28 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm2, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm14, %ymm31 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdx), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm24 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm22 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm20 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 48(%r10), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 48(%rax), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm8, %ymm5, %ymm28 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm30 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 48(%r9), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 48(%r8), %xmm11 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm12 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm18 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm30 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm31 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm23 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm5, %ymm27 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm12[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm13, %ymm19 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm13, %ymm17 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm10, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm24 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm5, %ymm21 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm13, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm3 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm13, %ymm26 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm4, %ymm25 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm4, %ymm23 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm4, %ymm22 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm4, %ymm19 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm10, %ymm29 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 @@ -5883,12 +5886,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdx), %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX512F-SLOW-NEXT: vmovdqa 16(%rcx), %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] @@ -5897,32 +5900,32 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa 16(%r10), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rax), %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 16(%r9), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX512F-SLOW-NEXT: vmovdqa 16(%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm27 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX512F-SLOW-NEXT: vmovdqa 48(%rsi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] @@ -5931,58 +5934,51 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,3,3,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm6 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm6 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm6, %zmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[0,0,2,1,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpandnq %zmm4, %zmm8, %zmm4 +; AVX512F-SLOW-NEXT: vpandnq %zmm3, %zmm8, %zmm3 ; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm5, %zmm5 ; AVX512F-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vpord %zmm4, %zmm5, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vpord %zmm3, %zmm5, %zmm4 {%k1} ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm9 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm12 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm12 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd $96, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm2, %zmm12 {%k1} ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] @@ -5993,512 +5989,510 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm21 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm21 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1} +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm1, %zmm21 {%k1} ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm20 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm22 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[3,3,3,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm23[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm7 = mem[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm24 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm6, %zmm24 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpandnq %zmm5, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpord %zmm5, %zmm7, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpandnq %zmm3, %zmm8, %zmm3 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpord %zmm3, %zmm5, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm5, %ymm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm21 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm3, %ymm23 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm5 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm5, %ymm22 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm7, %ymm19 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm14 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm3, %ymm16 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm10, %ymm14 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm10, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm10, %ymm17 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm10, %ymm20 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3],xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm10, %ymm13 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm15 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm13 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm3, %zmm5 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm13 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vpandnq %zmm13, %zmm8, %zmm13 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm23, %zmm23 -; AVX512F-SLOW-NEXT: vpord %zmm13, %zmm23, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm12[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm24, %ymm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm10[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm10[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm7, %ymm17 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm7, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm19 = xmm1[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm19, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm19 = mem[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm7, %zmm19 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm19 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpandnq %zmm1, %zmm8, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm25 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm7, %zmm19 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm13 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512F-SLOW-NEXT: vpandnq %zmm25, %zmm8, %zmm25 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm16, %zmm16 -; AVX512F-SLOW-NEXT: vpord %zmm25, %zmm16, %zmm13 {%k1} -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16 +; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm25, %ymm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm25 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm27 = mem[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm11[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,1,1] +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm29 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm30 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm29, %zmm29 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm30 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm30, %zmm20 +; AVX512F-SLOW-NEXT: vpandnq %zmm29, %zmm8, %zmm29 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm20, %zmm20 +; AVX512F-SLOW-NEXT: vpord %zmm29, %zmm20, %zmm7 {%k1} ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm29, %ymm10 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm10, %zmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm3, %zmm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm21[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm10, %zmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 -; AVX512F-SLOW-NEXT: vpandnq %zmm10, %zmm8, %zmm10 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpord %zmm10, %zmm7, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm7 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm10, %ymm10 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm26, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm20 = ymm23[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm16[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm5, %zmm10 {%k1} +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm28, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpandnq %zmm2, %zmm8, %zmm1 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm0, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm2, %zmm1 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-SLOW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-SLOW-NEXT: addq $632, %rsp # imm = 0x278 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $392, %rsp # imm = 0x188 +; AVX512F-FAST-NEXT: subq $360, %rsp # imm = 0x168 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm11 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm14 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm3 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm5 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm3 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%r10), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 48(%rax), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%r9), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 48(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm29 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 48(%r10), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 48(%rax), %xmm5 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 48(%r9), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa 48(%r8), %xmm7 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm20 -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm23 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm18 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm21 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 ; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm26 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa 16(%r10), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 16(%rax), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa 16(%r9), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm23 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 16(%rax), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa 16(%r9), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 16(%r8), %xmm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm29 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm28 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm31 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm17 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> ; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm2 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm3 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm3 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm4 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpandnq %zmm19, %zmm0, %zmm19 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm21, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm0 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm24, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm24 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512F-FAST-NEXT: vpandnq %zmm26, %zmm24, %zmm4 +; AVX512F-FAST-NEXT: vpandq %zmm24, %zmm29, %zmm15 ; AVX512F-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vpord %zmm19, %zmm21, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm15 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm1 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %ymm21, %ymm1 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm6 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm15 -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm1 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm14 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm21, %ymm14 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm14, %ymm8 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm2 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 +; AVX512F-FAST-NEXT: vpord %zmm4, %zmm15, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm12 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm12, %ymm15, %ymm12 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandnq (%rsp), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm20, %zmm3, %zmm11 -; AVX512F-FAST-NEXT: vpandnq %zmm22, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm23, %zmm8 -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm24, %zmm3, %zmm14 -; AVX512F-FAST-NEXT: vpandnq %zmm25, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm26, %zmm8 -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm27, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpandnq %zmm28, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm29, %zmm8 -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm30, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vpandnq %zmm31, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm16, %zmm8 -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm19, %zmm3, %zmm5 -; AVX512F-FAST-NEXT: vpandnq %zmm13, %zmm0, %zmm3 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm21, %zmm0 -; AVX512F-FAST-NEXT: vpord %zmm3, %zmm0, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm11 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm15 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm26, %ymm15 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm12 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm14 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm14 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm14 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm8 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm8, %ymm26, %ymm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm14 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpord %zmm5, %zmm6, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpord %zmm5, %zmm6, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq %zmm17, %zmm24, %zmm5 +; AVX512F-FAST-NEXT: vpandq %zmm24, %zmm18, %zmm6 +; AVX512F-FAST-NEXT: vpord %zmm5, %zmm6, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm21, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vpandnq %zmm22, %zmm24, %zmm5 +; AVX512F-FAST-NEXT: vpandq %zmm24, %zmm23, %zmm6 +; AVX512F-FAST-NEXT: vpord %zmm5, %zmm6, %zmm13 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm25, %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vpandnq %zmm27, %zmm24, %zmm5 +; AVX512F-FAST-NEXT: vpandq %zmm24, %zmm28, %zmm6 +; AVX512F-FAST-NEXT: vpord %zmm5, %zmm6, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm30, %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vpandnq %zmm31, %zmm24, %zmm5 +; AVX512F-FAST-NEXT: vpandq %zmm24, %zmm16, %zmm6 +; AVX512F-FAST-NEXT: vpord %zmm5, %zmm6, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm29, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpandnq %zmm11, %zmm24, %zmm0 +; AVX512F-FAST-NEXT: vpandq %zmm24, %zmm9, %zmm5 +; AVX512F-FAST-NEXT: vpord %zmm0, %zmm5, %zmm1 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512F-FAST-NEXT: addq $392, %rsp # imm = 0x188 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-FAST-NEXT: addq $360, %rsp # imm = 0x168 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -6508,228 +6502,226 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r10), %xmm22 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r10), %xmm21 ; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r10), %xmm19 ; AVX512BW-SLOW-NEXT: vmovdqa (%rax), %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rax), %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rax), %xmm23 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rax), %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rax), %xmm22 ; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rax), %xmm20 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r9), %xmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r9), %xmm21 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r9), %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r9), %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm10 ; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r8), %xmm18 ; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r8), %xmm24 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r8), %xmm25 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] ; AVX512BW-SLOW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 -; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rsi), %xmm28 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdi), %xmm30 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm15, %zmm3 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rsi), %xmm27 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdi), %xmm28 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm8, %ymm8 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm11, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm2 ; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa 48(%rcx), %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rcx), %xmm29 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm11 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm14 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm16 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] ; AVX512BW-SLOW-NEXT: movl $572662306, %eax # imm = 0x22222222 -; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-SLOW-NEXT: vpermw %zmm14, %zmm12, %zmm3 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm15 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 -; AVX512BW-SLOW-NEXT: vpermw %zmm14, %zmm10, %zmm14 -; AVX512BW-SLOW-NEXT: vpermw %zmm15, %zmm9, %zmm14 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm15[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm29 = xmm15[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm27, %ymm27 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm29, %ymm15 -; AVX512BW-SLOW-NEXT: vmovdqa 48(%rdx), %xmm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm15, %zmm15 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm27, %zmm27, %zmm27 -; AVX512BW-SLOW-NEXT: vpermw %zmm27, %zmm12, %zmm15 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm29 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-SLOW-NEXT: vpermw %zmm16, %zmm12, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm16, %zmm16, %zmm16 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm17, %zmm17, %zmm17 +; AVX512BW-SLOW-NEXT: vpermw %zmm17, %zmm14, %zmm17 +; AVX512BW-SLOW-NEXT: vpermw %zmm16, %zmm15, %zmm17 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm16[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm31 = xmm16[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm31[0],zero,zero,zero,xmm31[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm31, %ymm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdx), %xmm31 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm16, %zmm16 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm30 = xmm31[0],xmm29[0],xmm31[1],xmm29[1],xmm31[2],xmm29[2],xmm31[3],xmm29[3],xmm31[4],xmm29[4],xmm31[5],xmm29[5],xmm31[6],xmm29[6],xmm31[7],xmm29[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm30, %zmm30, %zmm30 +; AVX512BW-SLOW-NEXT: vpermw %zmm30, %zmm12, %zmm16 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm30 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm31 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm27 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm21 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm24 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm20, %zmm19 -; AVX512BW-SLOW-NEXT: vpermw %zmm19, %zmm10, %zmm19 -; AVX512BW-SLOW-NEXT: vpermw %zmm21, %zmm9, %zmm19 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = xmm20[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm20[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpermw %zmm19, %zmm14, %zmm19 +; AVX512BW-SLOW-NEXT: vpermw %zmm24, %zmm15, %zmm19 {%k2} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm20[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm25 = xmm20[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm24, %ymm24 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm20[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm24, %ymm20 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm20 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm20 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm21 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm21 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm29[0],xmm31[1],xmm29[1],xmm31[2],xmm29[2],xmm31[3],xmm29[3],xmm31[4],xmm29[4],xmm31[5],xmm29[5],xmm31[6],xmm29[6],xmm31[7],xmm29[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm27[0],xmm2[1],xmm27[1],xmm2[2],xmm27[2],xmm2[3],xmm27[3],xmm2[4],xmm27[4],xmm2[5],xmm27[5],xmm2[6],xmm27[6],xmm2[7],xmm27[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm24 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rsi), %xmm28 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm30 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm25 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm22 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm22 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm31[8],xmm29[8],xmm31[9],xmm29[9],xmm31[10],xmm29[10],xmm31[11],xmm29[11],xmm31[12],xmm29[12],xmm31[13],xmm29[13],xmm31[14],xmm29[14],xmm31[15],xmm29[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm25, %ymm20 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm31[8],xmm29[8],xmm31[9],xmm29[9],xmm31[10],xmm29[10],xmm31[11],xmm29[11],xmm31[12],xmm29[12],xmm31[13],xmm29[13],xmm31[14],xmm29[14],xmm31[15],xmm29[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm24, %zmm24, %zmm24 +; AVX512BW-SLOW-NEXT: vpermw %zmm24, %zmm12, %zmm20 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm24, %zmm24, %zmm25 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm26[0],xmm23[0],xmm26[1],xmm23[1],xmm26[2],xmm23[2],xmm26[3],xmm23[3],xmm26[4],xmm23[4],xmm26[5],xmm23[5],xmm26[6],xmm23[6],xmm26[7],xmm23[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm24, %zmm24, %zmm24 +; AVX512BW-SLOW-NEXT: vpermw %zmm24, %zmm14, %zmm24 +; AVX512BW-SLOW-NEXT: vpermw %zmm25, %zmm15, %zmm24 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm0[0],xmm30[0],xmm0[1],xmm30[1],xmm0[2],xmm30[2],xmm0[3],xmm30[3],xmm0[4],xmm30[4],xmm0[5],xmm30[5],xmm0[6],xmm30[6],xmm0[7],xmm30[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm25[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm28 = xmm25[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm28[0],zero,zero,zero,xmm28[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm27, %ymm27 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm25 = xmm25[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm28 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm28[0],xmm1[0],xmm28[1],xmm1[1],xmm28[2],xmm1[2],xmm28[3],xmm1[3],xmm28[4],xmm1[4],xmm28[5],xmm1[5],xmm28[6],xmm1[6],xmm28[7],xmm1[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm27, %zmm27, %zmm27 +; AVX512BW-SLOW-NEXT: vpermw %zmm27, %zmm12, %zmm25 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rsi), %xmm27 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm29 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm26[8],xmm23[8],xmm26[9],xmm23[9],xmm26[10],xmm23[10],xmm26[11],xmm23[11],xmm26[12],xmm23[12],xmm26[13],xmm23[13],xmm26[14],xmm23[14],xmm26[15],xmm23[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm23 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm21, %zmm21, %zmm26 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm22, %zmm22, %zmm21 +; AVX512BW-SLOW-NEXT: vpermw %zmm21, %zmm14, %zmm21 +; AVX512BW-SLOW-NEXT: vpermw %zmm26, %zmm15, %zmm21 {%k2} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm30[8],xmm0[9],xmm30[9],xmm0[10],xmm30[10],xmm0[11],xmm30[11],xmm0[12],xmm30[12],xmm0[13],xmm30[13],xmm0[14],xmm30[14],xmm0[15],xmm30[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm22 = xmm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm26 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm26[0],zero,zero,zero,xmm26[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm22, %ymm22 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm23, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm27[8],xmm2[9],xmm27[9],xmm2[10],xmm27[10],xmm2[11],xmm27[11],xmm2[12],xmm27[12],xmm2[13],xmm27[13],xmm2[14],xmm27[14],xmm2[15],xmm27[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm26, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm22 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm28[8],xmm1[8],xmm28[9],xmm1[9],xmm28[10],xmm1[10],xmm28[11],xmm1[11],xmm28[12],xmm1[12],xmm28[13],xmm1[13],xmm28[14],xmm1[14],xmm28[15],xmm1[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm23 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm16[0],xmm13[0],xmm16[1],xmm13[1],xmm16[2],xmm13[2],xmm16[3],xmm13[3],xmm16[4],xmm13[4],xmm16[5],xmm13[5],xmm16[6],xmm13[6],xmm16[7],xmm13[7] +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm22 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm13[0],xmm18[1],xmm13[1],xmm18[2],xmm13[2],xmm18[3],xmm13[3],xmm18[4],xmm13[4],xmm18[5],xmm13[5],xmm18[6],xmm13[6],xmm18[7],xmm13[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm26 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm26 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm14, %zmm1 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm15, %zmm1 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm29[0],xmm27[0],xmm29[1],xmm27[1],xmm29[2],xmm27[2],xmm29[3],xmm27[3],xmm29[4],xmm27[4],xmm29[5],xmm27[5],xmm29[6],xmm27[6],xmm29[7],xmm27[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm26 = xmm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm26[0],zero,zero,zero,xmm26[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm28 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm28[0],zero,zero,zero,xmm28[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm26, %ymm26 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm25[0],xmm2[1],xmm25[1],xmm2[2],xmm25[2],xmm2[3],xmm25[3],xmm2[4],xmm25[4],xmm2[5],xmm25[5],xmm2[6],xmm25[6],xmm2[7],xmm25[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm27 {%k2} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm13[8],xmm16[9],xmm13[9],xmm16[10],xmm13[10],xmm16[11],xmm13[11],xmm16[12],xmm13[12],xmm16[13],xmm13[13],xmm16[14],xmm13[14],xmm16[15],xmm13[15] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm13 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm10, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm13, %zmm9, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm28, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdx), %xmm28 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm28[0],xmm23[0],xmm28[1],xmm23[1],xmm28[2],xmm23[2],xmm28[3],xmm23[3],xmm28[4],xmm23[4],xmm28[5],xmm23[5],xmm28[6],xmm23[6],xmm28[7],xmm23[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm26, %zmm26, %zmm26 +; AVX512BW-SLOW-NEXT: vpermw %zmm26, %zmm12, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm18[8],xmm13[8],xmm18[9],xmm13[9],xmm18[10],xmm13[10],xmm18[11],xmm13[11],xmm18[12],xmm13[12],xmm18[13],xmm13[13],xmm18[14],xmm13[14],xmm18[15],xmm13[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 +; AVX512BW-SLOW-NEXT: vpermw %zmm9, %zmm14, %zmm9 +; AVX512BW-SLOW-NEXT: vpermw %zmm5, %zmm15, %zmm9 {%k2} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm29[8],xmm27[8],xmm29[9],xmm27[9],xmm29[10],xmm27[10],xmm29[11],xmm27[11],xmm29[12],xmm27[12],xmm29[13],xmm27[13],xmm29[14],xmm27[14],xmm29[15],xmm27[15] ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX512BW-SLOW-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 -; AVX512BW-SLOW-NEXT: vpermw %zmm5, %zmm10, %zmm5 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm7 -; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm9, %zmm5 {%k1} -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm7, %ymm7 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm9, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm25[8],xmm2[9],xmm25[9],xmm2[10],xmm25[10],xmm2[11],xmm25[11],xmm2[12],xmm25[12],xmm2[13],xmm25[13],xmm2[14],xmm25[14],xmm2[15],xmm25[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512BW-SLOW-NEXT: vpermw %zmm2, %zmm12, %zmm1 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX512BW-SLOW-NEXT: # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 -; AVX512BW-SLOW-NEXT: vpermw %zmm6, %zmm12, %zmm2 {%k2} +; AVX512BW-SLOW-NEXT: vpermw %zmm6, %zmm14, %zmm6 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm10 +; AVX512BW-SLOW-NEXT: vpermw %zmm10, %zmm15, %zmm6 {%k2} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm10, %ymm10 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm13, %ymm5 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm28[8],xmm23[8],xmm28[9],xmm23[9],xmm28[10],xmm23[10],xmm28[11],xmm23[11],xmm28[12],xmm23[12],xmm28[13],xmm23[13],xmm28[14],xmm23[14],xmm28[15],xmm23[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512BW-SLOW-NEXT: vpermw %zmm10, %zmm12, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm10, %ymm4 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 +; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm12, %zmm4 {%k1} ; AVX512BW-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm14, %zmm15 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm17, %zmm16 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm19, %zmm20 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm21, %zmm24 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm22, %zmm23 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm26, %zmm27 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm24, %zmm25 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm21, %zmm22 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm9, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k1} ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, 256(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -6738,7 +6730,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm0 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm14 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm12 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%r10), %xmm18 ; AVX512BW-FAST-NEXT: vmovdqa64 48(%r10), %xmm17 ; AVX512BW-FAST-NEXT: vmovdqa (%rax), %xmm1 @@ -6762,33 +6754,33 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: movl $-2004318072, %eax # imm = 0x88888888 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm6, %zmm16 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm7 ; AVX512BW-FAST-NEXT: vmovdqa64 48(%rcx), %xmm23 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15,8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm8, %zmm24 +; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm6, %zmm24 ; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512BW-FAST-NEXT: vmovdqa64 48(%rsi), %xmm25 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdi), %xmm28 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm26 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm27 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdi), %xmm26 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm14 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm27 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm28 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm28, %ymm6, %ymm6 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm6, %zmm6 ; AVX512BW-FAST-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm8 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm6 {%k2} ; AVX512BW-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-FAST-NEXT: kmovd %eax, %k3 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm16, %zmm8 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm16, %zmm6 {%k3} ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm17[0],xmm20[1],xmm17[1],xmm20[2],xmm17[2],xmm20[3],xmm17[3],xmm20[4],xmm17[4],xmm20[5],xmm17[5],xmm20[6],xmm17[6],xmm20[7],xmm17[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm16, %zmm16 @@ -6798,24 +6790,24 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm24, %zmm24 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm24, %zmm24 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm16, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm25[0],xmm28[1],xmm25[1],xmm28[2],xmm25[2],xmm28[3],xmm25[3],xmm28[4],xmm25[4],xmm28[5],xmm25[5],xmm28[6],xmm25[6],xmm28[7],xmm25[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm16, %xmm26 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm27, %ymm26 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdx), %xmm30 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm16, %xmm27 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm28, %ymm27 +; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdx), %xmm28 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm16, %ymm16 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm26, %zmm16 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm30[0],xmm23[0],xmm30[1],xmm23[1],xmm30[2],xmm23[2],xmm30[3],xmm23[3],xmm30[4],xmm23[4],xmm30[5],xmm23[5],xmm30[6],xmm23[6],xmm30[7],xmm23[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm26, %ymm26 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm26, %zmm26 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm26, %zmm26 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm26, %zmm16 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm26 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm16, %ymm16 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm28[0],xmm23[0],xmm28[1],xmm23[1],xmm28[2],xmm23[2],xmm28[3],xmm23[3],xmm28[4],xmm23[4],xmm28[5],xmm23[5],xmm28[6],xmm23[6],xmm28[7],xmm23[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm27, %ymm27 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm27, %zmm27 +; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm27, %zmm27 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm27, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm27 ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm24, %zmm16 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm27 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm24 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm20[8],xmm17[8],xmm20[9],xmm17[9],xmm20[10],xmm17[10],xmm20[11],xmm17[11],xmm20[12],xmm17[12],xmm20[13],xmm17[13],xmm20[14],xmm17[14],xmm20[15],xmm17[15] -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm29 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm21 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 @@ -6825,114 +6817,114 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm20, %zmm20 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm17, %zmm20 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm29 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm23[8],xmm30[9],xmm23[9],xmm30[10],xmm23[10],xmm30[11],xmm23[11],xmm30[12],xmm23[12],xmm30[13],xmm23[13],xmm30[14],xmm23[14],xmm30[15],xmm23[15] +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm22 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm28[8],xmm23[8],xmm28[9],xmm23[9],xmm28[10],xmm23[10],xmm28[11],xmm23[11],xmm28[12],xmm23[12],xmm28[13],xmm23[13],xmm28[14],xmm23[14],xmm28[15],xmm23[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm17, %zmm22 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm28[8],xmm25[8],xmm28[9],xmm25[9],xmm28[10],xmm25[10],xmm28[11],xmm25[11],xmm28[12],xmm25[12],xmm28[13],xmm25[13],xmm28[14],xmm25[14],xmm28[15],xmm25[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm23 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm23, %ymm23 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm17, %xmm25 +; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm17, %zmm23 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm25 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm25, %ymm25 +; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm17, %xmm26 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm17 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm22, %zmm17 {%k2} +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm17, %ymm17 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm17, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm23, %zmm17 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm20, %zmm17 {%k3} ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm22, %ymm22 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm22, %zmm22 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm22, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm20, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm21[0],xmm29[1],xmm21[1],xmm29[2],xmm21[2],xmm29[3],xmm21[3],xmm29[4],xmm21[4],xmm29[5],xmm21[5],xmm29[6],xmm21[6],xmm29[7],xmm21[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm20, %xmm23 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm28 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm23, %zmm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm24[0],xmm27[0],xmm24[1],xmm27[1],xmm24[2],xmm27[2],xmm24[3],xmm27[3],xmm24[4],xmm27[4],xmm24[5],xmm27[5],xmm24[6],xmm27[6],xmm24[7],xmm27[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm23 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm23, %zmm23 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm23, %zmm23 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm23, %zmm20 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%r9), %xmm23 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm22, %zmm20 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%r8), %xmm25 +; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm23, %zmm23 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm20, %zmm23 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm20, %xmm25 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm26, %ymm25 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm26 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm20, %ymm20 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm25, %zmm20 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm26[0],xmm29[0],xmm26[1],xmm29[1],xmm26[2],xmm29[2],xmm26[3],xmm29[3],xmm26[4],xmm29[4],xmm26[5],xmm29[5],xmm26[6],xmm29[6],xmm26[7],xmm29[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm25, %ymm25 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm25, %zmm25 +; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm25, %zmm25 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 16(%r9), %xmm25 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm23, %zmm20 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 16(%r8), %xmm23 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] ; AVX512BW-FAST-NEXT: vmovdqa64 16(%rcx), %xmm19 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm22 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm24[8],xmm27[8],xmm24[9],xmm27[9],xmm24[10],xmm27[10],xmm24[11],xmm27[11],xmm24[12],xmm27[12],xmm24[13],xmm27[13],xmm24[14],xmm27[14],xmm24[15],xmm27[15] +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm27 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm26, %ymm26 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm26, %zmm26 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm26, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm26 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm24, %zmm24 +; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm24, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm18, %zmm24 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm28 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm26[8],xmm29[8],xmm26[9],xmm29[9],xmm26[10],xmm29[10],xmm26[11],xmm29[11],xmm26[12],xmm29[12],xmm26[13],xmm29[13],xmm26[14],xmm29[14],xmm26[15],xmm29[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm18, %zmm24 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm29[8],xmm21[8],xmm29[9],xmm21[9],xmm29[10],xmm21[10],xmm29[11],xmm21[11],xmm29[12],xmm21[12],xmm29[13],xmm21[13],xmm29[14],xmm21[14],xmm29[15],xmm21[15] +; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm18, %zmm26 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm18, %xmm28 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm21, %ymm21 +; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm18, %xmm22 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm28, %ymm18, %ymm18 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm18, %ymm18 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm18 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm26, %zmm18 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm24, %zmm18 {%k3} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm21, %zmm21 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm25[0],xmm23[0],xmm25[1],xmm23[1],xmm25[2],xmm23[2],xmm25[3],xmm23[3],xmm25[4],xmm23[4],xmm25[5],xmm23[5],xmm25[6],xmm23[6],xmm25[7],xmm23[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm22, %ymm22 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm22, %zmm22 +; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm22, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm21, %xmm24 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm26, %ymm24 +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdx), %xmm26 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm21, %ymm21 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm24, %zmm21 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm21, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm22[0],xmm26[1],xmm22[1],xmm26[2],xmm22[2],xmm26[3],xmm22[3],xmm26[4],xmm22[4],xmm26[5],xmm22[5],xmm26[6],xmm22[6],xmm26[7],xmm22[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm21, %xmm27 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm28, %ymm27 -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdx), %xmm28 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm27, %zmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm28[0],xmm19[0],xmm28[1],xmm19[1],xmm28[2],xmm19[2],xmm28[3],xmm19[3],xmm28[4],xmm19[4],xmm28[5],xmm19[5],xmm28[6],xmm19[6],xmm28[7],xmm19[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm27, %ymm27 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm27, %zmm21 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm24, %zmm21 {%k3} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm25[8],xmm23[8],xmm25[9],xmm23[9],xmm25[10],xmm23[10],xmm25[11],xmm23[11],xmm25[12],xmm23[12],xmm25[13],xmm23[13],xmm25[14],xmm23[14],xmm25[15],xmm23[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm24, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm21 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm22, %zmm21 {%k3} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm12, %zmm12 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm15 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm14, %zmm15 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm28[8],xmm19[8],xmm28[9],xmm19[9],xmm28[10],xmm19[10],xmm28[11],xmm19[11],xmm28[12],xmm19[12],xmm28[13],xmm19[13],xmm28[14],xmm19[14],xmm28[15],xmm19[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm19, %xmm22 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm15 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] +; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm19, %xmm22 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm23, %ymm22 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm19 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm19, %ymm19 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm19, %ymm19 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm22, %zmm19 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm14, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm14, %zmm19 {%k2} +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm19 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm15, %zmm19 {%k3} ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -6943,14 +6935,14 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm4 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 @@ -6964,7 +6956,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 448(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 384(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll index 122b478577fbfe..07800f9a7f6a7e 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -96,66 +96,66 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; SSSE3-LABEL: testv2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: paddb %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 -; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 -; SSSE3-NEXT: psrld $16, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: paddw %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpeqw %xmm4, %xmm1 ; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: paddd %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: psrld $16, %xmm3 +; SSSE3-NEXT: paddd %xmm1, %xmm3 ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrlq $32, %xmm1 -; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: psrlq $32, %xmm3 +; SSSE3-NEXT: paddq %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqw %xmm4, %xmm2 -; SSE41-NEXT: psrld $16, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqw %xmm4, %xmm1 ; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: paddd %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: psrlq $32, %xmm3 +; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: testv2i64: @@ -374,66 +374,66 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; SSSE3-LABEL: testv2i64u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: paddb %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 -; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 -; SSSE3-NEXT: psrld $16, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: paddw %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpeqw %xmm4, %xmm1 ; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: paddd %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: psrld $16, %xmm3 +; SSSE3-NEXT: paddd %xmm1, %xmm3 ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrlq $32, %xmm1 -; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: psrlq $32, %xmm3 +; SSSE3-NEXT: paddq %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv2i64u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqw %xmm4, %xmm2 -; SSE41-NEXT: psrld $16, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqw %xmm4, %xmm1 ; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: paddd %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: psrlq $32, %xmm3 +; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: testv2i64u: @@ -790,24 +790,24 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; X86-SSE-LABEL: testv4i32: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pshufb %xmm0, %xmm4 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pshufb %xmm0, %xmm3 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE-NEXT: psrlw $4, %xmm1 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pshufb %xmm1, %xmm3 -; X86-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X86-SSE-NEXT: pand %xmm4, %xmm1 -; X86-SSE-NEXT: paddb %xmm3, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: pcmpeqb %xmm2, %xmm3 -; X86-SSE-NEXT: psrlw $8, %xmm3 -; X86-SSE-NEXT: pand %xmm1, %xmm3 +; X86-SSE-NEXT: pxor %xmm4, %xmm4 +; X86-SSE-NEXT: pshufb %xmm1, %xmm2 +; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm1 +; X86-SSE-NEXT: pand %xmm3, %xmm1 +; X86-SSE-NEXT: paddb %xmm2, %xmm1 +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm2 +; X86-SSE-NEXT: psrlw $8, %xmm2 +; X86-SSE-NEXT: pand %xmm1, %xmm2 ; X86-SSE-NEXT: psrlw $8, %xmm1 -; X86-SSE-NEXT: paddw %xmm3, %xmm1 -; X86-SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; X86-SSE-NEXT: paddw %xmm2, %xmm1 +; X86-SSE-NEXT: pcmpeqw %xmm4, %xmm0 ; X86-SSE-NEXT: psrld $16, %xmm0 ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: psrld $16, %xmm1 @@ -1039,24 +1039,24 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; X86-SSE-LABEL: testv4i32u: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pshufb %xmm0, %xmm4 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pshufb %xmm0, %xmm3 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE-NEXT: psrlw $4, %xmm1 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pshufb %xmm1, %xmm3 -; X86-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X86-SSE-NEXT: pand %xmm4, %xmm1 -; X86-SSE-NEXT: paddb %xmm3, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: pcmpeqb %xmm2, %xmm3 -; X86-SSE-NEXT: psrlw $8, %xmm3 -; X86-SSE-NEXT: pand %xmm1, %xmm3 +; X86-SSE-NEXT: pxor %xmm4, %xmm4 +; X86-SSE-NEXT: pshufb %xmm1, %xmm2 +; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm1 +; X86-SSE-NEXT: pand %xmm3, %xmm1 +; X86-SSE-NEXT: paddb %xmm2, %xmm1 +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm2 +; X86-SSE-NEXT: psrlw $8, %xmm2 +; X86-SSE-NEXT: pand %xmm1, %xmm2 ; X86-SSE-NEXT: psrlw $8, %xmm1 -; X86-SSE-NEXT: paddw %xmm3, %xmm1 -; X86-SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; X86-SSE-NEXT: paddw %xmm2, %xmm1 +; X86-SSE-NEXT: pcmpeqw %xmm4, %xmm0 ; X86-SSE-NEXT: psrld $16, %xmm0 ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: psrld $16, %xmm1 @@ -1254,22 +1254,22 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; X86-SSE-LABEL: testv8i16: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-SSE-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE-NEXT: pshufb %xmm0, %xmm3 -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psrlw $4, %xmm1 -; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pshufb %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: psrlw $4, %xmm3 +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE-NEXT: pxor %xmm4, %xmm4 -; X86-SSE-NEXT: pshufb %xmm1, %xmm2 -; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; X86-SSE-NEXT: pand %xmm3, %xmm1 -; X86-SSE-NEXT: paddb %xmm2, %xmm1 +; X86-SSE-NEXT: pshufb %xmm3, %xmm1 +; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm3 +; X86-SSE-NEXT: pand %xmm2, %xmm3 +; X86-SSE-NEXT: paddb %xmm1, %xmm3 ; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm0 ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: pand %xmm1, %xmm0 -; X86-SSE-NEXT: psrlw $8, %xmm1 -; X86-SSE-NEXT: paddw %xmm1, %xmm0 +; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: psrlw $8, %xmm3 +; X86-SSE-NEXT: paddw %xmm3, %xmm0 ; X86-SSE-NEXT: retl %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0) ret <8 x i16> %out @@ -1462,22 +1462,22 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; X86-SSE-LABEL: testv8i16u: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-SSE-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE-NEXT: pshufb %xmm0, %xmm3 -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psrlw $4, %xmm1 -; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pshufb %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: psrlw $4, %xmm3 +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE-NEXT: pxor %xmm4, %xmm4 -; X86-SSE-NEXT: pshufb %xmm1, %xmm2 -; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; X86-SSE-NEXT: pand %xmm3, %xmm1 -; X86-SSE-NEXT: paddb %xmm2, %xmm1 +; X86-SSE-NEXT: pshufb %xmm3, %xmm1 +; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm3 +; X86-SSE-NEXT: pand %xmm2, %xmm3 +; X86-SSE-NEXT: paddb %xmm1, %xmm3 ; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm0 ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: pand %xmm1, %xmm0 -; X86-SSE-NEXT: psrlw $8, %xmm1 -; X86-SSE-NEXT: paddw %xmm1, %xmm0 +; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: psrlw $8, %xmm3 +; X86-SSE-NEXT: paddw %xmm3, %xmm0 ; X86-SSE-NEXT: retl %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1) ret <8 x i16> %out diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll index 3c5e3adf038faa..a724babe469c54 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -214,17 +214,17 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpandn %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -234,8 +234,8 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] @@ -301,17 +301,17 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpandn %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -321,8 +321,8 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll index 008188b52c2005..c62147030aa0b0 100644 --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -58,28 +58,28 @@ define <4 x i16> @smulfixsat(<4 x i16> %a) { ; CHECK-NEXT: cmovll %ecx, %edx ; CHECK-NEXT: pextrw $1, %xmm0, %esi ; CHECK-NEXT: leal (%rsi,%rsi), %edi -; CHECK-NEXT: movswl %si, %r8d -; CHECK-NEXT: movl %r8d, %esi -; CHECK-NEXT: shrl $16, %esi -; CHECK-NEXT: shldw $1, %di, %si -; CHECK-NEXT: sarl $16, %r8d -; CHECK-NEXT: cmpl $16384, %r8d # imm = 0x4000 -; CHECK-NEXT: cmovgel %eax, %esi -; CHECK-NEXT: cmpl $-16384, %r8d # imm = 0xC000 -; CHECK-NEXT: cmovll %ecx, %esi -; CHECK-NEXT: movd %xmm0, %edi -; CHECK-NEXT: movswl %di, %edi -; CHECK-NEXT: movl %edi, %r8d +; CHECK-NEXT: movswl %si, %esi +; CHECK-NEXT: movl %esi, %r8d ; CHECK-NEXT: shrl $16, %r8d ; CHECK-NEXT: shldw $1, %di, %r8w -; CHECK-NEXT: sarl $16, %edi -; CHECK-NEXT: cmpl $16384, %edi # imm = 0x4000 +; CHECK-NEXT: sarl $16, %esi +; CHECK-NEXT: cmpl $16384, %esi # imm = 0x4000 ; CHECK-NEXT: cmovgel %eax, %r8d -; CHECK-NEXT: cmpl $-16384, %edi # imm = 0xC000 +; CHECK-NEXT: cmpl $-16384, %esi # imm = 0xC000 ; CHECK-NEXT: cmovll %ecx, %r8d -; CHECK-NEXT: movzwl %r8w, %edi -; CHECK-NEXT: movd %edi, %xmm1 -; CHECK-NEXT: pinsrw $1, %esi, %xmm1 +; CHECK-NEXT: movd %xmm0, %esi +; CHECK-NEXT: movswl %si, %esi +; CHECK-NEXT: movl %esi, %edi +; CHECK-NEXT: shrl $16, %edi +; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: sarl $16, %esi +; CHECK-NEXT: cmpl $16384, %esi # imm = 0x4000 +; CHECK-NEXT: cmovgel %eax, %edi +; CHECK-NEXT: cmpl $-16384, %esi # imm = 0xC000 +; CHECK-NEXT: cmovll %ecx, %edi +; CHECK-NEXT: movzwl %di, %esi +; CHECK-NEXT: movd %esi, %xmm1 +; CHECK-NEXT: pinsrw $1, %r8d, %xmm1 ; CHECK-NEXT: pinsrw $2, %edx, %xmm1 ; CHECK-NEXT: pextrw $3, %xmm0, %edx ; CHECK-NEXT: movswl %dx, %edx @@ -105,31 +105,31 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) { ; CHECK: # %bb.0: ; CHECK-NEXT: pextrw $2, %xmm0, %eax ; CHECK-NEXT: leal (%rax,%rax,2), %eax -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: shldw $1, %ax, %cx -; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: shldw $1, %ax, %dx +; CHECK-NEXT: cmpl $32768, %ecx # imm = 0x8000 ; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF -; CHECK-NEXT: cmovael %eax, %ecx -; CHECK-NEXT: pextrw $1, %xmm0, %edx -; CHECK-NEXT: addl %edx, %edx -; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: cmovael %eax, %edx +; CHECK-NEXT: pextrw $1, %xmm0, %ecx +; CHECK-NEXT: addl %ecx, %ecx +; CHECK-NEXT: movl %ecx, %esi ; CHECK-NEXT: shrl $16, %esi ; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shldw $1, %dx, %di +; CHECK-NEXT: shldw $1, %cx, %di ; CHECK-NEXT: cmpl $32768, %esi # imm = 0x8000 ; CHECK-NEXT: cmovael %eax, %edi -; CHECK-NEXT: movd %xmm0, %edx +; CHECK-NEXT: movd %xmm0, %ecx ; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: shldw $1, %dx, %si -; CHECK-NEXT: movl $32768, %edx # imm = 0x8000 -; CHECK-NEXT: negl %edx +; CHECK-NEXT: shldw $1, %cx, %si +; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 +; CHECK-NEXT: negl %ecx ; CHECK-NEXT: cmovael %eax, %esi -; CHECK-NEXT: movzwl %si, %edx -; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: movzwl %si, %ecx +; CHECK-NEXT: movd %ecx, %xmm1 ; CHECK-NEXT: pinsrw $1, %edi, %xmm1 -; CHECK-NEXT: pinsrw $2, %ecx, %xmm1 +; CHECK-NEXT: pinsrw $2, %edx, %xmm1 ; CHECK-NEXT: pextrw $3, %xmm0, %ecx ; CHECK-NEXT: shll $2, %ecx ; CHECK-NEXT: movl %ecx, %edx diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll index dc0ebe6b2e2eff..bd0fa25a2a381c 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll @@ -201,46 +201,46 @@ define i64 @test_v8i64_v8i8(<8 x i8> %a0) { define i64 @test_v16i64_v16i8(<16 x i8> %a0) { ; SSE2-LABEL: test_v16i64_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: psrad $24, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm6 ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm8 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; SSE2-NEXT: paddq %xmm5, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm9 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $24, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; SSE2-NEXT: paddq %xmm9, %xmm10 ; SSE2-NEXT: paddq %xmm8, %xmm10 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE2-NEXT: paddq %xmm4, %xmm6 -; SSE2-NEXT: paddq %xmm0, %xmm6 -; SSE2-NEXT: paddq %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE2-NEXT: paddq %xmm2, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: paddq %xmm6, %xmm0 -; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: paddq %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16i64_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll index 401118af26259f..caf8d7703811a0 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll @@ -402,36 +402,36 @@ define i64 @reduce_ctpop_v4i64(<4 x i64> %a0) { define i32 @reduce_ctpop_v8i32(<8 x i32> %a0) { ; SSE42-LABEL: reduce_ctpop_v8i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm2, %xmm5 -; SSE42-NEXT: pshufb %xmm4, %xmm5 +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: movdqa %xmm1, %xmm3 +; SSE42-NEXT: pand %xmm2, %xmm3 +; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NEXT: movdqa %xmm4, %xmm5 +; SSE42-NEXT: pshufb %xmm3, %xmm5 ; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm3, %xmm1 -; SSE42-NEXT: movdqa %xmm2, %xmm4 -; SSE42-NEXT: pshufb %xmm1, %xmm4 -; SSE42-NEXT: paddb %xmm5, %xmm4 +; SSE42-NEXT: pand %xmm2, %xmm1 +; SSE42-NEXT: movdqa %xmm4, %xmm3 +; SSE42-NEXT: pshufb %xmm1, %xmm3 +; SSE42-NEXT: paddb %xmm5, %xmm3 ; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE42-NEXT: psadbw %xmm1, %xmm4 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE42-NEXT: psadbw %xmm1, %xmm3 ; SSE42-NEXT: psadbw %xmm1, %xmm5 -; SSE42-NEXT: packuswb %xmm4, %xmm5 -; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pshufb %xmm4, %xmm6 +; SSE42-NEXT: packuswb %xmm3, %xmm5 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pand %xmm2, %xmm3 +; SSE42-NEXT: movdqa %xmm4, %xmm6 +; SSE42-NEXT: pshufb %xmm3, %xmm6 ; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm3, %xmm0 -; SSE42-NEXT: pshufb %xmm0, %xmm2 -; SSE42-NEXT: paddb %xmm6, %xmm2 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE42-NEXT: psadbw %xmm1, %xmm2 +; SSE42-NEXT: pand %xmm2, %xmm0 +; SSE42-NEXT: pshufb %xmm0, %xmm4 +; SSE42-NEXT: paddb %xmm6, %xmm4 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE42-NEXT: psadbw %xmm1, %xmm4 ; SSE42-NEXT: psadbw %xmm1, %xmm0 -; SSE42-NEXT: packuswb %xmm2, %xmm0 +; SSE42-NEXT: packuswb %xmm4, %xmm0 ; SSE42-NEXT: paddd %xmm5, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE42-NEXT: paddd %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll index fe2c41f57cfab1..39532b591db7ca 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -136,61 +136,63 @@ define float @test_v3f32(<3 x float> %a0) { define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE2-NEXT: cmpunordss %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 ; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: maxss %xmm1, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE2-NEXT: maxss %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: andnps %xmm3, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: andnps %xmm3, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: maxss %xmm0, %xmm3 -; SSE2-NEXT: cmpunordss %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 ; SSE2-NEXT: andnps %xmm3, %xmm4 -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: maxss %xmm0, %xmm2 -; SSE2-NEXT: cmpunordss %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 ; SSE2-NEXT: andnps %xmm2, %xmm3 -; SSE2-NEXT: andps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm3, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm0, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 ; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: maxss %xmm1, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE41-NEXT: andnps %xmm3, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: andnps %xmm3, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 ; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: maxss %xmm0, %xmm3 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: maxss %xmm1, %xmm3 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 ; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: maxss %xmm0, %xmm2 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: maxss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 ; SSE41-NEXT: andnps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm1, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: @@ -330,71 +332,38 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v8f32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] -; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512BW-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] -; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm8 -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm8, %xmm8, %k1 -; AVX512BW-NEXT: vmaxss %xmm8, %xmm6, %xmm0 -; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxss %xmm0, %xmm5, %xmm0 -; AVX512BW-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxss %xmm0, %xmm3, %xmm0 -; AVX512BW-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxss %xmm0, %xmm4, %xmm0 -; AVX512BW-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxss %xmm0, %xmm2, %xmm0 -; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxss %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm8 -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm8, %xmm8, %k1 -; AVX512VL-NEXT: vmaxss %xmm8, %xmm6, %xmm0 -; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0 -; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm4, %xmm0 -; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm0, %xmm7, %xmm8 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1} +; AVX512-NEXT: vcmpunordss %xmm8, %xmm8, %k1 +; AVX512-NEXT: vmaxss %xmm8, %xmm6, %xmm0 +; AVX512-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxss %xmm0, %xmm4, %xmm0 +; AVX512-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0) ret float %1 } @@ -593,14 +562,14 @@ define float @test_v16f32(<16 x float> %a0) { ; ; AVX512VL-LABEL: test_v16f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm6[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm5 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3,3,3] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm5[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm5[1,1,3,3] ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3] ; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm9[1,0] @@ -630,8 +599,8 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512VL-NEXT: vmaxss %xmm0, %xmm10, %xmm0 ; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm6, %xmm0 -; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0 ; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} @@ -639,20 +608,20 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm0 ; AVX512VL-NEXT: vmovss %xmm7, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0 -; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vmaxss %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm4, %xmm0 ; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0 ; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0) @@ -851,71 +820,38 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v8f64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] -; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0] -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0] -; AVX512BW-NEXT: vmaxsd %xmm0, %xmm7, %xmm8 -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm8, %xmm8, %k1 -; AVX512BW-NEXT: vmaxsd %xmm8, %xmm5, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxsd %xmm0, %xmm6, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxsd %xmm0, %xmm3, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxsd %xmm0, %xmm4, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0] -; AVX512VL-NEXT: vmaxsd %xmm0, %xmm7, %xmm8 -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm8, %xmm8, %k1 -; AVX512VL-NEXT: vmaxsd %xmm8, %xmm5, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxsd %xmm0, %xmm6, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxsd %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxsd %xmm0, %xmm4, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v8f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm7, %xmm8 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm8, %xmm8, %k1 +; AVX512-NEXT: vmaxsd %xmm8, %xmm5, %xmm0 +; AVX512-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxsd %xmm0, %xmm6, %xmm0 +; AVX512-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxsd %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxsd %xmm0, %xmm4, %xmm0 +; AVX512-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a0) ret double %1 } diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll index ec41657d2f248f..bb5396c6f17a74 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll @@ -249,18 +249,18 @@ define float @test_v4f32(<4 x float> %a0) { ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmaxss %xmm3, %xmm2, %xmm3 ; AVX-NEXT: vcmpunordss %xmm2, %xmm2, %xmm4 -; AVX-NEXT: vblendvps %xmm4, %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vmovd %xmm3, %eax +; AVX-NEXT: vblendvps %xmm4, %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vmovd %xmm2, %eax ; AVX-NEXT: testl %eax, %eax ; AVX-NEXT: js .LBB2_4 ; AVX-NEXT: # %bb.5: -; AVX-NEXT: vmovaps %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm3 ; AVX-NEXT: jmp .LBB2_6 ; AVX-NEXT: .LBB2_4: -; AVX-NEXT: vmovapd %xmm1, %xmm2 -; AVX-NEXT: vmovaps %xmm3, %xmm1 +; AVX-NEXT: vmovapd %xmm1, %xmm3 +; AVX-NEXT: vmovaps %xmm2, %xmm1 ; AVX-NEXT: .LBB2_6: -; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vmaxss %xmm3, %xmm1, %xmm2 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3 ; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovd %xmm1, %eax @@ -627,17 +627,17 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: por %xmm7, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: maxps %xmm2, %xmm6 ; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: cmpunordps %xmm5, %xmm0 -; SSE2-NEXT: andps %xmm0, %xmm5 -; SSE2-NEXT: andnps %xmm6, %xmm0 -; SSE2-NEXT: orps %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: maxps %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: cmpunordps %xmm5, %xmm2 +; SSE2-NEXT: andps %xmm2, %xmm5 +; SSE2-NEXT: andnps %xmm0, %xmm2 +; SSE2-NEXT: orps %xmm5, %xmm2 +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm6 ; SSE2-NEXT: pandn %xmm1, %xmm6 @@ -654,19 +654,19 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE2-NEXT: andps %xmm3, %xmm4 ; SSE2-NEXT: andnps %xmm1, %xmm3 ; SSE2-NEXT: orps %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: maxps %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: cmpunordps %xmm2, %xmm0 -; SSE2-NEXT: andps %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: cmpunordps %xmm5, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm5 ; SSE2-NEXT: andnps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax @@ -729,15 +729,15 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-LABEL: test_v16f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: movaps %xmm1, %xmm6 +; SSE41-NEXT: movaps %xmm1, %xmm5 ; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm5 ; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movaps %xmm3, %xmm5 -; SSE41-NEXT: maxps %xmm6, %xmm5 +; SSE41-NEXT: movaps %xmm3, %xmm6 +; SSE41-NEXT: maxps %xmm5, %xmm6 ; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: cmpunordps %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm6 ; SSE41-NEXT: movaps %xmm4, %xmm3 ; SSE41-NEXT: movaps %xmm4, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3 @@ -749,13 +749,13 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movaps %xmm5, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm2 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: movaps %xmm6, %xmm1 ; SSE41-NEXT: maxps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm5, %xmm0 -; SSE41-NEXT: cmpunordps %xmm5, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movaps %xmm6, %xmm0 +; SSE41-NEXT: cmpunordps %xmm6, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: testl %eax, %eax @@ -1279,15 +1279,15 @@ define double @test_v8f64(<8 x double> %a0) { ; SSE41-LABEL: test_v8f64: ; SSE41: # %bb.0: ; SSE41-NEXT: movapd %xmm0, %xmm4 -; SSE41-NEXT: movapd %xmm1, %xmm6 +; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: maxpd %xmm6, %xmm5 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: maxpd %xmm5, %xmm6 ; SSE41-NEXT: movapd %xmm3, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 ; SSE41-NEXT: movapd %xmm4, %xmm3 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 @@ -1299,13 +1299,13 @@ define double @test_v8f64(<8 x double> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm1 ; SSE41-NEXT: maxpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm5, %xmm0 -; SSE41-NEXT: cmpunordpd %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: cmpunordpd %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: movq %xmm1, %rax @@ -1579,15 +1579,15 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movapd %xmm1, %xmm8 ; SSE41-NEXT: movapd %xmm0, %xmm1 -; SSE41-NEXT: movapd %xmm3, %xmm10 +; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: maxpd %xmm10, %xmm9 +; SSE41-NEXT: movapd %xmm7, %xmm10 +; SSE41-NEXT: maxpd %xmm9, %xmm10 ; SSE41-NEXT: movapd %xmm7, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10 ; SSE41-NEXT: movapd %xmm8, %xmm7 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 @@ -1599,13 +1599,13 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 ; SSE41-NEXT: movapd %xmm3, %xmm5 ; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 -; SSE41-NEXT: movapd %xmm9, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 +; SSE41-NEXT: movapd %xmm10, %xmm3 ; SSE41-NEXT: maxpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm9, %xmm0 -; SSE41-NEXT: cmpunordpd %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm10, %xmm0 +; SSE41-NEXT: cmpunordpd %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 ; SSE41-NEXT: movapd %xmm2, %xmm5 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll index 5ae9e552d0dcda..42714fccdc62f2 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -69,61 +69,63 @@ define float @test_v2f32(<2 x float> %a0) { define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE2-NEXT: cmpunordss %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 ; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: minss %xmm1, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: andnps %xmm3, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: andnps %xmm3, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: minss %xmm0, %xmm3 -; SSE2-NEXT: cmpunordss %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 ; SSE2-NEXT: andnps %xmm3, %xmm4 -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: minss %xmm0, %xmm2 -; SSE2-NEXT: cmpunordss %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 ; SSE2-NEXT: andnps %xmm2, %xmm3 -; SSE2-NEXT: andps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm3, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm0, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 ; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: minss %xmm1, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE41-NEXT: andnps %xmm3, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: andnps %xmm3, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 ; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: minss %xmm0, %xmm3 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: minss %xmm1, %xmm3 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 ; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: minss %xmm0, %xmm2 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: minss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 ; SSE41-NEXT: andnps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm1, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: @@ -263,71 +265,38 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v8f32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] -; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512BW-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] -; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm8 -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm8, %xmm8, %k1 -; AVX512BW-NEXT: vminss %xmm8, %xmm6, %xmm0 -; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminss %xmm0, %xmm5, %xmm0 -; AVX512BW-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminss %xmm0, %xmm3, %xmm0 -; AVX512BW-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminss %xmm0, %xmm4, %xmm0 -; AVX512BW-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminss %xmm0, %xmm2, %xmm0 -; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminss %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm8 -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm8, %xmm8, %k1 -; AVX512VL-NEXT: vminss %xmm8, %xmm6, %xmm0 -; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0 -; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm4, %xmm0 -; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm0, %xmm7, %xmm8 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1} +; AVX512-NEXT: vcmpunordss %xmm8, %xmm8, %k1 +; AVX512-NEXT: vminss %xmm8, %xmm6, %xmm0 +; AVX512-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminss %xmm0, %xmm4, %xmm0 +; AVX512-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0) ret float %1 } @@ -526,14 +495,14 @@ define float @test_v16f32(<16 x float> %a0) { ; ; AVX512VL-LABEL: test_v16f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm6[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm5 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3,3,3] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm5[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm5[1,1,3,3] ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3] ; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm9[1,0] @@ -563,8 +532,8 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512VL-NEXT: vminss %xmm0, %xmm10, %xmm0 ; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm6, %xmm0 -; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0 ; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} @@ -572,20 +541,20 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm0 ; AVX512VL-NEXT: vmovss %xmm7, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0 -; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vminss %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm4, %xmm0 ; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0 ; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a0) @@ -854,71 +823,38 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v8f64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] -; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0] -; AVX512BW-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0] -; AVX512BW-NEXT: vminsd %xmm0, %xmm7, %xmm8 -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm8, %xmm8, %k1 -; AVX512BW-NEXT: vminsd %xmm8, %xmm5, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminsd %xmm0, %xmm6, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminsd %xmm0, %xmm3, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminsd %xmm0, %xmm4, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminsd %xmm0, %xmm2, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminsd %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0] -; AVX512VL-NEXT: vminsd %xmm0, %xmm7, %xmm8 -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm8, %xmm8, %k1 -; AVX512VL-NEXT: vminsd %xmm8, %xmm5, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminsd %xmm0, %xmm6, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminsd %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminsd %xmm0, %xmm4, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminsd %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminsd %xmm0, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v8f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm7, %xmm8 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm8, %xmm8, %k1 +; AVX512-NEXT: vminsd %xmm8, %xmm5, %xmm0 +; AVX512-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminsd %xmm0, %xmm6, %xmm0 +; AVX512-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminsd %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminsd %xmm0, %xmm4, %xmm0 +; AVX512-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminsd %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0) ret double %1 } diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index 4799b8e7e5857b..c201b5e6a899b2 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -1033,14 +1033,14 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-LABEL: test_v16i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -1051,12 +1051,12 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm1, %xmm6 ; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 @@ -1154,64 +1154,64 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm7, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm6, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm2 ; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm9, %xmm3 ; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm9 ; SSE2-NEXT: pandn %xmm1, %xmm3 ; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; SSE4-LABEL: test_v32i32: diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll index 75eeec456c9ac3..217e92584d3d5f 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -369,32 +369,32 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; ; SSE42-LABEL: test_v8i64: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm0, %xmm5 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] ; SSE42-NEXT: movdqa %xmm1, %xmm6 -; SSE42-NEXT: pxor %xmm4, %xmm6 +; SSE42-NEXT: pxor %xmm5, %xmm6 ; SSE42-NEXT: movdqa %xmm3, %xmm0 -; SSE42-NEXT: pxor %xmm4, %xmm0 +; SSE42-NEXT: pxor %xmm5, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE42-NEXT: movdqa %xmm5, %xmm1 -; SSE42-NEXT: pxor %xmm4, %xmm1 +; SSE42-NEXT: movdqa %xmm4, %xmm1 +; SSE42-NEXT: pxor %xmm5, %xmm1 ; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: pxor %xmm4, %xmm0 +; SSE42-NEXT: pxor %xmm5, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE42-NEXT: movapd %xmm2, %xmm1 -; SSE42-NEXT: xorpd %xmm4, %xmm1 +; SSE42-NEXT: xorpd %xmm5, %xmm1 ; SSE42-NEXT: movapd %xmm3, %xmm0 -; SSE42-NEXT: xorpd %xmm4, %xmm0 +; SSE42-NEXT: xorpd %xmm5, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE42-NEXT: movdqa %xmm3, %xmm0 -; SSE42-NEXT: pxor %xmm4, %xmm0 -; SSE42-NEXT: pxor %xmm1, %xmm4 -; SSE42-NEXT: pcmpgtq %xmm0, %xmm4 -; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: pxor %xmm5, %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm5 +; SSE42-NEXT: pcmpgtq %xmm0, %xmm5 +; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; SSE42-NEXT: movq %xmm1, %rax ; SSE42-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index 7f828fc293caac..883b5aa239b4ef 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -796,35 +796,35 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; ; AVX512DQ-LABEL: mask_replication_factor3_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k1 -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: movw $1, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k2 +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k3 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 +; AVX512DQ-NEXT: vpmovm2d %k2, %zmm0 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k4} {z} ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx) @@ -844,13 +844,14 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw (%rdi), %k2 ; AVX512BW-NEXT: kandw %k4, %k2, %k3 ; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: movw $-5, %ax ; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: movw $-9, %ax @@ -861,9 +862,9 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-33, %ax @@ -916,8 +917,8 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $4, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k2 +; AVX512BW-NEXT: kshiftrd $4, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF @@ -930,8 +931,8 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $5, %k0, %k2 @@ -941,102 +942,101 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k5 ; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k7, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 +; AVX512BW-NEXT: kshiftrw $14, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k7 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $28, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $28, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $11, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $29, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $29, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $30, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k7 +; AVX512BW-NEXT: kshiftrd $30, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 +; AVX512BW-NEXT: kshiftrw $4, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $31, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k7 +; AVX512BW-NEXT: kshiftrd $31, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k7 ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $22, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $23, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 @@ -1044,7 +1044,6 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 @@ -1088,19 +1087,19 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 @@ -1123,12 +1122,12 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -1141,8 +1140,8 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -1152,128 +1151,128 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $10, %k0, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $10, %k0, %k5 +; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $12, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 +; AVX512BW-NEXT: kshiftrd $12, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $13, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 +; AVX512BW-NEXT: kshiftrd $13, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $6, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 +; AVX512BW-NEXT: kshiftrd $14, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $15, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k6 +; AVX512BW-NEXT: kshiftrd $15, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $14, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $6, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $13, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $12, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $7, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $7, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $8, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $8, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $8, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k6 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrd $9, %k0, %k0 ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 @@ -1648,8 +1647,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k5, %k1, %k1 @@ -1679,8 +1678,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -1713,13 +1712,12 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $50, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -1749,6 +1747,7 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 @@ -1759,10 +1758,10 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrq $42, %k0, %k1 -; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k3 +; AVX512BW-NEXT: kshiftrq $42, %k0, %k3 +; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -1819,8 +1818,7 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 @@ -1828,7 +1826,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $37, %k0, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 @@ -3637,37 +3636,37 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k3 +; AVX512BW-NEXT: kandw %k2, %k1, %k2 ; AVX512BW-NEXT: kshiftrd $2, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 +; AVX512BW-NEXT: kshiftrw $5, %k3, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovd %eax, %k7 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $4, %k3, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF ; AVX512BW-NEXT: kmovd %eax, %k4 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k3, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF ; AVX512BW-NEXT: kmovd %eax, %k4 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $2, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill @@ -3720,15 +3719,15 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrd $31, %k5, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 @@ -3791,10 +3790,10 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 @@ -3860,12 +3859,12 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -3927,11 +3926,11 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kandw %k3, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kandw %k3, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4125,12 +4124,12 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kandw %k3, %k4, %k4 @@ -4194,10 +4193,10 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 @@ -4307,41 +4306,41 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: movw $1, %ax ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm3, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm8, %zmm14 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm3, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm7, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm8, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm9, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm11, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm8, %zmm8 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} @@ -4349,22 +4348,22 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 @@ -4373,33 +4372,33 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1152(%rdx) +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1152(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1088(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1024(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 960(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 960(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 832(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 768(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 640(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 192(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx) @@ -4410,41 +4409,41 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: movw $1, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm12 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm8, %zmm14 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm7, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm9, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm11, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm8, %zmm8 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} @@ -4452,22 +4451,22 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 @@ -4476,33 +4475,33 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1152(%rdx) +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1152(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1088(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1024(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 960(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 960(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 832(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 768(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 640(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) @@ -4541,15 +4540,17 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: movw $-33, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovq %k1, %k6 +; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-65, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovq %k1, %k4 +; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $9, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-129, %ax @@ -4573,151 +4574,149 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k3 -; AVX512BW-NEXT: kshiftrq $2, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k0, %k2 +; AVX512BW-NEXT: kshiftrq $2, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k7 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k7, %k7 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k7, %k7 +; AVX512BW-NEXT: kmovd %eax, %k7 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k2, %k0 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k7, %k7 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k7, %k0 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $3, %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k7, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k7, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k2 +; AVX512BW-NEXT: kshiftrq $3, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 +; AVX512BW-NEXT: korw %k0, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $4, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $5, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $6, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k7 +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512BW-NEXT: kandw %k3, %k1, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $7, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $8, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 @@ -4725,8 +4724,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k0, %k0 @@ -4734,228 +4733,229 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 +; AVX512BW-NEXT: korw %k2, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k7} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $10, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $11, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $12, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k6} {z} +; AVX512BW-NEXT: korw %k2, %k0, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrq $13, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $14, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $15, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k1 +; AVX512BW-NEXT: korw %k2, %k0, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $19, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k7} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $20, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $20, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 @@ -4964,409 +4964,408 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $21, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $22, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $23, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $24, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $25, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 +; AVX512BW-NEXT: korw %k2, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $26, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $27, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $28, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k6} {z} -; AVX512BW-NEXT: kandw %k2, %k1, %k0 +; AVX512BW-NEXT: korw %k2, %k0, %k2 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k2} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrq $29, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $14, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $30, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $31, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k1 +; AVX512BW-NEXT: korw %k2, %k0, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $33, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $34, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 +; AVX512BW-NEXT: kshiftrq $34, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 +; AVX512BW-NEXT: korw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $35, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $36, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $37, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $38, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $39, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $40, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k0, %k0 @@ -5374,190 +5373,190 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 +; AVX512BW-NEXT: korw %k2, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} -; AVX512BW-NEXT: kandw %k3, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $42, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $43, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $44, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: korw %k2, %k0, %k2 +; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k2} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k0 ; AVX512BW-NEXT: kshiftrq $45, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $46, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 +; AVX512BW-NEXT: korw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k1 +; AVX512BW-NEXT: korw %k2, %k0, %k1 ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $50, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 +; AVX512BW-NEXT: korw %k7, %k0, %k0 +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 @@ -5565,259 +5564,258 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $51, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $52, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $53, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $54, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $55, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k7 +; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $55, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $56, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 +; AVX512BW-NEXT: korw %k2, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $58, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k6} {z} +; AVX512BW-NEXT: korw %k2, %k0, %k2 +; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k2} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrq $61, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $62, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 +; AVX512BW-NEXT: kshiftrq $63, %k5, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k0, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k2 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k4 +; AVX512BW-NEXT: korw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 @@ -6422,27 +6420,27 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF ; AVX512BW-NEXT: kmovd %eax, %k6 ; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF +; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF ; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k1, %k1 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $2, %k5, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 ; AVX512BW-NEXT: kmovq %k2, %k4 @@ -6499,25 +6497,25 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k4 ; AVX512BW-NEXT: kshiftrd $31, %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -6532,11 +6530,11 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 ; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $26, %k5, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k1 +; AVX512BW-NEXT: kshiftrd $26, %k5, %k1 +; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -6567,32 +6565,32 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $28, %k2, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k2, %k3 ; AVX512BW-NEXT: korw %k3, %k5, %k3 @@ -6619,8 +6617,8 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -6645,165 +6643,166 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $3, %k4, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k4, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 ; AVX512BW-NEXT: korw %k4, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $21, %k1, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 +; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $21, %k2, %k4 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k5 +; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k5, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k5, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $12, %k5, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $22, %k1, %k4 -; AVX512BW-NEXT: kmovq %k1, %k7 +; AVX512BW-NEXT: kshiftrd $22, %k2, %k4 +; AVX512BW-NEXT: kmovq %k2, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 +; AVX512BW-NEXT: kandw %k7, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k4 -; AVX512BW-NEXT: kshiftrd $23, %k7, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k4 +; AVX512BW-NEXT: kshiftrd $23, %k6, %k5 +; AVX512BW-NEXT: kmovq %k6, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 ; AVX512BW-NEXT: korw %k3, %k4, %k3 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k3} {z} -; AVX512BW-NEXT: kmovq %k7, %k4 -; AVX512BW-NEXT: kshiftrd $18, %k7, %k6 -; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k3, %k6 +; AVX512BW-NEXT: kshiftrd $18, %k7, %k3 +; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k0 +; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $19, %k7, %k6 +; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $20, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kshiftrd $20, %k3, %k6 +; AVX512BW-NEXT: kmovq %k3, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 ; AVX512BW-NEXT: korw %k2, %k5, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: korw %k0, %k2, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovq %k4, %k0 -; AVX512BW-NEXT: kshiftrd $16, %k4, %k1 +; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 @@ -6821,33 +6820,33 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 +; AVX512BW-NEXT: kmovq %k0, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload @@ -6868,23 +6867,23 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $13, %k0, %k3 +; AVX512BW-NEXT: kmovq %k4, %k0 +; AVX512BW-NEXT: kshiftrd $13, %k4, %k3 ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k5, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k4, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k3 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrd $14, %k0, %k3 ; AVX512BW-NEXT: kmovq %k0, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 @@ -6894,15 +6893,16 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload @@ -6953,12 +6953,11 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -6969,7 +6968,8 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $12, %k2, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 @@ -6986,16 +6986,16 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k5, %k4 @@ -7018,10 +7018,10 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k4, %k1 @@ -7047,26 +7047,26 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $3, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k2, %k4 -; AVX512BW-NEXT: kmovq %k2, %k5 +; AVX512BW-NEXT: kmovq %k2, %k3 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $5, %k1, %k2 @@ -7224,64 +7224,64 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf64: ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: movw $1, %ax ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm10, %zmm1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm11, %zmm2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm12, %zmm3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm14, %zmm6 -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm4, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm10, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm11, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm4, %zmm8 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm10, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm11, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm12, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm4, %zmm24 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm14, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm10, %zmm10 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm12, %zmm12 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm14, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm8 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm12 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm13, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm8, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm3, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm7, %zmm20 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm8, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm9, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm11, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm3, %zmm24 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm13, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm8, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm3 +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 @@ -7290,8 +7290,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 @@ -7302,106 +7302,106 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1472(%rdx) +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1472(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1152(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1088(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 960(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 768(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 704(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 640(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 576(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor6_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: movw $1, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm7 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm11, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm12, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm14, %zmm6 -; AVX512DQ-NEXT: vpermd %zmm7, %zmm4, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm10, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm11, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm10, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm11, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm12, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm4, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm14, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm10, %zmm10 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm12, %zmm12 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm14, %zmm4 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm13, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm8, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm7, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm9, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm11, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm13, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm8, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm3 +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 @@ -7410,8 +7410,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 @@ -7422,42 +7422,42 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1472(%rdx) +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1472(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1152(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1088(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1024(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 960(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 768(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 704(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 640(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 576(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -7564,7 +7564,6 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k5, %k3 ; AVX512BW-NEXT: kshiftrq $3, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 @@ -7573,27 +7572,27 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $4, %k3, %k1 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kshiftrq $4, %k5, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 +; AVX512BW-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -9695,7 +9694,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq %k2, %k3 +; AVX512BW-NEXT: kmovq %k2, %k5 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 @@ -9703,7 +9702,8 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: movw $-5, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovq %k2, %k4 +; AVX512BW-NEXT: kmovq %k2, %k3 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-9, %ax @@ -9747,15 +9747,16 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $7, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovq %k2, %k4 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF @@ -9790,19 +9791,15 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k1, %k0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kmovq %k6, %k2 ; AVX512BW-NEXT: kshiftrd $29, %k6, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k0 +; AVX512BW-NEXT: kandw %k5, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k4, %k6 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $30, %k2, %k1 -; AVX512BW-NEXT: kmovq %k2, %k4 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $30, %k6, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k0, %k0 @@ -9810,8 +9807,8 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -9829,18 +9826,17 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k3 -; AVX512BW-NEXT: kshiftrd $31, %k4, %k0 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k3 +; AVX512BW-NEXT: kshiftrd $31, %k6, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -9859,24 +9855,25 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k1, %k0, %k1 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $27, %k4, %k1 +; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $27, %k6, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k3, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $13, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k7 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k7 -; AVX512BW-NEXT: kshiftrd $28, %k4, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k7 +; AVX512BW-NEXT: kshiftrd $28, %k6, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k7, %k6 @@ -9900,16 +9897,16 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq %k2, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -9941,32 +9938,32 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k5 ; AVX512BW-NEXT: kshiftrd $26, %k6, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -10008,8 +10005,8 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 @@ -10019,12 +10016,12 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrd $24, %k6, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 @@ -10069,7 +10066,8 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k5 ; AVX512BW-NEXT: kshiftrd $21, %k3, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 @@ -10089,8 +10087,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload @@ -10122,30 +10119,30 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $18, %k2, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $18, %k4, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 +; AVX512BW-NEXT: kandw %k1, %k2, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 +; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k6 -; AVX512BW-NEXT: kshiftrd $19, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $19, %k4, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -10157,16 +10154,16 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kandw %k3, %k6, %k6 @@ -10197,24 +10194,23 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k3, %k0 ; AVX512BW-NEXT: kandw %k4, %k0, %k3 @@ -10230,7 +10226,8 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -10263,30 +10260,30 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k2, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k3 ; AVX512BW-NEXT: kshiftrd $14, %k0, %k1 -; AVX512BW-NEXT: kmovq %k0, %k6 +; AVX512BW-NEXT: kmovq %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kandw %k4, %k3, %k3 @@ -10294,53 +10291,52 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k3, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k5 -; AVX512BW-NEXT: kshiftrd $15, %k6, %k3 -; AVX512BW-NEXT: kmovq %k6, %k0 +; AVX512BW-NEXT: kshiftrd $15, %k2, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k5, %k3 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 ; AVX512BW-NEXT: korw %k1, %k3, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovq %k0, %k3 -; AVX512BW-NEXT: kshiftrd $11, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $11, %k3, %k6 +; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 +; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k6 +; AVX512BW-NEXT: kandw %k7, %k5, %k6 ; AVX512BW-NEXT: kshiftrd $12, %k3, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k7 @@ -10349,16 +10345,16 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kandw %k4, %k6, %k6 @@ -10368,27 +10364,26 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $4, %k7, %k6 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: kmovq %k3, %k0 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k5, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $9, %k6, %k0 @@ -10397,16 +10392,16 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kandw %k1, %k2, %k2 @@ -10418,8 +10413,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -10443,16 +10437,17 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k5, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $2, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $7, %k4, %k1 @@ -10463,15 +10458,15 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -10677,135 +10672,137 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: movw $1, %ax ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm11, %zmm11, %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm0 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm15, %zmm2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm17, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm19, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm5, %zmm8 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm10 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm15, %zmm12 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm16, %zmm14 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm17, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm18, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm19, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm5, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm13, %zmm24 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm15, %zmm25 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm16, %zmm26 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm17, %zmm27 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm18, %zmm28 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm5, %zmm29 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm19, %zmm30 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm31 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm17, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm18, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm19, %zmm5 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm10, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm12, %zmm2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm14, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm15, %zmm9 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm16, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm3, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm10, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm12, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm20 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm14, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm15, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm16, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm3, %zmm24 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm10, %zmm25 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm26 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm27 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm28 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm15, %zmm29 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm3, %zmm30 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm31 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm10, %zmm0 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm12, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm14, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm16, %zmm3 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm13 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm15 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm26 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm25 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm24 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm23 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm22 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm21 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm17 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm4 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1728(%rdx) +; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 1728(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1664(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1600(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1536(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1152(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1088(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1024(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 960(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 896(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1600(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1536(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1088(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 1024(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 960(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 896(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -10814,135 +10811,137 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: movw $1, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm7 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm11 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm15, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm17, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm19, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm5, %zmm8 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm10 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm15, %zmm12 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm16, %zmm14 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm17, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm18, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm19, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm5, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm13, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm15, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm16, %zmm26 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm17, %zmm27 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm18, %zmm28 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm5, %zmm29 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm19, %zmm30 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm31 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm17, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm19, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm12, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm14, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm15, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm16, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm3, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm10, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm12, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm14, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm15, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm16, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm3, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm10, %zmm25 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm26 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm27 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm28 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm15, %zmm29 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm3, %zmm30 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm31 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm10, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm12, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm14, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm3 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm13 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm15 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm26 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm25 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 -; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm24 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 -; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm23 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm22 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 -; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm21 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 -; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm17 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm4 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1728(%rdx) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 +; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 1728(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1664(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1600(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1536(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1152(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1088(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1024(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 960(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 896(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1600(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1536(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1088(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 1024(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 960(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 896(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -10959,9 +10958,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-5, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovq %k2, %k3 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-9, %ax @@ -10971,9 +10969,9 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $12, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovd %eax, %k5 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $11, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-33, %ax @@ -10992,15 +10990,15 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq (%rdi), %k4 -; AVX512BW-NEXT: kshiftrq $1, %k4, %k0 +; AVX512BW-NEXT: kmovq (%rdi), %k3 +; AVX512BW-NEXT: kshiftrq $1, %k3, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovd %eax, %k4 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $7, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF @@ -11010,9 +11008,9 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF @@ -11037,7 +11035,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $2, %k4, %k1 +; AVX512BW-NEXT: kshiftrq $2, %k3, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -11049,34 +11047,33 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k7, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kshiftrq $3, %k4, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k1 +; AVX512BW-NEXT: kmovq %k3, %k7 +; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kshiftrq $3, %k3, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11084,11 +11081,12 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -11116,46 +11114,46 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $5, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 +; AVX512BW-NEXT: kandw %k4, %k0, %k6 ; AVX512BW-NEXT: kshiftrq $6, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -11172,8 +11170,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $7, %k4, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $7, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11184,41 +11182,39 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $8, %k4, %k0 -; AVX512BW-NEXT: kmovq %k4, %k5 +; AVX512BW-NEXT: kshiftrq $8, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 @@ -11226,22 +11222,22 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $9, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 @@ -11257,52 +11253,52 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $10, %k5, %k0 +; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kshiftrq $10, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $11, %k5, %k6 -; AVX512BW-NEXT: kmovq %k5, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $11, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 +; AVX512BW-NEXT: kandw %k5, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11311,8 +11307,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftrq $12, %k4, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $12, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11328,56 +11324,56 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k6 ; AVX512BW-NEXT: kshiftrq $13, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 ; AVX512BW-NEXT: korw %k1, %k6, %k6 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $14, %k5, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $14, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k2, %k1, %k1 @@ -11391,20 +11387,20 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $15, %k5, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $15, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -11415,15 +11411,16 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $16, %k4, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -11437,12 +11434,11 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k2, %k1, %k1 @@ -11450,23 +11446,24 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $17, %k5, %k0 -; AVX512BW-NEXT: kmovq %k5, %k7 +; AVX512BW-NEXT: kshiftrq $17, %k4, %k0 +; AVX512BW-NEXT: kmovq %k4, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11474,11 +11471,11 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $18, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -11487,20 +11484,20 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -11539,31 +11536,31 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kandw %k5, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $21, %k7, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $21, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11571,8 +11568,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -11588,54 +11585,54 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $22, %k7, %k0 +; AVX512BW-NEXT: kshiftrq $22, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 ; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kandw %k5, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $23, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k4, %k1, %k1 @@ -11662,12 +11659,11 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -11677,41 +11673,42 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} -; AVX512BW-NEXT: kandw %k5, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 ; AVX512BW-NEXT: kmovq %k2, %k7 ; AVX512BW-NEXT: kshiftrq $26, %k2, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -11722,25 +11719,25 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $27, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 +; AVX512BW-NEXT: kandw %k4, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -11751,34 +11748,34 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k1 +; AVX512BW-NEXT: kandw %k5, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $28, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k6 @@ -11786,7 +11783,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -11807,8 +11804,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $30, %k5, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $30, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11816,41 +11813,41 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $31, %k5, %k1 +; AVX512BW-NEXT: kandw %k4, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $31, %k7, %k1 +; AVX512BW-NEXT: kmovq %k7, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kandw %k3, %k6, %k6 @@ -11864,7 +11861,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 +; AVX512BW-NEXT: kshiftrq $32, %k4, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 @@ -11873,25 +11870,26 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $33, %k5, %k0 -; AVX512BW-NEXT: kmovq %k5, %k7 +; AVX512BW-NEXT: kshiftrq $33, %k4, %k0 +; AVX512BW-NEXT: kmovq %k4, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11899,8 +11897,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -11920,8 +11918,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 +; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -11929,8 +11927,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 @@ -11940,14 +11938,13 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $35, %k3, %k0 ; AVX512BW-NEXT: kmovq %k3, %k7 +; AVX512BW-NEXT: kshiftrq $35, %k3, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11955,19 +11952,20 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -11992,12 +11990,11 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 +; AVX512BW-NEXT: kandw %k4, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 @@ -12010,15 +12007,15 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k3, %k1, %k1 @@ -12028,18 +12025,18 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k6 ; AVX512BW-NEXT: kshiftrq $38, %k7, %k0 -; AVX512BW-NEXT: kmovq %k7, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -12056,8 +12053,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftrq $39, %k5, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $39, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12080,8 +12077,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -12098,11 +12095,10 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -12146,6 +12142,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12161,12 +12158,12 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 @@ -12182,8 +12179,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -12209,8 +12206,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -12221,51 +12217,52 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k6 ; AVX512BW-NEXT: kshiftrq $45, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 ; AVX512BW-NEXT: korw %k1, %k6, %k6 ; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k6} {z} -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $46, %k5, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $46, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k3, %k1, %k1 @@ -12280,7 +12277,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $47, %k7, %k1 +; AVX512BW-NEXT: kmovq %k7, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -12292,7 +12290,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -12307,14 +12305,14 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 +; AVX512BW-NEXT: kshiftrq $48, %k4, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -12335,17 +12333,17 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $49, %k5, %k0 -; AVX512BW-NEXT: kmovq %k5, %k7 +; AVX512BW-NEXT: kshiftrq $49, %k4, %k0 +; AVX512BW-NEXT: kmovq %k4, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k3, %k1, %k1 @@ -12355,12 +12353,12 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 @@ -12376,15 +12374,15 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -12394,16 +12392,16 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -12449,11 +12447,11 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -12464,7 +12462,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -12481,8 +12479,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -12515,11 +12513,12 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k2, %k1, %k1 @@ -12543,8 +12542,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kandw %k4, %k1, %k1 @@ -12632,14 +12630,14 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $60, %k5, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $60, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12660,18 +12658,19 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $61, %k5, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $61, %k7, %k0 +; AVX512BW-NEXT: kmovq %k7, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -12690,58 +12689,58 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $62, %k5, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $62, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 +; AVX512BW-NEXT: kshiftrq $63, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k4, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 @@ -13133,26 +13132,26 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf32: ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm14 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm0 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z} -; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm2, %zmm2 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7 @@ -13174,10 +13173,10 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 @@ -13190,17 +13189,17 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 960(%rdx) +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 960(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 832(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 640(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) @@ -13214,26 +13213,26 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-LABEL: mask_replication_factor8_vf32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm16, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7 @@ -13255,10 +13254,10 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 @@ -13271,17 +13270,17 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 960(%rdx) +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 960(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 832(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 704(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 640(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) @@ -13367,78 +13366,78 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: subq $136, %rsp ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm10, %zmm10, %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm12, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm8, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm14, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm10 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm10, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm16, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm18, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm20, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm22, %zmm5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm24, %zmm7 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm26, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm25 -; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm12, %zmm27 -; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm14, %zmm28 -; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm16, %zmm29 -; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm18, %zmm30 -; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm20, %zmm31 -; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm22, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm24, %zmm6 -; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm26, %zmm2 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm0 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm14 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm12 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm10 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm8 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm15, %zmm7 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm17, %zmm9 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm19, %zmm12 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm21, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm10, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm20 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm13, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm15, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm17, %zmm24 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm19, %zmm25 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm21, %zmm26 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm8, %zmm27 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm10, %zmm28 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm11, %zmm29 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm13, %zmm30 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm15, %zmm31 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm17, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm19, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm21, %zmm2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm10, %zmm0 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm13, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm15, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm17, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm19, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm21, %zmm4 +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm21 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm13 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm15 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm17 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} @@ -13447,33 +13446,33 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm26 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm25 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm24 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm23 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm22 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm20 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z} @@ -13490,34 +13489,34 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm30, 1920(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm29, 1856(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm28, 1792(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1728(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1664(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1600(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 1536(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1152(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 1088(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 1024(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1728(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 1664(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1600(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1536(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 1088(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 1024(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 960(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 896(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 768(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 704(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512F-ONLY-NEXT: addq $136, %rsp ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -13526,78 +13525,78 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $136, %rsp ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm10 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm16, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm22, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm24, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm26, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm10, %zmm12, %zmm27 -; AVX512DQ-NEXT: vpermd %zmm10, %zmm14, %zmm28 -; AVX512DQ-NEXT: vpermd %zmm10, %zmm16, %zmm29 -; AVX512DQ-NEXT: vpermd %zmm10, %zmm18, %zmm30 -; AVX512DQ-NEXT: vpermd %zmm10, %zmm20, %zmm31 -; AVX512DQ-NEXT: vpermd %zmm10, %zmm22, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm10, %zmm24, %zmm6 -; AVX512DQ-NEXT: vpermd %zmm10, %zmm26, %zmm2 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm14 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm12 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm10 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm15, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm17, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm19, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm21, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm10, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm13, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm15, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm17, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm19, %zmm25 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm21, %zmm26 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm27 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm10, %zmm28 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm11, %zmm29 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm13, %zmm30 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm15, %zmm31 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm17, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm19, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm21, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm10, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm13, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm15, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm17, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm19, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm21, %zmm4 +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm21 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm11 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm13 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm15 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm17 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} @@ -13606,33 +13605,33 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 -; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1 +; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm26 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 -; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm25 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm24 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 -; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm23 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm22 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm20 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 +; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 +; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 ; AVX512DQ-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z} @@ -13649,34 +13648,34 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1920(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1856(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 1792(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1728(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1664(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1600(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1536(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1152(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1088(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1024(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1728(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1664(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1600(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1536(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1088(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 1024(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 960(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 896(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 768(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 704(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512DQ-NEXT: addq $136, %rsp ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -13687,79 +13686,79 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm7 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm11 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm16 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm15 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm12 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm13 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm9 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm5, %k2 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm10, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm5, %k1 +; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm15, %k2 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm9, %k2 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm14 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm15 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm16, %k1 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm16 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm13, %k1 +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k2} {z} ; AVX512BW-NEXT: vpmovb2m %zmm12, %k2 -; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm7, %k1 -; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm11, %k1 +; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm11 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k2} {z} -; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm10, %k2 +; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm10 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} +; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} +; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} @@ -13770,24 +13769,24 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1536(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1536(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm24, 1472(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1408(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1344(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 1280(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1280(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 1024(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 960(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 768(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 832(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 768(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 704(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 640(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 576(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 33f7a4e42b7f70..8cda7dd6eb8cee 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -457,37 +457,36 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; SSE41-LABEL: var_rotate_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlw $4, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psllw $4, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: psllw $5, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: psllw $5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: psrlw $6, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psllw $2, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: psrlw $7, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm2, %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: var_rotate_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index eae1b1b23bcea6..356fcd3e3ad4c7 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -3617,25 +3617,25 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) { ; X86-SSE41-NEXT: pushl %esi ; X86-SSE41-NEXT: .cfi_def_cfa_offset 8 ; X86-SSE41-NEXT: .cfi_offset %esi, -8 -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE41-NEXT: movl (%edx), %eax -; X86-SSE41-NEXT: movl 4(%edx), %ecx -; X86-SSE41-NEXT: movl %ecx, %esi -; X86-SSE41-NEXT: movl 8(%edx), %edx -; X86-SSE41-NEXT: shldl $13, %ecx, %edx -; X86-SSE41-NEXT: shldl $15, %eax, %ecx +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl (%eax), %ecx +; X86-SSE41-NEXT: movl 4(%eax), %edx +; X86-SSE41-NEXT: movl %edx, %esi +; X86-SSE41-NEXT: movl 8(%eax), %eax +; X86-SSE41-NEXT: shldl $13, %edx, %eax +; X86-SSE41-NEXT: shldl $15, %ecx, %edx +; X86-SSE41-NEXT: shll $15, %edx +; X86-SSE41-NEXT: sarl $15, %edx ; X86-SSE41-NEXT: shll $15, %ecx ; X86-SSE41-NEXT: sarl $15, %ecx -; X86-SSE41-NEXT: shll $15, %eax -; X86-SSE41-NEXT: sarl $15, %eax -; X86-SSE41-NEXT: movd %eax, %xmm0 -; X86-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; X86-SSE41-NEXT: movd %ecx, %xmm0 +; X86-SSE41-NEXT: pinsrd $1, %edx, %xmm0 ; X86-SSE41-NEXT: shll $13, %esi ; X86-SSE41-NEXT: sarl $15, %esi ; X86-SSE41-NEXT: pinsrd $2, %esi, %xmm0 -; X86-SSE41-NEXT: shll $15, %edx -; X86-SSE41-NEXT: sarl $15, %edx -; X86-SSE41-NEXT: pinsrd $3, %edx, %xmm0 +; X86-SSE41-NEXT: shll $15, %eax +; X86-SSE41-NEXT: sarl $15, %eax +; X86-SSE41-NEXT: pinsrd $3, %eax, %xmm0 ; X86-SSE41-NEXT: popl %esi ; X86-SSE41-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE41-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll index 9294ae48a76956..7b3b7ca1be15f7 100644 --- a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll +++ b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll @@ -38,24 +38,24 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm11 = mem[0],zero ; SSE-NEXT: pcmpeqb %xmm8, %xmm0 -; SSE-NEXT: pmovsxbd %xmm0, %xmm7 +; SSE-NEXT: pmovsxbd %xmm0, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pmovsxbd %xmm0, %xmm0 ; SSE-NEXT: pcmpeqb %xmm8, %xmm1 -; SSE-NEXT: pmovsxbd %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: pmovsxbd %xmm1, %xmm6 -; SSE-NEXT: pcmpeqb %xmm8, %xmm2 -; SSE-NEXT: pmovsxbd %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: pmovsxbd %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: pmovsxbd %xmm1, %xmm7 +; SSE-NEXT: pcmpeqb %xmm8, %xmm3 +; SSE-NEXT: pmovsxbd %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: pmovsxbd %xmm1, %xmm5 ; SSE-NEXT: pcmpeqb %xmm8, %xmm11 ; SSE-NEXT: pmovsxbd %xmm11, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] -; SSE-NEXT: pmovsxbd %xmm2, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] +; SSE-NEXT: pmovsxbd %xmm3, %xmm3 ; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm11 ; SSE-NEXT: movdqa %xmm11, %xmm12 ; SSE-NEXT: pslld %xmm9, %xmm12 @@ -65,52 +65,52 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r ; SSE-NEXT: movdqa %xmm12, %xmm13 ; SSE-NEXT: pslld %xmm9, %xmm13 ; SSE-NEXT: pslld %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm12 -; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm13 -; SSE-NEXT: pslld %xmm9, %xmm13 -; SSE-NEXT: pslld %xmm10, %xmm7 ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm7 -; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm6 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm12 +; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm6 ; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: pslld %xmm9, %xmm13 ; SSE-NEXT: pslld %xmm10, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm6 -; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm13 ; SSE-NEXT: pslld %xmm9, %xmm13 -; SSE-NEXT: pslld %xmm10, %xmm5 +; SSE-NEXT: pslld %xmm10, %xmm7 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm5 -; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm4 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm7 +; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: pslld %xmm9, %xmm13 ; SSE-NEXT: pslld %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm4 -; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm13 ; SSE-NEXT: pslld %xmm9, %xmm13 -; SSE-NEXT: pslld %xmm10, %xmm3 +; SSE-NEXT: pslld %xmm10, %xmm5 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm3 -; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm2 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm5 +; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm13 ; SSE-NEXT: pslld %xmm9, %xmm13 ; SSE-NEXT: pslld %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm2 +; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pslld %xmm9, %xmm13 +; SSE-NEXT: pslld %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm3 ; SSE-NEXT: movups %xmm12, (%rdi,%rcx,4) ; SSE-NEXT: movups %xmm11, 16(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm6, 32(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm7, 48(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm4, 64(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm5, 80(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm2, 96(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm3, 112(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm7, 32(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm6, 48(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm5, 64(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm4, 80(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm3, 96(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm2, 112(%rdi,%rcx,4) ; SSE-NEXT: addq $32, %rcx ; SSE-NEXT: cmpq %rcx, %rdx ; SSE-NEXT: jne .LBB0_4 @@ -169,18 +169,18 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovq {{.*#+}} xmm9 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm10 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm13 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm14 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm11 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm12 = mem[0],zero ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm9, %xmm9 -; AVX1-NEXT: vpmovsxbd %xmm9, %xmm15 +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm13 ; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm9, %xmm12 +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm14 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm10, %xmm9 -; AVX1-NEXT: vpmovsxbd %xmm9, %xmm11 -; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm9, %xmm10 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm13, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm15 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm11, %xmm11 ; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = mem[0],zero,mem[1],zero ; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm9 @@ -188,51 +188,51 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r ; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: # xmm2 = mem[0],zero,mem[1],zero ; AVX1-NEXT: vpslld %xmm2, %xmm9, %xmm9 -; AVX1-NEXT: vblendvps %xmm15, %xmm1, %xmm9, %xmm9 -; AVX1-NEXT: vpmovsxbd %xmm13, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm13, %xmm13 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm14, %xmm14 -; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm15 -; AVX1-NEXT: vpslld %xmm0, %xmm15, %xmm0 -; AVX1-NEXT: vpslld %xmm2, %xmm15, %xmm2 -; AVX1-NEXT: vpmovsxbd %xmm14, %xmm15 -; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm12, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vblendvps %xmm13, %xmm1, %xmm9, %xmm9 +; AVX1-NEXT: vpmovsxbd %xmm11, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm11, %xmm11 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm12, %xmm12 +; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm13 +; AVX1-NEXT: vpslld %xmm0, %xmm13, %xmm0 +; AVX1-NEXT: vpslld %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpmovsxbd %xmm12, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm12, %xmm12 +; AVX1-NEXT: vblendvps %xmm14, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm2 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm12 +; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm14 ; AVX1-NEXT: vpslld %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vblendvps %xmm11, %xmm12, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm11 -; AVX1-NEXT: vpslld %xmm3, %xmm11, %xmm12 -; AVX1-NEXT: vpslld %xmm4, %xmm11, %xmm11 -; AVX1-NEXT: vblendvps %xmm10, %xmm12, %xmm11, %xmm10 -; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm11 -; AVX1-NEXT: vpslld %xmm5, %xmm11, %xmm12 -; AVX1-NEXT: vpslld %xmm6, %xmm11, %xmm11 -; AVX1-NEXT: vblendvps %xmm1, %xmm12, %xmm11, %xmm1 -; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm11 -; AVX1-NEXT: vpslld %xmm5, %xmm11, %xmm12 -; AVX1-NEXT: vpslld %xmm6, %xmm11, %xmm11 -; AVX1-NEXT: vblendvps %xmm13, %xmm12, %xmm11, %xmm11 -; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm12 -; AVX1-NEXT: vpslld %xmm7, %xmm12, %xmm13 -; AVX1-NEXT: vpslld %xmm8, %xmm12, %xmm12 -; AVX1-NEXT: vblendvps %xmm15, %xmm13, %xmm12, %xmm12 -; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm13 -; AVX1-NEXT: vpslld %xmm7, %xmm13, %xmm15 -; AVX1-NEXT: vpslld %xmm8, %xmm13, %xmm13 -; AVX1-NEXT: vblendvps %xmm14, %xmm15, %xmm13, %xmm13 +; AVX1-NEXT: vblendvps %xmm10, %xmm14, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm10 +; AVX1-NEXT: vpslld %xmm3, %xmm10, %xmm14 +; AVX1-NEXT: vpslld %xmm4, %xmm10, %xmm10 +; AVX1-NEXT: vblendvps %xmm15, %xmm14, %xmm10, %xmm10 +; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm14 +; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15 +; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14 +; AVX1-NEXT: vblendvps %xmm1, %xmm15, %xmm14, %xmm1 +; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm14 +; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15 +; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14 +; AVX1-NEXT: vblendvps %xmm11, %xmm15, %xmm14, %xmm11 +; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm14 +; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15 +; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14 +; AVX1-NEXT: vblendvps %xmm13, %xmm15, %xmm14, %xmm13 +; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm14 +; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15 +; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14 +; AVX1-NEXT: vblendvps %xmm12, %xmm15, %xmm14, %xmm12 ; AVX1-NEXT: vmovups %xmm9, (%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm0, 16(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm2, 32(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm10, 48(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm1, 64(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm11, 80(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm12, 96(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm13, 112(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm13, 96(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm12, 112(%rdi,%rcx,4) ; AVX1-NEXT: addq $32, %rcx ; AVX1-NEXT: cmpq %rcx, %rdx ; AVX1-NEXT: jne .LBB0_4 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index 77f5f2660af7ed..46faf40c5a31b1 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -468,36 +468,36 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; ; X86-AVX1-LABEL: var_shift_v32i8: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4 -; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X86-AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; X86-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 -; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; X86-AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 -; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; X86-AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 -; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; X86-AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; X86-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 -; X86-AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 -; X86-AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 +; X86-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: var_shift_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index c355eeaa42b669..320fed5266fb80 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -246,30 +246,30 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; ; X86-AVX1-LABEL: var_shift_v16i16: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4,4,5,5,6,6,7,7] -; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm4 -; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] -; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4 -; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] ; X86-AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3 +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X86-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 +; X86-AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7] +; X86-AVX1-NEXT: vpslld $23, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; X86-AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 -; X86-AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X86-AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4,4,5,5,6,6,7,7] -; X86-AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4 -; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; X86-AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 ; X86-AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; X86-AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: var_shift_v16i16: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index b34af730565e4d..3df01362158003 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -491,16 +491,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [1,0,2,0,8,0,9,0] -; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3 +; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [1,0,2,0,8,0,9,0] +; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm4 ; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,0,10,0,2,0,9,0] ; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X86-AVX512-NEXT: vpermt2pd %zmm4, %zmm5, %zmm6 +; X86-AVX512-NEXT: vpermt2pd %zmm3, %zmm5, %zmm6 ; X86-AVX512-NEXT: vmovapd %ymm6, (%edx) -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,3,0,10,0,1,0] -; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4 -; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx) +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,0,3,0,10,0,1,0] +; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3 +; X86-AVX512-NEXT: vmovapd %ymm3, (%ecx) ; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,0,11,0,3,0,11,0] ; X86-AVX512-NEXT: # ymm3 = mem[0,1,0,1] ; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index 6f128a00343975..dd1bfa6d9de867 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -25,36 +25,36 @@ define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) { define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-LABEL: PR50049: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa 32(%rsi), %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = <128,128,128,128,128,128,2,5,8,11,14,u,u,u,u,u> ; SSE-NEXT: pshufb %xmm6, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = <0,3,6,9,12,15,128,128,128,128,128,u,u,u,u,u> -; SSE-NEXT: pshufb %xmm7, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufb %xmm6, %xmm5 -; SSE-NEXT: pshufb %xmm7, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE-NEXT: pmullw %xmm5, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = <8,u,9,u,10,u,128,u,128,u,128,u,128,u,128,u> +; SSE-NEXT: pshufb %xmm7, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufb %xmm6, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = <128,u,128,u,128,u,1,u,4,u,7,u,10,u,13,u> ; SSE-NEXT: pshufb %xmm7, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufb %xmm6, %xmm2 -; SSE-NEXT: pshufb %xmm7, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pmullw %xmm3, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE-NEXT: pmullw %xmm4, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = <8,u,9,u,10,u,128,u,128,u,128,u,128,u,128,u> +; SSE-NEXT: pshufb %xmm6, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = <128,u,128,u,128,u,1,u,4,u,7,u,10,u,13,u> +; SSE-NEXT: pshufb %xmm7, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshufb %xmm6, %xmm1 +; SSE-NEXT: pshufb %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pmullw %xmm5, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq %x1 = load <48 x i8>, ptr %p1, align 16 %x2 = load <48 x i8>, ptr %p2, align 16 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll index 3ca0e2121e0d12..6b42767f41b520 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -431,7 +431,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movd %eax, %xmm5 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax @@ -443,7 +443,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: movd %eax, %xmm6 ; SSE2-NEXT: andl $15, %ecx ; SSE2-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSE2-NEXT: movd %eax, %xmm9 @@ -473,10 +473,10 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] @@ -484,7 +484,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -515,7 +515,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %eax, %xmm5 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -527,7 +527,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm6 ; SSSE3-NEXT: andl $15, %ecx ; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSSE3-NEXT: movd %eax, %xmm9 @@ -557,10 +557,10 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] @@ -568,7 +568,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -868,10 +868,10 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSE2-NEXT: movd %eax, %xmm10 ; SSE2-NEXT: andl $15, %r9d ; SSE2-NEXT: movzbl -24(%rsp,%r9), %eax -; SSE2-NEXT: movd %eax, %xmm12 +; SSE2-NEXT: movd %eax, %xmm11 ; SSE2-NEXT: andl $15, %r8d ; SSE2-NEXT: movzbl -24(%rsp,%r8), %eax -; SSE2-NEXT: movd %eax, %xmm11 +; SSE2-NEXT: movd %eax, %xmm12 ; SSE2-NEXT: andl $15, %esi ; SSE2-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm13 @@ -893,12 +893,12 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 @@ -966,10 +966,10 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSSE3-NEXT: movd %eax, %xmm10 ; SSSE3-NEXT: andl $15, %r9d ; SSSE3-NEXT: movzbl -24(%rsp,%r9), %eax -; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movd %eax, %xmm11 ; SSSE3-NEXT: andl $15, %r8d ; SSSE3-NEXT: movzbl -24(%rsp,%r8), %eax -; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: movd %eax, %xmm12 ; SSSE3-NEXT: andl $15, %esi ; SSSE3-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm13 @@ -991,12 +991,12 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 @@ -1014,58 +1014,60 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movzbl (%rdi), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: movzbl 1(%rdi), %eax -; SSE41-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE41-NEXT: movzbl 2(%rdi), %edx -; SSE41-NEXT: movzbl 3(%rdi), %esi -; SSE41-NEXT: movzbl 4(%rdi), %r8d -; SSE41-NEXT: movzbl 5(%rdi), %r9d -; SSE41-NEXT: movzbl 6(%rdi), %r10d -; SSE41-NEXT: movzbl 7(%rdi), %r11d -; SSE41-NEXT: movzbl 8(%rdi), %ebx -; SSE41-NEXT: movzbl 9(%rdi), %r14d -; SSE41-NEXT: movzbl 10(%rdi), %r15d -; SSE41-NEXT: movzbl 11(%rdi), %r12d -; SSE41-NEXT: movzbl 12(%rdi), %r13d -; SSE41-NEXT: movzbl 13(%rdi), %ebp -; SSE41-NEXT: movzbl 14(%rdi), %eax +; SSE41-NEXT: movzbl (%rdi), %esi +; SSE41-NEXT: andl $15, %esi +; SSE41-NEXT: movzbl 1(%rdi), %r8d +; SSE41-NEXT: movzbl 2(%rdi), %r9d +; SSE41-NEXT: movzbl 3(%rdi), %r10d +; SSE41-NEXT: movzbl 4(%rdi), %r11d +; SSE41-NEXT: movzbl 5(%rdi), %ebx +; SSE41-NEXT: movzbl 6(%rdi), %r14d +; SSE41-NEXT: movzbl 7(%rdi), %r15d +; SSE41-NEXT: movzbl 8(%rdi), %r12d +; SSE41-NEXT: movzbl 9(%rdi), %r13d +; SSE41-NEXT: movzbl 10(%rdi), %ebp +; SSE41-NEXT: movzbl 11(%rdi), %eax +; SSE41-NEXT: movzbl 12(%rdi), %ecx +; SSE41-NEXT: movzbl 13(%rdi), %edx +; SSE41-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE41-NEXT: movzbl 14(%rdi), %edx +; SSE41-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE41-NEXT: movzbl 15(%rdi), %edi ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movzbl -24(%rsp,%rcx), %ecx -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $1, -24(%rsp,%rcx), %xmm0 -; SSE41-NEXT: andl $15, %edx -; SSE41-NEXT: pinsrb $2, -24(%rsp,%rdx), %xmm0 -; SSE41-NEXT: andl $15, %esi -; SSE41-NEXT: pinsrb $3, -24(%rsp,%rsi), %xmm0 +; SSE41-NEXT: movzbl -40(%rsp,%rsi), %esi +; SSE41-NEXT: movd %esi, %xmm0 ; SSE41-NEXT: andl $15, %r8d -; SSE41-NEXT: pinsrb $4, -24(%rsp,%r8), %xmm0 +; SSE41-NEXT: pinsrb $1, -40(%rsp,%r8), %xmm0 ; SSE41-NEXT: andl $15, %r9d -; SSE41-NEXT: pinsrb $5, -24(%rsp,%r9), %xmm0 +; SSE41-NEXT: pinsrb $2, -40(%rsp,%r9), %xmm0 ; SSE41-NEXT: andl $15, %r10d -; SSE41-NEXT: pinsrb $6, -24(%rsp,%r10), %xmm0 +; SSE41-NEXT: pinsrb $3, -40(%rsp,%r10), %xmm0 ; SSE41-NEXT: andl $15, %r11d -; SSE41-NEXT: pinsrb $7, -24(%rsp,%r11), %xmm0 +; SSE41-NEXT: pinsrb $4, -40(%rsp,%r11), %xmm0 ; SSE41-NEXT: andl $15, %ebx -; SSE41-NEXT: pinsrb $8, -24(%rsp,%rbx), %xmm0 +; SSE41-NEXT: pinsrb $5, -40(%rsp,%rbx), %xmm0 ; SSE41-NEXT: andl $15, %r14d -; SSE41-NEXT: pinsrb $9, -24(%rsp,%r14), %xmm0 +; SSE41-NEXT: pinsrb $6, -40(%rsp,%r14), %xmm0 ; SSE41-NEXT: andl $15, %r15d -; SSE41-NEXT: pinsrb $10, -24(%rsp,%r15), %xmm0 +; SSE41-NEXT: pinsrb $7, -40(%rsp,%r15), %xmm0 ; SSE41-NEXT: andl $15, %r12d -; SSE41-NEXT: pinsrb $11, -24(%rsp,%r12), %xmm0 +; SSE41-NEXT: pinsrb $8, -40(%rsp,%r12), %xmm0 ; SSE41-NEXT: andl $15, %r13d -; SSE41-NEXT: pinsrb $12, -24(%rsp,%r13), %xmm0 +; SSE41-NEXT: pinsrb $9, -40(%rsp,%r13), %xmm0 ; SSE41-NEXT: andl $15, %ebp -; SSE41-NEXT: pinsrb $13, -24(%rsp,%rbp), %xmm0 +; SSE41-NEXT: pinsrb $10, -40(%rsp,%rbp), %xmm0 ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $14, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $11, -40(%rsp,%rax), %xmm0 +; SSE41-NEXT: andl $15, %ecx +; SSE41-NEXT: pinsrb $12, -40(%rsp,%rcx), %xmm0 +; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $13, -40(%rsp,%rax), %xmm0 +; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $14, -40(%rsp,%rax), %xmm0 ; SSE41-NEXT: andl $15, %edi -; SSE41-NEXT: pinsrb $15, -24(%rsp,%rdi), %xmm0 +; SSE41-NEXT: pinsrb $15, -40(%rsp,%rdi), %xmm0 ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r13 @@ -1082,58 +1084,60 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; AVX-NEXT: pushq %r13 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movzbl (%rdi), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: movzbl 1(%rdi), %eax -; AVX-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX-NEXT: movzbl 2(%rdi), %edx -; AVX-NEXT: movzbl 3(%rdi), %esi -; AVX-NEXT: movzbl 4(%rdi), %r8d -; AVX-NEXT: movzbl 5(%rdi), %r9d -; AVX-NEXT: movzbl 6(%rdi), %r10d -; AVX-NEXT: movzbl 7(%rdi), %r11d -; AVX-NEXT: movzbl 8(%rdi), %ebx -; AVX-NEXT: movzbl 9(%rdi), %r14d -; AVX-NEXT: movzbl 10(%rdi), %r15d -; AVX-NEXT: movzbl 11(%rdi), %r12d -; AVX-NEXT: movzbl 12(%rdi), %r13d -; AVX-NEXT: movzbl 13(%rdi), %ebp -; AVX-NEXT: movzbl 14(%rdi), %eax +; AVX-NEXT: movzbl (%rdi), %esi +; AVX-NEXT: andl $15, %esi +; AVX-NEXT: movzbl 1(%rdi), %r8d +; AVX-NEXT: movzbl 2(%rdi), %r9d +; AVX-NEXT: movzbl 3(%rdi), %r10d +; AVX-NEXT: movzbl 4(%rdi), %r11d +; AVX-NEXT: movzbl 5(%rdi), %ebx +; AVX-NEXT: movzbl 6(%rdi), %r14d +; AVX-NEXT: movzbl 7(%rdi), %r15d +; AVX-NEXT: movzbl 8(%rdi), %r12d +; AVX-NEXT: movzbl 9(%rdi), %r13d +; AVX-NEXT: movzbl 10(%rdi), %ebp +; AVX-NEXT: movzbl 11(%rdi), %eax +; AVX-NEXT: movzbl 12(%rdi), %ecx +; AVX-NEXT: movzbl 13(%rdi), %edx +; AVX-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX-NEXT: movzbl 14(%rdi), %edx +; AVX-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX-NEXT: movzbl 15(%rdi), %edi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movzbl -24(%rsp,%rcx), %ecx -; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $1, -24(%rsp,%rcx), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %edx -; AVX-NEXT: vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: vpinsrb $3, -24(%rsp,%rsi), %xmm0, %xmm0 +; AVX-NEXT: movzbl -40(%rsp,%rsi), %esi +; AVX-NEXT: vmovd %esi, %xmm0 ; AVX-NEXT: andl $15, %r8d -; AVX-NEXT: vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $1, -40(%rsp,%r8), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r9d -; AVX-NEXT: vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $2, -40(%rsp,%r9), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r10d -; AVX-NEXT: vpinsrb $6, -24(%rsp,%r10), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $3, -40(%rsp,%r10), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r11d -; AVX-NEXT: vpinsrb $7, -24(%rsp,%r11), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $4, -40(%rsp,%r11), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %ebx -; AVX-NEXT: vpinsrb $8, -24(%rsp,%rbx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $5, -40(%rsp,%rbx), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r14d -; AVX-NEXT: vpinsrb $9, -24(%rsp,%r14), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $6, -40(%rsp,%r14), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r15d -; AVX-NEXT: vpinsrb $10, -24(%rsp,%r15), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $7, -40(%rsp,%r15), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r12d -; AVX-NEXT: vpinsrb $11, -24(%rsp,%r12), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $8, -40(%rsp,%r12), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r13d -; AVX-NEXT: vpinsrb $12, -24(%rsp,%r13), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, -40(%rsp,%r13), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %ebp -; AVX-NEXT: vpinsrb $13, -24(%rsp,%rbp), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $10, -40(%rsp,%rbp), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $11, -40(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: andl $15, %ecx +; AVX-NEXT: vpinsrb $12, -40(%rsp,%rcx), %xmm0, %xmm0 +; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $13, -40(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $14, -40(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %edi -; AVX-NEXT: vpinsrb $15, -24(%rsp,%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $15, -40(%rsp,%rdi), %xmm0, %xmm0 ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 ; AVX-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index a94104a002d5ce..13933a37836053 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -502,7 +502,7 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm3 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm8 -; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm6 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] @@ -511,8 +511,8 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 @@ -525,7 +525,7 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 @@ -534,32 +534,32 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm8 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm8 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm11, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 @@ -568,7 +568,7 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 ; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 @@ -1609,7 +1609,7 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; ; SSE41-LABEL: trunc_packus_v8i64_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm6 ; SSE41-NEXT: movdqa 16(%rdi), %xmm5 ; SSE41-NEXT: movdqa 32(%rdi), %xmm4 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 @@ -1617,10 +1617,10 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -1629,34 +1629,34 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 ; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: xorpd %xmm5, %xmm5 ; SSE41-NEXT: movapd %xmm1, %xmm6 @@ -2051,103 +2051,103 @@ define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) { define <16 x i16> @trunc_packus_v16i32_v16i16(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-LABEL: trunc_packus_v16i32_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm3 -; SSE2-NEXT: movdqa 32(%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm5, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pandn %xmm5, %xmm6 -; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm4, %xmm0 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm4, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v16i32_v16i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm3 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm0 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm5, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm1 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm2 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 ; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm5, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pandn %xmm5, %xmm6 -; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm6, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm6, %xmm0 ; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm5, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm5, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSSE3-NEXT: pshufb %xmm2, %xmm4 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-NEXT: pshufb %xmm2, %xmm3 ; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v16i32_v16i16: @@ -3041,7 +3041,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; ; SSE41-LABEL: trunc_packus_v8i64_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm6 ; SSE41-NEXT: movdqa 16(%rdi), %xmm5 ; SSE41-NEXT: movdqa 32(%rdi), %xmm4 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 @@ -3049,10 +3049,10 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -3061,34 +3061,34 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 ; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: xorpd %xmm5, %xmm5 ; SSE41-NEXT: movapd %xmm1, %xmm6 @@ -3296,93 +3296,93 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm7, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm2 -; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm3 -; SSE2-SSSE3-NEXT: movq %xmm3, (%rsi) +; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm5 +; SSE2-SSSE3-NEXT: movq %xmm5, (%rsi) ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v8i64_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm6 ; SSE41-NEXT: movdqa 16(%rdi), %xmm5 -; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 32(%rdi), %xmm4 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 ; SSE41-NEXT: xorpd %xmm5, %xmm5 -; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: movapd %xmm3, %xmm6 ; SSE41-NEXT: xorpd %xmm1, %xmm6 ; SSE41-NEXT: movapd %xmm6, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 @@ -3391,19 +3391,19 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; SSE41-NEXT: movapd %xmm8, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: movapd %xmm8, %xmm3 +; SSE41-NEXT: xorpd %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: packusdw %xmm6, %xmm4 -; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE41-NEXT: packusdw %xmm6, %xmm3 +; SSE41-NEXT: movapd %xmm4, %xmm6 ; SSE41-NEXT: xorpd %xmm1, %xmm6 ; SSE41-NEXT: movapd %xmm6, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 @@ -3412,20 +3412,20 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: xorpd %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: xorpd %xmm1, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: packusdw %xmm6, %xmm5 -; SSE41-NEXT: packusdw %xmm5, %xmm4 -; SSE41-NEXT: packuswb %xmm4, %xmm4 -; SSE41-NEXT: movq %xmm4, (%rsi) +; SSE41-NEXT: packusdw %xmm5, %xmm3 +; SSE41-NEXT: packuswb %xmm3, %xmm3 +; SSE41-NEXT: movq %xmm3, (%rsi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v8i64_v8i8_store: @@ -3694,23 +3694,23 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm6 ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 @@ -3722,9 +3722,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm3 -; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v16i64_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index 2f3fdeb74dc473..20175f696f7dad 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -484,8 +484,8 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE2-SSSE3-LABEL: trunc_ssat_v8i64_v8i32: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm3 -; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm5 -; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm7 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm6 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] @@ -494,8 +494,8 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm8 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 @@ -504,41 +504,41 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm7 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm11, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 ; SSE2-SSSE3-NEXT: por %xmm1, %xmm8 @@ -546,8 +546,8 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] @@ -557,43 +557,43 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm8 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm9 ; SSE2-SSSE3-NEXT: por %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm10, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm1 ; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i32: @@ -1465,10 +1465,10 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-SSSE3-LABEL: trunc_ssat_v8i64_v8i16: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm6 +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm3 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm5 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm6 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 @@ -1486,7 +1486,7 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 @@ -1496,31 +1496,31 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm11, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm8 @@ -1528,8 +1528,8 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] @@ -1539,49 +1539,49 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm8 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm9 ; SSE2-SSSE3-NEXT: por %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm10, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: packssdw %xmm9, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm6, %xmm1 ; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm6 ; SSE41-NEXT: movdqa 16(%rdi), %xmm5 ; SSE41-NEXT: movdqa 32(%rdi), %xmm4 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 @@ -1589,10 +1589,10 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] +; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -1601,34 +1601,34 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 ; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709518848,18446744073709518848] ; SSE41-NEXT: movapd %xmm1, %xmm7 @@ -2644,10 +2644,10 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-SSSE3-LABEL: trunc_ssat_v8i64_v8i8: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm6 +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm3 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm5 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm6 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [127,127] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 @@ -2665,7 +2665,7 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 @@ -2675,31 +2675,31 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm11, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm8 @@ -2707,8 +2707,8 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] @@ -2718,50 +2718,50 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm8 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm9 ; SSE2-SSSE3-NEXT: por %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm10, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: packssdw %xmm9, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm6, %xmm1 ; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm6 ; SSE41-NEXT: movdqa 16(%rdi), %xmm5 ; SSE41-NEXT: movdqa 32(%rdi), %xmm4 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 @@ -2769,10 +2769,10 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -2781,34 +2781,34 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 ; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488] ; SSE41-NEXT: movapd %xmm1, %xmm7 @@ -2941,10 +2941,10 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-width"="256" { ; SSE2-SSSE3-LABEL: trunc_ssat_v8i64_v8i8_store: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm6 +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm3 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm5 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm6 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [127,127] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 @@ -2962,7 +2962,7 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 @@ -2972,31 +2972,31 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm11, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 ; SSE2-SSSE3-NEXT: pand %xmm7, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm7 @@ -3006,8 +3006,8 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744071562067840,18446744071562067840] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] @@ -3015,35 +3015,35 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm8 ; SSE2-SSSE3-NEXT: por %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm7 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm10, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 ; SSE2-SSSE3-NEXT: packssdw %xmm8, %xmm7 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] @@ -3051,7 +3051,7 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: packssdw %xmm0, %xmm7 ; SSE2-SSSE3-NEXT: packsswb %xmm7, %xmm7 ; SSE2-SSSE3-NEXT: movq %xmm7, (%rsi) @@ -3059,57 +3059,57 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm6 ; SSE41-NEXT: movdqa 16(%rdi), %xmm5 -; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 32(%rdi), %xmm4 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [127,127] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: movapd %xmm3, %xmm7 ; SSE41-NEXT: xorpd %xmm1, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] ; SSE41-NEXT: movapd %xmm7, %xmm9 @@ -3119,19 +3119,19 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 -; SSE41-NEXT: movapd %xmm8, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: movapd %xmm8, %xmm3 +; SSE41-NEXT: xorpd %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: packssdw %xmm7, %xmm4 -; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE41-NEXT: packssdw %xmm7, %xmm3 +; SSE41-NEXT: movapd %xmm4, %xmm7 ; SSE41-NEXT: xorpd %xmm1, %xmm7 ; SSE41-NEXT: movapd %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 @@ -3140,19 +3140,19 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 ; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: packssdw %xmm7, %xmm5 -; SSE41-NEXT: packssdw %xmm5, %xmm4 -; SSE41-NEXT: packsswb %xmm4, %xmm4 -; SSE41-NEXT: movq %xmm4, (%rsi) +; SSE41-NEXT: packssdw %xmm5, %xmm3 +; SSE41-NEXT: packsswb %xmm3, %xmm3 +; SSE41-NEXT: movq %xmm3, (%rsi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v8i64_v8i8_store: @@ -3241,11 +3241,11 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-SSSE3-LABEL: trunc_ssat_v16i64_v16i8: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm8 +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm7 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm12 ; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm11 -; SSE2-SSSE3-NEXT: movdqa 80(%rdi), %xmm7 +; SSE2-SSSE3-NEXT: movdqa 80(%rdi), %xmm9 ; SSE2-SSSE3-NEXT: movdqa 64(%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 112(%rdi), %xmm4 ; SSE2-SSSE3-NEXT: movdqa 96(%rdi), %xmm3 @@ -3256,8 +3256,8 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483775,2147483775] +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm14 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 @@ -3270,7 +3270,7 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm14 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 @@ -3283,7 +3283,7 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm14 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 @@ -3292,37 +3292,37 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm4 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm14 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm15, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm9 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm12, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm12, %xmm9 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm9[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm14 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm14 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm12 -; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm12, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm15, %xmm9 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm12 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm9 +; SSE2-SSSE3-NEXT: por %xmm12, %xmm9 ; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm12 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm12 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm12, %xmm14 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 @@ -3331,28 +3331,28 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE2-SSSE3-NEXT: pand %xmm12, %xmm11 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm12 ; SSE2-SSSE3-NEXT: por %xmm11, %xmm12 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm11 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm11 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm14 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm15, %xmm11 -; SSE2-SSSE3-NEXT: pand %xmm11, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm11, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm11 -; SSE2-SSSE3-NEXT: por %xmm8, %xmm11 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm8 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm7, %xmm11 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm8, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm7, %xmm10 ; SSE2-SSSE3-NEXT: pand %xmm10, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm10 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm10 @@ -3360,10 +3360,10 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm13 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm13 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] @@ -3374,8 +3374,8 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm10, %xmm14 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] @@ -3387,8 +3387,8 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE2-SSSE3-NEXT: movdqa %xmm12, %xmm10 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm11 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm11 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm11, %xmm13 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] @@ -3396,37 +3396,37 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE2-SSSE3-NEXT: pand %xmm10, %xmm12 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm10 ; SSE2-SSSE3-NEXT: por %xmm12, %xmm10 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm11 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm11 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm12 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm12, %xmm13 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm13, %xmm11 -; SSE2-SSSE3-NEXT: pand %xmm11, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm11, %xmm9 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm11 -; SSE2-SSSE3-NEXT: por %xmm7, %xmm11 +; SSE2-SSSE3-NEXT: por %xmm9, %xmm11 ; SSE2-SSSE3-NEXT: packssdw %xmm10, %xmm11 ; SSE2-SSSE3-NEXT: packssdw %xmm11, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm10, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm11, %xmm9 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm9 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm9 ; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm10, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] @@ -3434,14 +3434,14 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: packssdw %xmm9, %xmm5 ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm10, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 @@ -3449,8 +3449,8 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] @@ -4539,10 +4539,10 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; SSE2-SSSE3-NEXT: por %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: movd %xmm3, %r8d ; SSE2-SSSE3-NEXT: movw %r8w, 36(%rdi) -; SSE2-SSSE3-NEXT: movd %xmm2, %r11d -; SSE2-SSSE3-NEXT: movw %r11w, 24(%rdi) -; SSE2-SSSE3-NEXT: movd %xmm1, %r14d -; SSE2-SSSE3-NEXT: movw %r14w, 12(%rdi) +; SSE2-SSSE3-NEXT: movd %xmm2, %ebx +; SSE2-SSSE3-NEXT: movw %bx, 24(%rdi) +; SSE2-SSSE3-NEXT: movd %xmm1, %r11d +; SSE2-SSSE3-NEXT: movw %r11w, 12(%rdi) ; SSE2-SSSE3-NEXT: movd %xmm0, %eax ; SSE2-SSSE3-NEXT: movw %ax, (%rdi) ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] @@ -4565,22 +4565,22 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE2-SSSE3-NEXT: movd %xmm2, %r10d ; SSE2-SSSE3-NEXT: movw %r10w, 27(%rdi) -; SSE2-SSSE3-NEXT: shrl $16, %r11d -; SSE2-SSSE3-NEXT: movb %r11b, 26(%rdi) +; SSE2-SSSE3-NEXT: shrl $16, %ebx +; SSE2-SSSE3-NEXT: movb %bl, 26(%rdi) ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE2-SSSE3-NEXT: movd %xmm2, %r11d -; SSE2-SSSE3-NEXT: movw %r11w, 21(%rdi) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE2-SSSE3-NEXT: movd %xmm2, %ebx -; SSE2-SSSE3-NEXT: movw %bx, 18(%rdi) +; SSE2-SSSE3-NEXT: movw %bx, 21(%rdi) +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE2-SSSE3-NEXT: movd %xmm2, %ebp +; SSE2-SSSE3-NEXT: movw %bp, 18(%rdi) ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE2-SSSE3-NEXT: movd %xmm1, %ebp -; SSE2-SSSE3-NEXT: movw %bp, 15(%rdi) -; SSE2-SSSE3-NEXT: shrl $16, %r14d -; SSE2-SSSE3-NEXT: movb %r14b, 14(%rdi) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; SSE2-SSSE3-NEXT: movd %xmm1, %r14d -; SSE2-SSSE3-NEXT: movw %r14w, 9(%rdi) +; SSE2-SSSE3-NEXT: movw %r14w, 15(%rdi) +; SSE2-SSSE3-NEXT: shrl $16, %r11d +; SSE2-SSSE3-NEXT: movb %r11b, 14(%rdi) +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE2-SSSE3-NEXT: movd %xmm1, %r11d +; SSE2-SSSE3-NEXT: movw %r11w, 9(%rdi) ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-SSSE3-NEXT: movd %xmm1, %r15d ; SSE2-SSSE3-NEXT: movw %r15w, 6(%rdi) @@ -4601,14 +4601,14 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; SSE2-SSSE3-NEXT: movb %r9b, 32(%rdi) ; SSE2-SSSE3-NEXT: shrl $16, %r10d ; SSE2-SSSE3-NEXT: movb %r10b, 29(%rdi) -; SSE2-SSSE3-NEXT: shrl $16, %r11d -; SSE2-SSSE3-NEXT: movb %r11b, 23(%rdi) ; SSE2-SSSE3-NEXT: shrl $16, %ebx -; SSE2-SSSE3-NEXT: movb %bl, 20(%rdi) +; SSE2-SSSE3-NEXT: movb %bl, 23(%rdi) ; SSE2-SSSE3-NEXT: shrl $16, %ebp -; SSE2-SSSE3-NEXT: movb %bpl, 17(%rdi) +; SSE2-SSSE3-NEXT: movb %bpl, 20(%rdi) ; SSE2-SSSE3-NEXT: shrl $16, %r14d -; SSE2-SSSE3-NEXT: movb %r14b, 11(%rdi) +; SSE2-SSSE3-NEXT: movb %r14b, 17(%rdi) +; SSE2-SSSE3-NEXT: shrl $16, %r11d +; SSE2-SSSE3-NEXT: movb %r11b, 11(%rdi) ; SSE2-SSSE3-NEXT: shrl $16, %r15d ; SSE2-SSSE3-NEXT: movb %r15b, 8(%rdi) ; SSE2-SSSE3-NEXT: shrl $16, %r12d @@ -4702,45 +4702,45 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [8388607,8388607,8388607,8388607] -; AVX1-NEXT: vpminsd %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vpminsd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpminsd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4286578688,4286578688,4286578688,4286578688] ; AVX1-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpmaxsd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmaxsd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpextrd $3, %xmm3, %eax +; AVX1-NEXT: vpextrd $3, %xmm2, %eax ; AVX1-NEXT: movw %ax, 45(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 47(%rdi) -; AVX1-NEXT: vpextrd $2, %xmm3, %eax +; AVX1-NEXT: vpextrd $2, %xmm2, %eax ; AVX1-NEXT: movw %ax, 42(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 44(%rdi) -; AVX1-NEXT: vpextrd $1, %xmm3, %eax +; AVX1-NEXT: vpextrd $1, %xmm2, %eax ; AVX1-NEXT: movw %ax, 39(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 41(%rdi) -; AVX1-NEXT: vmovd %xmm3, %eax +; AVX1-NEXT: vmovd %xmm2, %eax ; AVX1-NEXT: movw %ax, 36(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 38(%rdi) -; AVX1-NEXT: vpextrd $3, %xmm2, %eax +; AVX1-NEXT: vpextrd $3, %xmm4, %eax ; AVX1-NEXT: movw %ax, 33(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 35(%rdi) -; AVX1-NEXT: vpextrd $2, %xmm2, %eax +; AVX1-NEXT: vpextrd $2, %xmm4, %eax ; AVX1-NEXT: movw %ax, 30(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 32(%rdi) -; AVX1-NEXT: vpextrd $1, %xmm2, %eax +; AVX1-NEXT: vpextrd $1, %xmm4, %eax ; AVX1-NEXT: movw %ax, 27(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 29(%rdi) -; AVX1-NEXT: vmovd %xmm2, %eax +; AVX1-NEXT: vmovd %xmm4, %eax ; AVX1-NEXT: movw %ax, 24(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 26(%rdi) @@ -4865,72 +4865,72 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vpextrd $3, %xmm1, %r15d -; AVX512-NEXT: movw %r15w, 45(%rdi) -; AVX512-NEXT: vpextrd $2, %xmm1, %r14d -; AVX512-NEXT: movw %r14w, 42(%rdi) -; AVX512-NEXT: vpextrd $1, %xmm1, %ebp -; AVX512-NEXT: movw %bp, 39(%rdi) +; AVX512-NEXT: vpextrd $3, %xmm1, %r8d +; AVX512-NEXT: movw %r8w, 45(%rdi) +; AVX512-NEXT: vpextrd $2, %xmm1, %r9d +; AVX512-NEXT: movw %r9w, 42(%rdi) +; AVX512-NEXT: vpextrd $1, %xmm1, %r10d +; AVX512-NEXT: movw %r10w, 39(%rdi) ; AVX512-NEXT: vmovd %xmm1, %r11d ; AVX512-NEXT: movw %r11w, 36(%rdi) ; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512-NEXT: vpextrd $3, %xmm1, %ebx ; AVX512-NEXT: movw %bx, 33(%rdi) -; AVX512-NEXT: vpextrd $2, %xmm1, %r10d -; AVX512-NEXT: movw %r10w, 30(%rdi) -; AVX512-NEXT: vpextrd $1, %xmm1, %r9d -; AVX512-NEXT: movw %r9w, 27(%rdi) -; AVX512-NEXT: vmovd %xmm1, %r8d -; AVX512-NEXT: movw %r8w, 24(%rdi) -; AVX512-NEXT: vpextrd $3, %xmm0, %esi -; AVX512-NEXT: movw %si, 9(%rdi) -; AVX512-NEXT: vpextrd $2, %xmm0, %edx -; AVX512-NEXT: movw %dx, 6(%rdi) -; AVX512-NEXT: vpextrd $1, %xmm0, %ecx -; AVX512-NEXT: movw %cx, 3(%rdi) -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: movw %ax, (%rdi) -; AVX512-NEXT: shrl $16, %r15d -; AVX512-NEXT: movb %r15b, 47(%rdi) -; AVX512-NEXT: shrl $16, %r14d -; AVX512-NEXT: movb %r14b, 44(%rdi) -; AVX512-NEXT: shrl $16, %ebp -; AVX512-NEXT: movb %bpl, 41(%rdi) +; AVX512-NEXT: vpextrd $2, %xmm1, %ebp +; AVX512-NEXT: movw %bp, 30(%rdi) +; AVX512-NEXT: vpextrd $1, %xmm1, %r14d +; AVX512-NEXT: movw %r14w, 27(%rdi) +; AVX512-NEXT: vmovd %xmm1, %r15d +; AVX512-NEXT: movw %r15w, 24(%rdi) +; AVX512-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-NEXT: movw %ax, 9(%rdi) +; AVX512-NEXT: vpextrd $2, %xmm0, %ecx +; AVX512-NEXT: movw %cx, 6(%rdi) +; AVX512-NEXT: vpextrd $1, %xmm0, %edx +; AVX512-NEXT: movw %dx, 3(%rdi) +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: movw %si, (%rdi) +; AVX512-NEXT: shrl $16, %r8d +; AVX512-NEXT: movb %r8b, 47(%rdi) +; AVX512-NEXT: shrl $16, %r9d +; AVX512-NEXT: movb %r9b, 44(%rdi) +; AVX512-NEXT: shrl $16, %r10d +; AVX512-NEXT: movb %r10b, 41(%rdi) ; AVX512-NEXT: shrl $16, %r11d ; AVX512-NEXT: movb %r11b, 38(%rdi) ; AVX512-NEXT: shrl $16, %ebx ; AVX512-NEXT: movb %bl, 35(%rdi) -; AVX512-NEXT: shrl $16, %r10d -; AVX512-NEXT: movb %r10b, 32(%rdi) -; AVX512-NEXT: shrl $16, %r9d -; AVX512-NEXT: movb %r9b, 29(%rdi) -; AVX512-NEXT: shrl $16, %r8d -; AVX512-NEXT: movb %r8b, 26(%rdi) +; AVX512-NEXT: shrl $16, %ebp +; AVX512-NEXT: movb %bpl, 32(%rdi) +; AVX512-NEXT: shrl $16, %r14d +; AVX512-NEXT: movb %r14b, 29(%rdi) +; AVX512-NEXT: shrl $16, %r15d +; AVX512-NEXT: movb %r15b, 26(%rdi) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpextrd $3, %xmm0, %r11d -; AVX512-NEXT: movw %r11w, 21(%rdi) -; AVX512-NEXT: vpextrd $2, %xmm0, %r10d -; AVX512-NEXT: movw %r10w, 18(%rdi) -; AVX512-NEXT: vpextrd $1, %xmm0, %r9d -; AVX512-NEXT: movw %r9w, 15(%rdi) -; AVX512-NEXT: vmovd %xmm0, %r8d -; AVX512-NEXT: movw %r8w, 12(%rdi) -; AVX512-NEXT: shrl $16, %esi -; AVX512-NEXT: movb %sil, 11(%rdi) -; AVX512-NEXT: shrl $16, %edx -; AVX512-NEXT: movb %dl, 8(%rdi) -; AVX512-NEXT: shrl $16, %ecx -; AVX512-NEXT: movb %cl, 5(%rdi) +; AVX512-NEXT: vpextrd $3, %xmm0, %r8d +; AVX512-NEXT: movw %r8w, 21(%rdi) +; AVX512-NEXT: vpextrd $2, %xmm0, %r9d +; AVX512-NEXT: movw %r9w, 18(%rdi) +; AVX512-NEXT: vpextrd $1, %xmm0, %r10d +; AVX512-NEXT: movw %r10w, 15(%rdi) +; AVX512-NEXT: vmovd %xmm0, %r11d +; AVX512-NEXT: movw %r11w, 12(%rdi) ; AVX512-NEXT: shrl $16, %eax -; AVX512-NEXT: movb %al, 2(%rdi) -; AVX512-NEXT: shrl $16, %r11d -; AVX512-NEXT: movb %r11b, 23(%rdi) -; AVX512-NEXT: shrl $16, %r10d -; AVX512-NEXT: movb %r10b, 20(%rdi) -; AVX512-NEXT: shrl $16, %r9d -; AVX512-NEXT: movb %r9b, 17(%rdi) +; AVX512-NEXT: movb %al, 11(%rdi) +; AVX512-NEXT: shrl $16, %ecx +; AVX512-NEXT: movb %cl, 8(%rdi) +; AVX512-NEXT: shrl $16, %edx +; AVX512-NEXT: movb %dl, 5(%rdi) +; AVX512-NEXT: shrl $16, %esi +; AVX512-NEXT: movb %sil, 2(%rdi) ; AVX512-NEXT: shrl $16, %r8d -; AVX512-NEXT: movb %r8b, 14(%rdi) +; AVX512-NEXT: movb %r8b, 23(%rdi) +; AVX512-NEXT: shrl $16, %r9d +; AVX512-NEXT: movb %r9b, 20(%rdi) +; AVX512-NEXT: shrl $16, %r10d +; AVX512-NEXT: movb %r10b, 17(%rdi) +; AVX512-NEXT: shrl $16, %r11d +; AVX512-NEXT: movb %r11b, 14(%rdi) ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r14 ; AVX512-NEXT: popq %r15 @@ -4947,72 +4947,72 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; SKX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; SKX-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; SKX-NEXT: vpextrd $3, %xmm1, %r15d -; SKX-NEXT: movw %r15w, 45(%rdi) -; SKX-NEXT: vpextrd $2, %xmm1, %r14d -; SKX-NEXT: movw %r14w, 42(%rdi) -; SKX-NEXT: vpextrd $1, %xmm1, %ebp -; SKX-NEXT: movw %bp, 39(%rdi) +; SKX-NEXT: vpextrd $3, %xmm1, %r8d +; SKX-NEXT: movw %r8w, 45(%rdi) +; SKX-NEXT: vpextrd $2, %xmm1, %r9d +; SKX-NEXT: movw %r9w, 42(%rdi) +; SKX-NEXT: vpextrd $1, %xmm1, %r10d +; SKX-NEXT: movw %r10w, 39(%rdi) ; SKX-NEXT: vmovd %xmm1, %r11d ; SKX-NEXT: movw %r11w, 36(%rdi) ; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; SKX-NEXT: vpextrd $3, %xmm1, %ebx ; SKX-NEXT: movw %bx, 33(%rdi) -; SKX-NEXT: vpextrd $2, %xmm1, %r10d -; SKX-NEXT: movw %r10w, 30(%rdi) -; SKX-NEXT: vpextrd $1, %xmm1, %r9d -; SKX-NEXT: movw %r9w, 27(%rdi) -; SKX-NEXT: vmovd %xmm1, %r8d -; SKX-NEXT: vpextrd $3, %xmm0, %edx -; SKX-NEXT: movw %r8w, 24(%rdi) -; SKX-NEXT: movw %dx, 9(%rdi) -; SKX-NEXT: vpextrd $2, %xmm0, %esi -; SKX-NEXT: vpextrd $1, %xmm0, %eax -; SKX-NEXT: movw %si, 6(%rdi) -; SKX-NEXT: movw %ax, 3(%rdi) -; SKX-NEXT: vmovd %xmm0, %ecx -; SKX-NEXT: movw %cx, (%rdi) -; SKX-NEXT: shrl $16, %r15d -; SKX-NEXT: movb %r15b, 47(%rdi) -; SKX-NEXT: shrl $16, %r14d -; SKX-NEXT: movb %r14b, 44(%rdi) -; SKX-NEXT: shrl $16, %ebp -; SKX-NEXT: movb %bpl, 41(%rdi) +; SKX-NEXT: vpextrd $2, %xmm1, %ebp +; SKX-NEXT: movw %bp, 30(%rdi) +; SKX-NEXT: vpextrd $1, %xmm1, %r14d +; SKX-NEXT: movw %r14w, 27(%rdi) +; SKX-NEXT: vmovd %xmm1, %r15d +; SKX-NEXT: vpextrd $3, %xmm0, %eax +; SKX-NEXT: movw %r15w, 24(%rdi) +; SKX-NEXT: movw %ax, 9(%rdi) +; SKX-NEXT: vpextrd $2, %xmm0, %edx +; SKX-NEXT: vpextrd $1, %xmm0, %ecx +; SKX-NEXT: movw %dx, 6(%rdi) +; SKX-NEXT: movw %cx, 3(%rdi) +; SKX-NEXT: vmovd %xmm0, %esi +; SKX-NEXT: movw %si, (%rdi) +; SKX-NEXT: shrl $16, %r8d +; SKX-NEXT: movb %r8b, 47(%rdi) +; SKX-NEXT: shrl $16, %r9d +; SKX-NEXT: movb %r9b, 44(%rdi) +; SKX-NEXT: shrl $16, %r10d +; SKX-NEXT: movb %r10b, 41(%rdi) ; SKX-NEXT: shrl $16, %r11d ; SKX-NEXT: movb %r11b, 38(%rdi) ; SKX-NEXT: shrl $16, %ebx ; SKX-NEXT: movb %bl, 35(%rdi) -; SKX-NEXT: shrl $16, %r10d -; SKX-NEXT: movb %r10b, 32(%rdi) -; SKX-NEXT: shrl $16, %r9d -; SKX-NEXT: movb %r9b, 29(%rdi) -; SKX-NEXT: shrl $16, %r8d -; SKX-NEXT: movb %r8b, 26(%rdi) +; SKX-NEXT: shrl $16, %ebp +; SKX-NEXT: movb %bpl, 32(%rdi) +; SKX-NEXT: shrl $16, %r14d +; SKX-NEXT: movb %r14b, 29(%rdi) +; SKX-NEXT: shrl $16, %r15d +; SKX-NEXT: movb %r15b, 26(%rdi) ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrd $3, %xmm0, %r11d -; SKX-NEXT: movw %r11w, 21(%rdi) -; SKX-NEXT: vpextrd $2, %xmm0, %r10d -; SKX-NEXT: movw %r10w, 18(%rdi) -; SKX-NEXT: vpextrd $1, %xmm0, %r9d -; SKX-NEXT: movw %r9w, 15(%rdi) -; SKX-NEXT: vmovd %xmm0, %r8d -; SKX-NEXT: movw %r8w, 12(%rdi) -; SKX-NEXT: shrl $16, %edx -; SKX-NEXT: movb %dl, 11(%rdi) -; SKX-NEXT: shrl $16, %esi -; SKX-NEXT: movb %sil, 8(%rdi) +; SKX-NEXT: vpextrd $3, %xmm0, %r8d +; SKX-NEXT: movw %r8w, 21(%rdi) +; SKX-NEXT: vpextrd $2, %xmm0, %r9d +; SKX-NEXT: movw %r9w, 18(%rdi) +; SKX-NEXT: vpextrd $1, %xmm0, %r10d +; SKX-NEXT: movw %r10w, 15(%rdi) +; SKX-NEXT: vmovd %xmm0, %r11d +; SKX-NEXT: movw %r11w, 12(%rdi) ; SKX-NEXT: shrl $16, %eax -; SKX-NEXT: movb %al, 5(%rdi) +; SKX-NEXT: movb %al, 11(%rdi) +; SKX-NEXT: shrl $16, %edx +; SKX-NEXT: movb %dl, 8(%rdi) ; SKX-NEXT: shrl $16, %ecx -; SKX-NEXT: movb %cl, 2(%rdi) -; SKX-NEXT: shrl $16, %r11d -; SKX-NEXT: movb %r11b, 23(%rdi) -; SKX-NEXT: shrl $16, %r10d -; SKX-NEXT: movb %r10b, 20(%rdi) -; SKX-NEXT: shrl $16, %r9d -; SKX-NEXT: movb %r9b, 17(%rdi) +; SKX-NEXT: movb %cl, 5(%rdi) +; SKX-NEXT: shrl $16, %esi +; SKX-NEXT: movb %sil, 2(%rdi) ; SKX-NEXT: shrl $16, %r8d -; SKX-NEXT: movb %r8b, 14(%rdi) +; SKX-NEXT: movb %r8b, 23(%rdi) +; SKX-NEXT: shrl $16, %r9d +; SKX-NEXT: movb %r9b, 20(%rdi) +; SKX-NEXT: shrl $16, %r10d +; SKX-NEXT: movb %r10b, 17(%rdi) +; SKX-NEXT: shrl $16, %r11d +; SKX-NEXT: movb %r11b, 14(%rdi) ; SKX-NEXT: popq %rbx ; SKX-NEXT: popq %r14 ; SKX-NEXT: popq %r15 diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index f40a7e39b98699..a86646307df5f2 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -402,15 +402,15 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; SSE41-LABEL: trunc_usat_v8i64_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm4 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm5 ; SSE41-NEXT: movdqa 32(%rdi), %xmm8 ; SSE41-NEXT: movdqa 48(%rdi), %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] @@ -421,7 +421,7 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 ; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm7, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -430,21 +430,21 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; SSE41-NEXT: movapd %xmm3, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] -; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8 ; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm5, %xmm2 +; SSE41-NEXT: pand %xmm7, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm8[0,2] @@ -787,8 +787,8 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; ; SSE41-LABEL: trunc_usat_v4i64_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -800,18 +800,18 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: movapd %xmm3, %xmm7 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm2, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pand %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm7, %xmm2 -; SSE41-NEXT: packusdw %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm7, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i16: @@ -1030,20 +1030,20 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm6 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm5 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm4 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm7 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm7 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 @@ -1054,10 +1054,10 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 @@ -1065,77 +1065,77 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSE2-SSSE3-NEXT: pslld $16, %xmm5 -; SSE2-SSSE3-NEXT: psrad $16, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[0,2] +; SSE2-SSSE3-NEXT: pslld $16, %xmm4 +; SSE2-SSSE3-NEXT: psrad $16, %xmm4 ; SSE2-SSSE3-NEXT: pslld $16, %xmm0 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm8 ; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm7 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 +; SSE41-NEXT: movapd %xmm4, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 ; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: movapd %xmm4, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 +; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm7, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm8, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: packusdw %xmm8, %xmm4 +; SSE41-NEXT: packusdw %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1496,93 +1496,93 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) { define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) { ; SSE2-LABEL: trunc_usat_v16i32_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm3 ; SSE2-NEXT: movdqa 32(%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 +; SSE2-NEXT: movdqa 48(%rdi), %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm7, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: por %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm4 ; SSE2-NEXT: pxor %xmm6, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pxor %xmm7, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pslld $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm7 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: packssdw %xmm7, %xmm0 +; SSE2-NEXT: pslld $16, %xmm8 +; SSE2-NEXT: psrad $16, %xmm8 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packssdw %xmm8, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v16i32_v16i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm5 +; SSSE3-NEXT: movdqa (%rdi), %xmm2 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm3 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm0 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm8 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm6, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm7, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 ; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm7, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm1 ; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm8, %xmm0 -; SSSE3-NEXT: pxor %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm8 -; SSSE3-NEXT: pxor %xmm7, %xmm4 -; SSSE3-NEXT: por %xmm8, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm4 ; SSSE3-NEXT: pxor %xmm6, %xmm8 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm7, %xmm0 -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm2, %xmm7 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm3, %xmm2 -; SSSE3-NEXT: pshufb %xmm3, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm3, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSSE3-NEXT: por %xmm4, %xmm8 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pxor %xmm7, %xmm6 +; SSSE3-NEXT: pand %xmm3, %xmm7 +; SSSE3-NEXT: por %xmm6, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm7 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSSE3-NEXT: pshufb %xmm2, %xmm8 +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v16i32_v16i16: @@ -1866,8 +1866,8 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; ; SSE41-LABEL: trunc_usat_v4i64_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -1879,19 +1879,19 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: movapd %xmm3, %xmm7 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm2, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pand %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm7, %xmm2 -; SSE41-NEXT: packusdw %xmm2, %xmm2 -; SSE41-NEXT: packuswb %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm7, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm3 +; SSE41-NEXT: packuswb %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i8: @@ -2171,52 +2171,52 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm8 ; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm7 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 +; SSE41-NEXT: movapd %xmm4, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 ; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: movapd %xmm4, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 +; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm7, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm8, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: packusdw %xmm8, %xmm4 +; SSE41-NEXT: packusdw %xmm4, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -2295,13 +2295,13 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; SSE2-SSSE3-LABEL: trunc_usat_v8i64_v8i8_store: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm6 -; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm6 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm0 ; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm4 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm7 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] @@ -2310,32 +2310,32 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm9 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm6 -; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] @@ -2346,64 +2346,64 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm5 -; SSE2-SSSE3-NEXT: movq %xmm5, (%rsi) +; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm6 +; SSE2-SSSE3-NEXT: movq %xmm6, (%rsi) ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm3 -; SSE41-NEXT: movdqa 48(%rdi), %xmm6 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm8 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm5 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm6, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movapd %xmm3, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 -; SSE41-NEXT: packusdw %xmm9, %xmm7 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: packusdw %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm6, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm2, %xmm7 -; SSE41-NEXT: packuswb %xmm7, %xmm7 -; SSE41-NEXT: movq %xmm7, (%rsi) +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm7, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm8 +; SSE41-NEXT: packuswb %xmm8, %xmm8 +; SSE41-NEXT: movq %xmm8, (%rsi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i8_store: @@ -2484,7 +2484,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE2-SSSE3-LABEL: trunc_usat_v16i64_v16i8: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movdqa 96(%rdi), %xmm1 -; SSE2-SSSE3-NEXT: movdqa 112(%rdi), %xmm4 +; SSE2-SSSE3-NEXT: movdqa 112(%rdi), %xmm3 ; SSE2-SSSE3-NEXT: movdqa 64(%rdi), %xmm6 ; SSE2-SSSE3-NEXT: movdqa 80(%rdi), %xmm7 ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm10 @@ -2492,49 +2492,49 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm8 ; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm11 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm11 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm13 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm12, %xmm13 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm11 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm11 ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm11 ; SSE2-SSSE3-NEXT: pand %xmm11, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm11 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm11 ; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm13 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm12, %xmm13 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm10 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm10, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm11, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm10 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm10 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm12 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm10 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm10 ; SSE2-SSSE3-NEXT: pand %xmm12, %xmm10 ; SSE2-SSSE3-NEXT: pand %xmm10, %xmm9 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm10 ; SSE2-SSSE3-NEXT: por %xmm9, %xmm10 ; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm9 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm9 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm12 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm9 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm9 ; SSE2-SSSE3-NEXT: pand %xmm12, %xmm9 ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm8 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm9 @@ -2542,51 +2542,51 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE2-SSSE3-NEXT: packuswb %xmm10, %xmm9 ; SSE2-SSSE3-NEXT: packuswb %xmm9, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm8 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm8 ; SSE2-SSSE3-NEXT: pand %xmm10, %xmm8 ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm8 ; SSE2-SSSE3-NEXT: por %xmm7, %xmm8 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm7 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 ; SSE2-SSSE3-NEXT: pand %xmm10, %xmm7 ; SSE2-SSSE3-NEXT: pand %xmm7, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm7 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 ; SSE2-SSSE3-NEXT: packuswb %xmm8, %xmm7 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm9 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm6 ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: por %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: retq ; @@ -2601,11 +2601,11 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: movdqa 32(%rdi), %xmm10 ; SSE41-NEXT: movdqa 48(%rdi), %xmm11 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm13 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm7, %xmm13 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] @@ -2615,8 +2615,8 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: movapd %xmm4, %xmm13 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm13 ; SSE41-NEXT: movdqa %xmm12, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -2626,8 +2626,8 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm2 ; SSE41-NEXT: packusdw %xmm13, %xmm2 ; SSE41-NEXT: movdqa %xmm11, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm12 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm12 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 ; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -2636,8 +2636,8 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: movapd %xmm4, %xmm12 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 ; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm11 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm11 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 ; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -2648,8 +2648,8 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: packusdw %xmm12, %xmm11 ; SSE41-NEXT: packusdw %xmm11, %xmm2 ; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm10 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -2658,8 +2658,8 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: movapd %xmm4, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -2669,8 +2669,8 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 ; SSE41-NEXT: packusdw %xmm10, %xmm9 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -2678,11 +2678,11 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm8 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pxor %xmm3, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm6, %xmm1 +; SSE41-NEXT: pand %xmm7, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 ; SSE41-NEXT: packusdw %xmm8, %xmm4 @@ -3150,42 +3150,42 @@ define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { define <16 x i8> @trunc_usat_v16i32_v16i8(ptr %p0) { ; SSE2-SSSE3-LABEL: trunc_usat_v16i32_v16i8: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm6 +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm1 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm5 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm7 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm8 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm7 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm8, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm2 -; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm7 +; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v16i32_v16i8: @@ -3250,43 +3250,43 @@ define <16 x i8> @trunc_usat_v16i32_v16i8(ptr %p0) { define void @trunc_usat_v16i32_v16i8_store(ptr %p0, ptr %p1) { ; SSE2-SSSE3-LABEL: trunc_usat_v16i32_v16i8_store: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm6 -; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm5 -; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm0 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm4 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm7 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm8 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm7 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: packuswb %xmm8, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm4, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm1 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm5, (%rsi) +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: packuswb %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm7 +; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm7 +; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm6, (%rsi) ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v16i32_v16i8_store: @@ -3657,7 +3657,7 @@ define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) { ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm7 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm5 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm3 ; SSE2-SSSE3-NEXT: movdqa 96(%rdi), %xmm8 ; SSE2-SSSE3-NEXT: movdqa 112(%rdi), %xmm9 ; SSE2-SSSE3-NEXT: movdqa 64(%rdi), %xmm10 @@ -3666,15 +3666,15 @@ define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) { ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm11 ; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm11 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm12 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm12 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 ; SSE2-SSSE3-NEXT: pand %xmm12, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm12 ; SSE2-SSSE3-NEXT: por %xmm1, %xmm12 ; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm11 ; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm11 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm10 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 @@ -3682,14 +3682,14 @@ define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) { ; SSE2-SSSE3-NEXT: packuswb %xmm12, %xmm1 ; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm10 ; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm11 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm11 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 ; SSE2-SSSE3-NEXT: pand %xmm11, %xmm9 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm11 ; SSE2-SSSE3-NEXT: por %xmm9, %xmm11 ; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 ; SSE2-SSSE3-NEXT: pand %xmm10, %xmm8 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm10 @@ -3698,33 +3698,33 @@ define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) { ; SSE2-SSSE3-NEXT: packuswb %xmm10, %xmm1 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm8 ; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm9 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm9 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm9 ; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 ; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm9, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm8 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: packuswb %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: packuswb %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v32i32_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll index 3d5947d8e59bd4..9adb9111f02f8d 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -637,22 +637,22 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; X86-SSE-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-SSE-NEXT: paddd %xmm0, %xmm1 ; X86-SSE-NEXT: pandn %xmm1, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: pand %xmm2, %xmm3 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-SSE-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE-NEXT: pshufb %xmm3, %xmm4 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pand %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pshufb %xmm2, %xmm4 ; X86-SSE-NEXT: psrlw $4, %xmm0 -; X86-SSE-NEXT: pand %xmm2, %xmm0 -; X86-SSE-NEXT: pshufb %xmm0, %xmm1 -; X86-SSE-NEXT: paddb %xmm4, %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; X86-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: psadbw %xmm2, %xmm1 -; X86-SSE-NEXT: psadbw %xmm2, %xmm0 -; X86-SSE-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE-NEXT: pand %xmm1, %xmm0 +; X86-SSE-NEXT: pshufb %xmm0, %xmm3 +; X86-SSE-NEXT: paddb %xmm4, %xmm3 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero +; X86-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X86-SSE-NEXT: psadbw %xmm1, %xmm3 +; X86-SSE-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE-NEXT: packuswb %xmm3, %xmm0 ; X86-SSE-NEXT: retl %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0) ret <4 x i32> %out @@ -878,22 +878,22 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; X86-SSE-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-SSE-NEXT: paddd %xmm0, %xmm1 ; X86-SSE-NEXT: pandn %xmm1, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: pand %xmm2, %xmm3 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-SSE-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE-NEXT: pshufb %xmm3, %xmm4 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pand %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pshufb %xmm2, %xmm4 ; X86-SSE-NEXT: psrlw $4, %xmm0 -; X86-SSE-NEXT: pand %xmm2, %xmm0 -; X86-SSE-NEXT: pshufb %xmm0, %xmm1 -; X86-SSE-NEXT: paddb %xmm4, %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; X86-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: psadbw %xmm2, %xmm1 -; X86-SSE-NEXT: psadbw %xmm2, %xmm0 -; X86-SSE-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE-NEXT: pand %xmm1, %xmm0 +; X86-SSE-NEXT: pshufb %xmm0, %xmm3 +; X86-SSE-NEXT: paddb %xmm4, %xmm3 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero +; X86-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X86-SSE-NEXT: psadbw %xmm1, %xmm3 +; X86-SSE-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE-NEXT: packuswb %xmm3, %xmm0 ; X86-SSE-NEXT: retl %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1) ret <4 x i32> %out diff --git a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll index a2affbd8728c23..c785db8879d490 100644 --- a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll +++ b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll @@ -39,10 +39,10 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1 ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload -; X86-NEXT: kmovw %k0, %edx +; X86-NEXT: kmovw %k0, %ecx ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload -; X86-NEXT: kmovw %k0, %ecx +; X86-NEXT: kmovw %k0, %edx ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload ; X86-NEXT: kmovw %k0, %edi @@ -50,11 +50,11 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1 ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 # 2-byte Reload ; X86-NEXT: kmovw %k2, %edi -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: kmovw %k1, %edx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: addl %edx, %eax +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: kmovw %k1, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %edx, %eax ; X86-NEXT: movw %ax, (%esi) ; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/vselect-minmax.ll b/llvm/test/CodeGen/X86/vselect-minmax.ll index cb0542ca7cea8b..a83136d9be80c2 100644 --- a/llvm/test/CodeGen/X86/vselect-minmax.ll +++ b/llvm/test/CodeGen/X86/vselect-minmax.ll @@ -5029,31 +5029,31 @@ define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) { ; ; SSE4-LABEL: test125: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm0, %xmm8 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm0, %xmm10 -; SSE4-NEXT: pxor %xmm8, %xmm10 +; SSE4-NEXT: pxor %xmm9, %xmm10 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE4-NEXT: movdqa %xmm1, %xmm9 -; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE4-NEXT: movdqa %xmm1, %xmm8 +; SSE4-NEXT: pxor %xmm9, %xmm8 ; SSE4-NEXT: movdqa %xmm5, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm2, %xmm1 -; SSE4-NEXT: pxor %xmm8, %xmm1 +; SSE4-NEXT: pxor %xmm9, %xmm1 ; SSE4-NEXT: movdqa %xmm6, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm8 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 -; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm7, %xmm9 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm9 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 @@ -5177,31 +5177,31 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) { ; ; SSE4-LABEL: test126: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm0, %xmm8 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm0, %xmm10 -; SSE4-NEXT: pxor %xmm8, %xmm10 +; SSE4-NEXT: pxor %xmm9, %xmm10 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE4-NEXT: movdqa %xmm1, %xmm9 -; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE4-NEXT: movdqa %xmm1, %xmm8 +; SSE4-NEXT: pxor %xmm9, %xmm8 ; SSE4-NEXT: movdqa %xmm5, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm2, %xmm1 -; SSE4-NEXT: pxor %xmm8, %xmm1 +; SSE4-NEXT: pxor %xmm9, %xmm1 ; SSE4-NEXT: movdqa %xmm6, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm8 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 -; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm7, %xmm9 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm9 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 @@ -5325,30 +5325,30 @@ define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) { ; ; SSE4-LABEL: test127: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm0, %xmm8 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm4, %xmm10 -; SSE4-NEXT: pxor %xmm8, %xmm10 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm10 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE4-NEXT: movdqa %xmm5, %xmm9 -; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE4-NEXT: movdqa %xmm5, %xmm8 +; SSE4-NEXT: pxor %xmm9, %xmm8 ; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm6, %xmm1 -; SSE4-NEXT: pxor %xmm8, %xmm1 +; SSE4-NEXT: pxor %xmm9, %xmm1 ; SSE4-NEXT: movdqa %xmm2, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pxor %xmm3, %xmm8 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 -; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm3, %xmm9 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm9 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 @@ -5472,30 +5472,30 @@ define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) { ; ; SSE4-LABEL: test128: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm0, %xmm8 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm4, %xmm10 -; SSE4-NEXT: pxor %xmm8, %xmm10 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm10 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE4-NEXT: movdqa %xmm5, %xmm9 -; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE4-NEXT: movdqa %xmm5, %xmm8 +; SSE4-NEXT: pxor %xmm9, %xmm8 ; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm6, %xmm1 -; SSE4-NEXT: pxor %xmm8, %xmm1 +; SSE4-NEXT: pxor %xmm9, %xmm1 ; SSE4-NEXT: movdqa %xmm2, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pxor %xmm3, %xmm8 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 -; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm3, %xmm9 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm9 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 @@ -7451,30 +7451,30 @@ define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) { ; ; SSE4-LABEL: test156: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm0, %xmm8 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm4, %xmm10 -; SSE4-NEXT: pxor %xmm8, %xmm10 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm10 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE4-NEXT: movdqa %xmm5, %xmm9 -; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE4-NEXT: movdqa %xmm5, %xmm8 +; SSE4-NEXT: pxor %xmm9, %xmm8 ; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm6, %xmm1 -; SSE4-NEXT: pxor %xmm8, %xmm1 +; SSE4-NEXT: pxor %xmm9, %xmm1 ; SSE4-NEXT: movdqa %xmm2, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pxor %xmm3, %xmm8 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 -; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm3, %xmm9 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm9 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 @@ -7598,31 +7598,31 @@ define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) { ; ; SSE4-LABEL: test159: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm0, %xmm8 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm0, %xmm10 -; SSE4-NEXT: pxor %xmm8, %xmm10 +; SSE4-NEXT: pxor %xmm9, %xmm10 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE4-NEXT: movdqa %xmm1, %xmm9 -; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE4-NEXT: movdqa %xmm1, %xmm8 +; SSE4-NEXT: pxor %xmm9, %xmm8 ; SSE4-NEXT: movdqa %xmm5, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm2, %xmm1 -; SSE4-NEXT: pxor %xmm8, %xmm1 +; SSE4-NEXT: pxor %xmm9, %xmm1 ; SSE4-NEXT: movdqa %xmm6, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm8 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 -; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm7, %xmm9 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm9 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 @@ -7746,31 +7746,31 @@ define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) { ; ; SSE4-LABEL: test160: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm0, %xmm8 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm0, %xmm10 -; SSE4-NEXT: pxor %xmm8, %xmm10 +; SSE4-NEXT: pxor %xmm9, %xmm10 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE4-NEXT: movdqa %xmm1, %xmm9 -; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE4-NEXT: movdqa %xmm1, %xmm8 +; SSE4-NEXT: pxor %xmm9, %xmm8 ; SSE4-NEXT: movdqa %xmm5, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm2, %xmm1 -; SSE4-NEXT: pxor %xmm8, %xmm1 +; SSE4-NEXT: pxor %xmm9, %xmm1 ; SSE4-NEXT: movdqa %xmm6, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm8 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 -; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm9, %xmm0 +; SSE4-NEXT: pxor %xmm7, %xmm9 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm9 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 @@ -10283,44 +10283,44 @@ entry: define <8 x i64> @concat_smin_smax(<4 x i64> %a0, <4 x i64> %a1) { ; SSE2-LABEL: concat_smin_smax: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm6, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] ; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm7 ; SSE2-NEXT: pandn %xmm2, %xmm7 ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: por %xmm7, %xmm4 ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: pxor %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm5 ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pandn %xmm1, %xmm8 ; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm7, %xmm3 ; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: movdqa %xmm4, %xmm0 diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index d6716d0edff40c..b7ffab81b982bd 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -200,24 +200,24 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movzbl (%eax), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx -; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx -; X86-NO-SHLD-NO-BMI2-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-NEXT: leal (%esi,%esi), %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: notb %cl -; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi -; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: orl %edi, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: testb $32, %al -; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %esi, %edi +; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %esi, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %esi, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, 4(%edx) -; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, (%edx) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebx, (%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx @@ -229,20 +229,20 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esi), %edx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esi), %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%edx), %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: shrl %cl, %edi -; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-SHLD-NO-BMI2-NEXT: xorl %esi, %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: xorl %edx, %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: testb $32, %cl -; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovnel %edi, %edx -; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovel %edi, %esi -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 4(%eax) -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovnel %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovel %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, (%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: retl @@ -282,19 +282,19 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esi), %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esi), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: testb $32, %cl -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %esi, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %edx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %edx, %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, 4(%eax) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, (%eax) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl @@ -332,24 +332,24 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %esi -; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movzbl (%eax), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx -; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %ebx -; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %edi -; X86-NO-SHLD-NO-BMI2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: shrl %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: notb %cl -; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %edi -; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: orl %edi, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: testb $32, %al -; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %esi, %edi +; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %esi, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %esi, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, (%edx) -; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, 4(%edx) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebx, 4(%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx @@ -463,25 +463,25 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movzbl (%eax), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx -; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx -; X86-NO-SHLD-NO-BMI2-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-NEXT: leal (%esi,%esi), %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: notb %cl -; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi -; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi -; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx -; X86-NO-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: sarl %cl, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: sarl $31, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: testb $32, %al -; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %ebx, %edi -; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %ebx, %esi +; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %edi, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %edi, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, 4(%edx) -; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, (%edx) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebx, (%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx @@ -493,20 +493,20 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esi), %edx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esi), %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%edx), %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl %cl, %edi -; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl $31, %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl $31, %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: testb $32, %cl -; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovnel %edi, %edx -; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovel %edi, %esi -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 4(%eax) -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovnel %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovel %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, (%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: retl @@ -517,23 +517,23 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esi), %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%edx), %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %dl -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %edx, (%esi), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, (%edx), %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %bl -; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%esi,%esi), %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %ebx, %edi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %esi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %edx, %ecx, %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: sarl $31, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: testb $32, %dl -; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovnel %esi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovel %esi, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %esi, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: sarl $31, %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: testb $32, %cl +; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovnel %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovel %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %esi, 4(%eax) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%eax) ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi @@ -546,19 +546,19 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esi), %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esi), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %esi, %edi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: testb $32, %cl -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %edi, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %edi, %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, 4(%eax) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, (%eax) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl @@ -1316,24 +1316,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $72, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE2-NEXT: movl (%edi), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movl (%edx), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%edi), %ecx +; X86-SSE2-NEXT: movl 4(%edx), %ecx ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%edi), %esi -; X86-SSE2-NEXT: movl 12(%edi), %ebx -; X86-SSE2-NEXT: movl 16(%edi), %ebp +; X86-SSE2-NEXT: movl 8(%edx), %edi +; X86-SSE2-NEXT: movl 12(%edx), %ebx +; X86-SSE2-NEXT: movl 16(%edx), %ebp ; X86-SSE2-NEXT: movzbl (%eax), %eax -; X86-SSE2-NEXT: movl 20(%edi), %edx -; X86-SSE2-NEXT: movl 24(%edi), %ecx -; X86-SSE2-NEXT: movl 28(%edi), %edi -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl 20(%edx), %esi +; X86-SSE2-NEXT: movl 24(%edx), %ecx +; X86-SSE2-NEXT: movl 28(%edx), %edx ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -2229,13 +2229,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl 40(%ecx), %ebx ; X86-SSE2-NEXT: movl 52(%ecx), %edi ; X86-SSE2-NEXT: movl 60(%ecx), %esi -; X86-SSE2-NEXT: movl 56(%ecx), %edx +; X86-SSE2-NEXT: movl 56(%ecx), %ecx ; X86-SSE2-NEXT: negl %eax -; X86-SSE2-NEXT: movl 152(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 152(%esp,%eax), %edx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %edx, 56(%eax) +; X86-SSE2-NEXT: movl %ecx, 56(%eax) ; X86-SSE2-NEXT: movl %esi, 60(%eax) -; X86-SSE2-NEXT: movl %ecx, 48(%eax) +; X86-SSE2-NEXT: movl %edx, 48(%eax) ; X86-SSE2-NEXT: movl %edi, 52(%eax) ; X86-SSE2-NEXT: movl %ebx, 40(%eax) ; X86-SSE2-NEXT: movl %ebp, 44(%eax) @@ -2367,25 +2367,24 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: ashr_64bytes: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pushq %r14 ; X64-SSE2-NEXT: pushq %rbx -; X64-SSE2-NEXT: movq (%rdi), %rcx -; X64-SSE2-NEXT: movq 8(%rdi), %r8 -; X64-SSE2-NEXT: movq 16(%rdi), %r9 -; X64-SSE2-NEXT: movq 24(%rdi), %r10 -; X64-SSE2-NEXT: movq 32(%rdi), %r11 -; X64-SSE2-NEXT: movq 40(%rdi), %rbx -; X64-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-SSE2-NEXT: movq (%rdi), %rax +; X64-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-SSE2-NEXT: movq 48(%rdi), %rbx ; X64-SSE2-NEXT: movq 56(%rdi), %rdi -; X64-SSE2-NEXT: movl (%rsi), %eax +; X64-SSE2-NEXT: movl (%rsi), %esi ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: sarq $63, %rdi ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -2395,25 +2394,24 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: andl $63, %eax -; X64-SSE2-NEXT: movq -128(%rsp,%rax), %rcx -; X64-SSE2-NEXT: movq -120(%rsp,%rax), %rsi -; X64-SSE2-NEXT: movq -104(%rsp,%rax), %rdi -; X64-SSE2-NEXT: movq -112(%rsp,%rax), %r8 -; X64-SSE2-NEXT: movq -88(%rsp,%rax), %r9 -; X64-SSE2-NEXT: movq -96(%rsp,%rax), %r10 -; X64-SSE2-NEXT: movq -72(%rsp,%rax), %r11 -; X64-SSE2-NEXT: movq -80(%rsp,%rax), %rax -; X64-SSE2-NEXT: movq %rax, 48(%rdx) +; X64-SSE2-NEXT: andl $63, %esi +; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax +; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx +; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi +; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8 +; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9 +; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10 +; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11 +; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi +; X64-SSE2-NEXT: movq %rsi, 48(%rdx) ; X64-SSE2-NEXT: movq %r11, 56(%rdx) ; X64-SSE2-NEXT: movq %r10, 32(%rdx) ; X64-SSE2-NEXT: movq %r9, 40(%rdx) ; X64-SSE2-NEXT: movq %r8, 16(%rdx) ; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rcx, (%rdx) -; X64-SSE2-NEXT: movq %rsi, 8(%rdx) +; X64-SSE2-NEXT: movq %rax, (%rdx) +; X64-SSE2-NEXT: movq %rcx, 8(%rdx) ; X64-SSE2-NEXT: popq %rbx -; X64-SSE2-NEXT: popq %r14 ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: ashr_64bytes: @@ -2772,5 +2770,5 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7: {{.*}} ; FALLBACK8: {{.*}} ; FALLBACK9: {{.*}} -; X86: {{.*}} ; X64: {{.*}} +; X86: {{.*}} diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 24475360cbbc46..723b82b98c073a 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -159,23 +159,23 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al -; X86-NO-BMI2-NO-SHLD-NEXT: cmovnel %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: cmovnel %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %esi, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -187,19 +187,19 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: xorl %esi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorl %edx, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovnel %edi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovnel %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl @@ -238,18 +238,18 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorl %edi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovnel %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovnel %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl @@ -284,23 +284,23 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al -; X86-NO-BMI2-NO-SHLD-NEXT: cmovnel %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: cmovnel %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %esi, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -408,24 +408,24 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al -; X86-NO-BMI2-NO-SHLD-NEXT: cmovnel %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: cmovnel %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %edi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -437,19 +437,19 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovnel %edi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovnel %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl @@ -460,22 +460,22 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovnel %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovnel %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -488,18 +488,18 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovnel %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovnel %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl @@ -658,15 +658,15 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movb (%eax), %ah ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -675,28 +675,28 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %ah ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %ah -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ecx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -788,26 +788,26 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ebx, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -905,9 +905,9 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ah +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -915,12 +915,12 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %al ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ah +; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %ah +; X86-NO-BMI2-NO-SHLD-NEXT: negb %ah +; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %ah, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %esi @@ -974,46 +974,46 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movb (%eax), %ah +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %ah, %al ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %ah +; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %ah +; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %ah +; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %ah, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1033,45 +1033,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, (%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %bl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 28(%esp,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1106,27 +1106,27 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ebx, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1224,22 +1224,22 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ah +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %al ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ah +; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %ah +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ah, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -1289,46 +1289,46 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movb (%eax), %ah +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %ah, %al ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %ah +; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %ah +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ecx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -1349,22 +1349,22 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %bl, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp @@ -1422,26 +1422,26 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ebx, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1475,38 +1475,38 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rdi), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; @@ -1642,28 +1642,28 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1672,80 +1672,79 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl ; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ecx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) @@ -1756,7 +1755,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) @@ -1775,24 +1774,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1833,15 +1832,15 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1849,14 +1848,14 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 28(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1882,30 +1881,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1914,68 +1913,65 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 20(%esp,%edi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 20(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %ecx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 24(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1991,28 +1987,28 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2031,11 +2027,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi @@ -2047,23 +2043,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx @@ -2072,7 +2068,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -2107,40 +2103,40 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil ; X64-NO-BMI2-NO-SHLD-NEXT: negb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %sil, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r10), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r10), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %sil, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r10), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%rdi), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r10), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rdi), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; @@ -2205,29 +2201,29 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes: @@ -2279,26 +2275,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $84, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload @@ -2315,81 +2311,79 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah ; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 28(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax) @@ -2399,7 +2393,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -2412,30 +2406,30 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $88, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2449,55 +2443,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %al, %ah +; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %ah +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 28(%esi) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%esi) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%esi) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi @@ -2508,7 +2503,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $88, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -2523,30 +2518,30 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $88, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2557,61 +2552,57 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, (%esp), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 84(%esp,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 84(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload @@ -2633,26 +2624,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $84, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -2670,58 +2661,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %eax, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %ebx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%esi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $84, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -2754,38 +2743,38 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rdi), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; @@ -2923,112 +2912,113 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al +; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl ; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ecx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) @@ -3039,7 +3029,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) @@ -3057,38 +3047,40 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl @@ -3117,15 +3109,15 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -3133,14 +3125,14 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 28(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3165,104 +3157,99 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ecx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 20(%esp,%edi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 20(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ebx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 24(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -3278,37 +3265,37 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al @@ -3319,11 +3306,11 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi @@ -3335,23 +3322,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx @@ -3360,7 +3347,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -3435,14 +3422,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11 @@ -3450,33 +3437,33 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r13,%r13), %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, 40(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) @@ -3536,31 +3523,31 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r15,%r15), %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r12 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r14, %r12 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp @@ -3569,12 +3556,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12 @@ -3621,55 +3608,55 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r13d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r13d +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r13d ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -3716,42 +3703,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r12, %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r10, %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r15, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rdi,%rdi), %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r12, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r13, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r13, %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r13, %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 @@ -3838,18 +3825,18 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx @@ -3981,26 +3968,25 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%esi), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 60(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4189,11 +4175,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -4210,11 +4196,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload @@ -4230,7 +4216,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4277,7 +4263,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi @@ -4292,7 +4278,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -4334,20 +4320,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx @@ -4356,16 +4342,16 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx @@ -4374,7 +4360,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx @@ -4383,74 +4369,77 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 40(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 32(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) @@ -4607,16 +4596,16 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %ecx @@ -4633,13 +4622,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4722,18 +4711,18 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi ; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r14), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r14), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -4785,15 +4774,15 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 48(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, 56(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 @@ -4856,18 +4845,18 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r10), %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r10), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r10), %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r10), %r12 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r10), %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r15, %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r10), %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r10), %r11 @@ -4878,16 +4867,16 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r15, %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r14, %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%r10), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r12, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r15, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 40(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, 24(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) @@ -4940,56 +4929,56 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r8d -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r11), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r11), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r11), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r11), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r11), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %bpl -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r15, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r8d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r8d -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r15, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r11), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rbx, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%r11), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r11), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r14, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %r11, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r13, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -5038,45 +5027,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r8, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %rbx, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r12, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r11, %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %r12, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %r13, %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %rbp, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r14, %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r14, %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 48(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 @@ -5094,9 +5083,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax @@ -5143,10 +5132,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax @@ -5171,163 +5160,158 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %ch ; X86-NO-BMI2-NO-SHLD-NEXT: notb %ch ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, (%esp) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%ecx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: negl %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%edx), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx @@ -5384,9 +5368,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx @@ -5417,8 +5401,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -5429,6 +5411,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi @@ -5520,9 +5504,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %ebp @@ -5545,16 +5528,16 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 52(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5566,10 +5549,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 48(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5593,47 +5576,47 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $216, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $220, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebx), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5649,13 +5632,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -5673,137 +5655,137 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, 216(%esp,%ebx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, (%esp), %ebx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 212(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 60(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 36(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 28(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) @@ -5821,7 +5803,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $216, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $220, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -5834,7 +5816,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -5843,11 +5825,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %eax @@ -5855,9 +5836,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edi), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ecx @@ -5868,10 +5850,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5882,16 +5861,19 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -5914,114 +5896,111 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edx), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 44(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 184(%esp,%eax), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 60(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 44(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -6032,7 +6011,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -6105,14 +6084,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11 @@ -6120,33 +6099,33 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r13,%r13), %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, 40(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) @@ -6207,31 +6186,31 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r15,%r15), %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r12 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r14, %r12 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp @@ -6240,12 +6219,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12 @@ -6289,59 +6268,59 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rax), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %rsi, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r15, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %r13d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r13d +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r13d +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r9, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rdi, %rax, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r15, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -6389,42 +6368,42 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r12, %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r10, %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r15, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rdi,%rdi), %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r12, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r13, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r13, %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r13, %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 @@ -6512,18 +6491,18 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx @@ -6655,26 +6634,25 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%esi), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 60(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -6864,11 +6842,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -6885,11 +6863,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload @@ -6905,7 +6883,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -6932,201 +6910,202 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ecx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ecx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 76(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ebx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 60(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 40(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 32(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) @@ -7159,7 +7138,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -7236,89 +7215,91 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%edx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%edx), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%edx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%edx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%edx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 40(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 32(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%edx) @@ -7346,7 +7327,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -7360,9 +7341,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} -; X86: {{.*}} -; X86-NO-SHLD: {{.*}} -; X86-SHLD: {{.*}} ; X64: {{.*}} ; X64-NO-SHLD: {{.*}} ; X64-SHLD: {{.*}} +; X86: {{.*}} +; X86-NO-SHLD: {{.*}} +; X86-SHLD: {{.*}} diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index 691ca40191d4ba..327d51a927dfc8 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -28,14 +28,14 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64 ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movzbl (%eax), %eax +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movzbl (%edx), %edx ; X86-NO-BMI2-NEXT: shll $3, %ecx ; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half: @@ -81,14 +81,14 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movzwl (%eax), %eax +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movzwl (%edx), %edx ; X86-NO-BMI2-NEXT: shll $3, %ecx ; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half: @@ -219,21 +219,21 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 ; ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: -; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half: diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 8c7535f6169138..9b032c90513e4f 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -30,14 +30,14 @@ define void @load_1byte_chunk_of_2byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movzwl (%eax), %eax +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movzwl (%edx), %edx ; X86-NO-BMI2-NEXT: shll $3, %ecx ; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca: @@ -83,14 +83,14 @@ define void @load_1byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movl (%eax), %eax +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movl (%edx), %edx ; X86-NO-BMI2-NEXT: shll $3, %ecx ; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca: @@ -213,21 +213,21 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: -; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca: @@ -316,20 +316,20 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al -; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movw %si, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movw %di, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -437,20 +437,20 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al -; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll index da468b6d809e89..19e0f5d5dc6ddc 100644 --- a/llvm/test/CodeGen/X86/widen_bitcnt.ll +++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll @@ -101,36 +101,36 @@ define <4 x i32> @widen_ctpop_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { define <8 x i32> @widen_ctpop_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; SSE42-LABEL: widen_ctpop_v4i32_v8i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm2, %xmm5 -; SSE42-NEXT: pshufb %xmm4, %xmm5 +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pand %xmm2, %xmm3 +; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NEXT: movdqa %xmm4, %xmm5 +; SSE42-NEXT: pshufb %xmm3, %xmm5 ; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm3, %xmm0 -; SSE42-NEXT: movdqa %xmm2, %xmm4 -; SSE42-NEXT: pshufb %xmm0, %xmm4 -; SSE42-NEXT: paddb %xmm5, %xmm4 +; SSE42-NEXT: pand %xmm2, %xmm0 +; SSE42-NEXT: movdqa %xmm4, %xmm3 +; SSE42-NEXT: pshufb %xmm0, %xmm3 +; SSE42-NEXT: paddb %xmm5, %xmm3 ; SSE42-NEXT: pxor %xmm5, %xmm5 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE42-NEXT: psadbw %xmm5, %xmm4 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE42-NEXT: psadbw %xmm5, %xmm3 ; SSE42-NEXT: psadbw %xmm5, %xmm0 -; SSE42-NEXT: packuswb %xmm4, %xmm0 -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pshufb %xmm4, %xmm6 +; SSE42-NEXT: packuswb %xmm3, %xmm0 +; SSE42-NEXT: movdqa %xmm1, %xmm3 +; SSE42-NEXT: pand %xmm2, %xmm3 +; SSE42-NEXT: movdqa %xmm4, %xmm6 +; SSE42-NEXT: pshufb %xmm3, %xmm6 ; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm3, %xmm1 -; SSE42-NEXT: pshufb %xmm1, %xmm2 -; SSE42-NEXT: paddb %xmm6, %xmm2 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE42-NEXT: psadbw %xmm5, %xmm2 +; SSE42-NEXT: pand %xmm2, %xmm1 +; SSE42-NEXT: pshufb %xmm1, %xmm4 +; SSE42-NEXT: paddb %xmm6, %xmm4 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE42-NEXT: psadbw %xmm5, %xmm4 ; SSE42-NEXT: psadbw %xmm5, %xmm1 -; SSE42-NEXT: packuswb %xmm2, %xmm1 +; SSE42-NEXT: packuswb %xmm4, %xmm1 ; SSE42-NEXT: retq ; ; AVX2-LABEL: widen_ctpop_v4i32_v8i32: @@ -190,52 +190,52 @@ define <8 x i32> @widen_ctpop_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) { ; SSE42-LABEL: widen_ctpop_v2i32_v8i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE42-NEXT: movdqa %xmm0, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm4, %xmm7 +; SSE42-NEXT: pand %xmm4, %xmm6 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NEXT: movdqa %xmm5, %xmm7 ; SSE42-NEXT: pshufb %xmm6, %xmm7 ; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm5, %xmm0 -; SSE42-NEXT: movdqa %xmm4, %xmm6 +; SSE42-NEXT: pand %xmm4, %xmm0 +; SSE42-NEXT: movdqa %xmm5, %xmm6 ; SSE42-NEXT: pshufb %xmm0, %xmm6 ; SSE42-NEXT: paddb %xmm7, %xmm6 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero ; SSE42-NEXT: pxor %xmm6, %xmm6 ; SSE42-NEXT: psadbw %xmm6, %xmm0 ; SSE42-NEXT: movdqa %xmm1, %xmm7 -; SSE42-NEXT: pand %xmm5, %xmm7 -; SSE42-NEXT: movdqa %xmm4, %xmm8 +; SSE42-NEXT: pand %xmm4, %xmm7 +; SSE42-NEXT: movdqa %xmm5, %xmm8 ; SSE42-NEXT: pshufb %xmm7, %xmm8 ; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm5, %xmm1 -; SSE42-NEXT: movdqa %xmm4, %xmm7 +; SSE42-NEXT: pand %xmm4, %xmm1 +; SSE42-NEXT: movdqa %xmm5, %xmm7 ; SSE42-NEXT: pshufb %xmm1, %xmm7 ; SSE42-NEXT: paddb %xmm8, %xmm7 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm7[0],zero,xmm7[1],zero ; SSE42-NEXT: psadbw %xmm6, %xmm1 ; SSE42-NEXT: packuswb %xmm1, %xmm0 ; SSE42-NEXT: movdqa %xmm2, %xmm1 -; SSE42-NEXT: pand %xmm5, %xmm1 -; SSE42-NEXT: movdqa %xmm4, %xmm7 +; SSE42-NEXT: pand %xmm4, %xmm1 +; SSE42-NEXT: movdqa %xmm5, %xmm7 ; SSE42-NEXT: pshufb %xmm1, %xmm7 ; SSE42-NEXT: psrlw $4, %xmm2 -; SSE42-NEXT: pand %xmm5, %xmm2 -; SSE42-NEXT: movdqa %xmm4, %xmm1 +; SSE42-NEXT: pand %xmm4, %xmm2 +; SSE42-NEXT: movdqa %xmm5, %xmm1 ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: paddb %xmm7, %xmm1 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; SSE42-NEXT: psadbw %xmm6, %xmm1 ; SSE42-NEXT: movdqa %xmm3, %xmm2 -; SSE42-NEXT: pand %xmm5, %xmm2 -; SSE42-NEXT: movdqa %xmm4, %xmm7 +; SSE42-NEXT: pand %xmm4, %xmm2 +; SSE42-NEXT: movdqa %xmm5, %xmm7 ; SSE42-NEXT: pshufb %xmm2, %xmm7 ; SSE42-NEXT: psrlw $4, %xmm3 -; SSE42-NEXT: pand %xmm5, %xmm3 -; SSE42-NEXT: pshufb %xmm3, %xmm4 -; SSE42-NEXT: paddb %xmm7, %xmm4 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero +; SSE42-NEXT: pand %xmm4, %xmm3 +; SSE42-NEXT: pshufb %xmm3, %xmm5 +; SSE42-NEXT: paddb %xmm7, %xmm5 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero ; SSE42-NEXT: psadbw %xmm6, %xmm2 ; SSE42-NEXT: packuswb %xmm2, %xmm1 ; SSE42-NEXT: retq @@ -347,51 +347,51 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32 define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; SSE42-LABEL: widen_ctlz_v2i32_v4i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE42-NEXT: movdqa %xmm3, %xmm6 -; SSE42-NEXT: pshufb %xmm0, %xmm6 -; SSE42-NEXT: movdqa %xmm0, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm5 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: pand %xmm4, %xmm5 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: movdqa %xmm3, %xmm7 -; SSE42-NEXT: pshufb %xmm5, %xmm7 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm5 -; SSE42-NEXT: pand %xmm6, %xmm5 -; SSE42-NEXT: paddb %xmm7, %xmm5 -; SSE42-NEXT: movdqa %xmm0, %xmm6 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm6 -; SSE42-NEXT: psrlw $8, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: psrlw $8, %xmm5 -; SSE42-NEXT: paddw %xmm6, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE42-NEXT: psrld $16, %xmm0 -; SSE42-NEXT: pand %xmm5, %xmm0 -; SSE42-NEXT: psrld $16, %xmm5 -; SSE42-NEXT: paddd %xmm5, %xmm0 -; SSE42-NEXT: movdqa %xmm3, %xmm6 -; SSE42-NEXT: pshufb %xmm1, %xmm6 -; SSE42-NEXT: movdqa %xmm1, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm5 -; SSE42-NEXT: pand %xmm4, %xmm5 -; SSE42-NEXT: pshufb %xmm5, %xmm3 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm5 -; SSE42-NEXT: pand %xmm6, %xmm5 -; SSE42-NEXT: paddb %xmm3, %xmm5 -; SSE42-NEXT: movdqa %xmm1, %xmm3 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: pshufb %xmm0, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: psrlw $4, %xmm4 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: pand %xmm5, %xmm4 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: movdqa %xmm2, %xmm7 +; SSE42-NEXT: pshufb %xmm4, %xmm7 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm4 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: paddb %xmm7, %xmm4 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm3 ; SSE42-NEXT: psrlw $8, %xmm3 -; SSE42-NEXT: pand %xmm5, %xmm3 -; SSE42-NEXT: psrlw $8, %xmm5 -; SSE42-NEXT: paddw %xmm3, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE42-NEXT: pand %xmm4, %xmm3 +; SSE42-NEXT: psrlw $8, %xmm4 +; SSE42-NEXT: paddw %xmm3, %xmm4 +; SSE42-NEXT: pcmpeqw %xmm6, %xmm0 +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pand %xmm4, %xmm0 +; SSE42-NEXT: psrld $16, %xmm4 +; SSE42-NEXT: paddd %xmm4, %xmm0 +; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: pshufb %xmm1, %xmm3 +; SSE42-NEXT: movdqa %xmm1, %xmm4 +; SSE42-NEXT: psrlw $4, %xmm4 +; SSE42-NEXT: pand %xmm5, %xmm4 +; SSE42-NEXT: pshufb %xmm4, %xmm2 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm4 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: paddb %xmm2, %xmm4 +; SSE42-NEXT: movdqa %xmm1, %xmm2 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm2 +; SSE42-NEXT: psrlw $8, %xmm2 +; SSE42-NEXT: pand %xmm4, %xmm2 +; SSE42-NEXT: psrlw $8, %xmm4 +; SSE42-NEXT: paddw %xmm2, %xmm4 +; SSE42-NEXT: pcmpeqw %xmm6, %xmm1 ; SSE42-NEXT: psrld $16, %xmm1 -; SSE42-NEXT: pand %xmm5, %xmm1 -; SSE42-NEXT: psrld $16, %xmm5 -; SSE42-NEXT: paddd %xmm1, %xmm5 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE42-NEXT: pand %xmm4, %xmm1 +; SSE42-NEXT: psrld $16, %xmm4 +; SSE42-NEXT: paddd %xmm1, %xmm4 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE42-NEXT: retq ; ; AVX2-LABEL: widen_ctlz_v2i32_v4i32: @@ -451,50 +451,50 @@ define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { define <8 x i32> @widen_ctlz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; SSE42-LABEL: widen_ctlz_v4i32_v8i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE42-NEXT: movdqa %xmm3, %xmm6 -; SSE42-NEXT: pshufb %xmm0, %xmm6 -; SSE42-NEXT: movdqa %xmm0, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm5 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: pand %xmm4, %xmm5 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: movdqa %xmm3, %xmm7 -; SSE42-NEXT: pshufb %xmm5, %xmm7 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm5 -; SSE42-NEXT: pand %xmm6, %xmm5 -; SSE42-NEXT: paddb %xmm7, %xmm5 -; SSE42-NEXT: movdqa %xmm0, %xmm6 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm6 -; SSE42-NEXT: psrlw $8, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: psrlw $8, %xmm5 -; SSE42-NEXT: paddw %xmm6, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE42-NEXT: psrld $16, %xmm0 -; SSE42-NEXT: pand %xmm5, %xmm0 -; SSE42-NEXT: psrld $16, %xmm5 -; SSE42-NEXT: paddd %xmm5, %xmm0 -; SSE42-NEXT: movdqa %xmm3, %xmm5 -; SSE42-NEXT: pshufb %xmm1, %xmm5 -; SSE42-NEXT: movdqa %xmm1, %xmm6 -; SSE42-NEXT: psrlw $4, %xmm6 -; SSE42-NEXT: pand %xmm4, %xmm6 -; SSE42-NEXT: pshufb %xmm6, %xmm3 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: paddb %xmm3, %xmm6 -; SSE42-NEXT: movdqa %xmm1, %xmm3 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: pshufb %xmm0, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: psrlw $4, %xmm4 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: pand %xmm5, %xmm4 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: movdqa %xmm2, %xmm7 +; SSE42-NEXT: pshufb %xmm4, %xmm7 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm4 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: paddb %xmm7, %xmm4 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm3 ; SSE42-NEXT: psrlw $8, %xmm3 -; SSE42-NEXT: pand %xmm6, %xmm3 -; SSE42-NEXT: psrlw $8, %xmm6 -; SSE42-NEXT: paddw %xmm3, %xmm6 -; SSE42-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE42-NEXT: pand %xmm4, %xmm3 +; SSE42-NEXT: psrlw $8, %xmm4 +; SSE42-NEXT: paddw %xmm3, %xmm4 +; SSE42-NEXT: pcmpeqw %xmm6, %xmm0 +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pand %xmm4, %xmm0 +; SSE42-NEXT: psrld $16, %xmm4 +; SSE42-NEXT: paddd %xmm4, %xmm0 +; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: pshufb %xmm1, %xmm3 +; SSE42-NEXT: movdqa %xmm1, %xmm4 +; SSE42-NEXT: psrlw $4, %xmm4 +; SSE42-NEXT: pand %xmm5, %xmm4 +; SSE42-NEXT: pshufb %xmm4, %xmm2 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm4 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: paddb %xmm2, %xmm4 +; SSE42-NEXT: movdqa %xmm1, %xmm2 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm2 +; SSE42-NEXT: psrlw $8, %xmm2 +; SSE42-NEXT: pand %xmm4, %xmm2 +; SSE42-NEXT: psrlw $8, %xmm4 +; SSE42-NEXT: paddw %xmm2, %xmm4 +; SSE42-NEXT: pcmpeqw %xmm6, %xmm1 ; SSE42-NEXT: psrld $16, %xmm1 -; SSE42-NEXT: pand %xmm6, %xmm1 -; SSE42-NEXT: psrld $16, %xmm6 -; SSE42-NEXT: paddd %xmm6, %xmm1 +; SSE42-NEXT: pand %xmm4, %xmm1 +; SSE42-NEXT: psrld $16, %xmm4 +; SSE42-NEXT: paddd %xmm4, %xmm1 ; SSE42-NEXT: retq ; ; AVX2-LABEL: widen_ctlz_v4i32_v8i32: @@ -539,51 +539,51 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; SSE42-LABEL: widen_ctlz_v2i32_v8i32: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE42-NEXT: movdqa %xmm5, %xmm8 -; SSE42-NEXT: pshufb %xmm0, %xmm8 -; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm7 +; SSE42-NEXT: movdqa %xmm5, %xmm7 +; SSE42-NEXT: pshufb %xmm0, %xmm7 +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: psrlw $4, %xmm8 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: pand %xmm6, %xmm7 +; SSE42-NEXT: pand %xmm6, %xmm8 ; SSE42-NEXT: pxor %xmm4, %xmm4 ; SSE42-NEXT: movdqa %xmm5, %xmm9 -; SSE42-NEXT: pshufb %xmm7, %xmm9 -; SSE42-NEXT: pcmpeqb %xmm4, %xmm7 -; SSE42-NEXT: pand %xmm8, %xmm7 -; SSE42-NEXT: paddb %xmm9, %xmm7 -; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: pshufb %xmm8, %xmm9 ; SSE42-NEXT: pcmpeqb %xmm4, %xmm8 -; SSE42-NEXT: psrlw $8, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 +; SSE42-NEXT: paddb %xmm9, %xmm8 +; SSE42-NEXT: movdqa %xmm0, %xmm7 +; SSE42-NEXT: pcmpeqb %xmm4, %xmm7 ; SSE42-NEXT: psrlw $8, %xmm7 -; SSE42-NEXT: paddw %xmm8, %xmm7 +; SSE42-NEXT: pand %xmm8, %xmm7 +; SSE42-NEXT: psrlw $8, %xmm8 +; SSE42-NEXT: paddw %xmm7, %xmm8 ; SSE42-NEXT: pcmpeqw %xmm4, %xmm0 ; SSE42-NEXT: psrld $16, %xmm0 -; SSE42-NEXT: pand %xmm7, %xmm0 -; SSE42-NEXT: psrld $16, %xmm7 -; SSE42-NEXT: paddd %xmm7, %xmm0 -; SSE42-NEXT: movdqa %xmm5, %xmm8 -; SSE42-NEXT: pshufb %xmm1, %xmm8 -; SSE42-NEXT: movdqa %xmm1, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm7 -; SSE42-NEXT: pand %xmm6, %xmm7 -; SSE42-NEXT: movdqa %xmm5, %xmm9 -; SSE42-NEXT: pshufb %xmm7, %xmm9 -; SSE42-NEXT: pcmpeqb %xmm4, %xmm7 -; SSE42-NEXT: pand %xmm8, %xmm7 -; SSE42-NEXT: paddb %xmm9, %xmm7 +; SSE42-NEXT: pand %xmm8, %xmm0 +; SSE42-NEXT: psrld $16, %xmm8 +; SSE42-NEXT: paddd %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm5, %xmm7 +; SSE42-NEXT: pshufb %xmm1, %xmm7 ; SSE42-NEXT: movdqa %xmm1, %xmm8 +; SSE42-NEXT: psrlw $4, %xmm8 +; SSE42-NEXT: pand %xmm6, %xmm8 +; SSE42-NEXT: movdqa %xmm5, %xmm9 +; SSE42-NEXT: pshufb %xmm8, %xmm9 ; SSE42-NEXT: pcmpeqb %xmm4, %xmm8 -; SSE42-NEXT: psrlw $8, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 +; SSE42-NEXT: paddb %xmm9, %xmm8 +; SSE42-NEXT: movdqa %xmm1, %xmm7 +; SSE42-NEXT: pcmpeqb %xmm4, %xmm7 ; SSE42-NEXT: psrlw $8, %xmm7 -; SSE42-NEXT: paddw %xmm8, %xmm7 +; SSE42-NEXT: pand %xmm8, %xmm7 +; SSE42-NEXT: psrlw $8, %xmm8 +; SSE42-NEXT: paddw %xmm7, %xmm8 ; SSE42-NEXT: pcmpeqw %xmm4, %xmm1 ; SSE42-NEXT: psrld $16, %xmm1 -; SSE42-NEXT: pand %xmm7, %xmm1 -; SSE42-NEXT: psrld $16, %xmm7 -; SSE42-NEXT: paddd %xmm1, %xmm7 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE42-NEXT: pand %xmm8, %xmm1 +; SSE42-NEXT: psrld $16, %xmm8 +; SSE42-NEXT: paddd %xmm1, %xmm8 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] ; SSE42-NEXT: movdqa %xmm5, %xmm7 ; SSE42-NEXT: pshufb %xmm2, %xmm7 ; SSE42-NEXT: movdqa %xmm2, %xmm1 @@ -605,27 +605,27 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; SSE42-NEXT: pand %xmm1, %xmm2 ; SSE42-NEXT: psrld $16, %xmm1 ; SSE42-NEXT: paddd %xmm2, %xmm1 -; SSE42-NEXT: movdqa %xmm5, %xmm7 -; SSE42-NEXT: pshufb %xmm3, %xmm7 +; SSE42-NEXT: movdqa %xmm5, %xmm2 +; SSE42-NEXT: pshufb %xmm3, %xmm2 +; SSE42-NEXT: movdqa %xmm3, %xmm7 +; SSE42-NEXT: psrlw $4, %xmm7 +; SSE42-NEXT: pand %xmm6, %xmm7 +; SSE42-NEXT: pshufb %xmm7, %xmm5 +; SSE42-NEXT: pcmpeqb %xmm4, %xmm7 +; SSE42-NEXT: pand %xmm2, %xmm7 +; SSE42-NEXT: paddb %xmm5, %xmm7 ; SSE42-NEXT: movdqa %xmm3, %xmm2 -; SSE42-NEXT: psrlw $4, %xmm2 -; SSE42-NEXT: pand %xmm6, %xmm2 -; SSE42-NEXT: pshufb %xmm2, %xmm5 ; SSE42-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE42-NEXT: pand %xmm7, %xmm2 -; SSE42-NEXT: paddb %xmm5, %xmm2 -; SSE42-NEXT: movdqa %xmm3, %xmm5 -; SSE42-NEXT: pcmpeqb %xmm4, %xmm5 -; SSE42-NEXT: psrlw $8, %xmm5 -; SSE42-NEXT: pand %xmm2, %xmm5 ; SSE42-NEXT: psrlw $8, %xmm2 -; SSE42-NEXT: paddw %xmm5, %xmm2 +; SSE42-NEXT: pand %xmm7, %xmm2 +; SSE42-NEXT: psrlw $8, %xmm7 +; SSE42-NEXT: paddw %xmm2, %xmm7 ; SSE42-NEXT: pcmpeqw %xmm4, %xmm3 ; SSE42-NEXT: psrld $16, %xmm3 -; SSE42-NEXT: pand %xmm2, %xmm3 -; SSE42-NEXT: psrld $16, %xmm2 -; SSE42-NEXT: paddd %xmm3, %xmm2 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE42-NEXT: pand %xmm7, %xmm3 +; SSE42-NEXT: psrld $16, %xmm7 +; SSE42-NEXT: paddd %xmm3, %xmm7 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE42-NEXT: retq ; ; AVX2-LABEL: widen_ctlz_v2i32_v8i32: @@ -728,51 +728,51 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; SSE42-LABEL: widen_ctlz_undef_v2i32_v4i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE42-NEXT: movdqa %xmm3, %xmm6 -; SSE42-NEXT: pshufb %xmm0, %xmm6 -; SSE42-NEXT: movdqa %xmm0, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm5 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: pand %xmm4, %xmm5 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: movdqa %xmm3, %xmm7 -; SSE42-NEXT: pshufb %xmm5, %xmm7 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm5 -; SSE42-NEXT: pand %xmm6, %xmm5 -; SSE42-NEXT: paddb %xmm7, %xmm5 -; SSE42-NEXT: movdqa %xmm0, %xmm6 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm6 -; SSE42-NEXT: psrlw $8, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: psrlw $8, %xmm5 -; SSE42-NEXT: paddw %xmm6, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE42-NEXT: psrld $16, %xmm0 -; SSE42-NEXT: pand %xmm5, %xmm0 -; SSE42-NEXT: psrld $16, %xmm5 -; SSE42-NEXT: paddd %xmm5, %xmm0 -; SSE42-NEXT: movdqa %xmm3, %xmm6 -; SSE42-NEXT: pshufb %xmm1, %xmm6 -; SSE42-NEXT: movdqa %xmm1, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm5 -; SSE42-NEXT: pand %xmm4, %xmm5 -; SSE42-NEXT: pshufb %xmm5, %xmm3 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm5 -; SSE42-NEXT: pand %xmm6, %xmm5 -; SSE42-NEXT: paddb %xmm3, %xmm5 -; SSE42-NEXT: movdqa %xmm1, %xmm3 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: pshufb %xmm0, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: psrlw $4, %xmm4 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: pand %xmm5, %xmm4 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: movdqa %xmm2, %xmm7 +; SSE42-NEXT: pshufb %xmm4, %xmm7 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm4 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: paddb %xmm7, %xmm4 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm3 ; SSE42-NEXT: psrlw $8, %xmm3 -; SSE42-NEXT: pand %xmm5, %xmm3 -; SSE42-NEXT: psrlw $8, %xmm5 -; SSE42-NEXT: paddw %xmm3, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE42-NEXT: pand %xmm4, %xmm3 +; SSE42-NEXT: psrlw $8, %xmm4 +; SSE42-NEXT: paddw %xmm3, %xmm4 +; SSE42-NEXT: pcmpeqw %xmm6, %xmm0 +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pand %xmm4, %xmm0 +; SSE42-NEXT: psrld $16, %xmm4 +; SSE42-NEXT: paddd %xmm4, %xmm0 +; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: pshufb %xmm1, %xmm3 +; SSE42-NEXT: movdqa %xmm1, %xmm4 +; SSE42-NEXT: psrlw $4, %xmm4 +; SSE42-NEXT: pand %xmm5, %xmm4 +; SSE42-NEXT: pshufb %xmm4, %xmm2 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm4 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: paddb %xmm2, %xmm4 +; SSE42-NEXT: movdqa %xmm1, %xmm2 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm2 +; SSE42-NEXT: psrlw $8, %xmm2 +; SSE42-NEXT: pand %xmm4, %xmm2 +; SSE42-NEXT: psrlw $8, %xmm4 +; SSE42-NEXT: paddw %xmm2, %xmm4 +; SSE42-NEXT: pcmpeqw %xmm6, %xmm1 ; SSE42-NEXT: psrld $16, %xmm1 -; SSE42-NEXT: pand %xmm5, %xmm1 -; SSE42-NEXT: psrld $16, %xmm5 -; SSE42-NEXT: paddd %xmm1, %xmm5 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE42-NEXT: pand %xmm4, %xmm1 +; SSE42-NEXT: psrld $16, %xmm4 +; SSE42-NEXT: paddd %xmm1, %xmm4 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE42-NEXT: retq ; ; AVX2-LABEL: widen_ctlz_undef_v2i32_v4i32: @@ -832,50 +832,50 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; SSE42-LABEL: widen_ctlz_undef_v4i32_v8i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE42-NEXT: movdqa %xmm3, %xmm6 -; SSE42-NEXT: pshufb %xmm0, %xmm6 -; SSE42-NEXT: movdqa %xmm0, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm5 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: pand %xmm4, %xmm5 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: movdqa %xmm3, %xmm7 -; SSE42-NEXT: pshufb %xmm5, %xmm7 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm5 -; SSE42-NEXT: pand %xmm6, %xmm5 -; SSE42-NEXT: paddb %xmm7, %xmm5 -; SSE42-NEXT: movdqa %xmm0, %xmm6 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm6 -; SSE42-NEXT: psrlw $8, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: psrlw $8, %xmm5 -; SSE42-NEXT: paddw %xmm6, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE42-NEXT: psrld $16, %xmm0 -; SSE42-NEXT: pand %xmm5, %xmm0 -; SSE42-NEXT: psrld $16, %xmm5 -; SSE42-NEXT: paddd %xmm5, %xmm0 -; SSE42-NEXT: movdqa %xmm3, %xmm5 -; SSE42-NEXT: pshufb %xmm1, %xmm5 -; SSE42-NEXT: movdqa %xmm1, %xmm6 -; SSE42-NEXT: psrlw $4, %xmm6 -; SSE42-NEXT: pand %xmm4, %xmm6 -; SSE42-NEXT: pshufb %xmm6, %xmm3 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: paddb %xmm3, %xmm6 -; SSE42-NEXT: movdqa %xmm1, %xmm3 -; SSE42-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: pshufb %xmm0, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: psrlw $4, %xmm4 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: pand %xmm5, %xmm4 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: movdqa %xmm2, %xmm7 +; SSE42-NEXT: pshufb %xmm4, %xmm7 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm4 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: paddb %xmm7, %xmm4 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm3 ; SSE42-NEXT: psrlw $8, %xmm3 -; SSE42-NEXT: pand %xmm6, %xmm3 -; SSE42-NEXT: psrlw $8, %xmm6 -; SSE42-NEXT: paddw %xmm3, %xmm6 -; SSE42-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE42-NEXT: pand %xmm4, %xmm3 +; SSE42-NEXT: psrlw $8, %xmm4 +; SSE42-NEXT: paddw %xmm3, %xmm4 +; SSE42-NEXT: pcmpeqw %xmm6, %xmm0 +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pand %xmm4, %xmm0 +; SSE42-NEXT: psrld $16, %xmm4 +; SSE42-NEXT: paddd %xmm4, %xmm0 +; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: pshufb %xmm1, %xmm3 +; SSE42-NEXT: movdqa %xmm1, %xmm4 +; SSE42-NEXT: psrlw $4, %xmm4 +; SSE42-NEXT: pand %xmm5, %xmm4 +; SSE42-NEXT: pshufb %xmm4, %xmm2 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm4 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: paddb %xmm2, %xmm4 +; SSE42-NEXT: movdqa %xmm1, %xmm2 +; SSE42-NEXT: pcmpeqb %xmm6, %xmm2 +; SSE42-NEXT: psrlw $8, %xmm2 +; SSE42-NEXT: pand %xmm4, %xmm2 +; SSE42-NEXT: psrlw $8, %xmm4 +; SSE42-NEXT: paddw %xmm2, %xmm4 +; SSE42-NEXT: pcmpeqw %xmm6, %xmm1 ; SSE42-NEXT: psrld $16, %xmm1 -; SSE42-NEXT: pand %xmm6, %xmm1 -; SSE42-NEXT: psrld $16, %xmm6 -; SSE42-NEXT: paddd %xmm6, %xmm1 +; SSE42-NEXT: pand %xmm4, %xmm1 +; SSE42-NEXT: psrld $16, %xmm4 +; SSE42-NEXT: paddd %xmm4, %xmm1 ; SSE42-NEXT: retq ; ; AVX2-LABEL: widen_ctlz_undef_v4i32_v8i32: @@ -920,51 +920,51 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; SSE42-LABEL: widen_ctlz_undef_v2i32_v8i32: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE42-NEXT: movdqa %xmm5, %xmm8 -; SSE42-NEXT: pshufb %xmm0, %xmm8 -; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm7 +; SSE42-NEXT: movdqa %xmm5, %xmm7 +; SSE42-NEXT: pshufb %xmm0, %xmm7 +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: psrlw $4, %xmm8 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: pand %xmm6, %xmm7 +; SSE42-NEXT: pand %xmm6, %xmm8 ; SSE42-NEXT: pxor %xmm4, %xmm4 ; SSE42-NEXT: movdqa %xmm5, %xmm9 -; SSE42-NEXT: pshufb %xmm7, %xmm9 -; SSE42-NEXT: pcmpeqb %xmm4, %xmm7 -; SSE42-NEXT: pand %xmm8, %xmm7 -; SSE42-NEXT: paddb %xmm9, %xmm7 -; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: pshufb %xmm8, %xmm9 ; SSE42-NEXT: pcmpeqb %xmm4, %xmm8 -; SSE42-NEXT: psrlw $8, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 +; SSE42-NEXT: paddb %xmm9, %xmm8 +; SSE42-NEXT: movdqa %xmm0, %xmm7 +; SSE42-NEXT: pcmpeqb %xmm4, %xmm7 ; SSE42-NEXT: psrlw $8, %xmm7 -; SSE42-NEXT: paddw %xmm8, %xmm7 +; SSE42-NEXT: pand %xmm8, %xmm7 +; SSE42-NEXT: psrlw $8, %xmm8 +; SSE42-NEXT: paddw %xmm7, %xmm8 ; SSE42-NEXT: pcmpeqw %xmm4, %xmm0 ; SSE42-NEXT: psrld $16, %xmm0 -; SSE42-NEXT: pand %xmm7, %xmm0 -; SSE42-NEXT: psrld $16, %xmm7 -; SSE42-NEXT: paddd %xmm7, %xmm0 -; SSE42-NEXT: movdqa %xmm5, %xmm8 -; SSE42-NEXT: pshufb %xmm1, %xmm8 -; SSE42-NEXT: movdqa %xmm1, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm7 -; SSE42-NEXT: pand %xmm6, %xmm7 -; SSE42-NEXT: movdqa %xmm5, %xmm9 -; SSE42-NEXT: pshufb %xmm7, %xmm9 -; SSE42-NEXT: pcmpeqb %xmm4, %xmm7 -; SSE42-NEXT: pand %xmm8, %xmm7 -; SSE42-NEXT: paddb %xmm9, %xmm7 +; SSE42-NEXT: pand %xmm8, %xmm0 +; SSE42-NEXT: psrld $16, %xmm8 +; SSE42-NEXT: paddd %xmm8, %xmm0 +; SSE42-NEXT: movdqa %xmm5, %xmm7 +; SSE42-NEXT: pshufb %xmm1, %xmm7 ; SSE42-NEXT: movdqa %xmm1, %xmm8 +; SSE42-NEXT: psrlw $4, %xmm8 +; SSE42-NEXT: pand %xmm6, %xmm8 +; SSE42-NEXT: movdqa %xmm5, %xmm9 +; SSE42-NEXT: pshufb %xmm8, %xmm9 ; SSE42-NEXT: pcmpeqb %xmm4, %xmm8 -; SSE42-NEXT: psrlw $8, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 +; SSE42-NEXT: paddb %xmm9, %xmm8 +; SSE42-NEXT: movdqa %xmm1, %xmm7 +; SSE42-NEXT: pcmpeqb %xmm4, %xmm7 ; SSE42-NEXT: psrlw $8, %xmm7 -; SSE42-NEXT: paddw %xmm8, %xmm7 +; SSE42-NEXT: pand %xmm8, %xmm7 +; SSE42-NEXT: psrlw $8, %xmm8 +; SSE42-NEXT: paddw %xmm7, %xmm8 ; SSE42-NEXT: pcmpeqw %xmm4, %xmm1 ; SSE42-NEXT: psrld $16, %xmm1 -; SSE42-NEXT: pand %xmm7, %xmm1 -; SSE42-NEXT: psrld $16, %xmm7 -; SSE42-NEXT: paddd %xmm1, %xmm7 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE42-NEXT: pand %xmm8, %xmm1 +; SSE42-NEXT: psrld $16, %xmm8 +; SSE42-NEXT: paddd %xmm1, %xmm8 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] ; SSE42-NEXT: movdqa %xmm5, %xmm7 ; SSE42-NEXT: pshufb %xmm2, %xmm7 ; SSE42-NEXT: movdqa %xmm2, %xmm1 @@ -986,27 +986,27 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; SSE42-NEXT: pand %xmm1, %xmm2 ; SSE42-NEXT: psrld $16, %xmm1 ; SSE42-NEXT: paddd %xmm2, %xmm1 -; SSE42-NEXT: movdqa %xmm5, %xmm7 -; SSE42-NEXT: pshufb %xmm3, %xmm7 +; SSE42-NEXT: movdqa %xmm5, %xmm2 +; SSE42-NEXT: pshufb %xmm3, %xmm2 +; SSE42-NEXT: movdqa %xmm3, %xmm7 +; SSE42-NEXT: psrlw $4, %xmm7 +; SSE42-NEXT: pand %xmm6, %xmm7 +; SSE42-NEXT: pshufb %xmm7, %xmm5 +; SSE42-NEXT: pcmpeqb %xmm4, %xmm7 +; SSE42-NEXT: pand %xmm2, %xmm7 +; SSE42-NEXT: paddb %xmm5, %xmm7 ; SSE42-NEXT: movdqa %xmm3, %xmm2 -; SSE42-NEXT: psrlw $4, %xmm2 -; SSE42-NEXT: pand %xmm6, %xmm2 -; SSE42-NEXT: pshufb %xmm2, %xmm5 ; SSE42-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE42-NEXT: pand %xmm7, %xmm2 -; SSE42-NEXT: paddb %xmm5, %xmm2 -; SSE42-NEXT: movdqa %xmm3, %xmm5 -; SSE42-NEXT: pcmpeqb %xmm4, %xmm5 -; SSE42-NEXT: psrlw $8, %xmm5 -; SSE42-NEXT: pand %xmm2, %xmm5 ; SSE42-NEXT: psrlw $8, %xmm2 -; SSE42-NEXT: paddw %xmm5, %xmm2 +; SSE42-NEXT: pand %xmm7, %xmm2 +; SSE42-NEXT: psrlw $8, %xmm7 +; SSE42-NEXT: paddw %xmm2, %xmm7 ; SSE42-NEXT: pcmpeqw %xmm4, %xmm3 ; SSE42-NEXT: psrld $16, %xmm3 -; SSE42-NEXT: pand %xmm2, %xmm3 -; SSE42-NEXT: psrld $16, %xmm2 -; SSE42-NEXT: paddd %xmm3, %xmm2 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE42-NEXT: pand %xmm7, %xmm3 +; SSE42-NEXT: psrld $16, %xmm7 +; SSE42-NEXT: paddd %xmm3, %xmm7 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE42-NEXT: retq ; ; AVX2-LABEL: widen_ctlz_undef_v2i32_v8i32: @@ -1205,42 +1205,42 @@ define <4 x i32> @widen_cttz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { define <8 x i32> @widen_cttz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; SSE42-LABEL: widen_cttz_v4i32_v8i32: ; SSE42: # %bb.0: -; SSE42-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: paddd %xmm4, %xmm2 -; SSE42-NEXT: pandn %xmm2, %xmm0 +; SSE42-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: paddd %xmm2, %xmm3 +; SSE42-NEXT: pandn %xmm3, %xmm0 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm0, %xmm5 -; SSE42-NEXT: pand %xmm3, %xmm5 -; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pshufb %xmm5, %xmm6 +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NEXT: movdqa %xmm5, %xmm6 +; SSE42-NEXT: pshufb %xmm4, %xmm6 ; SSE42-NEXT: psrlw $4, %xmm0 ; SSE42-NEXT: pand %xmm3, %xmm0 -; SSE42-NEXT: movdqa %xmm2, %xmm5 -; SSE42-NEXT: pshufb %xmm0, %xmm5 -; SSE42-NEXT: paddb %xmm6, %xmm5 +; SSE42-NEXT: movdqa %xmm5, %xmm4 +; SSE42-NEXT: pshufb %xmm0, %xmm4 +; SSE42-NEXT: paddb %xmm6, %xmm4 ; SSE42-NEXT: pxor %xmm6, %xmm6 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE42-NEXT: psadbw %xmm6, %xmm5 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE42-NEXT: psadbw %xmm6, %xmm4 ; SSE42-NEXT: psadbw %xmm6, %xmm0 -; SSE42-NEXT: packuswb %xmm5, %xmm0 -; SSE42-NEXT: paddd %xmm1, %xmm4 -; SSE42-NEXT: pandn %xmm4, %xmm1 -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: movdqa %xmm2, %xmm5 -; SSE42-NEXT: pshufb %xmm4, %xmm5 +; SSE42-NEXT: packuswb %xmm4, %xmm0 +; SSE42-NEXT: paddd %xmm1, %xmm2 +; SSE42-NEXT: pandn %xmm2, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm2 +; SSE42-NEXT: pand %xmm3, %xmm2 +; SSE42-NEXT: movdqa %xmm5, %xmm4 +; SSE42-NEXT: pshufb %xmm2, %xmm4 ; SSE42-NEXT: psrlw $4, %xmm1 ; SSE42-NEXT: pand %xmm3, %xmm1 -; SSE42-NEXT: pshufb %xmm1, %xmm2 -; SSE42-NEXT: paddb %xmm5, %xmm2 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE42-NEXT: psadbw %xmm6, %xmm2 +; SSE42-NEXT: pshufb %xmm1, %xmm5 +; SSE42-NEXT: paddb %xmm4, %xmm5 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE42-NEXT: psadbw %xmm6, %xmm5 ; SSE42-NEXT: psadbw %xmm6, %xmm1 -; SSE42-NEXT: packuswb %xmm2, %xmm1 +; SSE42-NEXT: packuswb %xmm5, %xmm1 ; SSE42-NEXT: retq ; ; AVX2-LABEL: widen_cttz_v4i32_v8i32: @@ -1301,15 +1301,15 @@ define <8 x i32> @widen_cttz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; SSE42-NEXT: movdqa %xmm0, %xmm4 ; SSE42-NEXT: paddd %xmm6, %xmm4 ; SSE42-NEXT: pandn %xmm4, %xmm0 -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: pand %xmm5, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm4, %xmm8 +; SSE42-NEXT: pand %xmm4, %xmm7 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NEXT: movdqa %xmm5, %xmm8 ; SSE42-NEXT: pshufb %xmm7, %xmm8 ; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm5, %xmm0 -; SSE42-NEXT: movdqa %xmm4, %xmm7 +; SSE42-NEXT: pand %xmm4, %xmm0 +; SSE42-NEXT: movdqa %xmm5, %xmm7 ; SSE42-NEXT: pshufb %xmm0, %xmm7 ; SSE42-NEXT: paddb %xmm8, %xmm7 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero @@ -1319,12 +1319,12 @@ define <8 x i32> @widen_cttz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; SSE42-NEXT: paddd %xmm6, %xmm8 ; SSE42-NEXT: pandn %xmm8, %xmm1 ; SSE42-NEXT: movdqa %xmm1, %xmm8 -; SSE42-NEXT: pand %xmm5, %xmm8 -; SSE42-NEXT: movdqa %xmm4, %xmm9 +; SSE42-NEXT: pand %xmm4, %xmm8 +; SSE42-NEXT: movdqa %xmm5, %xmm9 ; SSE42-NEXT: pshufb %xmm8, %xmm9 ; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm5, %xmm1 -; SSE42-NEXT: movdqa %xmm4, %xmm8 +; SSE42-NEXT: pand %xmm4, %xmm1 +; SSE42-NEXT: movdqa %xmm5, %xmm8 ; SSE42-NEXT: pshufb %xmm1, %xmm8 ; SSE42-NEXT: paddb %xmm9, %xmm8 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero @@ -1334,12 +1334,12 @@ define <8 x i32> @widen_cttz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; SSE42-NEXT: paddd %xmm6, %xmm1 ; SSE42-NEXT: pandn %xmm1, %xmm2 ; SSE42-NEXT: movdqa %xmm2, %xmm1 -; SSE42-NEXT: pand %xmm5, %xmm1 -; SSE42-NEXT: movdqa %xmm4, %xmm8 +; SSE42-NEXT: pand %xmm4, %xmm1 +; SSE42-NEXT: movdqa %xmm5, %xmm8 ; SSE42-NEXT: pshufb %xmm1, %xmm8 ; SSE42-NEXT: psrlw $4, %xmm2 -; SSE42-NEXT: pand %xmm5, %xmm2 -; SSE42-NEXT: movdqa %xmm4, %xmm1 +; SSE42-NEXT: pand %xmm4, %xmm2 +; SSE42-NEXT: movdqa %xmm5, %xmm1 ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: paddb %xmm8, %xmm1 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero @@ -1347,14 +1347,14 @@ define <8 x i32> @widen_cttz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; SSE42-NEXT: paddd %xmm3, %xmm6 ; SSE42-NEXT: pandn %xmm6, %xmm3 ; SSE42-NEXT: movdqa %xmm3, %xmm2 -; SSE42-NEXT: pand %xmm5, %xmm2 -; SSE42-NEXT: movdqa %xmm4, %xmm6 +; SSE42-NEXT: pand %xmm4, %xmm2 +; SSE42-NEXT: movdqa %xmm5, %xmm6 ; SSE42-NEXT: pshufb %xmm2, %xmm6 ; SSE42-NEXT: psrlw $4, %xmm3 -; SSE42-NEXT: pand %xmm5, %xmm3 -; SSE42-NEXT: pshufb %xmm3, %xmm4 -; SSE42-NEXT: paddb %xmm6, %xmm4 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero +; SSE42-NEXT: pand %xmm4, %xmm3 +; SSE42-NEXT: pshufb %xmm3, %xmm5 +; SSE42-NEXT: paddb %xmm6, %xmm5 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero ; SSE42-NEXT: psadbw %xmm7, %xmm2 ; SSE42-NEXT: packuswb %xmm2, %xmm1 ; SSE42-NEXT: retq @@ -1554,42 +1554,42 @@ define <4 x i32> @widen_cttz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { define <8 x i32> @widen_cttz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; SSE42-LABEL: widen_cttz_undef_v4i32_v8i32: ; SSE42: # %bb.0: -; SSE42-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: paddd %xmm4, %xmm2 -; SSE42-NEXT: pandn %xmm2, %xmm0 +; SSE42-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: paddd %xmm2, %xmm3 +; SSE42-NEXT: pandn %xmm3, %xmm0 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm0, %xmm5 -; SSE42-NEXT: pand %xmm3, %xmm5 -; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pshufb %xmm5, %xmm6 +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NEXT: movdqa %xmm5, %xmm6 +; SSE42-NEXT: pshufb %xmm4, %xmm6 ; SSE42-NEXT: psrlw $4, %xmm0 ; SSE42-NEXT: pand %xmm3, %xmm0 -; SSE42-NEXT: movdqa %xmm2, %xmm5 -; SSE42-NEXT: pshufb %xmm0, %xmm5 -; SSE42-NEXT: paddb %xmm6, %xmm5 +; SSE42-NEXT: movdqa %xmm5, %xmm4 +; SSE42-NEXT: pshufb %xmm0, %xmm4 +; SSE42-NEXT: paddb %xmm6, %xmm4 ; SSE42-NEXT: pxor %xmm6, %xmm6 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE42-NEXT: psadbw %xmm6, %xmm5 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE42-NEXT: psadbw %xmm6, %xmm4 ; SSE42-NEXT: psadbw %xmm6, %xmm0 -; SSE42-NEXT: packuswb %xmm5, %xmm0 -; SSE42-NEXT: paddd %xmm1, %xmm4 -; SSE42-NEXT: pandn %xmm4, %xmm1 -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: movdqa %xmm2, %xmm5 -; SSE42-NEXT: pshufb %xmm4, %xmm5 +; SSE42-NEXT: packuswb %xmm4, %xmm0 +; SSE42-NEXT: paddd %xmm1, %xmm2 +; SSE42-NEXT: pandn %xmm2, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm2 +; SSE42-NEXT: pand %xmm3, %xmm2 +; SSE42-NEXT: movdqa %xmm5, %xmm4 +; SSE42-NEXT: pshufb %xmm2, %xmm4 ; SSE42-NEXT: psrlw $4, %xmm1 ; SSE42-NEXT: pand %xmm3, %xmm1 -; SSE42-NEXT: pshufb %xmm1, %xmm2 -; SSE42-NEXT: paddb %xmm5, %xmm2 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE42-NEXT: psadbw %xmm6, %xmm2 +; SSE42-NEXT: pshufb %xmm1, %xmm5 +; SSE42-NEXT: paddb %xmm4, %xmm5 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE42-NEXT: psadbw %xmm6, %xmm5 ; SSE42-NEXT: psadbw %xmm6, %xmm1 -; SSE42-NEXT: packuswb %xmm2, %xmm1 +; SSE42-NEXT: packuswb %xmm5, %xmm1 ; SSE42-NEXT: retq ; ; AVX2-LABEL: widen_cttz_undef_v4i32_v8i32: @@ -1650,15 +1650,15 @@ define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; SSE42-NEXT: movdqa %xmm0, %xmm4 ; SSE42-NEXT: paddd %xmm6, %xmm4 ; SSE42-NEXT: pandn %xmm4, %xmm0 -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: pand %xmm5, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm4, %xmm8 +; SSE42-NEXT: pand %xmm4, %xmm7 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NEXT: movdqa %xmm5, %xmm8 ; SSE42-NEXT: pshufb %xmm7, %xmm8 ; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm5, %xmm0 -; SSE42-NEXT: movdqa %xmm4, %xmm7 +; SSE42-NEXT: pand %xmm4, %xmm0 +; SSE42-NEXT: movdqa %xmm5, %xmm7 ; SSE42-NEXT: pshufb %xmm0, %xmm7 ; SSE42-NEXT: paddb %xmm8, %xmm7 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero @@ -1668,12 +1668,12 @@ define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; SSE42-NEXT: paddd %xmm6, %xmm8 ; SSE42-NEXT: pandn %xmm8, %xmm1 ; SSE42-NEXT: movdqa %xmm1, %xmm8 -; SSE42-NEXT: pand %xmm5, %xmm8 -; SSE42-NEXT: movdqa %xmm4, %xmm9 +; SSE42-NEXT: pand %xmm4, %xmm8 +; SSE42-NEXT: movdqa %xmm5, %xmm9 ; SSE42-NEXT: pshufb %xmm8, %xmm9 ; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm5, %xmm1 -; SSE42-NEXT: movdqa %xmm4, %xmm8 +; SSE42-NEXT: pand %xmm4, %xmm1 +; SSE42-NEXT: movdqa %xmm5, %xmm8 ; SSE42-NEXT: pshufb %xmm1, %xmm8 ; SSE42-NEXT: paddb %xmm9, %xmm8 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero @@ -1683,12 +1683,12 @@ define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; SSE42-NEXT: paddd %xmm6, %xmm1 ; SSE42-NEXT: pandn %xmm1, %xmm2 ; SSE42-NEXT: movdqa %xmm2, %xmm1 -; SSE42-NEXT: pand %xmm5, %xmm1 -; SSE42-NEXT: movdqa %xmm4, %xmm8 +; SSE42-NEXT: pand %xmm4, %xmm1 +; SSE42-NEXT: movdqa %xmm5, %xmm8 ; SSE42-NEXT: pshufb %xmm1, %xmm8 ; SSE42-NEXT: psrlw $4, %xmm2 -; SSE42-NEXT: pand %xmm5, %xmm2 -; SSE42-NEXT: movdqa %xmm4, %xmm1 +; SSE42-NEXT: pand %xmm4, %xmm2 +; SSE42-NEXT: movdqa %xmm5, %xmm1 ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: paddb %xmm8, %xmm1 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero @@ -1696,14 +1696,14 @@ define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; SSE42-NEXT: paddd %xmm3, %xmm6 ; SSE42-NEXT: pandn %xmm6, %xmm3 ; SSE42-NEXT: movdqa %xmm3, %xmm2 -; SSE42-NEXT: pand %xmm5, %xmm2 -; SSE42-NEXT: movdqa %xmm4, %xmm6 +; SSE42-NEXT: pand %xmm4, %xmm2 +; SSE42-NEXT: movdqa %xmm5, %xmm6 ; SSE42-NEXT: pshufb %xmm2, %xmm6 ; SSE42-NEXT: psrlw $4, %xmm3 -; SSE42-NEXT: pand %xmm5, %xmm3 -; SSE42-NEXT: pshufb %xmm3, %xmm4 -; SSE42-NEXT: paddb %xmm6, %xmm4 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero +; SSE42-NEXT: pand %xmm4, %xmm3 +; SSE42-NEXT: pshufb %xmm3, %xmm5 +; SSE42-NEXT: paddb %xmm6, %xmm5 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero ; SSE42-NEXT: psadbw %xmm7, %xmm2 ; SSE42-NEXT: packuswb %xmm2, %xmm1 ; SSE42-NEXT: retq diff --git a/llvm/test/CodeGen/X86/widen_cast-4.ll b/llvm/test/CodeGen/X86/widen_cast-4.ll index 7dd92ce874aefe..306c4da5609fec 100644 --- a/llvm/test/CodeGen/X86/widen_cast-4.ll +++ b/llvm/test/CodeGen/X86/widen_cast-4.ll @@ -20,19 +20,19 @@ define void @update(ptr %dst_i, ptr %src_i, i32 %n) nounwind { ; WIDE-NEXT: # %bb.2: # %forbody ; WIDE-NEXT: # in Loop: Header=BB0_1 Depth=1 ; WIDE-NEXT: movl (%esp), %eax -; WIDE-NEXT: leal (,%eax,8), %edx -; WIDE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIDE-NEXT: addl %edx, %ecx -; WIDE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; WIDE-NEXT: addl {{[0-9]+}}(%esp), %edx +; WIDE-NEXT: leal (,%eax,8), %ecx +; WIDE-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIDE-NEXT: addl %ecx, %edx ; WIDE-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIDE-NEXT: addl {{[0-9]+}}(%esp), %ecx +; WIDE-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; WIDE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; WIDE-NEXT: psubb %xmm0, %xmm3 ; WIDE-NEXT: psrlw $2, %xmm3 ; WIDE-NEXT: pand %xmm1, %xmm3 ; WIDE-NEXT: pxor %xmm2, %xmm3 ; WIDE-NEXT: psubb %xmm2, %xmm3 -; WIDE-NEXT: movq %xmm3, (%ecx,%eax,8) +; WIDE-NEXT: movq %xmm3, (%edx,%eax,8) ; WIDE-NEXT: incl (%esp) ; WIDE-NEXT: jmp .LBB0_1 ; WIDE-NEXT: .LBB0_3: # %afterfor diff --git a/llvm/test/CodeGen/X86/x32-va_start.ll b/llvm/test/CodeGen/X86/x32-va_start.ll index e61e5765f124aa..b1f328274af9f8 100644 --- a/llvm/test/CodeGen/X86/x32-va_start.ll +++ b/llvm/test/CodeGen/X86/x32-va_start.ll @@ -100,22 +100,22 @@ define i32 @foo(float %a, ptr nocapture readnone %fmt, ...) nounwind { ; 32BITABI-LABEL: foo: ; 32BITABI: # %bb.0: # %entry ; 32BITABI-NEXT: subl $28, %esp -; 32BITABI-NEXT: leal {{[0-9]+}}(%esp), %ecx -; 32BITABI-NEXT: movl %ecx, (%esp) -; 32BITABI-NEXT: cmpl $40, %ecx +; 32BITABI-NEXT: leal {{[0-9]+}}(%esp), %eax +; 32BITABI-NEXT: movl %eax, (%esp) +; 32BITABI-NEXT: cmpl $40, %eax ; 32BITABI-NEXT: ja .LBB0_2 ; 32BITABI-NEXT: # %bb.1: # %vaarg.in_reg -; 32BITABI-NEXT: movl {{[0-9]+}}(%esp), %eax -; 32BITABI-NEXT: addl %ecx, %eax -; 32BITABI-NEXT: addl $8, %ecx -; 32BITABI-NEXT: movl %ecx, (%esp) +; 32BITABI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; 32BITABI-NEXT: addl %eax, %ecx +; 32BITABI-NEXT: addl $8, %eax +; 32BITABI-NEXT: movl %eax, (%esp) ; 32BITABI-NEXT: jmp .LBB0_3 ; 32BITABI-NEXT: .LBB0_2: # %vaarg.in_mem -; 32BITABI-NEXT: movl {{[0-9]+}}(%esp), %eax -; 32BITABI-NEXT: leal 8(%eax), %ecx -; 32BITABI-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; 32BITABI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; 32BITABI-NEXT: leal 8(%ecx), %eax +; 32BITABI-NEXT: movl %eax, {{[0-9]+}}(%esp) ; 32BITABI-NEXT: .LBB0_3: # %vaarg.end -; 32BITABI-NEXT: movl (%eax), %eax +; 32BITABI-NEXT: movl (%ecx), %eax ; 32BITABI-NEXT: addl $28, %esp ; 32BITABI-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/x86-cmov-converter.ll b/llvm/test/CodeGen/X86/x86-cmov-converter.ll index b02da217e76b21..406c41fda2f920 100644 --- a/llvm/test/CodeGen/X86/x86-cmov-converter.ll +++ b/llvm/test/CodeGen/X86/x86-cmov-converter.ll @@ -451,29 +451,29 @@ define i32 @MaxValue(i32 %n, ptr nocapture readonly %a) #0 { ; ; CHECK-FORCEALL-LABEL: MaxValue: ; CHECK-FORCEALL: # %bb.0: # %entry -; CHECK-FORCEALL-NEXT: movl (%rsi), %r8d +; CHECK-FORCEALL-NEXT: movl (%rsi), %ecx ; CHECK-FORCEALL-NEXT: cmpl $2, %edi ; CHECK-FORCEALL-NEXT: jge .LBB4_3 ; CHECK-FORCEALL-NEXT: # %bb.1: -; CHECK-FORCEALL-NEXT: movl %r8d, %eax +; CHECK-FORCEALL-NEXT: movl %ecx, %eax ; CHECK-FORCEALL-NEXT: .LBB4_2: # %for.cond.cleanup ; CHECK-FORCEALL-NEXT: retq ; CHECK-FORCEALL-NEXT: .LBB4_3: # %for.body.preheader -; CHECK-FORCEALL-NEXT: movl %edi, %ecx -; CHECK-FORCEALL-NEXT: movl $1, %edx +; CHECK-FORCEALL-NEXT: movl %edi, %edx +; CHECK-FORCEALL-NEXT: movl $1, %edi ; CHECK-FORCEALL-NEXT: .LBB4_4: # %for.body ; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-FORCEALL-NEXT: movl (%rsi,%rdx,4), %eax -; CHECK-FORCEALL-NEXT: cmpl %r8d, %eax +; CHECK-FORCEALL-NEXT: movl (%rsi,%rdi,4), %eax +; CHECK-FORCEALL-NEXT: cmpl %ecx, %eax ; CHECK-FORCEALL-NEXT: jg .LBB4_6 ; CHECK-FORCEALL-NEXT: # %bb.5: # %for.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB4_4 Depth=1 -; CHECK-FORCEALL-NEXT: movl %r8d, %eax +; CHECK-FORCEALL-NEXT: movl %ecx, %eax ; CHECK-FORCEALL-NEXT: .LBB4_6: # %for.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB4_4 Depth=1 -; CHECK-FORCEALL-NEXT: addq $1, %rdx -; CHECK-FORCEALL-NEXT: movl %eax, %r8d -; CHECK-FORCEALL-NEXT: cmpq %rdx, %rcx +; CHECK-FORCEALL-NEXT: addq $1, %rdi +; CHECK-FORCEALL-NEXT: movl %eax, %ecx +; CHECK-FORCEALL-NEXT: cmpq %rdi, %rdx ; CHECK-FORCEALL-NEXT: je .LBB4_2 ; CHECK-FORCEALL-NEXT: jmp .LBB4_4 entry: diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index aaba44c8dc1116..6dbf520b78ca58 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1072,109 +1072,101 @@ ret void define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, ptr %p) nounwind { ; AVX1-LABEL: interleaved_store_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $24, %rsp -; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa %ymm2, %ymm4 -; AVX1-NEXT: vmovdqa %ymm0, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 -; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm7 -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm7 -; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm8 -; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm10 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13 +; AVX1-NEXT: vpshufb %xmm12, %xmm13, %xmm7 +; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm7 +; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm8 +; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm6 +; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm14 +; AVX1-NEXT: vpshufb %xmm12, %xmm14, %xmm15 +; AVX1-NEXT: vpor %xmm8, %xmm15, %xmm6 +; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm6 +; AVX1-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX1-NEXT: vpshufb %xmm15, %xmm0, %xmm15 +; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm12, %xmm2, %xmm11 +; AVX1-NEXT: vpor %xmm0, %xmm11, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm15 -; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm10 -; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm10 -; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm5 -; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm2 -; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm9 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm8 -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] -; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm13, %xmm12, %xmm13 -; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm5, %xmm11, %xmm11 -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm11, %xmm10, %xmm11 -; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm0, %xmm15, %xmm15 -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] -; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] -; AVX1-NEXT: vpshufb %xmm10, %xmm6, %xmm12 -; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12 -; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX1-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12 -; AVX1-NEXT: vpshufb %xmm10, %xmm14, %xmm14 -; AVX1-NEXT: vpor %xmm14, %xmm12, %xmm12 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm10 -; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm6 -; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm11 -; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqu %xmm6, 80(%rdi) -; AVX1-NEXT: vmovdqu %xmm9, 64(%rdi) -; AVX1-NEXT: vmovdqu %xmm8, 16(%rdi) -; AVX1-NEXT: vmovdqu %xmm4, (%rdi) -; AVX1-NEXT: vmovdqu %xmm10, 48(%rdi) -; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi) -; AVX1-NEXT: vmovdqu %xmm2, 176(%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; AVX1-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX1-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm12, %xmm10, %xmm12 +; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm6, %xmm13, %xmm13 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm13, %xmm9, %xmm13 +; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] +; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload +; AVX1-NEXT: # xmm8 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] +; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm11, %xmm5 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm9 +; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm11 +; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpshufb %xmm9, %xmm12, %xmm12 +; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm9 +; AVX1-NEXT: vmovdqu %xmm7, 80(%rdi) +; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi) +; AVX1-NEXT: vmovdqu %xmm4, 16(%rdi) +; AVX1-NEXT: vmovdqu %xmm2, (%rdi) +; AVX1-NEXT: vmovdqu %xmm11, 48(%rdi) +; AVX1-NEXT: vmovdqu %xmm8, 32(%rdi) +; AVX1-NEXT: vmovdqu %xmm9, 176(%rdi) ; AVX1-NEXT: vmovdqu %xmm1, 160(%rdi) -; AVX1-NEXT: vmovdqu %xmm12, 112(%rdi) +; AVX1-NEXT: vmovdqu %xmm5, 112(%rdi) ; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi) -; AVX1-NEXT: vmovdqu %xmm11, 144(%rdi) -; AVX1-NEXT: vmovdqu %xmm5, 128(%rdi) -; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: vmovdqu %xmm12, 144(%rdi) +; AVX1-NEXT: vmovdqu %xmm6, 128(%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1273,55 +1265,57 @@ ret void define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX1-LABEL: interleaved_load_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm11 +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovdqu (%rdi), %xmm8 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqu 48(%rdi), %xmm13 ; AVX1-NEXT: vmovups 64(%rdi), %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4 ; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5 -; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2 -; AVX1-NEXT: vmovdqu 144(%rdi), %xmm10 -; AVX1-NEXT: vmovdqu 160(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqu 112(%rdi), %xmm3 +; AVX1-NEXT: vmovdqu 144(%rdi), %xmm7 +; AVX1-NEXT: vmovdqu 160(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] ; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm6 -; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm7 -; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm8 +; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm11 +; AVX1-NEXT: vpshufb %xmm9, %xmm8, %xmm10 ; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm9 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm12 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm15, %xmm3, %xmm12 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpor %xmm5, %xmm12, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX1-NEXT: vpshufb %xmm15, %xmm3, %xmm12 -; AVX1-NEXT: vpor %xmm10, %xmm12, %xmm0 +; AVX1-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm12 +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpor %xmm7, %xmm12, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX1-NEXT: vmovdqa %xmm1, %xmm0 +; AVX1-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm12 -; AVX1-NEXT: vpor %xmm11, %xmm12, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm14, %xmm13, %xmm11 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm13 -; AVX1-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm13 -; AVX1-NEXT: vpshufb %xmm15, %xmm4, %xmm5 -; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm5 -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX1-NEXT: vpshufb %xmm14, %xmm0, %xmm13 -; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm10 -; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10 -; AVX1-NEXT: vmovdqu 176(%rdi), %xmm13 -; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX1-NEXT: vpshufb %xmm15, %xmm13, %xmm12 -; AVX1-NEXT: vpor %xmm0, %xmm12, %xmm3 -; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm12 +; AVX1-NEXT: vpor %xmm8, %xmm12, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm14, %xmm13, %xmm8 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm15, %xmm0, %xmm13 +; AVX1-NEXT: vpor %xmm8, %xmm13, %xmm5 +; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm14, %xmm0, %xmm8 +; AVX1-NEXT: vpshufb %xmm15, %xmm4, %xmm13 +; AVX1-NEXT: vpor %xmm8, %xmm13, %xmm8 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm13 +; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb %xmm15, %xmm13, %xmm7 +; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm5 +; AVX1-NEXT: vmovdqu 176(%rdi), %xmm7 +; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm0 +; AVX1-NEXT: vpshufb %xmm15, %xmm7, %xmm12 +; AVX1-NEXT: vpor %xmm0, %xmm12, %xmm1 +; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm12 ; AVX1-NEXT: vmovdqu 128(%rdi), %xmm14 ; AVX1-NEXT: vpshufb %xmm15, %xmm14, %xmm15 ; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm15 @@ -1329,53 +1323,54 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm12 ; AVX1-NEXT: vpor %xmm6, %xmm12, %xmm12 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm15 -; AVX1-NEXT: vpor %xmm7, %xmm15, %xmm15 -; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 -; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm15 +; AVX1-NEXT: vpor %xmm15, %xmm11, %xmm15 +; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm9[11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm10 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] -; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] +; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5 +; AVX1-NEXT: vpshufb %xmm10, %xmm13, %xmm13 +; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm5 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm13 +; AVX1-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX1-NEXT: vpor %xmm7, %xmm13, %xmm7 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm11 -; AVX1-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm10, %xmm14, %xmm10 +; AVX1-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm8, %xmm1 -; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm2 +; AVX1-NEXT: vpshufb %xmm10, %xmm15, %xmm2 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm10, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm3 +; AVX1-NEXT: vpaddb %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm11, %xmm2 +; AVX1-NEXT: vpshufb %xmm10, %xmm12, %xmm3 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm13, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 @@ -1383,6 +1378,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX1-NEXT: vpaddb %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_load_vf64_i8_stride3: @@ -1489,64 +1486,64 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX1-LABEL: interleaved_store_vf64_i8_stride4: ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm12 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm12 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm10 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm13 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm13 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX1-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4 +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm10 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 ; AVX1-NEXT: vmovaps %ymm3, 224(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 192(%rdi) ; AVX1-NEXT: vmovaps %ymm1, 160(%rdi) -; AVX1-NEXT: vmovaps %ymm4, 128(%rdi) +; AVX1-NEXT: vmovaps %ymm6, 128(%rdi) ; AVX1-NEXT: vmovaps %ymm2, 96(%rdi) -; AVX1-NEXT: vmovaps %ymm9, 64(%rdi) -; AVX1-NEXT: vmovaps %ymm6, 32(%rdi) +; AVX1-NEXT: vmovaps %ymm10, 64(%rdi) +; AVX1-NEXT: vmovaps %ymm4, 32(%rdi) ; AVX1-NEXT: vmovaps %ymm8, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll index fab3847b3a2c51..98c0fc82155e00 100644 --- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -1021,23 +1021,23 @@ define void @infiniteloop3() { ; ENABLE-NEXT: jne LBB12_7 ; ENABLE-NEXT: LBB12_2: ## %loop2a.preheader ; ENABLE-NEXT: xorl %eax, %eax -; ENABLE-NEXT: xorl %ecx, %ecx +; ENABLE-NEXT: xorl %edx, %edx ; ENABLE-NEXT: movq %rax, %rsi ; ENABLE-NEXT: jmp LBB12_4 ; ENABLE-NEXT: .p2align 4, 0x90 ; ENABLE-NEXT: LBB12_3: ## %loop2b ; ENABLE-NEXT: ## in Loop: Header=BB12_4 Depth=1 -; ENABLE-NEXT: movq %rdx, (%rsi) -; ENABLE-NEXT: movq %rdx, %rsi +; ENABLE-NEXT: movq %rcx, (%rsi) +; ENABLE-NEXT: movq %rcx, %rsi ; ENABLE-NEXT: LBB12_4: ## %loop1 ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; ENABLE-NEXT: movq %rcx, %rdx +; ENABLE-NEXT: movq %rdx, %rcx ; ENABLE-NEXT: testq %rax, %rax -; ENABLE-NEXT: movq (%rax), %rcx +; ENABLE-NEXT: movq (%rax), %rdx ; ENABLE-NEXT: jne LBB12_3 ; ENABLE-NEXT: ## %bb.5: ## in Loop: Header=BB12_4 Depth=1 -; ENABLE-NEXT: movq %rdx, %rax -; ENABLE-NEXT: movq %rdx, %rsi +; ENABLE-NEXT: movq %rcx, %rax +; ENABLE-NEXT: movq %rcx, %rsi ; ENABLE-NEXT: jmp LBB12_4 ; ENABLE-NEXT: LBB12_7: ## %end ; ENABLE-NEXT: retq @@ -1053,23 +1053,23 @@ define void @infiniteloop3() { ; DISABLE-NEXT: jne LBB12_7 ; DISABLE-NEXT: LBB12_2: ## %loop2a.preheader ; DISABLE-NEXT: xorl %eax, %eax -; DISABLE-NEXT: xorl %ecx, %ecx +; DISABLE-NEXT: xorl %edx, %edx ; DISABLE-NEXT: movq %rax, %rsi ; DISABLE-NEXT: jmp LBB12_4 ; DISABLE-NEXT: .p2align 4, 0x90 ; DISABLE-NEXT: LBB12_3: ## %loop2b ; DISABLE-NEXT: ## in Loop: Header=BB12_4 Depth=1 -; DISABLE-NEXT: movq %rdx, (%rsi) -; DISABLE-NEXT: movq %rdx, %rsi +; DISABLE-NEXT: movq %rcx, (%rsi) +; DISABLE-NEXT: movq %rcx, %rsi ; DISABLE-NEXT: LBB12_4: ## %loop1 ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; DISABLE-NEXT: movq %rcx, %rdx +; DISABLE-NEXT: movq %rdx, %rcx ; DISABLE-NEXT: testq %rax, %rax -; DISABLE-NEXT: movq (%rax), %rcx +; DISABLE-NEXT: movq (%rax), %rdx ; DISABLE-NEXT: jne LBB12_3 ; DISABLE-NEXT: ## %bb.5: ## in Loop: Header=BB12_4 Depth=1 -; DISABLE-NEXT: movq %rdx, %rax -; DISABLE-NEXT: movq %rdx, %rsi +; DISABLE-NEXT: movq %rcx, %rax +; DISABLE-NEXT: movq %rcx, %rsi ; DISABLE-NEXT: jmp LBB12_4 ; DISABLE-NEXT: LBB12_7: ## %end ; DISABLE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll index 6eb34b4e773e8f..247f193170766f 100644 --- a/llvm/test/CodeGen/X86/xmulo.ll +++ b/llvm/test/CodeGen/X86/xmulo.ll @@ -222,39 +222,39 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %eax, %ecx -; WIN32-NEXT: addl %esi, %ecx +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl %eax, %ebp +; WIN32-NEXT: addl %esi, %ebp ; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: sarl $31, %eax -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: imull %ebx, %edi +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: imull %ebx, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: mull %ebx -; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: addl %edi, %esi -; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %ebp, %eax +; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: addl %esi, %edi +; WIN32-NEXT: addl %eax, %edi +; WIN32-NEXT: addl %ecx, %eax ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %ecx, %esi +; WIN32-NEXT: adcl %ebp, %edi ; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %ebp, %ecx -; WIN32-NEXT: adcl $0, %edi +; WIN32-NEXT: addl %esi, %ecx +; WIN32-NEXT: adcl $0, %ebp ; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %ecx, %ebp -; WIN32-NEXT: adcl %edi, %ebx +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: addl %ecx, %esi +; WIN32-NEXT: adcl %ebp, %ebx ; WIN32-NEXT: setb %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) @@ -262,14 +262,14 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; WIN32-NEXT: adcl %esi, %edx -; WIN32-NEXT: movl %ebp, %ecx +; WIN32-NEXT: adcl %edi, %edx +; WIN32-NEXT: movl %esi, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: xorl %ecx, %edx ; WIN32-NEXT: xorl %eax, %ecx ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %ebp, 4(%eax) +; WIN32-NEXT: movl %esi, 4(%eax) ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al @@ -569,68 +569,70 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: pushl %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: subl $8, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: imull %ecx, %edi -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: movl %ebx, %esi +; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %edi, %esi +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: addl %eax, %ecx +; WIN32-NEXT: addl %esi, %ecx ; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: sarl $31, %eax -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: imull %ebp, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: addl %edi, %ebx +; WIN32-NEXT: addl %esi, %ebx ; WIN32-NEXT: addl %eax, %ebx -; WIN32-NEXT: addl %ecx, %eax -; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %esi, %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: movl %edi, %eax -; WIN32-NEXT: mull %ebp -; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: addl %edi, %eax +; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: adcl %ecx, %ebx +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: addl %ecx, %edi +; WIN32-NEXT: adcl $0, %esi +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %esi, %ecx -; WIN32-NEXT: adcl $0, %ebp +; WIN32-NEXT: addl %edi, %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: adcl %esi, %ebp +; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: movl %edi, %eax -; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %edi -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %ecx, %esi -; WIN32-NEXT: adcl %ebp, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: setb %cl -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %edi, %eax -; WIN32-NEXT: movzbl %cl, %ecx -; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: mull %edx +; WIN32-NEXT: addl %ebp, %eax +; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; WIN32-NEXT: adcl %esi, %edx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; WIN32-NEXT: adcl %ebx, %edx -; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: xorl %esi, %edx -; WIN32-NEXT: xorl %eax, %esi +; WIN32-NEXT: sarl $31, %ecx +; WIN32-NEXT: xorl %ecx, %edx +; WIN32-NEXT: xorl %eax, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: orl %edx, %esi +; WIN32-NEXT: orl %edx, %ecx +; WIN32-NEXT: movl %edi, %edx ; WIN32-NEXT: jne LBB12_2 ; WIN32-NEXT: # %bb.1: ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: LBB12_2: -; WIN32-NEXT: movl %ebp, %edx -; WIN32-NEXT: addl $4, %esp +; WIN32-NEXT: addl $8, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -992,58 +994,57 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) { ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: pushl %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl %ebp, %ecx ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %edi, %esi -; WIN32-NEXT: imull %ecx, %esi +; WIN32-NEXT: movl %ebx, %edi +; WIN32-NEXT: imull %ecx, %edi ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: addl %eax, %ecx -; WIN32-NEXT: addl %esi, %ecx -; WIN32-NEXT: movl %edi, %eax -; WIN32-NEXT: sarl $31, %eax -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: imull %ebp, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: addl %edi, %esi +; WIN32-NEXT: movl %eax, %ecx ; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %ebx, %eax +; WIN32-NEXT: addl %edi, %esi +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: sarl $31, %eax +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: imull %ebp, %ebx +; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: addl %ebx, %edi +; WIN32-NEXT: addl %eax, %edi +; WIN32-NEXT: addl %ecx, %eax ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %ecx, %esi -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: adcl %esi, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %ebx, %ecx -; WIN32-NEXT: adcl $0, %edi -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: addl %esi, %ecx +; WIN32-NEXT: adcl $0, %ebp +; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: addl %ecx, %ebx -; WIN32-NEXT: adcl %edi, %ebp +; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: addl %ecx, %esi +; WIN32-NEXT: adcl %ebp, %ebx ; WIN32-NEXT: setb %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %ebp, %eax +; WIN32-NEXT: addl %ebx, %eax ; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; WIN32-NEXT: adcl %esi, %edx -; WIN32-NEXT: sarl $31, %ebx -; WIN32-NEXT: xorl %ebx, %edx -; WIN32-NEXT: xorl %eax, %ebx -; WIN32-NEXT: orl %edx, %ebx +; WIN32-NEXT: adcl %edi, %edx +; WIN32-NEXT: sarl $31, %esi +; WIN32-NEXT: xorl %esi, %edx +; WIN32-NEXT: xorl %eax, %esi +; WIN32-NEXT: orl %edx, %esi ; WIN32-NEXT: jne LBB18_1 ; WIN32-NEXT: # %bb.3: # %continue ; WIN32-NEXT: movb $1, %al @@ -1698,66 +1699,66 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: subl $20, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %ebx -; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl 4(%eax), %ebp +; WIN32-NEXT: movl (%eax), %edx +; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl 4(%eax), %edi ; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: movl %ecx, %ebx ; WIN32-NEXT: sarl $31, %eax ; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: imull %ebp, %ecx -; WIN32-NEXT: mull %ebx +; WIN32-NEXT: imull %edi, %ecx +; WIN32-NEXT: mull %edx ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: addl %ecx, %ebx -; WIN32-NEXT: movl %ebp, %ecx -; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: addl %ecx, %ebp +; WIN32-NEXT: movl %edi, %ecx +; WIN32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %edi, %esi +; WIN32-NEXT: movl %ebx, %esi ; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %edi -; WIN32-NEXT: addl %eax, %edi -; WIN32-NEXT: addl %esi, %edi +; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: addl %eax, %ebx +; WIN32-NEXT: addl %esi, %ebx ; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload -; WIN32-NEXT: addl %ecx, %ebx +; WIN32-NEXT: addl %ecx, %ebp ; WIN32-NEXT: addl %eax, %ecx ; WIN32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %ebx, %edi -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: mull %esi +; WIN32-NEXT: adcl %ebp, %ebx +; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull %esi -; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; WIN32-NEXT: adcl $0, %ebx -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; WIN32-NEXT: adcl $0, %ebp +; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %esi, %ebp -; WIN32-NEXT: adcl %ebx, %ecx -; WIN32-NEXT: setb %bl +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: addl %ecx, %edi +; WIN32-NEXT: adcl %ebp, %esi +; WIN32-NEXT: setb %cl ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %ecx, %eax -; WIN32-NEXT: movzbl %bl, %ecx +; WIN32-NEXT: addl %esi, %eax +; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; WIN32-NEXT: adcl %edi, %edx -; WIN32-NEXT: movl %ebp, %ecx +; WIN32-NEXT: adcl %ebx, %edx +; WIN32-NEXT: movl %edi, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: xorl %ecx, %edx ; WIN32-NEXT: xorl %eax, %ecx ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %ebp, 4(%eax) +; WIN32-NEXT: movl %edi, 4(%eax) ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al @@ -1805,68 +1806,68 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: subl $12, %esp +; WIN32-NEXT: subl $16, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %ebp -; WIN32-NEXT: movl 4(%eax), %eax +; WIN32-NEXT: movl (%eax), %esi +; WIN32-NEXT: movl 4(%eax), %ebx +; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: imull %ecx, %esi -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: movl %ebx, %edi +; WIN32-NEXT: imull %ecx, %edi +; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: addl %eax, %ecx -; WIN32-NEXT: addl %esi, %ecx -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl %eax, %ebp +; WIN32-NEXT: addl %edi, %ebp +; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: sarl $31, %eax -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: addl %edi, %esi -; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %ebx, %eax +; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: addl %ebx, %edi +; WIN32-NEXT: addl %eax, %edi +; WIN32-NEXT: addl %ecx, %eax ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %ecx, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %ebp -; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: adcl %ebp, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %ebp -; WIN32-NEXT: movl %edx, %edi -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %ebx, %ecx -; WIN32-NEXT: adcl $0, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull (%esp) # 4-byte Folded Reload -; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: movl %eax, %ebp ; WIN32-NEXT: addl %ecx, %ebp -; WIN32-NEXT: adcl %edi, %ebx -; WIN32-NEXT: setb %cl +; WIN32-NEXT: adcl $0, %esi +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl %ebp, %ecx +; WIN32-NEXT: adcl %esi, %ebx +; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull (%esp) # 4-byte Folded Reload +; WIN32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; WIN32-NEXT: addl %ebx, %eax -; WIN32-NEXT: movzbl %cl, %ecx -; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; WIN32-NEXT: adcl %esi, %edx -; WIN32-NEXT: movl %ebp, %ecx -; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: xorl %ecx, %edx -; WIN32-NEXT: xorl %eax, %ecx -; WIN32-NEXT: orl %edx, %ecx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: adcl %edi, %edx +; WIN32-NEXT: movl %ecx, %esi +; WIN32-NEXT: sarl $31, %esi +; WIN32-NEXT: xorl %esi, %edx +; WIN32-NEXT: xorl %eax, %esi +; WIN32-NEXT: orl %edx, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %ebp, 4(%eax) +; WIN32-NEXT: movl %ecx, 4(%eax) ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al -; WIN32-NEXT: addl $12, %esp +; WIN32-NEXT: addl $16, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -2221,29 +2222,30 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %ebp +; WIN32-NEXT: movl (%eax), %esi ; WIN32-NEXT: movl 4(%eax), %eax -; WIN32-NEXT: testl %esi, %esi +; WIN32-NEXT: testl %ebx, %ebx ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %cl ; WIN32-NEXT: andb %dl, %cl -; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: seto %ch +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: mull %esi ; WIN32-NEXT: seto %bl +; WIN32-NEXT: orb %ch, %bl +; WIN32-NEXT: orb %cl, %bl +; WIN32-NEXT: leal (%edi,%eax), %ecx ; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: mull %ebp -; WIN32-NEXT: seto %ch -; WIN32-NEXT: orb %bl, %ch -; WIN32-NEXT: orb %cl, %ch -; WIN32-NEXT: leal (%edi,%eax), %esi -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %esi, %edx +; WIN32-NEXT: addl %ecx, %edx ; WIN32-NEXT: setb %cl -; WIN32-NEXT: orb %ch, %cl +; WIN32-NEXT: orb %bl, %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl %eax, (%esi) ; WIN32-NEXT: movl %edx, 4(%esi) @@ -2299,9 +2301,9 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl (%ecx), %ebp -; WIN32-NEXT: movl 4(%ecx), %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: movl (%edx), %ebp +; WIN32-NEXT: movl 4(%edx), %esi ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %esi, %esi diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll index 080b3dd75ee9a9..6f98d626985aa5 100644 --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll @@ -220,18 +220,18 @@ declare i32 @__FrameHandler(...) define void @test5(ptr %s, i32 %n) { ; CHECK-LABEL: test5: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, wzr -; CHECK-NEXT: add x8, x8, #19, lsl #12 // =77824 -; CHECK-NEXT: add x8, x8, #2176 -; CHECK-NEXT: cmp w9, w1 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: add x9, x9, #19, lsl #12 // =77824 +; CHECK-NEXT: add x9, x9, #2176 +; CHECK-NEXT: cmp w8, w1 ; CHECK-NEXT: b.ge .LBB4_2 ; CHECK-NEXT: .LBB4_1: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w9, [x8, #4] -; CHECK-NEXT: add w9, w9, #1 -; CHECK-NEXT: str w9, [x8] -; CHECK-NEXT: cmp w9, w1 +; CHECK-NEXT: str w8, [x9, #4] +; CHECK-NEXT: add w8, w8, #1 +; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: cmp w8, w1 ; CHECK-NEXT: b.lt .LBB4_1 ; CHECK-NEXT: .LBB4_2: // %while_end ; CHECK-NEXT: ret diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll index 07debadf34f52d..78ee680ae161ad 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll @@ -408,9 +408,9 @@ declare <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr, i32) nounwind readonly define void @testReuse(ptr %src, i32 %stride) nounwind ssp { ; A9-LABEL: testReuse: ; A9: @ %bb.0: @ %entry +; A9-NEXT: sub.w r2, r1, r1, lsl #2 ; A9-NEXT: sub.w r12, r0, r1, lsl #2 -; A9-NEXT: sub.w r0, r1, r1, lsl #2 -; A9-NEXT: lsls r2, r0, #1 +; A9-NEXT: lsls r2, r2, #1 ; A9-NEXT: movs r3, #0 ; A9-NEXT: .LBB5_1: @ %for.body ; A9-NEXT: @ =>This Inner Loop Header: Depth=1